In [1]:
from functools import reduce
from pyspark.sql.functions import col, lit, when
from graphframes import *
import datetime
from pyspark import SparkFiles
import networkx as nx
import matplotlib.pyplot as plt

In [2]:
def PlotGraph(edge_list):
    Gplot=nx.Graph()
    for row in edge_list.select('src','dst').take(1000):
        Gplot.add_edge(row['src'],row['dst'])

    plt.subplot(121)
    nx.draw(Gplot, with_labels=True, font_weight='bold')

# Graph Construction

## loading edges and verticies from files
* the query takes a long time to run, so this saves a lot of time if the cluser is terminated

In [5]:
# CSV options
infer_schema = "false"
first_row_is_header = "true"
delimiter = ","

# The applied options are for CSV files. For other file types, these will be ignored.
edges_df = spark.read.format("csv") \
  .option("inferSchema", infer_schema) \
  .option("header", first_row_is_header) \
  .option("sep", delimiter) \
  .load("/FileStore/tables/q2_edges.csv")

# The applied options are for CSV files. For other file types, these will be ignored.
vertices_df = spark.read.format("csv") \
  .option("inferSchema", infer_schema) \
  .option("header", first_row_is_header) \
  .option("sep", delimiter) \
  .load("/FileStore/tables/q2_vertices.csv")

### create the graph

In [7]:
g = GraphFrame(vertices_df, edges_df)

## Connect to the remote CPDP database

In [9]:
url = "jdbc:postgresql://cpdb-databricks.cgod7egsd6vr.us-east-2.rds.amazonaws.com/cpdb"
user = "data_sci"
pwd = "dataSci4lyf"
driver = "org.postgresql.Driver"

reader = spark.read.format("jdbc")\
  .option("driver", driver)\
  .option("url", url)\
  .option("user", user)\
  .option("password", pwd)\

def cpdp_read(query):
  return reader.option("dbtable", query).load()

## Query for awards
* Filtering by officers in our subset
  - subset is officers appointed between `01-01-2000` and `12-31-2007`
* Filtering awards by status `Final`

In [11]:
awards_query = """
(WITH officer_subset AS (
    SELECT o.id, o.first_name, o.last_name, o.birth_year, o.appointed_date,
          date_part('year', '2018-01-01'::DATE) - o.birth_year as estimated_age,
          ('2018-01-01'::DATE - o.appointed_date) / 365 as years_on_force,
          COUNT(a.id) as allegation_count
    FROM data_officer o
    LEFT JOIN data_officerallegation a on o.id = a.officer_id
    WHERE active = 'Yes'
        AND appointed_date BETWEEN '2000-01-01' AND '2007-12-31'
    GROUP BY o.id
    ORDER BY years_on_force DESC
) SELECT
       award.id as award_id,
       award.award_type,
       award.start_date as award_date,
       subset.id as officer_id
FROM data_award award
INNER JOIN officer_subset subset ON subset.id = award.officer_id
WHERE award.current_status = 'Final') awards
"""
awards_df = cpdp_read(awards_query)

## Query for allegations
- filtering for allegation commit by officers in the subset
- only looking at allegation categories of violent misconduct

In [13]:
allegations_query = """
(WITH officer_subset AS (
    SELECT o.id, o.first_name, o.last_name, o.birth_year, o.appointed_date,
          date_part('year', '2018-01-01'::DATE) - o.birth_year as estimated_age,
          ('2018-01-01'::DATE - o.appointed_date) / 365 as years_on_force,
          COUNT(a.id) as allegation_count
    FROM data_officer o
    LEFT JOIN data_officerallegation a on o.id = a.officer_id
    WHERE active = 'Yes'
        AND appointed_date BETWEEN '2000-01-01' AND '2007-12-31'
    GROUP BY o.id
    ORDER BY years_on_force DESC
) SELECT
       officer_allegation.id as officer_allegation_id,
       allegation.id as allegation_id,
       subset.id as officer_id,
       allegation.incident_date as incident_date,
       category.category as category_type,
       category.allegation_name as allegation_name
FROM data_officerallegation officer_allegation
INNER JOIN data_allegation allegation on officer_allegation.allegation_id = allegation.id
INNER JOIN data_allegationcategory category on officer_allegation.allegation_category_id = category.id
INNER JOIN officer_subset subset ON subset.id = officer_allegation.officer_id
WHERE
      allegation.incident_date >= subset.appointed_date
      AND category.category NOT IN ('Operation/Personnel Violations',
                       'Lockup Procedures',
                       'Traffic',
                       'Supervisory Responsibilities',
                       'Unknown',
                       'Medical')) allegations
"""
allegations_df = cpdp_read(allegations_query)

## Create Vertices
- csv files can be found in ``/src/data/`
- Get distinct `award_type` and `allegation_category`
- assemble vertices `(id, vertex_type, value)`
  - type is either `allegation_category` or `award_type`

In [15]:
award_type_query = """
(SELECT DISTINCT award_type
FROM data_award) award_type
"""
award_type_df = cpdp_read(award_type_query)

In [16]:
allegation_categories_query = """
(SELECT DISTINCT category
FROM data_allegationcategory
WHERE category NOT IN ('Operation/Personnel Violations',
                       'Lockup Procedures',
                       'Traffic',
                       'Supervisory Responsibilities',
                       'Unknown',
                       'Medical')) allegation_categories
"""
allegation_categories_df = cpdp_read(allegation_categories_query)

In [17]:
# create verticies
verts = []
award_types = award_type_df.rdd.collect()
award_types_list = []
award_types_id_map = {}
for row_idx, row in enumerate(award_types):
  verts.append((100+row_idx, 'award_type', row['award_type']))
  award_types_list.append(row['award_type'])
  award_types_id_map[100+row_idx] = row
  
allegation_categories = allegation_categories_df.rdd.collect()
allegation_categories_list = []
allegation_categories_id_map = {}
for row_idx, row in enumerate(allegation_categories):
  verts.append((200+row_idx, 'allegation_category', row['category']))
  allegation_categories_list.append(row['category'])
  allegation_categories_id_map[200+row_idx] = row
  
vertices = sqlContext.createDataFrame(verts, ['id', 'vertex_type', 'value'])
vertices.write.mode('overwrite').save("vertices")

## Create Edges
- **NOTE**: These cell take a very long time to run (hours)
  * csv files can be found in `/src/data/`
- an edge is a award that is given with 60 days of a allegation 
  * `(src, dst, award_id, officer_id, award_date, allegation_id, allegation_incident_date)`

In [19]:
# create edges
allegations = allegations_df.rdd.collect()
edges = []
for allegation_idx, allegation in enumerate(allegations):
  incident_date = allegation['incident_date']
  year_later = incident_date + datetime.timedelta(days=60)
  officer_id = allegation['officer_id']
  post_allegation_awards = awards_df.filter(awards_df.officer_id == officer_id).filter(awards_df.award_date > incident_date).filter(awards_df.award_date <= year_later)
  src_id = 200+allegation_categories_list.index(allegation.category_type)
  for award_idx, award in enumerate(post_allegation_awards.rdd.collect()):
    dest_id = 100+award_types_list.index(award.award_type)
    edges.append((src_id, dest_id, award.award_id, award.officer_id, award.award_date, allegation.allegation_id, incident_date))
    
    
edges_df = sqlContext.createDataFrame(edges, ['src', 'dst', 'award_id', 'officer_id', 'award_date', 'allegation_id', 'allegation_incident_date'])
edges_df.write.mode('overwrite').save("edges")

# Analysis

## Total Counts of award
- for reference in the analysis

In [22]:
award_grouping_query = """
(WITH officer_subset AS (SELECT o.id, o.first_name, o.last_name, o.birth_year, o.appointed_date,
          date_part('year', '2018-01-01'::DATE) - o.birth_year as estimated_age,
          ('2018-01-01'::DATE - o.appointed_date) / 365 as years_on_force,
          COUNT(a.id) as allegation_count,
           (SELECT da.incident_date
               FROM data_officerallegation oa
               JOIN data_allegation da on oa.allegation_id = da.id
                JOIN data_allegationcategory category on oa.allegation_category_id = category.id
               WHERE o.id = oa.officer_id
                AND category.category NOT IN ('Operation/Personnel Violations',
                       'Lockup Procedures',
                       'Traffic',
                       'Supervisory Responsibilities',
                       'Unknown',
                       'Medical')
               LIMIT 1
               OFFSET 10
            ) as repeater_start_date
    FROM data_officer o
    LEFT JOIN data_officerallegation a on o.id = a.officer_id
    WHERE active = 'Yes'
        AND appointed_date BETWEEN '2000-01-01' AND '2007-12-31'
    GROUP BY o.id
    ORDER BY years_on_force DESC)
SELECT award_type, count(*)
FROM data_award
INNER JOIN officer_subset subset ON subset.id = data_award.officer_id
GROUP BY award_type
ORDER BY count(*) DESC) award_grouping
"""
award_grouping_df = cpdp_read(award_grouping_query)

## In-Degress
* Analyzing which awards have the most incoming edges

In [24]:
inDegrees_df = g.inDegrees.orderBy('inDegree', ascending=False)
inDegrees = []
for row in inDegrees_df.rdd.collect():
  print(row)
  inDegrees.append((award_types_id_map[int(row.id)].award_type, row.id, row.inDegree))
  
inDegrees_df = sqlContext.createDataFrame(inDegrees, ['award_type', 'id', 'inDegree'])
display(inDegrees_df.limit(10))

award_type,id,inDegree
Honorable Mention,143,19986
Department Commendation,137,1297
Complimentary Letter,130,1065
Emblem Of Recognition - Physical Fitness,122,587
Attendance Recognition Award,126,542
2009 Crime Reduction Award,104,321
Presidential Election Deployment Award 2008,124,274
Deployment Operations Center Award,140,263
Nato Summit Service Award,128,233
Unit Meritorious Performance Award,135,180


## Out-Degree
- Analyzing which allegations links to the most awards

In [26]:
outDegrees_df = g.outDegrees.orderBy('outDegree', ascending=False)
outDegrees = []
for row in outDegrees_df.rdd.collect():
  outDegrees.append((allegation_categories_id_map[int(row.id)].category,row.id, row.outDegree))
  
outDegrees_df = sqlContext.createDataFrame(outDegrees, ['allegation_category', 'id', 'outDegree'])
display(outDegrees_df.limit(10))

allegation_category,id,outDegree
Illegal Search,208,10134
Use Of Force,205,8865
False Arrest,202,4148
Verbal Abuse,210,850
Domestic,204,715
Criminal Misconduct,201,421
Conduct Unbecoming (Off-Duty),200,222
Bribery / Official Corruption,212,197
Drug / Alcohol Abuse,211,23
Excessive Force,203,10


In [27]:
from pyspark.sql.functions import *

def edges_for_src(src):
  subg = g.filterEdges("src = "+src).dropIsolatedVertices()
  edges = []
  for row in subg.edges.rdd.collect():
    edges.append((row.dst, award_types_id_map[int(row.dst)].award_type, int(row.award_id)))
  
  edges_df = sqlContext.createDataFrame(edges, ['dst', 'award_type', 'award_id'])
  return edges_df

### Illegal Search to Awards
- Analyzing which awards are given with 60 days of illegal search

In [29]:
is_edges = edges_for_src("208")
display(is_edges)

dst,award_type,award_id
143,Honorable Mention,514253
143,Honorable Mention,514254
130,Complimentary Letter,514255
143,Honorable Mention,514247
143,Honorable Mention,514244
143,Honorable Mention,514245
143,Honorable Mention,514244
143,Honorable Mention,514245
143,Honorable Mention,514244
143,Honorable Mention,514245


In [30]:
display(is_edges.groupBy('award_type').agg(count('award_id').alias('count')).orderBy('count', ascending=False).limit(5))

award_type,count
Honorable Mention,8055
Department Commendation,518
Complimentary Letter,347
Emblem Of Recognition - Physical Fitness,224
Attendance Recognition Award,194


### Use of Force to Awards
- Analyzing which awards are given with 60 days of use of force

In [32]:
uof_edges = edges_for_src("205")
display(uof_edges)

dst,award_type,award_id
143,Honorable Mention,514259
130,Complimentary Letter,514260
143,Honorable Mention,514253
143,Honorable Mention,514254
130,Complimentary Letter,514255
143,Honorable Mention,514241
143,Honorable Mention,514237
143,Honorable Mention,514238
143,Honorable Mention,514239
143,Honorable Mention,646155


In [33]:
display(uof_edges.groupBy('award_type').agg(count('award_id').alias('count')).orderBy('count', ascending=False).limit(5))

award_type,count
Honorable Mention,6845
Department Commendation,461
Complimentary Letter,415
Attendance Recognition Award,197
Emblem Of Recognition - Physical Fitness,177


### Allegation Category to Department Commendation awards
- Analyzing which allegation preceed Department Commendation awards

In [35]:
from pyspark.sql.functions import *

def edges_for_dst(dst):
  subg = g.filterEdges("dst = "+dst).dropIsolatedVertices()
  edges = []
  for row in subg.edges.rdd.collect():
    edges.append((row.dst, allegation_categories_id_map[int(row.src)].category, int(row.allegation_id)))
  
  edges_df = sqlContext.createDataFrame(edges, ['dst', 'category', 'allegation_id'])
  return edges_df

In [36]:
dc_df = edges_for_dst("137")
display(dc_df)

dst,category,allegation_id
137,Illegal Search,63698
137,Use Of Force,62798
137,Use Of Force,95188
137,Use Of Force,73072
137,Use Of Force,57770
137,Illegal Search,47702
137,Illegal Search,82244
137,False Arrest,82583
137,Illegal Search,81168
137,False Arrest,75666


In [37]:
display(dc_df.groupBy('category').agg(count('allegation_id').alias('count')).orderBy('count', ascending=False).limit(5))

category,count
Illegal Search,518
Use Of Force,461
False Arrest,217
Domestic,46
Verbal Abuse,31


### Allegation Category to Complimentary Letter
- Analyzing which allegation preceed Complimentary Letter awards

In [39]:
cl_df = edges_for_dst("130")
display(cl_df)

dst,category,allegation_id
130,Conduct Unbecoming (Off-Duty),84637
130,Use Of Force,84225
130,Verbal Abuse,69140
130,Use Of Force,68608
130,Illegal Search,68105
130,Illegal Search,79008
130,Illegal Search,65321
130,Illegal Search,65321
130,Use Of Force,90551
130,Use Of Force,95547


In [40]:
display(cl_df.groupBy('category').agg(count('allegation_id').alias('count')).orderBy('count', ascending=False).limit(5))

category,count
Use Of Force,415
Illegal Search,347
False Arrest,162
Verbal Abuse,50
Domestic,42


### Allegation Category to 2009 Crime Reduction Award

In [42]:
cra_df = edges_for_dst("104")
display(cra_df.groupBy('category').agg(count('allegation_id').alias('count')).orderBy('count', ascending=False).limit(5))

category,count
Use Of Force,139
Illegal Search,76
False Arrest,60
Verbal Abuse,18
Domestic,13
