In [1]:
# install java
!apt-get install openjdk-8-jdk-headless -qq > /dev/null

# install spark (change the version number if needed)
!wget -q https://archive.apache.org/dist/spark/spark-3.2.0/spark-3.2.0-bin-hadoop3.2.tgz

# unzip the spark file to the current folder
!tar xf spark-3.2.0-bin-hadoop3.2.tgz

# set your spark folder to your system path environment. 
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.2.0-bin-hadoop3.2"

# install findspark using pip
!pip install -q findspark

# install pyspark
!pip3 install pyspark==3.2.0

# install graphframes
!pip3 install graphframes

Collecting pyspark==3.2.0
  Downloading pyspark-3.2.0.tar.gz (281.3 MB)
[K     |████████████████████████████████| 281.3 MB 34 kB/s 
[?25hCollecting py4j==0.10.9.2
  Downloading py4j-0.10.9.2-py2.py3-none-any.whl (198 kB)
[K     |████████████████████████████████| 198 kB 52.8 MB/s 
[?25hBuilding wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.2.0-py2.py3-none-any.whl size=281805912 sha256=17294dfef5f173cfdf8ea5b7e947dc1fd753fc73413c226f76333dec7f2f4ceb
  Stored in directory: /root/.cache/pip/wheels/0b/de/d2/9be5d59d7331c6c2a7c1b6d1a4f463ce107332b1ecd4e80718
Successfully built pyspark
Installing collected packages: py4j, pyspark
Successfully installed py4j-0.10.9.2 pyspark-3.2.0
Collecting graphframes
  Downloading graphframes-0.6-py2.py3-none-any.whl (18 kB)
Collecting nose
  Downloading nose-1.3.7-py3-none-any.whl (154 kB)
[K     |████████████████████████████████| 154 kB 4.3 MB/s 
In

In [20]:
!cp -v /content/graphframes-0.8.2-spark3.2-s_2.12.jar $SPARK_HOME/jars/

'/content/graphframes-0.8.2-spark3.2-s_2.12.jar' -> '/content/spark-3.2.0-bin-hadoop3.2/jars/graphframes-0.8.2-spark3.2-s_2.12.jar'


In [21]:
#import the packages
from pyspark import *
from pyspark.sql import *
from graphframes import *
import findspark
import pandas as pd

findspark.init()

# Start a Spark session
spark = SparkSession.builder.master("local[*]").getOrCreate()

In [22]:
import psycopg2

In [23]:
# access the postgresql server
conn = psycopg2.connect(
    host="codd04.research.northwestern.edu",
    port = "5433",
    database="postgres",
    user="cpdbstudent",
    password="DataSci4AI")

In [26]:
cursor = conn.cursor()

In [27]:
edges_query = '''--Co-accusals for officers with DAM allegations earning above average salary
DROP TABLE IF EXISTS da_category_ids;
CREATE TEMP TABLE da_category_ids AS (
    SELECT id
    FROM data_allegationcategory
    WHERE data_allegationcategory.category = 'Drug / Alcohol Abuse' OR data_allegationcategory.category = 'Medical' or allegation_name LIKE 'Medical Roll%'
    OR data_allegationcategory.category_code IN ('08J', '024', '003', '003A', '003B', '003C', '003D', '003E'));

DROP TABLE IF EXISTS da_cohort;
CREATE TEMP TABLE da_cohort AS (
    SELECT DISTINCT officer_id
    FROM data_officerallegation
    WHERE allegation_category_id IN (SELECT * from da_category_ids));

DROP TABLE IF EXISTS class_example;
CREATE TEMP TABLE class_example AS (
SELECT da1.officer_id src, da2.officer_id dst, COUNT(DISTINCT da1.allegation_id) relationship
FROM data_officerallegation da1
JOIN data_officerallegation da2 ON da1.allegation_id = da2.allegation_id AND da1.officer_id < da2.officer_id
GROUP BY da1.officer_id, da2.officer_id ORDER BY count(*) DESC);

DROP TABLE IF EXISTS edgesq1;
CREATE TEMP TABLE edgesq1 AS (
SELECT src,dst,relationship FROM class_example
JOIN da_cohort ON da_cohort.officer_id = class_example.src);

select * from edgesq1
'''

In [16]:
nodes_query = '''
--Co-accusals for officers with DAM allegations earning above average salary
DROP TABLE IF EXISTS da_category_ids;
CREATE TEMP TABLE da_category_ids AS (
    SELECT id
    FROM data_allegationcategory
    WHERE data_allegationcategory.category = 'Drug / Alcohol Abuse' OR data_allegationcategory.category = 'Medical' or allegation_name LIKE 'Medical Roll%'
    OR data_allegationcategory.category_code IN ('08J', '024', '003', '003A', '003B', '003C', '003D', '003E'));

DROP TABLE IF EXISTS da_cohort;
CREATE TEMP TABLE da_cohort AS (
    SELECT DISTINCT officer_id
    FROM data_officerallegation
    WHERE allegation_category_id IN (SELECT * from da_category_ids));

DROP TABLE IF EXISTS class_example;
CREATE TEMP TABLE class_example AS (
SELECT da1.officer_id src, da2.officer_id dst, COUNT(DISTINCT da1.allegation_id) relationship
FROM data_officerallegation da1
JOIN data_officerallegation da2 ON da1.allegation_id = da2.allegation_id AND da1.officer_id < da2.officer_id
GROUP BY da1.officer_id, da2.officer_id ORDER BY count(*) DESC);

DROP TABLE IF EXISTS edgesq1;
CREATE TEMP TABLE edgesq1 AS (
SELECT src,dst,relationship FROM class_example
JOIN da_cohort ON da_cohort.officer_id = class_example.src);

SELECT  id, first_name || ' ' || last_name officer_name, allegation_count FROM data_officer
where id in (select src from edgesq1) or id in (select dst from edgesq1);


'''

In [28]:
import pandas as pd

In [29]:
cursor.execute(edges_query)
edges = cursor.fetchall()
print("shape is: " + str(len(edges))) # 17465

df_edges = pd.DataFrame(edges)
colnames = [desc[0] for desc in cursor.description]
df_edges.columns = colnames

print(df_edges)

shape is: 22018
         src    dst  relationship
0          9  12641             1
1         17  26456             2
2         17  10656             2
3         17  22523             1
4         17  20246             1
...      ...    ...           ...
22013  32346  32351             2
22014  32346  32377             3
22015  32397  32419             1
22016  32397  32406             1
22017  32527  33722             2

[22018 rows x 3 columns]


In [30]:
cursor.execute(nodes_query)
nodes = cursor.fetchall()
print("shape is: " + str(len(nodes))) # 17465

df_nodes = pd.DataFrame(nodes)
colnames = [desc[0] for desc in cursor.description]
df_nodes.columns = colnames

print(df_nodes)

shape is: 8167
         id       officer_name  allegation_count
0     32312       Randall Ryan                10
1     32358        Kevin Stoll                24
2     33700        Terra Ester                 4
3     32461     Albert Krueger                 9
4     33051       Doris Haynes                11
...     ...                ...               ...
8162  32398     Thomas Waldera                 8
8163  32098  Christoph Kennedy                 8
8164  25962     Joseph Seinitz                31
8165  25732     Andrew Schoeff                24
8166  27439  Robert Stegmiller                62

[8167 rows x 3 columns]


In [31]:
edges_ = spark.createDataFrame(df_edges)

In [32]:
nodes = spark.createDataFrame(df_nodes)

In [33]:
cpdb = GraphFrame(nodes, edges_)

In [34]:
cpdb.vertices.show()

+-----+--------------------+----------------+
|   id|        officer_name|allegation_count|
+-----+--------------------+----------------+
|32312|        Randall Ryan|              10|
|32358|         Kevin Stoll|              24|
|33700|         Terra Ester|               4|
|32461|      Albert Krueger|               9|
|33051|        Doris Haynes|              11|
|32536|       Edmund Leracz|               4|
|25219|      Linda Salustro|               8|
| 3613|       Patricia Cain|               4|
|31793|      Adam Aleszczyk|               3|
|32527|         Vance Henry|               6|
|32005|          Tony Green|              11|
|32188|     Elmore Metcalfe|              27|
|32308|        Giselle Ruiz|               2|
| 2735|Charles Breckenridge|              10|
|17891|      Billy Mc Bride|               1|
|13065|       James Jackson|              22|
|32549|  Michael Williamson|               1|
|33056|     Luevorne Brooks|              10|
|28428|       Robert Thomas|      

In [35]:
cpdb.edges.show()

+---+-----+------------+
|src|  dst|relationship|
+---+-----+------------+
|  9|12641|           1|
| 17|26456|           2|
| 17|10656|           2|
| 17|22523|           1|
| 17|20246|           1|
| 17|15351|           1|
| 17|12521|           1|
| 17| 9373|           1|
| 17| 6885|           1|
| 17| 4494|           1|
| 17|32102|           1|
| 17|28925|           1|
| 17|27976|           1|
| 54|27802|           1|
| 54|29189|           1|
| 54|30398|           1|
| 54|31550|           1|
| 54|31756|           1|
| 54| 4168|           1|
| 54|11252|           1|
+---+-----+------------+
only showing top 20 rows



In [36]:
cpdb.vertices.sort(['id'],ascending=True).show()

+---+------------------+----------------+
| id|      officer_name|allegation_count|
+---+------------------+----------------+
|  9|     Michael Abbey|               3|
| 17|   Moulay Abdullah|              25|
| 54|      Jeff Acevedo|              13|
| 59|    Edward Acevedo|              14|
|117|       James Adams|              18|
|156|      John Adreani|              32|
|160|     Mahir Affaneh|              12|
|197|   Joseph Aguinaga|              18|
|198|     John Aguinaga|              21|
|200|  Richard Aguinaga|              16|
|223| Deborah Ahmad Bey|               9|
|237|     Arlene Ajello|              19|
|241|    Willie Akerson|              29|
|325| Darlene Alexander|               7|
|335|    John Alexander|               1|
|361|        John Allan|              21|
|364|Michael Allegretti|              28|
|368|     Valerie Allen|               8|
|381|      Sherry Allen|               1|
|396|     Michael Allen|               7|
+---+------------------+----------

In [37]:
tc_cpdb = cpdb.triangleCount()
tc_cpdb.select("id", "count").show()

+-----+-----+
|   id|count|
+-----+-----+
|32188|    7|
|32529|    1|
|32536|    0|
|32308|    1|
|32005|    0|
|33056|    3|
|31793|    0|
|32527|    0|
|17891|   78|
|13065|    0|
| 2735|   10|
|28428|    0|
|32549|    0|
|33049|   17|
|32461|    0|
|25219|    0|
|32312|    0|
|32358|    0|
| 3613|    0|
|33051|    0|
+-----+-----+
only showing top 20 rows



In [38]:
pr_cpdb = cpdb.pageRank(resetProbability=0.15, tol=0.01)
#look at the pagerank score for every vertex
pr_cpdb.vertices.orderBy('pagerank', ascending=False).show()
pr_cpdb.edges.show()

+-----+-----------------+----------------+------------------+
|   id|     officer_name|allegation_count|          pagerank|
+-----+-----------------+----------------+------------------+
|29955|     Carl Walston|               1| 8.530227585167191|
|23005|       Paul Price|               1|4.6109338298201035|
|32032|    Brian Hawkins|              49| 4.524007245073958|
|32405|     Henry Walton|              16| 3.426605063309718|
|29035|     Edward Tures|              14| 3.315799479059275|
|32198|  Raphael Mitchem|              52|3.2925073407632053|
|22480|   Edwin Phillips|               1| 3.251604314428484|
|31265|      Lee Wozniak|              12| 3.220324514177345|
|31906|      James Davis|              76|2.8454226202732715|
|31563|Lawrence Zaragoza|              13| 2.740164442446008|
|31115|  Andrew Wojnicki|               3|2.6051990192559016|
|31497|        John Zago|               7|2.5702751555354864|
|19154|    Raymond Mills|               1| 2.536867354149699|
|30603| 