In [1]:
from pyspark import SparkContext
sc = SparkContext.getOrCreate()

In [2]:
from pyspark.sql import SparkSession
ss = SparkSession.builder.getOrCreate()

In [3]:
import os
import boto3

In [4]:
sfpd_filename = "data/sfpd.csv"
sfpd_rdd = sc.textFile(sfpd_filename)

In [5]:
# # aws
# aws_filename = "my_aws.txt"
# aws_id_key = sc.textFile(aws_filename).map(lambda x: x.split(',')).collect()[0]
# access_id = aws_id_key[0]
# access_key = aws_id_key[1]

In [6]:
# bucket_name = 'msds697jonross.and.friends' # Add your bucket name
# file_name = 'sfpd.csv' # select file
# s3 = boto3.resource('s3',
#                      aws_access_key_id=access_id,
#                      aws_secret_access_key=access_key)
# bucket = s3.Bucket(bucket_name) 
# obj = bucket.Object(key=file_name) # S3 uses key-value structure where key is your file name
# file_content = obj.get()["Body"].read().decode("utf-8") # Read the Body which is the contents of the file.

In [7]:
# file_content[1:100] # test if I get the file

In [8]:
# content_list = file_content.split("\n")
# content_list[0]

In [9]:
# sfpd_rdd = sc.parallelize(content_list) # get the rdd

In [10]:
# # This cell of code converts Unicode to ASCII
# rdd = rdd.map(lambda line : line.encode('ascii', 'ignore'))

In [11]:
# Get column list
columns = sfpd_rdd.take(1)
column_names_list = columns[0].split(",")
column_names_list

['unique_key',
 'category',
 'descript',
 'dayofweek',
 'pddistrict',
 'resolution',
 'address',
 'longitude',
 'latitude',
 'location',
 'pdid',
 'timestamp']

In [12]:
# Load the data
sfpd_data_rdd = sfpd_rdd.filter(lambda x: x.split(',')[0] != 'unique_key')
sfpd_data_rdd.take(10)

['166018573,LARCENY/THEFT,GRAND THEFT FROM LOCKED AUTO,Sunday,,NONE,100 Block of VELASCO AV,-122.4133519852842,37.70820245849022,"(37.70820245849022, -122.4133519852842)",16601857306244,2016-01-17 23:54:00+00:00',
 '160874408,DRIVING UNDER THE INFLUENCE,DRIVING WHILE UNDER THE INFLUENCE OF DRUGS,Thursday,PARK,NONE,DUBOCE AV / NOE ST,-122.43357509728241,37.769176747627725,"(37.769176747627725, -122.43357509728241)",16087440865060,2016-10-27 04:15:00+00:00',
 '120574836,ROBBERY,ATTEMPTED ROBBERY ON THE STREET WITH A KNIFE,Saturday,PARK,NONE,HAIGHT ST / DIVISADERO ST,-122.437048523435,37.7712678186367,"(37.7712678186367, -122.437048523435)",12057483603412,2012-07-21 05:55:00+00:00',
 '160573898,SECONDARY CODES,JUVENILE INVOLVED,Saturday,PARK,NONE,1200 Block of PAGE ST,-122.44139370111428,37.77172731417711,"(37.77172731417711, -122.44139370111428)",16057389815500,2016-07-16 11:15:00+00:00',
 '40502947,DRUG/NARCOTIC,POSSESSION OF COCAINE,Sunday,PARK,"ARREST, BOOKED",1300 Block of FULTON ST,

In [13]:
# total count
sfpd_data_rdd.count()

2170785

## Data Cleaning

In [14]:
# Clean the data
sfpd_data_rdd = sfpd_data_rdd.map(lambda x: x.split(",", 2))

In [15]:
print([int(x) for x in sfpd_data_rdd.map(lambda x: x[0]).collect()][0]) # test for unique_key
print(sfpd_data_rdd.count())
print(sfpd_data_rdd.map(lambda x: x[0]).distinct().count()) # The # of distinct key != # of rows

166018573
2170785
1713368


In [16]:
sfpd_data_rdd.map(lambda x: x[1]).distinct().collect() # test for category

['OTHER OFFENSES',
 'DISORDERLY CONDUCT',
 'PROSTITUTION',
 'DRUG/NARCOTIC',
 'VANDALISM',
 'FRAUD',
 'LIQUOR LAWS',
 'GAMBLING',
 'TREA',
 'SECONDARY CODES',
 'NON-CRIMINAL',
 'KIDNAPPING',
 'PORNOGRAPHY/OBSCENE MAT',
 'STOLEN PROPERTY',
 'DRUNKENNESS',
 'BRIBERY',
 'EMBEZZLEMENT',
 'ARSON',
 'TRESPASS',
 'FORGERY/COUNTERFEITING',
 '"SEX OFFENSES',
 'LARCENY/THEFT',
 'DRIVING UNDER THE INFLUENCE',
 'FAMILY OFFENSES',
 'LOITERING',
 'BURGLARY',
 'EXTORTION',
 'MISSING PERSON',
 'RUNAWAY',
 'SUSPICIOUS OCC',
 'WEAPON LAWS',
 'BAD CHECKS',
 'VEHICLE THEFT',
 'SUICIDE',
 'ROBBERY',
 'RECOVERED VEHICLE',
 'WARRANTS',
 'ASSAULT']

In [17]:
weekdays = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']

def weekdaySplit(string):
    for day in weekdays:
        if day in string:
            return([string[:string.find(day)-1], day, string[string.find(day)+len(day)+1:]])

# Test
testString = 'POSSESSION OF METH-AMPHETAMINE,Saturday,PARK,"ARREST, BOOKED",15TH ST / NOE ST,-122.433256948064,37.7658609302161,"(37.7658609302161, -122.433256948064)",14039187016650,2014-05-10 05:43:00+00:00'
weekdaySplit(testString)           

['POSSESSION OF METH-AMPHETAMINE',
 'Saturday',
 'PARK,"ARREST, BOOKED",15TH ST / NOE ST,-122.433256948064,37.7658609302161,"(37.7658609302161, -122.433256948064)",14039187016650,2014-05-10 05:43:00+00:00']

In [18]:
# Clean to unique_id, category, descript, weekofday, others
sfpd_data_rdd = sfpd_data_rdd.map(lambda x: [x[0], x[1], weekdaySplit(x[2])])\
                             .map(lambda x: [x[0], x[1], x[2][0], x[2][1], x[2][2]])
sfpd_data_rdd.take(1)

[['166018573',
  'LARCENY/THEFT',
  'GRAND THEFT FROM LOCKED AUTO',
  'Sunday',
  ',NONE,100 Block of VELASCO AV,-122.4133519852842,37.70820245849022,"(37.70820245849022, -122.4133519852842)",16601857306244,2016-01-17 23:54:00+00:00']]

In [19]:
sfpd_data_rdd.map(lambda x: x[3]).distinct().collect() # test for weekday

['Wednesday', 'Friday', 'Sunday', 'Saturday', 'Monday', 'Thursday', 'Tuesday']

In [20]:
sfpd_data_rdd.map(lambda x: x[4].count(',')).distinct().collect() 
# [8, 9] indicate further cleaning

[8, 9]

In [53]:
# Actual SF Police Department District
sf_pd_district_list = ['', 'Central', 'Southern', 'Bayview', 'Mission', 'Northern',
                       'Park', 'Richmond', 'Ingleside', 'Taraval', 'Tenderloin']
sf_pd_district_list = sorted([pd.upper() for pd in sf_pd_district_list])
sf_pd_district_list

['',
 'BAYVIEW',
 'CENTRAL',
 'INGLESIDE',
 'MISSION',
 'NORTHERN',
 'PARK',
 'RICHMOND',
 'SOUTHERN',
 'TARAVAL',
 'TENDERLOIN']

In [22]:
sorted(sfpd_data_rdd.map(lambda x: x[4].split(',')[0]).distinct().collect()) # Test if pd is correct

['',
 'BAYVIEW',
 'CENTRAL',
 'INGLESIDE',
 'MISSION',
 'NORTHERN',
 'PARK',
 'RICHMOND',
 'SOUTHERN',
 'TARAVAL',
 'TENDERLOIN']

In [23]:
# Clean to unique_id, category, descript, weekofday, pddistrict, others
sfpd_data_rdd = sfpd_data_rdd.map(lambda x: [x[0], x[1], x[2], x[3],
                                             x[4].split(',', 1)])\
                             .map(lambda x: [x[0], x[1], x[2], x[3],
                                             x[4][0], x[4][1]])
sfpd_data_rdd.take(1)

[['166018573',
  'LARCENY/THEFT',
  'GRAND THEFT FROM LOCKED AUTO',
  'Sunday',
  '',
  'NONE,100 Block of VELASCO AV,-122.4133519852842,37.70820245849022,"(37.70820245849022, -122.4133519852842)",16601857306244,2016-01-17 23:54:00+00:00']]

In [24]:
sfpd_data_rdd.map(lambda x: x[5].count(',')).distinct().collect() 
# [7, 8] indicate further cleaning

[7, 8]

In [25]:
# Clean to unique_id, category, descript, weekofday, pddistrict,
# latitude, longitude, timestamp, address and resolution
sfpd_data_rdd = sfpd_data_rdd.map(lambda x: [x[0], x[1], x[2], x[3], x[4],
                                             x[5].split(",")])\
                             .map(lambda x: [x[0], x[1], x[2], x[3], x[4], 
                                             list(reversed(x[5]))])\
                             .map(lambda x: [x[0], x[1], x[2], x[3], x[4], 
                                             x[5][4], x[5][5], x[5][0], x[5][6], list(reversed(x[5][7:]))])
sfpd_data_rdd.take(10)

[['166018573',
  'LARCENY/THEFT',
  'GRAND THEFT FROM LOCKED AUTO',
  'Sunday',
  '',
  '37.70820245849022',
  '-122.4133519852842',
  '2016-01-17 23:54:00+00:00',
  '100 Block of VELASCO AV',
  ['NONE']],
 ['160874408',
  'DRIVING UNDER THE INFLUENCE',
  'DRIVING WHILE UNDER THE INFLUENCE OF DRUGS',
  'Thursday',
  'PARK',
  '37.769176747627725',
  '-122.43357509728241',
  '2016-10-27 04:15:00+00:00',
  'DUBOCE AV / NOE ST',
  ['NONE']],
 ['120574836',
  'ROBBERY',
  'ATTEMPTED ROBBERY ON THE STREET WITH A KNIFE',
  'Saturday',
  'PARK',
  '37.7712678186367',
  '-122.437048523435',
  '2012-07-21 05:55:00+00:00',
  'HAIGHT ST / DIVISADERO ST',
  ['NONE']],
 ['160573898',
  'SECONDARY CODES',
  'JUVENILE INVOLVED',
  'Saturday',
  'PARK',
  '37.77172731417711',
  '-122.44139370111428',
  '2016-07-16 11:15:00+00:00',
  '1200 Block of PAGE ST',
  ['NONE']],
 ['40502947',
  'DRUG/NARCOTIC',
  'POSSESSION OF COCAINE',
  'Sunday',
  'PARK',
  '37.7768021540039',
  '-122.439037573428',
  '200

In [26]:
def resolutionTransform(alist):
    if len(alist) == 2:
        return ",".join(alist)
    else:
        return alist[0]

# test
print(resolutionTransform(['None']))
print(type(resolutionTransform(['None'])))
print(resolutionTransform(['"ARREST', ' BOOKED"']))
print(type(resolutionTransform(['"ARREST', ' BOOKED"'])))

None
<class 'str'>
"ARREST, BOOKED"
<class 'str'>


In [27]:
def toFloatSafe(inval):
  try:
    return float(inval)
  except ValueError:
    return None

from datetime import datetime
def toTimeSafe(inval):
  inval = inval.strip("\"") # Timestamp starting and ending with a double quotation mark.
  try:
    return datetime.strptime(inval, "%Y-%m-%d %H:%M:%S")
  except ValueError:
    return None
print(toTimeSafe('2016-01-17 23:54:00+00:00'[:-6]))

2016-01-17 23:54:00


In [28]:
# Finally, clean to unique_id, category, descript, weekofday, pddistrict,
# latitude, longitude, timestamp, address and resolution
sfpd_data_rdd = sfpd_data_rdd.map(lambda x: [x[0], x[1], x[2], x[3], x[4],
                                             float(x[5]), float(x[6]), 
                                             toTimeSafe(x[7][:-6]), x[8], resolutionTransform(x[9])])
sfpd_data_rdd.take(1)

[['166018573',
  'LARCENY/THEFT',
  'GRAND THEFT FROM LOCKED AUTO',
  'Sunday',
  '',
  37.70820245849022,
  -122.4133519852842,
  datetime.datetime(2016, 1, 17, 23, 54),
  '100 Block of VELASCO AV',
  'NONE']]

# Create DataFrame

In [29]:
from pyspark.sql.types import *
schema = StructType([ StructField("unique_key", StringType(), False),
                      StructField("category", StringType(), True),
                      StructField("descript", StringType(), True),
                      StructField("dayofweek", StringType(), True),
                      StructField("pddistrict", StringType(), True),
                      StructField("latitude", FloatType(), True),
                      StructField("longitude", FloatType(), True),
                      StructField("timestamp", TimestampType(), True),
                      StructField("address", StringType(), True),
                      StructField("resolution", StringType(), True)
                    ])

In [32]:
sfpd_data_df = ss.createDataFrame(sfpd_data_rdd, schema)
sfpd_data_df.show(5)

+----------+--------------------+--------------------+---------+----------+---------+----------+-------------------+--------------------+----------------+
|unique_key|            category|            descript|dayofweek|pddistrict| latitude| longitude|          timestamp|             address|      resolution|
+----------+--------------------+--------------------+---------+----------+---------+----------+-------------------+--------------------+----------------+
| 166018573|       LARCENY/THEFT|GRAND THEFT FROM ...|   Sunday|          |37.708202|-122.41335|2016-01-17 23:54:00|100 Block of VELA...|            NONE|
| 160874408|DRIVING UNDER THE...|DRIVING WHILE UND...| Thursday|      PARK|37.769176|-122.43358|2016-10-27 04:15:00|  DUBOCE AV / NOE ST|            NONE|
| 120574836|             ROBBERY|ATTEMPTED ROBBERY...| Saturday|      PARK|37.771267|-122.43705|2012-07-21 05:55:00|HAIGHT ST / DIVIS...|            NONE|
| 160573898|     SECONDARY CODES|   JUVENILE INVOLVED| Saturday|      

# Heat map

In [31]:
sfpd_data_rdd.take(1)

[['166018573',
  'LARCENY/THEFT',
  'GRAND THEFT FROM LOCKED AUTO',
  'Sunday',
  '',
  37.70820245849022,
  -122.4133519852842,
  datetime.datetime(2016, 1, 17, 23, 54),
  '100 Block of VELASCO AV',
  'NONE']]

In [33]:
sfpd_data_rdd.map(lambda x: x[4]).distinct().collect()

['',
 'PARK',
 'MISSION',
 'RICHMOND',
 'TENDERLOIN',
 'BAYVIEW',
 'CENTRAL',
 'TARAVAL',
 'SOUTHERN',
 'INGLESIDE',
 'NORTHERN']

In [55]:
sf_pd_district_list.remove("")
sf_pd_district_list

['BAYVIEW',
 'CENTRAL',
 'INGLESIDE',
 'MISSION',
 'NORTHERN',
 'PARK',
 'RICHMOND',
 'SOUTHERN',
 'TARAVAL',
 'TENDERLOIN']

In [58]:
sfpd_data_rdd.filter(lambda x: x[4] == "PARK").take(1)

[['160874408',
  'DRIVING UNDER THE INFLUENCE',
  'DRIVING WHILE UNDER THE INFLUENCE OF DRUGS',
  'Thursday',
  'PARK',
  37.769176747627725,
  -122.43357509728241,
  datetime.datetime(2016, 10, 27, 4, 15),
  'DUBOCE AV / NOE ST',
  'NONE']]

In [80]:
print(sfpd_data_rdd.filter(lambda x: x[4] == "PARK")\
.map(lambda x: [f"Unique Key: {x[0]}",
                f"Category: {x[1]}",
                f"Descript: {x[2]}",
                f"Day of Week: {x[3]}",
                f"Police Department: {x[4]}",
                f"Time: {str(x[7])}",
                f"Address: {x[8]}",
                f"Resolution: {x[9]}"])\
                .map(lambda x: "\n".join(x)).take(1)[0])


Unique Key: 160874408
Category: DRIVING UNDER THE INFLUENCE
Descript: DRIVING WHILE UNDER THE INFLUENCE OF DRUGS
Day of Week: Thursday
Police Department: PARK
Time: 2016-10-27 04:15:00
Address: DUBOCE AV / NOE ST
Resolution: NONE


In [87]:
color_by_district_dict = {}
color_list = ['rgb(88,169,137)', 
              'rgb(230,58,154)', 
              'rgb(230,51,10)', 
              'rgb(192,228,163)', 
              'rgb(88,50,240)', 
              'rgb(145,87,36)', 
              'rgb(4,14,57)', 
              'rgb(205,183,12)', 
              'rgb(225,103,137)', 
              'rgb(81,106,147)']
for i in range(len(sf_pd_district_list)):
    color_by_district_dict[sf_pd_district_list[i]] = color_list[i]
    
color_by_district_dict

{'BAYVIEW': 'rgb(88,169,137)',
 'CENTRAL': 'rgb(230,58,154)',
 'INGLESIDE': 'rgb(230,51,10)',
 'MISSION': 'rgb(192,228,163)',
 'NORTHERN': 'rgb(88,50,240)',
 'PARK': 'rgb(145,87,36)',
 'RICHMOND': 'rgb(4,14,57)',
 'SOUTHERN': 'rgb(205,183,12)',
 'TARAVAL': 'rgb(225,103,137)',
 'TENDERLOIN': 'rgb(81,106,147)'}

In [88]:
def info_getter(district):
    district_rdd = sfpd_data_rdd.filter(lambda x: x[4] == district)
    lat_list = district_rdd.map(lambda x: str(x[5])).collect()
    lon_list = district_rdd.map(lambda x: str(x[6])).collect()
    text_list = district_rdd.map(lambda x: [f"Unique Key: {x[0]}",
                                            f"Category: {x[1]}",
                                            f"Descript: {x[2]}",
                                            f"Day of Week: {x[3]}",
                                            f"Police Department: {x[4]}",
                                            f"Time: {str(x[7])}",
                                            f"Address: {x[8]}",
                                            f"Resolution: {x[9]}"])\
                                            .map(lambda x: "\n".join(x))\
                                            .collect()
    return(lat_list, lon_list, text_list, color_by_district_dict[district])

# Test
temp = info_getter("PARK")
print(temp[0][0])
print(temp[1][0])
print(temp[2][0])
print(temp[3])

37.769176747627725
-122.43357509728241
Unique Key: 160874408
Category: DRIVING UNDER THE INFLUENCE
Descript: DRIVING WHILE UNDER THE INFLUENCE OF DRUGS
Day of Week: Thursday
Police Department: PARK
Time: 2016-10-27 04:15:00
Address: DUBOCE AV / NOE ST
Resolution: NONE
rgb(145,87,36)


In [89]:
import plotly.plotly as py
import plotly.graph_objs as go

In [94]:
my_mapbox_api = sc.textFile("mapbox_api.txt").collect()[0]
mapbox_access_token = my_mapbox_api

In [98]:
district_data = []
for district in sf_pd_district_list:
    dump = info_getter(district)
    trace = go.Scattermapbox(
                lat=dump[0],
                lon=dump[1],
                mode='markers',
                marker=dict(
                            size=2.5,
                            color=dump[3]
                           ),
                text=dump[2],
            )
    district_data.append(trace)

In [113]:
import pickle
# Dump the district_data
filename = 'district_data_sfpd'
outfile = open(filename, 'wb')
pickle.dump(district_data, outfile)
outfile.close()

In [None]:
# # Load the district_data
# infile = open(filename, 'rb')
# district_data = pickle.load(infile)
# infile.close()

In [104]:
layout = go.Layout(
    autosize=True,
    hovermode='closest',
    mapbox=dict(
        accesstoken=mapbox_access_token,
        bearing=0,
        center=dict(
            lat=37.77,
            lon=-122.42
        ),
        pitch=0,
        zoom=10
    ),
)

# fig = dict(data=district_data, layout=layout) # This fails since too many to plot
# py.iplot(fig, filename='SFPD by district')

In [103]:
for district in sf_pd_district_list:
    print(f"{district} : {sfpd_data_rdd.filter(lambda x: x[4] == district).count()}")

BAYVIEW : 217097
CENTRAL : 220420
INGLESIDE : 190593
MISSION : 293940
NORTHERN : 266869
PARK : 123110
RICHMOND : 114449
SOUTHERN : 391669
TARAVAL : 163620
TENDERLOIN : 189017


In [109]:
# BAYVIEW
fig = dict(data=[district_data[0]], layout=layout)
py.iplot(fig, filename='SFPD BAYVIEW')

In [110]:
# MISSION
fig = dict(data=[district_data[3]], layout=layout)
py.iplot(fig, filename='SFPD MISSION')

In [112]:
# PARK
fig = dict(data=[district_data[5]], layout=layout)
py.iplot(fig, filename='SFPD PARK')