In [1]:
from pyspark import SparkContext
sc = SparkContext.getOrCreate()

In [2]:
from pyspark.sql import SparkSession
ss = SparkSession.builder.getOrCreate()

In [3]:
sfpd_filename = "data/sfpd.csv"
sfpd_rdd = sc.textFile(sfpd_filename)

In [4]:
# Get column list
columns = sfpd_rdd.take(1)
column_names_list = columns[0].split(",")
column_names_list

['unique_key',
 'category',
 'descript',
 'dayofweek',
 'pddistrict',
 'resolution',
 'address',
 'longitude',
 'latitude',
 'location',
 'pdid',
 'timestamp']

In [5]:
# Load the data
sfpd_data_rdd = sfpd_rdd.filter(lambda x: x.split(',')[0] != 'unique_key')
sfpd_data_rdd.take(10)

['166018573,LARCENY/THEFT,GRAND THEFT FROM LOCKED AUTO,Sunday,,NONE,100 Block of VELASCO AV,-122.4133519852842,37.70820245849022,"(37.70820245849022, -122.4133519852842)",16601857306244,2016-01-17 23:54:00+00:00',
 '160874408,DRIVING UNDER THE INFLUENCE,DRIVING WHILE UNDER THE INFLUENCE OF DRUGS,Thursday,PARK,NONE,DUBOCE AV / NOE ST,-122.43357509728241,37.769176747627725,"(37.769176747627725, -122.43357509728241)",16087440865060,2016-10-27 04:15:00+00:00',
 '120574836,ROBBERY,ATTEMPTED ROBBERY ON THE STREET WITH A KNIFE,Saturday,PARK,NONE,HAIGHT ST / DIVISADERO ST,-122.437048523435,37.7712678186367,"(37.7712678186367, -122.437048523435)",12057483603412,2012-07-21 05:55:00+00:00',
 '160573898,SECONDARY CODES,JUVENILE INVOLVED,Saturday,PARK,NONE,1200 Block of PAGE ST,-122.44139370111428,37.77172731417711,"(37.77172731417711, -122.44139370111428)",16057389815500,2016-07-16 11:15:00+00:00',
 '40502947,DRUG/NARCOTIC,POSSESSION OF COCAINE,Sunday,PARK,"ARREST, BOOKED",1300 Block of FULTON ST,

In [6]:
# total count
sfpd_data_rdd.count()

2170785

## Data Cleaning

In [7]:
# Clean the data
sfpd_data_rdd = sfpd_data_rdd.map(lambda x: x.split(",", 2))

In [8]:
print([int(x) for x in sfpd_data_rdd.map(lambda x: x[0]).collect()][0]) # test for unique_key
print(sfpd_data_rdd.count())
print(sfpd_data_rdd.map(lambda x: x[0]).distinct().count()) # The # of distinct key != # of rows

166018573
2170785
1713368


In [9]:
sfpd_data_rdd.map(lambda x: x[1]).distinct().collect() # test for category

['OTHER OFFENSES',
 'DISORDERLY CONDUCT',
 'PROSTITUTION',
 'DRUG/NARCOTIC',
 'VANDALISM',
 'FRAUD',
 'LIQUOR LAWS',
 'GAMBLING',
 'TREA',
 'SECONDARY CODES',
 'NON-CRIMINAL',
 'KIDNAPPING',
 'PORNOGRAPHY/OBSCENE MAT',
 'STOLEN PROPERTY',
 'DRUNKENNESS',
 'BRIBERY',
 'EMBEZZLEMENT',
 'ARSON',
 'TRESPASS',
 'FORGERY/COUNTERFEITING',
 '"SEX OFFENSES',
 'LARCENY/THEFT',
 'DRIVING UNDER THE INFLUENCE',
 'FAMILY OFFENSES',
 'LOITERING',
 'BURGLARY',
 'EXTORTION',
 'MISSING PERSON',
 'RUNAWAY',
 'SUSPICIOUS OCC',
 'WEAPON LAWS',
 'BAD CHECKS',
 'VEHICLE THEFT',
 'SUICIDE',
 'ROBBERY',
 'RECOVERED VEHICLE',
 'WARRANTS',
 'ASSAULT']

In [11]:
weekdays = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']

def weekdaySplit(string):
    for day in weekdays:
        if day in string:
            return([string[:string.find(day)-1], day, string[string.find(day)+len(day)+1:]])

# Test
testString = 'POSSESSION OF METH-AMPHETAMINE,Saturday,PARK,"ARREST, BOOKED",15TH ST / NOE ST,-122.433256948064,37.7658609302161,"(37.7658609302161, -122.433256948064)",14039187016650,2014-05-10 05:43:00+00:00'
weekdaySplit(testString)           

['POSSESSION OF METH-AMPHETAMINE',
 'Saturday',
 'PARK,"ARREST, BOOKED",15TH ST / NOE ST,-122.433256948064,37.7658609302161,"(37.7658609302161, -122.433256948064)",14039187016650,2014-05-10 05:43:00+00:00']

In [12]:
# Clean to unique_id, category, descript, weekofday, others
sfpd_data_rdd = sfpd_data_rdd.map(lambda x: [x[0], x[1], weekdaySplit(x[2])])\
                             .map(lambda x: [x[0], x[1], x[2][0], x[2][1], x[2][2]])
sfpd_data_rdd.take(1)

[['166018573',
  'LARCENY/THEFT',
  'GRAND THEFT FROM LOCKED AUTO',
  'Sunday',
  ',NONE,100 Block of VELASCO AV,-122.4133519852842,37.70820245849022,"(37.70820245849022, -122.4133519852842)",16601857306244,2016-01-17 23:54:00+00:00']]

In [13]:
sfpd_data_rdd.map(lambda x: x[3]).distinct().collect() # test for weekday

['Wednesday', 'Friday', 'Sunday', 'Saturday', 'Monday', 'Thursday', 'Tuesday']

In [14]:
sfpd_data_rdd.map(lambda x: x[4].count(',')).distinct().collect() 
# [8, 9] indicate further cleaning

[8, 9]

In [15]:
# Actual SF Police Department District
sf_pd_district_list = ['', 'Central', 'Southern', 'Bayview', 'Mission', 'Northern',
                       'Park', 'Richmond', 'Ingleside', 'Taraval', 'Tenderloin']
sf_pd_district_list = sorted([pd.upper() for pd in sf_pd_district_list])
sf_pd_district_list

['',
 'BAYVIEW',
 'CENTRAL',
 'INGLESIDE',
 'MISSION',
 'NORTHERN',
 'PARK',
 'RICHMOND',
 'SOUTHERN',
 'TARAVAL',
 'TENDERLOIN']

In [16]:
sorted(sfpd_data_rdd.map(lambda x: x[4].split(',')[0]).distinct().collect()) # Test if pd is correct

['',
 'BAYVIEW',
 'CENTRAL',
 'INGLESIDE',
 'MISSION',
 'NORTHERN',
 'PARK',
 'RICHMOND',
 'SOUTHERN',
 'TARAVAL',
 'TENDERLOIN']

In [17]:
# Clean to unique_id, category, descript, weekofday, pddistrict, others
sfpd_data_rdd = sfpd_data_rdd.map(lambda x: [x[0], x[1], x[2], x[3],
                                             x[4].split(',', 1)])\
                             .map(lambda x: [x[0], x[1], x[2], x[3],
                                             x[4][0], x[4][1]])
sfpd_data_rdd.take(1)

[['166018573',
  'LARCENY/THEFT',
  'GRAND THEFT FROM LOCKED AUTO',
  'Sunday',
  '',
  'NONE,100 Block of VELASCO AV,-122.4133519852842,37.70820245849022,"(37.70820245849022, -122.4133519852842)",16601857306244,2016-01-17 23:54:00+00:00']]

In [18]:
sfpd_data_rdd.map(lambda x: x[5].count(',')).distinct().collect() 
# [7, 8] indicate further cleaning

[7, 8]

In [19]:
# Clean to unique_id, category, descript, weekofday, pddistrict,
# latitude, longitude, timestamp, address and resolution
sfpd_data_rdd = sfpd_data_rdd.map(lambda x: [x[0], x[1], x[2], x[3], x[4],
                                             x[5].split(",")])\
                             .map(lambda x: [x[0], x[1], x[2], x[3], x[4], 
                                             list(reversed(x[5]))])\
                             .map(lambda x: [x[0], x[1], x[2], x[3], x[4], 
                                             x[5][4], x[5][5], x[5][0], x[5][6], list(reversed(x[5][7:]))])
sfpd_data_rdd.take(10)

[['166018573',
  'LARCENY/THEFT',
  'GRAND THEFT FROM LOCKED AUTO',
  'Sunday',
  '',
  '37.70820245849022',
  '-122.4133519852842',
  '2016-01-17 23:54:00+00:00',
  '100 Block of VELASCO AV',
  ['NONE']],
 ['160874408',
  'DRIVING UNDER THE INFLUENCE',
  'DRIVING WHILE UNDER THE INFLUENCE OF DRUGS',
  'Thursday',
  'PARK',
  '37.769176747627725',
  '-122.43357509728241',
  '2016-10-27 04:15:00+00:00',
  'DUBOCE AV / NOE ST',
  ['NONE']],
 ['120574836',
  'ROBBERY',
  'ATTEMPTED ROBBERY ON THE STREET WITH A KNIFE',
  'Saturday',
  'PARK',
  '37.7712678186367',
  '-122.437048523435',
  '2012-07-21 05:55:00+00:00',
  'HAIGHT ST / DIVISADERO ST',
  ['NONE']],
 ['160573898',
  'SECONDARY CODES',
  'JUVENILE INVOLVED',
  'Saturday',
  'PARK',
  '37.77172731417711',
  '-122.44139370111428',
  '2016-07-16 11:15:00+00:00',
  '1200 Block of PAGE ST',
  ['NONE']],
 ['40502947',
  'DRUG/NARCOTIC',
  'POSSESSION OF COCAINE',
  'Sunday',
  'PARK',
  '37.7768021540039',
  '-122.439037573428',
  '200

In [20]:
def resolutionTransform(alist):
    if len(alist) == 2:
        return ",".join(alist)
    else:
        return alist[0]

# test
print(resolutionTransform(['None']))
print(type(resolutionTransform(['None'])))
print(resolutionTransform(['"ARREST', ' BOOKED"']))
print(type(resolutionTransform(['"ARREST', ' BOOKED"'])))

None
<class 'str'>
"ARREST, BOOKED"
<class 'str'>


In [21]:
def toFloatSafe(inval):
  try:
    return float(inval)
  except ValueError:
    return None

from datetime import datetime
def toTimeSafe(inval):
  inval = inval.strip("\"") # Timestamp starting and ending with a double quotation mark.
  try:
    return datetime.strptime(inval, "%Y-%m-%d %H:%M:%S")
  except ValueError:
    return None
print(toTimeSafe('2016-01-17 23:54:00+00:00'[:-6]))

2016-01-17 23:54:00


In [34]:
# Finally, clean to unique_id, category, descript, weekofday, pddistrict,
# latitude, longitude, timestamp, address and resolution
sfpd_data_rdd = sfpd_data_rdd.map(lambda x: [x[0], x[1], x[2], x[3], x[4],
                                             float(x[5]), float(x[6]), 
                                             toTimeSafe(x[7][:-6]), x[8], resolutionTransform(x[9])])
sfpd_data_rdd.take(1)

[['166018573',
  'LARCENY/THEFT',
  'GRAND THEFT FROM LOCKED AUTO',
  'Sunday',
  '',
  37.70820245849022,
  -122.4133519852842,
  datetime.datetime(2016, 1, 17, 23, 54),
  '100 Block of VELASCO AV',
  'NONE']]

# Create DataFrame

In [35]:
from pyspark.sql.types import *
schema = StructType([ StructField("unique_key", StringType(), False),
                      StructField("category", StringType(), True),
                      StructField("descript", StringType(), True),
                      StructField("dayofweek", StringType(), True),
                      StructField("pddistrict", StringType(), True),
                      StructField("latitude", FloatType(), True),
                      StructField("longitude", FloatType(), True),
                      StructField("timestamp", TimestampType(), True),
                      StructField("address", StringType(), True),
                      StructField("resolution", StringType(), True)
                    ])

In [45]:
ss.createDataFrame(sfpd_data_rdd, schema).show(5)

+----------+--------------------+--------------------+---------+----------+---------+----------+-------------------+--------------------+----------------+
|unique_key|            category|            descript|dayofweek|pddistrict| latitude| longitude|          timestamp|             address|      resolution|
+----------+--------------------+--------------------+---------+----------+---------+----------+-------------------+--------------------+----------------+
| 166018573|       LARCENY/THEFT|GRAND THEFT FROM ...|   Sunday|          |37.708202|-122.41335|2016-01-17 23:54:00|100 Block of VELA...|            NONE|
| 160874408|DRIVING UNDER THE...|DRIVING WHILE UND...| Thursday|      PARK|37.769176|-122.43358|2016-10-27 04:15:00|  DUBOCE AV / NOE ST|            NONE|
| 120574836|             ROBBERY|ATTEMPTED ROBBERY...| Saturday|      PARK|37.771267|-122.43705|2012-07-21 05:55:00|HAIGHT ST / DIVIS...|            NONE|
| 160573898|     SECONDARY CODES|   JUVENILE INVOLVED| Saturday|      