In [1]:
import os
from pyspark.sql import SparkSession
from pyspark import SparkContext
from pyspark.sql.types import *

In [5]:
sc = SparkContext.getOrCreate()

In [6]:
pyspark_submit_args = '--packages org.mongodo.spark:mongo-spark-connector_2.11:2.4.0 pyspark-shell'
os.environ['PYSPARK_SUBMIT_ARGS'] = pyspark_submit_args

In [7]:
ss = SparkSession.builder\
                 .appName('jonrossandfriends')\
                 .config('spark.mongodb.input.uri', 'mongodb://34.214.80.18/msds697.sffd')\
                 .getOrCreate()

In [8]:
# Read the data
sffd_df = ss.read.format('com.mongodb.spark.sql.DefaultSource').load()

In [9]:
# Print the schema and see how elegant it looks
sffd_df.printSchema()

root
 |-- _id: struct (nullable = true)
 |    |-- oid: string (nullable = true)
 |-- address: string (nullable = true)
 |-- available_timestamp: string (nullable = true)
 |-- battalion: string (nullable = true)
 |-- call_final_disposition: string (nullable = true)
 |-- call_type: string (nullable = true)
 |-- dispatch_timestamp: string (nullable = true)
 |-- entry_timestamp: string (nullable = true)
 |-- hospital_timestamp: string (nullable = true)
 |-- on_scene_timestamp: string (nullable = true)
 |-- received_timestamp: string (nullable = true)
 |-- response_timestamp: string (nullable = true)
 |-- station_area: string (nullable = true)
 |-- transport_timestamp: string (nullable = true)
 |-- zipcode_of_incident: string (nullable = true)



In [13]:
# Print 2 and see how it looks
sffd_df.show(2)

+--------------------+--------------------+--------------------+---------+----------------------+----------------+--------------------+--------------------+------------------+--------------------+--------------------+--------------------+------------+-------------------+-------------------+
|                 _id|             address| available_timestamp|battalion|call_final_disposition|       call_type|  dispatch_timestamp|     entry_timestamp|hospital_timestamp|  on_scene_timestamp|  received_timestamp|  response_timestamp|station_area|transport_timestamp|zipcode_of_incident|
+--------------------+--------------------+--------------------+---------+----------------------+----------------+--------------------+--------------------+------------------+--------------------+--------------------+--------------------+------------+-------------------+-------------------+
|[5c413fb4b760937d...|200 Block of WILL...|2004-09-13 22:45:...|      B10|                 Other|Medical Incident|2004-09-13

In [15]:
# Check if the data is dirty
sffd_df.groupBy("zipcode_of_incident").count().rdd.collect()

[Row(zipcode_of_incident=u'94102', count=570087),
 Row(zipcode_of_incident=u'94107', count=180127),
 Row(zipcode_of_incident=u'94104', count=34641),
 Row(zipcode_of_incident=u'94131', count=83845),
 Row(zipcode_of_incident=u'94112', count=215612),
 Row(zipcode_of_incident=u'94103', count=543100),
 Row(zipcode_of_incident=u'94130', count=29617),
 Row(zipcode_of_incident=u'94118', count=136152),
 Row(zipcode_of_incident=u'94117', count=154335),
 Row(zipcode_of_incident=u'94129', count=13879),
 Row(zipcode_of_incident=u'94109', count=378663),
 Row(zipcode_of_incident=u'94132', count=109934),
 Row(zipcode_of_incident=u'94123', count=94463),
 Row(zipcode_of_incident=u'94158', count=20320),
 Row(zipcode_of_incident=u'94105', count=108715),
 Row(zipcode_of_incident=u'94111', count=76258),
 Row(zipcode_of_incident=u'94134', count=126582),
 Row(zipcode_of_incident=u'94116', count=98739),
 Row(zipcode_of_incident=u'94127', count=47437),
 Row(zipcode_of_incident=u'94122', count=163262),
 Row(zipc

In [24]:
timestamp_list = ['received_timestamp',
                  'entry_timestamp',
                  'dispatch_timestamp',
                  'response_timestamp',
                  'on_scene_timestamp',
                  'transport_timestamp',
                  'hospital_timestamp',
                  'available_timestamp']
timestamp_adj_list = [attr + "_adj" for attr in timestamp_list]

# Test
timestamp_adj_list[0]

'received_timestamp_adj'

In [25]:
# Give a try to see if we can change column type
sffd_df.withColumn("available_timestamp_adj", sffd_df["available_timestamp"].cast("timestamp"))\
       .select("available_timestamp_adj").show(1)

# The answer is "hell yeah"

+-----------------------+
|available_timestamp_adj|
+-----------------------+
|    2000-06-03 15:39:18|
+-----------------------+
only showing top 1 row



In [26]:
# Convert timestamp from StringType to TimestampType
for i in range(len(timestamp_list)):
    sffd_df = sffd_df.withColumn(timestamp_adj_list[i], 
                                 sffd_df[timestamp_list[i]].cast("timestamp"))\
                     .drop(timestamp_list[i])\
                     .withColumnRenamed(timestamp_adj_list[i], timestamp_list[i])
sffd_df.printSchema()

root
 |-- _id: struct (nullable = true)
 |    |-- oid: string (nullable = true)
 |-- address: string (nullable = true)
 |-- battalion: string (nullable = true)
 |-- call_final_disposition: string (nullable = true)
 |-- call_type: string (nullable = true)
 |-- station_area: string (nullable = true)
 |-- zipcode_of_incident: string (nullable = true)
 |-- received_timestamp: timestamp (nullable = true)
 |-- entry_timestamp: timestamp (nullable = true)
 |-- dispatch_timestamp: timestamp (nullable = true)
 |-- response_timestamp: timestamp (nullable = true)
 |-- on_scene_timestamp: timestamp (nullable = true)
 |-- transport_timestamp: timestamp (nullable = true)
 |-- hospital_timestamp: timestamp (nullable = true)
 |-- available_timestamp: timestamp (nullable = true)



In [28]:
# I guess we don't need rows with empty zipcode
sffd_df = sffd_df.filter("zipcode_of_incident != ''")

In [29]:
# Test if we filter out the empty zipcode
sffd_df.groupBy("zipcode_of_incident").count().rdd.collect()

[Row(zipcode_of_incident=u'94102', count=570087),
 Row(zipcode_of_incident=u'94107', count=180127),
 Row(zipcode_of_incident=u'94104', count=34641),
 Row(zipcode_of_incident=u'94131', count=83845),
 Row(zipcode_of_incident=u'94112', count=215612),
 Row(zipcode_of_incident=u'94103', count=543100),
 Row(zipcode_of_incident=u'94130', count=29617),
 Row(zipcode_of_incident=u'94118', count=136152),
 Row(zipcode_of_incident=u'94117', count=154335),
 Row(zipcode_of_incident=u'94129', count=13879),
 Row(zipcode_of_incident=u'94109', count=378663),
 Row(zipcode_of_incident=u'94132', count=109934),
 Row(zipcode_of_incident=u'94123', count=94463),
 Row(zipcode_of_incident=u'94158', count=20320),
 Row(zipcode_of_incident=u'94105', count=108715),
 Row(zipcode_of_incident=u'94111', count=76258),
 Row(zipcode_of_incident=u'94134', count=126582),
 Row(zipcode_of_incident=u'94116', count=98739),
 Row(zipcode_of_incident=u'94127', count=47437),
 Row(zipcode_of_incident=u'94122', count=163262),
 Row(zipc