In [1]:
import pyspark
from pyspark.sql.types import *
import os
import boto3
import numpy as np

In [2]:
sc = pyspark.SparkContext.getOrCreate()
ss = pyspark.sql.SparkSession.builder.getOrCreate()

In [3]:
bucket_name = 'msds697jonross.and.friends' # Add your bucket name
file_name = 'sffd.csv' # select file
s3 = boto3.resource('s3')
bucket = s3.Bucket(bucket_name) 
obj = bucket.Object(key=file_name) # S3 uses key-value structure where key is your file name
file_content = obj.get()["Body"].read().decode("utf-8") # Read the Body which is the contents of the file.

In [4]:
# number of rows (subract header and empty line at end)
rows = file_content.split('\n')
len(rows)-2

4557045

In [5]:
# number of  columns
column_names = rows[0]
len(column_names.split(','))

14

In [6]:
# randomly sample rows
sz=10000
samples = np.random.choice(rows[1:], size=sz, replace=False)
samples[:2]

array(['Structure Fire,2004-04-15 23:49:38+00:00,2004-04-15 23:50:00+00:00,2004-04-15 23:50:19+00:00,2004-04-15 23:54:28+00:00,2004-04-15 23:57:46+00:00,,,Other,2004-04-16 00:18:31+00:00,500 Block of GEARY ST,94102,B01,03',
       'Medical Incident,2013-04-02 21:47:07+00:00,2013-04-02 21:47:30+00:00,2013-04-02 21:47:56+00:00,2013-04-02 21:49:23+00:00,2013-04-02 21:52:03+00:00,,,Other,2013-04-02 22:04:42+00:00,200 Block of SENECA AVE,94112,B09,15'],
      dtype='<U315')

In [7]:
sffd_rdd = sc.parallelize(list(samples))\
    .filter(lambda x: len(x.split(','))==len(samples[0].split(',')))\
    .map(lambda x: x.split(','))

In [8]:
# number of rows removed
sz - sffd_rdd.count()

12

In [9]:
sffd_schema = StructType([StructField("call_type", StringType(), False),
                          StructField("received_timestamp", StringType(), False),
                          StructField("entry_timestamp", StringType(), False),
                          StructField("dispatch_timestamp", StringType(), False),
                          StructField("response_timestamp", StringType(), False),
                          StructField("on_scene_timestamp", StringType(), False),
                          StructField("transport_timestamp", StringType(), False),
                          StructField("hospital_timestamp", StringType(), False),
                          StructField("call_final_disposition", StringType(), False),
                          StructField("available_timestamp", StringType(), False),
                          StructField("address", StringType(), False),
                          StructField("zipcode_of_incident", StringType(), False),
                          StructField("battalion", StringType(), False),
                          StructField("station_area", StringType(), False)
                    ])

In [10]:
sffd_df = ss.createDataFrame(sffd_rdd, sffd_schema)

In [11]:
print('\n----------------------\n'.join(x for x in column_names.split(',')))

call_type
----------------------
received_timestamp
----------------------
entry_timestamp
----------------------
dispatch_timestamp
----------------------
response_timestamp
----------------------
on_scene_timestamp
----------------------
transport_timestamp
----------------------
hospital_timestamp
----------------------
call_final_disposition
----------------------
available_timestamp
----------------------
address
----------------------
zipcode_of_incident
----------------------
battalion
----------------------
station_area


In [12]:
sffd_df.groupBy("call_type")\
    .count()\
    .orderBy("count", ascending=[0])\
    .show()

+--------------------+-----+
|           call_type|count|
+--------------------+-----+
|    Medical Incident| 6527|
|      Structure Fire| 1279|
|              Alarms| 1062|
|   Traffic Collision|  409|
|Citizen Assist / ...|  187|
|               Other|  157|
|        Outside Fire|  109|
|Gas Leak (Natural...|   45|
|        Water Rescue|   44|
|        Vehicle Fire|   30|
|Elevator / Escala...|   30|
|Odor (Strange / U...|   28|
|   Electrical Hazard|   22|
|Smoke Investigati...|   22|
|          Fuel Spill|    9|
|              HazMat|    7|
|           Explosion|    5|
|Industrial Accidents|    4|
|  Aircraft Emergency|    3|
|   High Angle Rescue|    2|
+--------------------+-----+
only showing top 20 rows

