In [364]:
import pyspark
from pyspark.sql.types import *
import os
import json
import requests
import boto3
import numpy as np
from pyspark.sql.functions import *

In [7]:
sc = pyspark.SparkContext.getOrCreate()
ss = pyspark.sql.SparkSession.builder.getOrCreate()

In [8]:
bucket_name = 'msds697jonross.and.friends' # Add your bucket name
file_name = 'sffd.csv' # select file
s3 = boto3.resource('s3')
bucket = s3.Bucket(bucket_name) 
obj = bucket.Object(key=file_name) # S3 uses key-value structure where key is your file name
file_content = obj.get()["Body"].read().decode("utf-8") # Read the Body which is the contents of the file.

In [9]:
# number of rows (subract header and empty line at end)
rows = file_content.split('\n')
len(rows)-2

4557045

In [387]:
rows[:3]

['call_type,received_timestamp,entry_timestamp,dispatch_timestamp,response_timestamp,on_scene_timestamp,transport_timestamp,hospital_timestamp,call_final_disposition,available_timestamp,address,zipcode_of_incident,battalion,station_area',
 'Explosion,2017-02-07 20:00:53+00:00,2017-02-07 20:00:53+00:00,2017-02-07 20:03:38+00:00,2017-02-07 20:05:16+00:00,2017-02-07 20:08:11+00:00,,,Fire,2017-02-07 20:13:41+00:00,0 Block of FRANCIS ST,94112,B09,',
 'Elevator / Escalator Rescue,2000-06-03 15:32:02+00:00,2000-06-03 15:32:17+00:00,2000-06-03 15:32:33+00:00,,,,,Other,2000-06-03 15:39:18+00:00,2700 Block of VAN NESS AVE,94123,B04,04']

In [63]:
# number of  columns
column_names = rows[0].split(',')
n_cols = sc.broadcast(len(column_names))
n_cols.value

14

In [64]:
print('   '.join(x for x in column_names))

call_type   received_timestamp   entry_timestamp   dispatch_timestamp   response_timestamp   on_scene_timestamp   transport_timestamp   hospital_timestamp   call_final_disposition   available_timestamp   address   zipcode_of_incident   battalion   station_area


In [65]:
# randomly sample rows
sz=10000
samples = np.random.choice(rows[1:], size=sz, replace=False)
samples[:2]

array(['Medical Incident,2009-12-21 14:38:56+00:00,2009-12-21 14:39:41+00:00,2009-12-21 14:41:41+00:00,2009-12-21 14:41:59+00:00,2009-12-21 14:51:09+00:00,2009-12-21 15:07:26+00:00,2009-12-21 15:21:41+00:00,Code 2 Transport,2009-12-21 15:47:56+00:00,200 Block of 6TH ST,94103,B03,01',
       'Medical Incident,2007-10-06 22:07:39+00:00,2007-10-06 22:09:05+00:00,2007-10-06 22:09:46+00:00,2007-10-06 22:10:43+00:00,2007-10-06 22:13:46+00:00,2007-10-06 22:27:28+00:00,2007-10-06 22:35:57+00:00,Code 2 Transport,2007-10-06 23:01:56+00:00,FRANKLIN ST/GROVE ST,94102,B02,36'],
      dtype='<U315')

In [68]:
def filter_fire(x):
    return len(x.split(',')) == n_cols.value

rdd = sc.parallelize(list(samples))\
    .filter(filter_fire)\
    .map(lambda x: x.split(','))

In [71]:
# number of rows removed
sz - rdd.count()

14

In [97]:
schema = StructType([StructField("call_type", StringType(), False),
                    StructField("received_timestamp", StringType(), False),
                    StructField("entry_timestamp", StringType(), False),
                    StructField("dispatch_timestamp", StringType(), False),
                    StructField("response_timestamp", StringType(), False),
                    StructField("on_scene_timestamp", StringType(), False),
                    StructField("transport_timestamp", StringType(), False),
                    StructField("hospital_timestamp", StringType(), False),
                    StructField("call_final_disposition", StringType(), False),
                    StructField("available_timestamp", StringType(), False),
                    StructField("address", StringType(), False),
                    StructField("zipcode_of_incident", StringType(), False),
                    StructField("battalion", StringType(), False),
                    StructField("station_area", StringType(), False)
                    ])

In [98]:
df = ss.createDataFrame(rdd, schema)
# df = df.withColumn('date', to_timestamp(df.timestamp, 'yyyy-MM-dd HH:mm:00+00:00'))\
#     .drop("timestamp")\
#     .withColumnRenamed('date', 'timestamp')

In [99]:
# print('\n----------------------\n'.join(x for x in column_names))
df.printSchema()

root
 |-- call_type: string (nullable = false)
 |-- received_timestamp: string (nullable = false)
 |-- entry_timestamp: string (nullable = false)
 |-- dispatch_timestamp: string (nullable = false)
 |-- response_timestamp: string (nullable = false)
 |-- on_scene_timestamp: string (nullable = false)
 |-- transport_timestamp: string (nullable = false)
 |-- hospital_timestamp: string (nullable = false)
 |-- call_final_disposition: string (nullable = false)
 |-- available_timestamp: string (nullable = false)
 |-- address: string (nullable = false)
 |-- zipcode_of_incident: string (nullable = false)
 |-- battalion: string (nullable = false)
 |-- station_area: string (nullable = false)



In [100]:
df.columns

['call_type',
 'received_timestamp',
 'entry_timestamp',
 'dispatch_timestamp',
 'response_timestamp',
 'on_scene_timestamp',
 'transport_timestamp',
 'hospital_timestamp',
 'call_final_disposition',
 'available_timestamp',
 'address',
 'zipcode_of_incident',
 'battalion',
 'station_area']

In [101]:
df.select('received_timestamp').show(20, False)

+-------------------------+
|received_timestamp       |
+-------------------------+
|2009-12-21 14:38:56+00:00|
|2007-10-06 22:07:39+00:00|
|2009-03-26 23:01:24+00:00|
|2002-11-22 08:01:25+00:00|
|2017-01-19 11:03:38+00:00|
|2017-08-01 08:37:32+00:00|
|2015-11-13 03:00:54+00:00|
|2001-12-19 19:24:08+00:00|
|2001-02-11 11:07:36+00:00|
|2000-08-30 13:30:11+00:00|
|2002-05-11 14:02:42+00:00|
|2013-06-02 20:20:53+00:00|
|2012-02-29 00:10:29+00:00|
|2013-04-15 13:17:29+00:00|
|2013-01-21 14:19:19+00:00|
|2016-06-13 02:29:07+00:00|
|2009-10-08 03:11:14+00:00|
|2005-03-24 17:23:10+00:00|
|2010-10-11 17:27:39+00:00|
|2002-11-03 00:58:21+00:00|
+-------------------------+
only showing top 20 rows



In [116]:
my_rows = ['received_timestamp',
          'entry_timestamp',
          'dispatch_timestamp',
          'response_timestamp',
          'on_scene_timestamp',
          'transport_timestamp',
          'hospital_timestamp',
          'available_timestamp']

df_w_time = df
for row in my_rows:
    df_w_time = df_w_time.withColumn(row, to_timestamp(df[row], format = 'yyyy-MM-dd HH:mm:ss+00:00'))

In [117]:
df_w_time.show(2)

+----------------+-------------------+-------------------+-------------------+-------------------+-------------------+-------------------+-------------------+----------------------+-------------------+--------------------+-------------------+---------+------------+
|       call_type| received_timestamp|    entry_timestamp| dispatch_timestamp| response_timestamp| on_scene_timestamp|transport_timestamp| hospital_timestamp|call_final_disposition|available_timestamp|             address|zipcode_of_incident|battalion|station_area|
+----------------+-------------------+-------------------+-------------------+-------------------+-------------------+-------------------+-------------------+----------------------+-------------------+--------------------+-------------------+---------+------------+
|Medical Incident|2009-12-21 14:38:56|2009-12-21 14:39:41|2009-12-21 14:41:41|2009-12-21 14:41:59|2009-12-21 14:51:09|2009-12-21 15:07:26|2009-12-21 15:21:41|      Code 2 Transport|2009-12-21 15:47:56| 

In [137]:
small_df = df_w_time.select('call_type',
                 'received_timestamp',
                 'on_scene_timestamp',
                 'address',
                 'zipcode_of_incident',
                 'battalion',
                 'station_area')\
    .withColumn("response_time", 
                (unix_timestamp('on_scene_timestamp') - unix_timestamp('received_timestamp')) / 60)\
    .orderBy('received_timestamp', ascending=[0])

In [149]:
small_df.select('call_type').groupBy('call_type').count().orderBy('count', ascending=[0]).show(30,False)

+-----------------------------------+-----+
|call_type                          |count|
+-----------------------------------+-----+
|Medical Incident                   |6453 |
|Structure Fire                     |1319 |
|Alarms                             |1086 |
|Traffic Collision                  |460  |
|Other                              |162  |
|Citizen Assist / Service Call      |143  |
|Outside Fire                       |106  |
|Water Rescue                       |40   |
|Vehicle Fire                       |37   |
|Gas Leak (Natural and LP Gases)    |35   |
|Electrical Hazard                  |32   |
|Odor (Strange / Unknown)           |23   |
|Elevator / Escalator Rescue        |22   |
|Smoke Investigation (Outside)      |14   |
|HazMat                             |13   |
|Aircraft Emergency                 |8    |
|Fuel Spill                         |6    |
|Explosion                          |6    |
|Industrial Accidents               |5    |
|Assist Police                  

In [209]:
small_df.select('response_time').show(5, False)

+------------------+
|response_time     |
+------------------+
|6.65              |
|4.85              |
|21.733333333333334|
|7.466666666666667 |
|3.1166666666666667|
+------------------+
only showing top 5 rows



In [232]:
# might need to be careful about what we are grouping by
station_area_aggregates = small_df.select('station_area','battalion','response_time')\
    .filter('response_time is not null')\
    .groupBy('battalion', 'station_area')\
    .agg(mean('response_time'),
         variance('response_time'), 
         count('response_time'),
         kurtosis('response_time'),
         skewness('response_time'))\
    .orderBy('battalion')

col_names = station_area_aggregates.columns
for i in range(len(col_names[2:])):
    station_area_aggregates = station_area_aggregates\
        .withColumnRenamed(col_names[i+2], col_names[2+i].split('(')[0])
station_area_aggregates.columns

['battalion',
 'station_area',
 'avg',
 'var_samp',
 'count',
 'kurtosis',
 'skewness']

In [233]:
print(station_area_aggregates.count())
station_area_aggregates.show(10)

114
+---------+------------+------------------+-------------------+-----+--------------------+------------------+
|battalion|station_area|               avg|           var_samp|count|            kurtosis|          skewness|
+---------+------------+------------------+-------------------+-----+--------------------+------------------+
|      B01|          A1| 6.878571428571429|  7.353769841269842|    7|-0.23939421900317814|1.0206350661830932|
|      B01|          13| 7.716446402349489| 52.668870403579675|  227|   51.43983045642755| 5.985938690039854|
|      B01|          03| 8.799285714285714| 41.722732976765585|   70|  2.7434081814501114|1.8328941831480656|
|      B01|          16| 6.811904761904763|  4.844788359788359|    7|  0.6304996578427877|0.7919111812697118|
|      B01|          41|6.8463696369636935|  20.90235891089108|  101|  15.791962004879405| 3.376354744483347|
|      B01|          28| 9.063314711359407| 36.504327028365374|  179|   5.675194401469296| 2.202920064094221|
|     

In [337]:
df.select("timestamp").show(1, False)

+-------------------+
|timestamp          |
+-------------------+
|2006-10-17 17:20:00|
+-------------------+
only showing top 1 row



In [359]:
def t_series(interval='day'):
    df.select(date_trunc(interval, 'timestamp'))\
        .withColumnRenamed(f'date_trunc({interval}, timestamp)', interval)\
        .groupBy(interval)\
        .count()\
        .orderBy(interval, ascending=False).show()

In [361]:
t_series('year')

+-------------------+-----+
|               year|count|
+-------------------+-----+
|2018-01-01 00:00:00|   20|
|2017-01-01 00:00:00|  730|
|2016-01-01 00:00:00|  684|
|2015-01-01 00:00:00|  676|
|2014-01-01 00:00:00|  684|
|2013-01-01 00:00:00|  706|
|2012-01-01 00:00:00|  627|
|2011-01-01 00:00:00|  606|
|2010-01-01 00:00:00|  614|
|2009-01-01 00:00:00|  638|
|2008-01-01 00:00:00|  651|
|2007-01-01 00:00:00|  651|
|2006-01-01 00:00:00|  645|
|2005-01-01 00:00:00|  673|
|2004-01-01 00:00:00|  693|
|2003-01-01 00:00:00|  702|
+-------------------+-----+

