## Data Processing

In [1]:
import pyspark
from pyspark.sql.types import *
import os
import boto3
import numpy as np
from pyspark.sql.functions import *

In [2]:
sc = pyspark.SparkContext.getOrCreate()
ss = pyspark.sql.SparkSession.builder.getOrCreate()

In [3]:
bucket_name = 'msds697jonross.and.friends' # Add your bucket name
file_name = 'sffd.csv' # select file
s3 = boto3.resource('s3')
bucket = s3.Bucket(bucket_name) 
obj = bucket.Object(key=file_name) # S3 uses key-value structure where key is your file name
file_content = obj.get()["Body"].read().decode("utf-8") # Read the Body which is the contents of the file.

In [4]:
# number of rows (subract header and empty line at end)
rows = file_content.split('\n')
len(rows)-2

4557045

In [5]:
# number of  columns
column_names = rows[0].split(',')
n_cols = sc.broadcast(len(column_names))
n_cols.value

14

In [6]:
print('\n'.join(x for x in column_names))

call_type
received_timestamp
entry_timestamp
dispatch_timestamp
response_timestamp
on_scene_timestamp
transport_timestamp
hospital_timestamp
call_final_disposition
available_timestamp
address
zipcode_of_incident
battalion
station_area


In [30]:
# randomly sample rows
sz=100000
samples = np.random.choice(rows[1:], size=sz, replace=False)
samples[:2]

array(['Alarms,2004-10-24 11:16:45+00:00,2004-10-24 11:18:15+00:00,2004-10-24 11:18:23+00:00,2004-10-24 11:18:31+00:00,2004-10-24 11:21:52+00:00,,,Other,2004-10-24 11:24:41+00:00,0 Block of SAN FERNANDO WAY,94127,B08,A3',
       'Medical Incident,2012-02-18 01:36:32+00:00,2012-02-18 01:39:11+00:00,2012-02-18 01:39:48+00:00,2012-02-18 01:39:57+00:00,2012-02-18 01:50:19+00:00,2012-02-18 02:02:45+00:00,2012-02-18 02:08:54+00:00,Other,2012-02-18 02:31:20+00:00,3100 Block of FILLMORE ST,94123,B04,16'],
      dtype='<U315')

In [31]:
def filter_fire(x):
    return len(x.split(',')) == n_cols.value

rdd = sc.parallelize(list(samples))\
    .filter(filter_fire)\
    .map(lambda x: x.split(','))

In [32]:
# number of rows removed
sz - rdd.count()

141

In [33]:
schema = StructType([StructField("call_type", StringType(), False),
                    StructField("received_timestamp", StringType(), False),
                    StructField("entry_timestamp", StringType(), False),
                    StructField("dispatch_timestamp", StringType(), False),
                    StructField("response_timestamp", StringType(), False),
                    StructField("on_scene_timestamp", StringType(), False),
                    StructField("transport_timestamp", StringType(), False),
                    StructField("hospital_timestamp", StringType(), False),
                    StructField("call_final_disposition", StringType(), False),
                    StructField("available_timestamp", StringType(), False),
                    StructField("address", StringType(), False),
                    StructField("zipcode_of_incident", StringType(), False),
                    StructField("battalion", StringType(), False),
                    StructField("station_area", StringType(), False)
                    ])

In [34]:
df = ss.createDataFrame(rdd, schema) # .cache()

In [35]:
df.printSchema()

root
 |-- call_type: string (nullable = false)
 |-- received_timestamp: string (nullable = false)
 |-- entry_timestamp: string (nullable = false)
 |-- dispatch_timestamp: string (nullable = false)
 |-- response_timestamp: string (nullable = false)
 |-- on_scene_timestamp: string (nullable = false)
 |-- transport_timestamp: string (nullable = false)
 |-- hospital_timestamp: string (nullable = false)
 |-- call_final_disposition: string (nullable = false)
 |-- available_timestamp: string (nullable = false)
 |-- address: string (nullable = false)
 |-- zipcode_of_incident: string (nullable = false)
 |-- battalion: string (nullable = false)
 |-- station_area: string (nullable = false)



In [36]:
# convert to timestamps
my_rows = ['received_timestamp',
          'entry_timestamp',
          'dispatch_timestamp',
          'response_timestamp',
          'on_scene_timestamp',
          'transport_timestamp',
          'hospital_timestamp',
          'available_timestamp']

df_w_time = df
for row in my_rows:
    df_w_time = df_w_time.withColumn(row, to_timestamp(df[row], format = 'yyyy-MM-dd HH:mm:ss+00:00'))

In [37]:
small_df = df_w_time.select('call_type',
                 'received_timestamp',
                 'on_scene_timestamp',
                 'zipcode_of_incident',
                 'battalion',
                 'station_area')\
    .withColumn("label", 
                (unix_timestamp('on_scene_timestamp') - unix_timestamp('received_timestamp')) / 60)\
    .orderBy('received_timestamp', ascending=[0])\
    .na.drop(subset=["label"])\
    .select('call_type',
            'zipcode_of_incident',
            'battalion',
            'station_area',
            'label')\


In [38]:
small_df.show(5)
print(small_df.count())

+----------------+-------------------+---------+------------+-----------------+
|       call_type|zipcode_of_incident|battalion|station_area|            label|
+----------------+-------------------+---------+------------+-----------------+
|Medical Incident|              94131|      B06|          11|5.616666666666666|
|Medical Incident|              94110|      B06|          09|              4.1|
|Medical Incident|              94110|      B06|          07|5.233333333333333|
|Medical Incident|              94108|      B01|          13|              5.6|
|Medical Incident|              94111|      B01|          13|8.883333333333333|
+----------------+-------------------+---------+------------+-----------------+
only showing top 5 rows

76624


## Machine Learning

In [39]:
from pyspark.ml.feature import StringIndexer
from pyspark.ml.feature import OneHotEncoder

from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import RandomForestRegressor
from pyspark.ml.evaluation import RegressionEvaluator

In [40]:
def remove_empty_strings(df):
    return df.replace('','unknown')
dfnonas = remove_empty_strings(small_df)

In [41]:
#converting strings to numeric values
def indexStringColumns(df, cols):
    #variable newdf will be updated several times
    newdf = df
    
    for c in cols:
        #For each given colum, fits StringIndexerModel.
        si = StringIndexer(inputCol=c, outputCol=c+"-num")
        sm = si.fit(newdf)
        #Creates a DataFame by putting the transformed values in the new colum with suffix "-num" 
        #and then drops the original columns.
        #and drop the "-num" suffix. 
        newdf = sm.transform(newdf).drop(c)
        newdf = newdf.withColumnRenamed(c+"-num", c)
    return newdf

dfnumeric = indexStringColumns(dfnonas, ["call_type", "zipcode_of_incident", "battalion", "station_area"])

In [42]:
dfnumeric.show()

+------------------+---------+-------------------+---------+------------+
|             label|call_type|zipcode_of_incident|battalion|station_area|
+------------------+---------+-------------------+---------+------------+
| 5.616666666666666|      0.0|               20.0|      7.0|        11.0|
|               4.1|      0.0|                2.0|      7.0|        36.0|
| 5.233333333333333|      0.0|                2.0|      7.0|         3.0|
|               5.6|      0.0|               17.0|      2.0|         5.0|
| 8.883333333333333|      0.0|               21.0|      2.0|         5.0|
|              5.25|      3.0|                1.0|      1.0|        28.0|
| 7.166666666666667|      2.0|               18.0|      0.0|        24.0|
| 6.916666666666667|      0.0|               20.0|      7.0|        38.0|
| 6.683333333333334|      1.0|               10.0|      8.0|         4.0|
| 9.033333333333333|      0.0|                3.0|      3.0|         1.0|
|               0.0|      5.0|        

In [43]:
def oneHotEncodeColumns(df, cols):
    newdf = df
    for c in cols:
        #For each given colum, create OneHotEncoder. 
        #dropLast : Whether to drop the last category in the encoded vector (default: true)
        onehotenc = OneHotEncoder(inputCol=c, outputCol=c+"-onehot", dropLast=False)
        #Creates a DataFame by putting the transformed values in the new colum with suffix "-onehot" 
        #and then drops the original columns.
        #and drop the "-onehot" suffix. 
        newdf = onehotenc.transform(newdf).drop(c)
        newdf = newdf.withColumnRenamed(c+"-onehot", c)
    return newdf

dfhot = oneHotEncodeColumns(dfnumeric, ["call_type", "zipcode_of_incident", "battalion", "station_area"])

In [44]:
dfhot.show()

+------------------+--------------+-------------------+--------------+---------------+
|             label|     call_type|zipcode_of_incident|     battalion|   station_area|
+------------------+--------------+-------------------+--------------+---------------+
| 5.616666666666666|(29,[0],[1.0])|    (28,[20],[1.0])|(11,[7],[1.0])|(51,[11],[1.0])|
|               4.1|(29,[0],[1.0])|     (28,[2],[1.0])|(11,[7],[1.0])|(51,[36],[1.0])|
| 5.233333333333333|(29,[0],[1.0])|     (28,[2],[1.0])|(11,[7],[1.0])| (51,[3],[1.0])|
|               5.6|(29,[0],[1.0])|    (28,[17],[1.0])|(11,[2],[1.0])| (51,[5],[1.0])|
| 8.883333333333333|(29,[0],[1.0])|    (28,[21],[1.0])|(11,[2],[1.0])| (51,[5],[1.0])|
|              5.25|(29,[3],[1.0])|     (28,[1],[1.0])|(11,[1],[1.0])|(51,[28],[1.0])|
| 7.166666666666667|(29,[2],[1.0])|    (28,[18],[1.0])|(11,[0],[1.0])|(51,[24],[1.0])|
| 6.916666666666667|(29,[0],[1.0])|    (28,[20],[1.0])|(11,[7],[1.0])|(51,[38],[1.0])|
| 6.683333333333334|(29,[1],[1.0])|    (28,

In [45]:
# Create Training and Test data.
sets = dfhot.randomSplit([0.8, 0.2])
train = sets[0] # .cache()
test = sets[1] # .cache()

In [46]:
in_cols = small_df.columns[:-1]
in_cols

['call_type', 'zipcode_of_incident', 'battalion', 'station_area']

In [47]:
va = VectorAssembler(outputCol="features",
                     inputCols=in_cols)

In [48]:
train_for_model = va.transform(train)\
    .select("features", "label")
test_for_model = va.transform(test)\
    .select("features", "label")

In [49]:
train_for_model.show(1,False)

+------------------------------------+-----+
|features                            |label|
+------------------------------------+-----+
|(119,[0,30,57,68],[1.0,1.0,1.0,1.0])|0.0  |
+------------------------------------+-----+
only showing top 1 row



In [50]:
# Fit Random Forest Regressor
rf = RandomForestRegressor()
rf_fitted = rf.fit(train_for_model)

In [51]:
# Evaluate model results
test_pred = rf_fitted.transform(test_for_model)
r2_ev = RegressionEvaluator(metricName='r2')
rmse_ev = RegressionEvaluator(metricName='rmse')
mae_ev = RegressionEvaluator(metricName='mae')
r2 = r2_ev.evaluate(test_pred)
rmse = rmse_ev.evaluate(test_pred)
mae = mae_ev.evaluate(test_pred)

In [52]:
print(f"R^2:  {r2}\nRMSE: {rmse}\nMAE:  {mae}")

R^2:  0.0038455312058562896
RMSE: 9.660442607036526
MAE:  4.3166788231828415
