## Data Processing

In [1]:
import pyspark
from pyspark.sql.types import *
import os
import boto3
import numpy as np
from pyspark.sql.functions import *
from pyspark.ml import Pipeline

In [2]:
sc = pyspark.SparkContext.getOrCreate()
ss = pyspark.sql.SparkSession.builder.getOrCreate()

In [3]:
bucket_name = 'msds697jonross.and.friends' # Add your bucket name
s3 = boto3.resource('s3')
bucket = s3.Bucket(bucket_name) 

In [4]:
file_name_fire = 'sffd.csv' # select file
obj_fire = bucket.Object(key=file_name_fire) # S3 uses key-value structure where key is your file name
file_content_fire = obj_fire.get()["Body"].read().decode("utf-8") # Read the Body which is the contents of the file.

In [5]:
file_name_goo = 'google_data.csv' # select file
obj_goo = bucket.Object(key=file_name_goo) # S3 uses key-value structure where key is your file name
file_content_goo = obj_goo.get()["Body"].read().decode("utf-8") # Read the Body which is the contents of the file.

In [6]:
# number of rows (subract header and empty line at end)
rows_fire = file_content_fire.split('\n')
len(rows_fire)-2

4557045

In [7]:
# number of rows (subract empty line at end)
rows_goo = file_content_goo.split('\n')
len(rows_goo)-1

32074

In [8]:
# number of  columns
column_names_fire = rows_fire[0].split(',')
n_cols = sc.broadcast(len(column_names_fire))
n_cols.value

14

In [9]:
print('\n'.join(x for x in column_names_fire))

call_type
received_timestamp
entry_timestamp
dispatch_timestamp
response_timestamp
on_scene_timestamp
transport_timestamp
hospital_timestamp
call_final_disposition
available_timestamp
address
zipcode_of_incident
battalion
station_area


In [10]:
# randomly sample rows
sz=100000
samples_fire = np.random.choice(rows_fire[1:], size=sz, replace=False)
samples_fire[:2]

array(['HazMat,2010-09-01 07:53:36+00:00,2010-09-01 07:54:46+00:00,2010-09-01 07:55:07+00:00,2010-09-01 07:55:55+00:00,,,,Other,2010-09-01 08:00:31+00:00,400 Block of HOWARD ST,94105,B03,35',
       'Medical Incident,2014-12-21 15:40:07+00:00,2014-12-21 15:43:30+00:00,2014-12-21 15:43:57+00:00,2014-12-21 15:45:06+00:00,2014-12-21 15:48:26+00:00,,,Unable to Locate,2014-12-21 15:52:32+00:00,YORK ST/CESAR CHAVEZ ST,94110,B06,09'],
      dtype='<U315')

In [11]:
stations = [f"0{i}" for i in range(1,10)] +\
    [str(i) for i in list(range(10,27))] +\
    ['28','29'] +\
    [str(i) for i in list(range(31,45))] +\
    ['48','49','51']
stations = sc.broadcast(stations)
# stations.value

In [12]:
def filter_fire(x):
    if len(x.split(',')) == n_cols.value:
        return x.split(',')[13] in stations.value

def map_fire(x):
    return ((x.split(',')[13],x.split(',')[10]), x.split(',')[:2]+[x.split(',')[5]]+[x.split(',')[11]])

def map_google(x):
    s = x.split(',')
    try:
        if int(s[0]) < 10:
            s[0] = '0' + s[0]
    except:
        pass
    return (tuple(s[:2]), s[2:])

def map_joined(x):
    try:
        return list(x[0])+x[1][0]+[float(y) for y in x[1][1]]
    except:
        return list(x[0])+x[1][0]+[None,None]

In [13]:
rdd_fire = sc.parallelize(list(samples_fire))\
    .filter(filter_fire)\
    .map(map_fire)

In [14]:
rdd_google = sc.parallelize(rows_goo)\
    .map(map_google)

In [15]:
rdd_fire.take(2)

[(('35', '400 Block of HOWARD ST'),
  ['HazMat', '2010-09-01 07:53:36+00:00', '', '94105']),
 (('09', 'YORK ST/CESAR CHAVEZ ST'),
  ['Medical Incident',
   '2014-12-21 15:40:07+00:00',
   '2014-12-21 15:48:26+00:00',
   '94110'])]

In [16]:
rdd_google.take(2)

[(('01', '0 Block of 0NB OCTAVIA OF'), ['3724', '12.883333333333333']),
 (('01', '0 Block of 101 NB OCTAVIA OF'), ['4290', '9.416666666666666'])]

In [17]:
joined = rdd_fire.leftOuterJoin(rdd_google)

In [18]:
joined.take(2)

[(('01', 'GRANT AV/POST ST'),
  (['Medical Incident', '2001-03-19 12:09:43+00:00', '', '94108'],
   ['1531', '6.816666666666666'])),
 (('01', 'GRANT AV/POST ST'),
  (['Structure Fire', '2014-07-26 05:12:24+00:00', '', '94108'],
   ['1531', '6.816666666666666']))]

In [19]:
rdd = joined.map(map_joined)

In [20]:
rdd.take(2)

[['01',
  'GRANT AV/POST ST',
  'Medical Incident',
  '2001-03-19 12:09:43+00:00',
  '',
  '94108',
  1531.0,
  6.816666666666666],
 ['01',
  'GRANT AV/POST ST',
  'Structure Fire',
  '2014-07-26 05:12:24+00:00',
  '',
  '94108',
  1531.0,
  6.816666666666666]]

In [21]:
# number of rows removed
sz - rdd.count()

1495

In [22]:
schema = StructType([StructField("station_area", StringType(), False),
                     StructField("address", StringType(), False),
                     StructField("call_type", StringType(), False),
                     StructField("received_timestamp", StringType(), False),
                     StructField("on_scene_timestamp", StringType(), False),
                     StructField("zipcode_of_incident", StringType(), False),
                     StructField("distance", FloatType(), True),
                     StructField("duration", FloatType(), True)
                     ])

In [23]:
df = ss.createDataFrame(rdd, schema) # .cache()

In [24]:
df.printSchema()

root
 |-- station_area: string (nullable = false)
 |-- address: string (nullable = false)
 |-- call_type: string (nullable = false)
 |-- received_timestamp: string (nullable = false)
 |-- on_scene_timestamp: string (nullable = false)
 |-- zipcode_of_incident: string (nullable = false)
 |-- distance: float (nullable = true)
 |-- duration: float (nullable = true)



In [25]:
# convert to timestamps
my_rows = ['received_timestamp',
          'on_scene_timestamp']

df_w_time = df
for row in my_rows:
    df_w_time = df_w_time.withColumn(row, to_timestamp(df[row], format = 'yyyy-MM-dd HH:mm:ss+00:00'))

In [89]:
data_full = df_w_time.select('station_area',
                 'address',
                 'call_type',
                 'received_timestamp',
                 'on_scene_timestamp',
                 'zipcode_of_incident',
                 'distance',
                 'duration')\
    .withColumn("label", 
                (unix_timestamp('on_scene_timestamp') - unix_timestamp('received_timestamp')) / 60)\
    .orderBy('received_timestamp', ascending=[0])\
    .na.drop(subset=["label"])\
    .where('label != 0')\
    .select('station_area',
        'call_type',
        'zipcode_of_incident',
        'distance',
        'duration',
        'label')

# data_1 is the largest dataset
# has all columns and ony removes missing google data
data_1 = data_full.na.drop(subset=["distance", "duration"]) # (removes about 50 rows)
data_2 = data_1.where('distance < 8000') # approximately 5 miles (removes about 700 rows)
data_3 = data_2.where('label < 30') # less than 30 minutes (removes about 1100 rows)
data_4 = data_1.select('distance','duration','label') # just the duration and distance columns
data_5 = data_4.where('distance < 8000')
data_6 = data_5.where('label < 30')
data_7 = data_1.where('label < 20')
data_8 = data_4.where('label < 20')
data_9 = data_1.where('label < 10')
data_10 = data_4.where('label < 10')

In [96]:
data_1.show(5)
for d in (data_full, data_1, data_2, data_3, data_4, data_5, data_6, data_7, data_8,data_9,data_10):
    print(d.count())

+------------+-----------------+-------------------+--------+---------+------------------+
|station_area|        call_type|zipcode_of_incident|distance| duration|             label|
+------------+-----------------+-------------------+--------+---------+------------------+
|          18|            Other|              94116|  1820.0| 4.483333|12.916666666666666|
|          21| Medical Incident|              94117|  1439.0|      4.5| 6.833333333333333|
|          11|Traffic Collision|              94110|   429.0|      1.5|               4.0|
|          42|Traffic Collision|              94134|    92.0|     0.55|               3.0|
|          08| Medical Incident|              94107|   906.0|4.1666665|             15.15|
+------------+-----------------+-------------------+--------+---------+------------------+
only showing top 5 rows

75309
75250
74510
73357
75250
74510
73357
71507
71507
58061
58061


In [28]:
data_6.select(mean('label'), mean('duration'),
             variance('label'), variance('duration')).show()

+-----------------+------------------+------------------+------------------+
|       avg(label)|     avg(duration)|   var_samp(label)|var_samp(duration)|
+-----------------+------------------+------------------+------------------+
|7.900106556520394|3.9686669308964615|21.445902259004406| 6.090160642913485|
+-----------------+------------------+------------------+------------------+



In [29]:
data_6.show(10)

+--------+----------+------------------+
|distance|  duration|             label|
+--------+----------+------------------+
|  1820.0|  4.483333|12.916666666666666|
|  1439.0|       4.5| 6.833333333333333|
|   429.0|       1.5|               4.0|
|    92.0|      0.55|               3.0|
|   906.0| 4.1666665|             15.15|
|  1212.0| 3.3333333|              11.8|
|   163.0|0.96666664| 5.766666666666667|
|  1283.0|      6.35|              5.95|
|  1747.0| 5.9666667|              6.55|
|  1441.0|  4.233333| 9.266666666666667|
+--------+----------+------------------+
only showing top 10 rows



## Machine Learning

In [30]:
from pyspark.ml.feature import StringIndexer
from pyspark.ml.feature import OneHotEncoder

from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import RandomForestRegressor
from pyspark.ml.regression import LinearRegression
from pyspark.ml.evaluation import RegressionEvaluator

In [32]:
def remove_empty_strings(df):
    return df.replace('','unknown')
# dfnonas = remove_empty_strings(small_df)

In [33]:
#converting strings to numeric values
def indexStringColumns(df, cols):
    #variable newdf will be updated several times
    newdf = df
    
    for c in cols:
        #For each given colum, fits StringIndexerModel.
        si = StringIndexer(inputCol=c, outputCol=c+"-num")
        sm = si.fit(newdf)
        #Creates a DataFame by putting the transformed values in the new colum with suffix "-num" 
        #and then drops the original columns.
        #and drop the "-num" suffix. 
        newdf = sm.transform(newdf).drop(c)
        newdf = newdf.withColumnRenamed(c+"-num", c)
    return newdf

# dfnumeric = indexStringColumns(dfnonas, ["call_type", "zipcode_of_incident", "station_area"])
# dfnumeric = indexStringColumns(dfnonas, [])

In [34]:
def oneHotEncodeColumns(df, cols):
    newdf = df
    for c in cols:
        #For each given colum, create OneHotEncoder. 
        #dropLast : Whether to drop the last category in the encoded vector (default: true)
        onehotenc = OneHotEncoder(inputCol=c, outputCol=c+"-onehot", dropLast=False)
        #Creates a DataFame by putting the transformed values in the new colum with suffix "-onehot" 
        #and then drops the original columns.
        #and drop the "-onehot" suffix. 
        newdf = onehotenc.transform(newdf).drop(c)
        newdf = newdf.withColumnRenamed(c+"-onehot", c)
    return newdf

# dfhot = oneHotEncodeColumns(dfnumeric, ["call_type", "zipcode_of_incident", "station_area"])
# dfhot = oneHotEncodeColumns(dfnumeric, [])

In [35]:
def process(df, cat_cols, hot_cols):
    dfnonas = remove_empty_strings(df)
    dfnumeric = indexStringColumns(dfnonas, cat_cols)
    dfhot = oneHotEncodeColumns(dfnumeric, hot_cols)
    return dfhot

### Choose data

In [106]:
processed_data = process(data_10,
                         cat_cols=[],
                         hot_cols=[])

# processed_data = process(data_9,
#                          cat_cols=["call_type", "zipcode_of_incident", "station_area"],
#                          hot_cols=["call_type", "zipcode_of_incident", "station_area"])

In [107]:
# Create training and test data.
sets = processed_data.randomSplit([0.8, 0.2])
train = sets[0] # .cache()
test = sets[1] # .cache()

In [108]:
# make pipelines
va = VectorAssembler(outputCol="features",
                     inputCols=[x for x in processed_data.columns if x != 'label'])

rf = RandomForestRegressor()
lr = LinearRegression()

pipeline_lr = Pipeline(stages=[va,lr])
pipeline_rf = Pipeline(stages=[va,rf])

In [109]:
# fit models
lr_fitted = pipeline_lr.fit(train)
rf_fitted = pipeline_rf.fit(train)

## Results - careful about re-running cells
In each section, first cell is results on all columns and second cell uses just `distance` and `duration`. Once we start restricting response times, we see that the metrics are consistently better when we have use all of the columns and not just these two.

In [94]:
# Evaluate model results
def evaluate_regression(fitted_model, test_set):
    test_pred = fitted_model.transform(test_set)
    r2_ev = RegressionEvaluator(metricName='r2')
    rmse_ev = RegressionEvaluator(metricName='rmse')
    mae_ev = RegressionEvaluator(metricName='mae')
    r2 = r2_ev.evaluate(test_pred)
    rmse = rmse_ev.evaluate(test_pred)
    mae = mae_ev.evaluate(test_pred)
    return (r2,rmse,mae)

### All data

Random Forest has much lower RMSE, other metrics about the same

In [41]:
# data_1
ev_lr = evaluate_regression(fitted_model=lr_fitted, test_set=test)
print(f"Linear Regression:\nR^2:  {ev_lr[0]}\nRMSE: {ev_lr[1]}\nMAE:  {ev_lr[2]}")
ev_rf = evaluate_regression(fitted_model=rf_fitted, test_set=test)
print(f"Random Forest\nR^2:  {ev_rf[0]}\nRMSE: {ev_rf[1]}\nMAE:  {ev_rf[2]}")

Linear Regression:
R^2:  0.024123781896566276
RMSE: 16.91553255119037
MAE:  4.1326953987440325
Random Forest
R^2:  0.007411846391023991
RMSE: 8.521495726129338
MAE:  4.15831882230389


In [65]:
# data_4
ev_lr = evaluate_regression(fitted_model=lr_fitted, test_set=test)
print(f"Linear Regression:\nR^2:  {ev_lr[0]}\nRMSE: {ev_lr[1]}\nMAE:  {ev_lr[2]}")
ev_rf = evaluate_regression(fitted_model=rf_fitted, test_set=test)
print(f"Random Forest\nR^2:  {ev_rf[0]}\nRMSE: {ev_rf[1]}\nMAE:  {ev_rf[2]}")

Linear Regression:
R^2:  0.0025279094655130674
RMSE: 15.984160416015817
MAE:  4.4516522225800355
Random Forest
R^2:  0.002132135890805853
RMSE: 11.290663267309872
MAE:  4.2393099674474914


### Distance under 5 miles

Random Forest has larger RMSE when using just `distance` and `duration`

In [47]:
# data_2
ev_lr = evaluate_regression(fitted_model=lr_fitted, test_set=test)
print(f"Linear Regression:\nR^2:  {ev_lr[0]}\nRMSE: {ev_lr[1]}\nMAE:  {ev_lr[2]}")
ev_rf = evaluate_regression(fitted_model=rf_fitted, test_set=test)
print(f"Random Forest\nR^2:  {ev_rf[0]}\nRMSE: {ev_rf[1]}\nMAE:  {ev_rf[2]}")

Linear Regression:
R^2:  0.011747170232487392
RMSE: 18.237697208031083
MAE:  4.138500431421881
Random Forest
R^2:  0.07787912148991272
RMSE: 11.024637278044311
MAE:  4.17742435452288


In [71]:
# data_5
ev_lr = evaluate_regression(fitted_model=lr_fitted, test_set=test)
print(f"Linear Regression:\nR^2:  {ev_lr[0]}\nRMSE: {ev_lr[1]}\nMAE:  {ev_lr[2]}")
ev_rf = evaluate_regression(fitted_model=rf_fitted, test_set=test)
print(f"Random Forest\nR^2:  {ev_rf[0]}\nRMSE: {ev_rf[1]}\nMAE:  {ev_rf[2]}")

Linear Regression:
R^2:  0.004887448693409291
RMSE: 9.302058481339335
MAE:  4.339269175512891
Random Forest
R^2:  0.003757202013706773
RMSE: 12.16717568442183
MAE:  4.267489319780205


### Response time under 30 minutes and distance under 5 miles
Linear regression on all columns shows best results so far, but rf and lr are about even

In [53]:
# data_3
ev_lr = evaluate_regression(fitted_model=lr_fitted, test_set=test)
print(f"Linear Regression:\nR^2:  {ev_lr[0]}\nRMSE: {ev_lr[1]}\nMAE:  {ev_lr[2]}")
ev_rf = evaluate_regression(fitted_model=rf_fitted, test_set=test)
print(f"Random Forest\nR^2:  {ev_rf[0]}\nRMSE: {ev_rf[1]}\nMAE:  {ev_rf[2]}")

Linear Regression:
R^2:  0.05760830583437737
RMSE: 4.448589286539655
MAE:  3.157774939960122
Random Forest
R^2:  0.05139482839197307
RMSE: 4.570550521980881
MAE:  3.199242685090251


In [77]:
# data_6
ev_lr = evaluate_regression(fitted_model=lr_fitted, test_set=test)
print(f"Linear Regression:\nR^2:  {ev_lr[0]}\nRMSE: {ev_lr[1]}\nMAE:  {ev_lr[2]}")
ev_rf = evaluate_regression(fitted_model=rf_fitted, test_set=test)
print(f"Random Forest\nR^2:  {ev_rf[0]}\nRMSE: {ev_rf[1]}\nMAE:  {ev_rf[2]}")

Linear Regression:
R^2:  0.007980575876808005
RMSE: 4.645194347474092
MAE:  3.284769959525284
Random Forest
R^2:  0.012756496234489823
RMSE: 4.5969485899603635
MAE:  3.284296656386548


### Response time under 20 minutes

In [84]:
# data_7
ev_lr = evaluate_regression(fitted_model=lr_fitted, test_set=test)
print(f"Linear Regression:\nR^2:  {ev_lr[0]}\nRMSE: {ev_lr[1]}\nMAE:  {ev_lr[2]}")
ev_rf = evaluate_regression(fitted_model=rf_fitted, test_set=test)
print(f"Random Forest\nR^2:  {ev_rf[0]}\nRMSE: {ev_rf[1]}\nMAE:  {ev_rf[2]}")

Linear Regression:
R^2:  0.07058101144354822
RMSE: 3.4348941696083926
MAE:  2.59477181595019
Random Forest
R^2:  0.0636771025288424
RMSE: 3.475151985473244
MAE:  2.6325462510540683


In [95]:
# data_8
ev_lr = evaluate_regression(fitted_model=lr_fitted, test_set=test)
print(f"Linear Regression:\nR^2:  {ev_lr[0]}\nRMSE: {ev_lr[1]}\nMAE:  {ev_lr[2]}")
ev_rf = evaluate_regression(fitted_model=rf_fitted, test_set=test)
print(f"Random Forest\nR^2:  {ev_rf[0]}\nRMSE: {ev_rf[1]}\nMAE:  {ev_rf[2]}")

Linear Regression:
R^2:  0.005668059927685931
RMSE: 3.5687820031549062
MAE:  2.706931626315683
Random Forest
R^2:  0.01587843454160398
RMSE: 3.511942347124428
MAE:  2.708119644041678


### Response time under 10 minutes

In [105]:
# data_9
ev_lr = evaluate_regression(fitted_model=lr_fitted, test_set=test)
print(f"Linear Regression:\nR^2:  {ev_lr[0]}\nRMSE: {ev_lr[1]}\nMAE:  {ev_lr[2]}")
ev_rf = evaluate_regression(fitted_model=rf_fitted, test_set=test)
print(f"Random Forest\nR^2:  {ev_rf[0]}\nRMSE: {ev_rf[1]}\nMAE:  {ev_rf[2]}")

Linear Regression:
R^2:  0.07534076155725988
RMSE: 1.8215848776812464
MAE:  1.462095511225529
Random Forest
R^2:  0.06409058397892797
RMSE: 1.8136400902373682
MAE:  1.4644012341860397


In [110]:
# data_10
ev_lr = evaluate_regression(fitted_model=lr_fitted, test_set=test)
print(f"Linear Regression:\nR^2:  {ev_lr[0]}\nRMSE: {ev_lr[1]}\nMAE:  {ev_lr[2]}")
ev_rf = evaluate_regression(fitted_model=rf_fitted, test_set=test)
print(f"Random Forest\nR^2:  {ev_rf[0]}\nRMSE: {ev_rf[1]}\nMAE:  {ev_rf[2]}")

Linear Regression:
R^2:  0.004414109096016583
RMSE: 1.8919566430244967
MAE:  1.5052075248679457
Random Forest
R^2:  0.02789106584729928
RMSE: 1.8336034161065586
MAE:  1.5060200355496516
