In [364]:
import pyspark
from pyspark.sql.types import *
import os
import json
import requests
import boto3
import numpy as np
from pyspark.sql.functions import *

In [7]:
sc = pyspark.SparkContext.getOrCreate()
ss = pyspark.sql.SparkSession.builder.getOrCreate()

In [8]:
bucket_name = 'msds697jonross.and.friends' # Add your bucket name
file_name = 'sffd.csv' # select file
s3 = boto3.resource('s3')
bucket = s3.Bucket(bucket_name) 
obj = bucket.Object(key=file_name) # S3 uses key-value structure where key is your file name
file_content = obj.get()["Body"].read().decode("utf-8") # Read the Body which is the contents of the file.

In [9]:
# number of rows (subract header and empty line at end)
rows = file_content.split('\n')
len(rows)-2

4557045

In [63]:
# number of  columns
column_names = rows[0].split(',')
n_cols = sc.broadcast(len(column_names))
n_cols.value

14

In [64]:
print('   '.join(x for x in column_names))

call_type   received_timestamp   entry_timestamp   dispatch_timestamp   response_timestamp   on_scene_timestamp   transport_timestamp   hospital_timestamp   call_final_disposition   available_timestamp   address   zipcode_of_incident   battalion   station_area


In [65]:
# randomly sample rows
sz=10000
samples = np.random.choice(rows[1:], size=sz, replace=False)
samples[:2]

array(['Medical Incident,2009-12-21 14:38:56+00:00,2009-12-21 14:39:41+00:00,2009-12-21 14:41:41+00:00,2009-12-21 14:41:59+00:00,2009-12-21 14:51:09+00:00,2009-12-21 15:07:26+00:00,2009-12-21 15:21:41+00:00,Code 2 Transport,2009-12-21 15:47:56+00:00,200 Block of 6TH ST,94103,B03,01',
       'Medical Incident,2007-10-06 22:07:39+00:00,2007-10-06 22:09:05+00:00,2007-10-06 22:09:46+00:00,2007-10-06 22:10:43+00:00,2007-10-06 22:13:46+00:00,2007-10-06 22:27:28+00:00,2007-10-06 22:35:57+00:00,Code 2 Transport,2007-10-06 23:01:56+00:00,FRANKLIN ST/GROVE ST,94102,B02,36'],
      dtype='<U315')

In [68]:
def filter_fire(x):
    return len(x.split(',')) == n_cols.value

rdd = sc.parallelize(list(samples))\
    .filter(filter_fire)\
    .map(lambda x: x.split(','))

In [71]:
# number of rows removed
sz - rdd.count()

14

In [97]:
schema = StructType([StructField("call_type", StringType(), False),
                    StructField("received_timestamp", StringType(), False),
                    StructField("entry_timestamp", StringType(), False),
                    StructField("dispatch_timestamp", StringType(), False),
                    StructField("response_timestamp", StringType(), False),
                    StructField("on_scene_timestamp", StringType(), False),
                    StructField("transport_timestamp", StringType(), False),
                    StructField("hospital_timestamp", StringType(), False),
                    StructField("call_final_disposition", StringType(), False),
                    StructField("available_timestamp", StringType(), False),
                    StructField("address", StringType(), False),
                    StructField("zipcode_of_incident", StringType(), False),
                    StructField("battalion", StringType(), False),
                    StructField("station_area", StringType(), False)
                    ])

In [98]:
df = ss.createDataFrame(rdd, schema)

In [99]:
df.printSchema()

root
 |-- call_type: string (nullable = false)
 |-- received_timestamp: string (nullable = false)
 |-- entry_timestamp: string (nullable = false)
 |-- dispatch_timestamp: string (nullable = false)
 |-- response_timestamp: string (nullable = false)
 |-- on_scene_timestamp: string (nullable = false)
 |-- transport_timestamp: string (nullable = false)
 |-- hospital_timestamp: string (nullable = false)
 |-- call_final_disposition: string (nullable = false)
 |-- available_timestamp: string (nullable = false)
 |-- address: string (nullable = false)
 |-- zipcode_of_incident: string (nullable = false)
 |-- battalion: string (nullable = false)
 |-- station_area: string (nullable = false)



In [116]:
my_rows = ['received_timestamp',
          'entry_timestamp',
          'dispatch_timestamp',
          'response_timestamp',
          'on_scene_timestamp',
          'transport_timestamp',
          'hospital_timestamp',
          'available_timestamp']

df_w_time = df
for row in my_rows:
    df_w_time = df_w_time.withColumn(row, to_timestamp(df[row], format = 'yyyy-MM-dd HH:mm:ss+00:00'))

In [137]:
small_df = df_w_time.select('call_type',
                 'received_timestamp',
                 'on_scene_timestamp',
                 'address',
                 'zipcode_of_incident',
                 'battalion',
                 'station_area')\
    .withColumn("response_time", 
                (unix_timestamp('on_scene_timestamp') - unix_timestamp('received_timestamp')) / 60)\
    .orderBy('received_timestamp', ascending=[0])

In [334]:
df_ = df.select('station_area', 'address').where('station_area not in ("","A1","A2","A3")')

In [335]:
df_.show(20, False)

+------------+-------------------------------+
|station_area|address                        |
+------------+-------------------------------+
|01          |200 Block of 6TH ST            |
|36          |FRANKLIN ST/GROVE ST           |
|03          |1200 Block of SUTTER ST        |
|15          |1000 Block of CAYUGA AVE       |
|08          |BRYANT ST/MORRIS ST            |
|28          |JEFFERSON ST/POWELL ST         |
|43          |0 Block of BLYTHDALE AVE       |
|43          |600 Block of RUSSIA AVE        |
|32          |3800 Block of MISSION ST       |
|43          |100 Block of BLYTHDALE AVE     |
|05          |1200 Block of GOLDEN GATE AVE  |
|07          |500 Block of SOUTH VAN NESS AVE|
|13          |200 Block of KEARNY ST         |
|01          |800 Block of MARKET ST         |
|01          |LEAVENWORTH ST/GOLDEN GATE AV  |
|01          |0 Block of 6TH ST              |
|05          |1400 Block of MCALLISTER ST    |
|06          |200 Block of HARTFORD ST       |
|33          

In [352]:
fire_stations = ["San+Francisco+fire+department+station+" + str(int(a['station_area'])) for a in  df_.select('station_area').collect()]
addresses = [a['address'].replace(' ','+') for a in df_.select('address').collect()]

In [353]:
for x,y in zip(fire_stations[:10], addresses[:10]):
    print(x,y)

San+Francisco+fire+department+station+1 200+Block+of+6TH+ST
San+Francisco+fire+department+station+36 FRANKLIN+ST/GROVE+ST
San+Francisco+fire+department+station+3 1200+Block+of+SUTTER+ST
San+Francisco+fire+department+station+15 1000+Block+of+CAYUGA+AVE
San+Francisco+fire+department+station+8 BRYANT+ST/MORRIS+ST
San+Francisco+fire+department+station+28 JEFFERSON+ST/POWELL+ST
San+Francisco+fire+department+station+43 0+Block+of+BLYTHDALE+AVE
San+Francisco+fire+department+station+43 600+Block+of+RUSSIA+AVE
San+Francisco+fire+department+station+32 3800+Block+of+MISSION+ST
San+Francisco+fire+department+station+43 100+Block+of+BLYTHDALE+AVE


In [354]:
len(fire_stations)

9860

In [356]:
apikey='<insert>'

In [357]:
'https://maps.googleapis.com/maps/api/distancematrix/json?origins=&destinations=&key={apikey}'

'https://maps.googleapis.com/maps/api/distancematrix/json?origins=&destinations=&key={apikey}'

In [386]:
%%time

n = 50
distances = ['']*n
durations = ['']*n
for i, origin, destination in zip(range(n), fire_stations[:n],addresses[:n]):
    r = requests.get(f'https://maps.googleapis.com/maps/api/distancematrix/json?origins={origin}&destinations={destination}&key={apikey}')
    j = r.json()
    e = j['rows'][0]['elements']
    distances[i] = e[0]['distance']['value']
    durations[i] = e[0]['duration']['value']  / 60

KeyError: 'distance'

In [378]:
list(zip(distances, durations))

[(2827560, 1543.4333333333334), (4960850, 2705.55), (345, 1.6666666666666667)]

In [379]:
import pandas as pd

In [383]:
station_area = [a['station_area'] for a in df_.select('station_area').collect()]
address = [a['address'] for a in df_.select('address').collect()]

In [384]:
pd.DataFrame.from_dict({'station_area': station_area[:n],
                        'address': address[:n],
                        'distance': distances[:n],
                        'duration': durations[:n]})

Unnamed: 0,station_area,address,distance,duration
0,1,200 Block of 6TH ST,2827560,1543.433333
1,36,FRANKLIN ST/GROVE ST,4960850,2705.55
2,3,1200 Block of SUTTER ST,345,1.666667


In [271]:
stations = df.select('station_area').distinct().orderBy('station_area').collect()

In [277]:
d = {'station_area' : [row['station_area'] for row in stations][1:]}

In [294]:
stations = ["0"+str(i) for i in range(1,10)] + [str(i) for i in range(10,52)]

In [298]:
d={}
for i, k in enumerate(stations):
    d[k] =  addresses[i]
print(d['03'])

1067 Post Street at Polk Street


In [293]:
# thought we needed this but we probably dont
# addresses = ['935 Folsom at 5th Street',
#             '1340 Powell Street at Broadway',
#             '1067 Post Street at Polk Street',
#             '449 Mission Rock at 3rd Street ',
#             '1301 Turk Street at Webster Street',
#             '135 Sanchez Street at Henry Street',
#             '2300 Folsom Street at 19th Street',
#             '36 Bluxome Street at 4th Street',
#             '2245 Jerrold Avenue at Upton Street',
#             '655 Presidio Avenue at Bush Street',
#             '3880 26th Street at Church Street',
#             '1145 Stanyan Street at Grattan Street',
#             '530 Sansome Street at Washington Street',
#             '551 26th Avenue at Geary Boulevard',
#             '1000 Ocean Avenue at Phelan Avenue',
#             '2251 Greenwich Street at Fillmore Street',
#             '1295 Shafter Avenue at Ingalls Street',
#             '1935 32nd Avenue at Ortega Street',
#             '390 Buckingham Way at Winston Street',
#             '285 Olympia Way at Clarendon Avenue',
#             '1443 Grove Street at Broderick Street',
#             '1290 16th Avenue at Irving Street',
#             '1348 45th Avenue at Judah Street',
#             '100 Hoffman Avenue at Alvarado Street',
#             '3305 3rd Street at Cargo Way',
#             '80 Digby Street at Addison Street',
#             '',
#             '1814 Stockton Street at Greenwich Street',
#             '299 Vermont Street at 16th Street',
#             '',
#             '441 12th Avenue at Geary Boulevard',
#             '194 Park Street at Holly Park Circle',
#             '8 Capitol Avenue at Sagamore Street',
#             '499 41st Avenue at Geary Boulevard',
#             'Pier 22½, The Embarcadero at Harrison Street',
#             '109 Oak Street at Franklin Street',
#             '798 Wisconsin Street at 22nd Street',
#             '2150 California Street at Laguna Street',
#             '1091 Portola Drive at Miraloma Drive',
#             '2155 18th Avenue at Rivera Street',
#             '1325 Leavenworth Street at Jackson Street',
#             '2430 San Bruno Avenue at Silver Avenue',
#             '720 Moscow Street at France Avenue',
#             '1298 Girard Street at Wilde Avenue',
#             '',
#             '',
#             '',
#             '800 Avenue I at 10th Street, Treasure Island',
#             '1415 Evans Avenue at Mendell Street',
#             '',
#             '218 Lincoln Blvd at Keyes Avenue']