In [182]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.ticker import NullFormatter

In [183]:
# convert csv data into pandas dataframe
file = 'sfpd_dispatch_data_subset.csv'
pdf = pd.read_csv(file)

In [184]:
# work with new copy of dataframe for analysis of address, time, dispatch type
dfg2 = pdf


In [185]:
# work with new copy of dataframe to analyze relationship between location & time taken to dispatch

# analyze locations based on zip code
dfg3 = pdf
cols = list(dfg3)
l = ['zipcode_of_incident', 'station_area', 'entry_timestamp', 'on_scene_timestamp']
for i in range(0, len(l)): cols.remove(l[i])
# cols is now a list of all the columns in the df that must be removed

# cleanse data to only include: zip code, station_area, entry_timestamp, on_scene_timestamp
dfg3 = dfg3.drop(cols, axis=1)
# cleansing data of time values that are NaN
dfg3 = dfg3.dropna(subset=['on_scene_timestamp', 'entry_timestamp'])

# create column of time taken to dispatch (difference between received and on_scene timestamps)
dfg3['entry_timestamp'] = pd.to_datetime(dfg3['entry_timestamp'])
dfg3['on_scene_timestamp'] = pd.to_datetime(dfg3['on_scene_timestamp'])
dfg3['dispatch_time'] = (dfg3['on_scene_timestamp'] - dfg3['entry_timestamp'])
dfg3['dispatch_time'] = dfg3['dispatch_time'].astype('timedelta64[m]')


bp = dfg3.boxplot(by='station_area', column='dispatch_time', fontsize=20, grid=False, figsize=(30,20))
fig = np.asarray(bp).reshape(-1)[0].get_figure()
fig.suptitle('', fontsize=30)
plt.title("Dispatch Time by Station Areas", fontsize=30)
plt.xlabel('Station Area', fontsize=30)
plt.ylabel('Dispatch Time in Minutes', fontsize=30)
#eliminating extreme outliers to better visualize greater quantity of data
plt.ylim(0, 80)
plt.savefig("station_time", dpi='figure')

dfg3.boxplot(column='dispatch_time', by='zipcode_of_incident', fontsize=18, grid=False, figsize=(40,15))
fig = np.asarray(bp).reshape(-1)[0].get_figure()
fig.suptitle('')
plt.title("Dispatch Time by Zip Code", fontsize=30)
plt.xlabel('Zip Code of Incident', fontsize=30)
plt.ylabel('Dispatch Time in Minutes', fontsize=30)
#eliminating extreme outliers to better visualize greater quantity of data
plt.ylim(0, 100)
plt.savefig("zip_time", dpi='figure')


  return getattr(obj, method)(*args, **kwds)


In [188]:
# work with new copy of dataframe to analyze relationship between location, dispatch type, increase in dispatch calls
# areas experiencing the greatest increase in dispatch calls

dfgb = pdf
cols = list(dfgb)
l = ['zipcode_of_incident', 'entry_timestamp', 'box', 'address', 'station_area', 'unit_type']
for i in range(0, len(l)): cols.remove(l[i])
# cols is now a list of all the columns in the df that must be removed

# cleanse data to only include: 'zipcode_of_incident', 'entrey_timestamp', 'box', 'address', 'station_area', 'unit_type', 'call_type_group'
dfgb = dfgb.drop(cols, axis=1)
dfgb['entry_timestamp'] = pd.to_datetime(dfgb['entry_timestamp'])
increasez = pd.DataFrame(columns=['zip_code', 'day', 'frequency'])
increasez['day'] = (dfgb['entry_timestamp']).dt.date
increasez['zip_code'] = dfgb['zipcode_of_incident']
increasez = increasez.groupby(['zip_code','day']).size().to_frame(name='frequency')
changez = pd.DataFrame(columns=['zip_code', 'increase'])
prevz = 1
curz = 0
curlow = 0
curhigh = 0
for index, row in increasez.iterrows():
    curz = index[0]
    if (curz==prevz):
        cur = row.frequency
        if (cur<curlow): curlow = cur
        elif (cur>curhigh): curhigh=cur
    else: 
        changez.set_value(prevz, 'zip_code', prevz)
        if (curlow!=0): changez.set_value(prevz, 'increase', (((curhigh-curlow)/curlow)*100))
        else: changez.set_value(prevz, 'increase', 0)
        curlow = row.frequency
        curhigh = row.frequency
    prevz = curz
changez = changez.drop(labels=1, axis=0)
changez = changez.drop(changez[changez.increase < 100].index)
zips = changez['zip_code'].tolist()
# changez.plot(x='zip_code', y='increase', kind='bar', figsize=(50,30), fontsize=40)
# plt.xlabel('San Francisco Zip Code', fontsize=50, labelpad=30)
# plt.ylabel('Increase in Dispatch Calls (%)', fontsize=50, labelpad=30)
# plt.suptitle('Increase in Dispatch Calls by Zip Code', fontsize=50, verticalalignment='top')
# plt.savefig("increase_zip")

increaseb = pd.DataFrame(columns=['box', 'day', 'frequency'])
increaseb['day'] = (dfgb['entry_timestamp']).dt.date
increaseb['box'] = dfgb['box']
increaseb = increaseb.groupby(['box','day']).size().to_frame(name='frequency')
changeb = pd.DataFrame(columns=['box', 'increase'])
prevb = 1
curb = 0
curlow = 0
curhigh = 0
for index, row in increaseb.iterrows():
    curb = index[0]
    if (curb==prevb):
        cur = row.frequency
        if (cur<curlow): curlow = cur
        elif (cur>curhigh): curhigh=cur
    else: 
        changeb.set_value(prevb, 'box', prevb)
        if (curlow!=0): changeb.set_value(prevb, 'increase', (((curhigh-curlow)/curlow)*100))
        else: changeb.set_value(prevb, 'increase', 0)
        curlow = row.frequency
        curhigh = row.frequency
    prevb = curb
changeb = changeb.drop(labels=1, axis=0)
changeb = changeb.drop(changeb[changeb.increase < 1000].index)
boxs = changeb['box'].tolist()
changeb.plot(x='box', y='increase', kind='bar', figsize=(50,30), fontsize=40)
plt.xlabel('San Francisco Box', fontsize=50, labelpad=30)
plt.ylabel('Increase in Dispatch Calls (%)', fontsize=50, labelpad=30)
plt.suptitle('Increase in Dispatch Calls by Box Area', fontsize=50, verticalalignment='top')
plt.savefig("increase_box")

# based on highest zip frequency, what is highest unit_type frequency
typez = pd.DataFrame(columns=['zip_code', 'day', 'frequency', 'type'])
typez['day'] = (dfgb['entry_timestamp']).dt.date
typez['zip_code'] = dfgb['zipcode_of_incident']
typez['type']=dfgb['unit_type']
typez = typez.groupby(['zip_code','type']).size().to_frame(name='frequency')
for i,r in typez.iterrows():
    if (i[0] not in zips): typez = typez.drop(i)
cur = 0
prev = 1
high = 0
mostz = pd.DataFrame(columns=['zip_code', 'type'])
t=0
for i,r in typez.iterrows():  
    cur = i[0]
    if (cur==prev):
        if (r.frequency>high): 
            high=r.frequency
            t = i[1]
    else:
        mostz.set_value(prev, 'zip_code', prev)
        mostz.set_value(prev, 'type', t)
        
        high = r.frequency
        t = i[1]
    prev=cur
mostz = mostz.drop(labels=1, axis=0)
x=mostz.style

# based on highest box call frequency, what is highest unit_type frequency
typeb = pd.DataFrame(columns=['box', 'day', 'frequency', 'type'])
typeb['day'] = (dfgb['entry_timestamp']).dt.date
typeb['box'] = dfgb['box']
typeb['type']=dfgb['unit_type']
typeb = typeb.groupby(['box','type']).size().to_frame(name='frequency')
for i,r in typeb.iterrows():
    if (i[0] not in boxs): typeb = typeb.drop(i)
cur = 0
prev = 1
high = 0
mostb = pd.DataFrame(columns=['box', 'type'])
t=0
for i,r in typeb.iterrows():  
    cur = i[0]
    if (cur==prev):
        if (r.frequency>high): 
            high=r.frequency
            t = i[1]
    else:
        mostb.set_value(prev, 'box', prev)
        mostb.set_value(prev, 'type', t)
        high = r.frequency
        t = i[1]
    prev=cur
mostb = mostb.drop(labels=1, axis=0)
y=mostb.style




In [239]:
# work with new copy of dataframe for analysis of lat, lon, dispatch time, dispatch type (unit_type)

dfg2 = pdf
cols = list(dfg2)
l = ['latitude', 'longitude', 'entry_timestamp', 'on_scene_timestamp', 'unit_type']
for i in range(0, len(l)): cols.remove(l[i]) 
# cols is now a list of all the columns in the df that must be removed

# cleanse data 
dfg2 = dfg2.drop(cols, axis=1)
# cleansing data of time values that are NaN
dfg2 = dfg2.dropna(axis=0)
    
# create column of time taken to dispatch (difference between received and on_scene timestamps)
dfg2['entry_timestamp'] = pd.to_datetime(dfg2['entry_timestamp'])
dfg2['on_scene_timestamp'] = pd.to_datetime(dfg2['on_scene_timestamp'])
dfg2['dispatch_time'] = (dfg2['on_scene_timestamp'] - dfg2['entry_timestamp'])
dfg2['dispatch_time'] = dfg2['dispatch_time'].astype('timedelta64[m]')
dfg2 = dfg2.drop(axis=1, labels=['entry_timestamp', 'on_scene_timestamp'])

#dataframe for scatter plot
dfs = dfg2.drop(axis=1, labels='dispatch_time', inplace=False)
#dataframe for heat map
dfh = dfg2.drop(axis=1, labels='unit_type', inplace=False)
dfs['unit_type'] = dfs['unit_type'].astype('category')
dfh = dfh.drop(dfh[dfh.dispatch_time>20].index)
# changeb = changeb.drop(changeb[changeb.increase < 1000].index)
print(set(np.asarray(dfs['unit_type'])))
print(dfh['latitude'].values.max())
print(dfh['longitude'].values.max())
print(dfh['latitude'].values.min())
print(dfh['longitude'].values.min())
print(dfh['dispatch_time'].tolist)

{'TRUCK', 'ENGINE', 'RESCUE CAPTAIN', 'RESCUE SQUAD', 'PRIVATE', 'INVESTIGATION', 'SUPPORT', 'CHIEF', 'MEDIC'}
37.8316662304
-122.365138272
37.7086491718
-122.513648359
<bound method Series.tolist of 0       15.0
1       13.0
2        4.0
3        3.0
4        3.0
5        4.0
6       10.0
7        6.0
8        5.0
9        4.0
10      17.0
11      14.0
12       5.0
13       9.0
14       5.0
15       5.0
16       5.0
17       8.0
18       6.0
20       5.0
21       3.0
22       3.0
23      10.0
24       6.0
25       5.0
26       4.0
28       7.0
29       2.0
30       3.0
31       3.0
        ... 
9959    15.0
9960     4.0
9962    14.0
9963     4.0
9965    17.0
9967     3.0
9968     4.0
9970     3.0
9971     4.0
9974     3.0
9975     9.0
9976     5.0
9977    11.0
9978     6.0
9979     3.0
9980     3.0
9981     7.0
9983     4.0
9984     8.0
9985     7.0
9987     3.0
9988     8.0
9989     5.0
9990     0.0
9991     5.0
9992     0.0
9993     2.0
9995     3.0
9996     3.0
9999     7.0
Name: d

In [241]:
import plotly
plotly.tools.set_credentials_file(username='aarushiw', api_key='ZhzGXF6zXnqwnxYQZGIe')
import plotly.graph_objs as go
import plotly.tools as tls
import plotly.plotly as py
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
init_notebook_mode(connected=True)



data = [go.Heatmap( z=dfh['dispatch_time'].values.tolist(), 
                   x=dfh['latitude'].values.tolist(),
                   y=dfh['longitude'].values.tolist(),
                   colorscale='Viridis')]
py.iplot(data, filename='pandas-heatmap', title='Dispatch Time')


High five! You successfully sent some data to your account on plotly. View your plot in your browser at https://plot.ly/~aarushiw/0 or inside your plot.ly account where it is named 'pandas-heatmap'
