# Process Data
#### Daniel Fay

In [1]:
# Import necessary packages
import pandas as pd
import numpy as np

In [2]:
# Load dataset
bus = pd.read_csv('data/interim/bus_times.csv', low_memory=False, index_col=0)

### Create Temporal Features

In [3]:
bus.drop(['trip', 'end_time'],axis=1,inplace=True)
bus['begin_time'] = pd.to_datetime(bus['begin_time'])
bus['trip_time'] = pd.to_timedelta(bus['trip_time'])
bus.head()

Unnamed: 0,bus_line,direction,trip_time,begin_time
0,B11,2.0,00:39:54,2016-01-01 09:11:05
1,B11,2.0,00:38:27,2016-01-01 09:44:02
2,B11,1.0,00:53:54,2016-01-01 09:52:02
3,B11,2.0,00:34:42,2016-01-01 10:10:50
4,B11,1.0,00:41:03,2016-01-01 10:23:00


In [4]:
bus['day_of_week'] = bus['begin_time'].dt.weekday_name
bus['month'] = bus['begin_time'].dt.month
bus['hour'] = bus['begin_time'].dt.hour
bus['date'] = bus['begin_time'].dt.date
bus['day'] = bus['begin_time'].dt.day
bus['minute'] = bus['begin_time'].dt.minute
bus['trip_time'] = bus['trip_time'].dt.total_seconds()/60
bus.head()

Unnamed: 0,bus_line,direction,trip_time,begin_time,day_of_week,month,hour,date,day,minute
0,B11,2.0,39.9,2016-01-01 09:11:05,Friday,1,9,2016-01-01,1,11
1,B11,2.0,38.45,2016-01-01 09:44:02,Friday,1,9,2016-01-01,1,44
2,B11,1.0,53.9,2016-01-01 09:52:02,Friday,1,9,2016-01-01,1,52
3,B11,2.0,34.7,2016-01-01 10:10:50,Friday,1,10,2016-01-01,1,10
4,B11,1.0,41.05,2016-01-01 10:23:00,Friday,1,10,2016-01-01,1,23


In [5]:
traf_int = {}
for month in range(6,10):
    traf_int[month] = 'PeakAM'
    
for month in range(10,16):
    traf_int[month] = 'MidDay'
    
for month in range(16,20):
    traf_int[month] = 'PeakPM'
    
for month in range(20,25):
    traf_int[month] = 'Night'
    
for month in range(6):
    traf_int[month] = 'Night'

In [6]:
bus['time_period'] = bus['hour'].map(traf_int)
bus.sort_values(['bus_line','begin_time'], inplace=True)
bus = bus.loc[bus['trip_time'] < 150]
bus = bus.loc[bus['trip_time'] > 5]
bus.head()

Unnamed: 0,bus_line,direction,trip_time,begin_time,day_of_week,month,hour,date,day,minute,time_period
115,B11,1.0,67.633333,2016-01-01 00:00:13,Friday,1,0,2016-01-01,1,0,Night
107,B11,2.0,36.066667,2016-01-01 00:00:14,Friday,1,0,2016-01-01,1,0,Night
112,B11,2.0,31.0,2016-01-01 00:00:15,Friday,1,0,2016-01-01,1,0,Night
100,B11,1.0,7.2,2016-01-01 00:00:17,Friday,1,0,2016-01-01,1,0,Night
109,B11,1.0,23.9,2016-01-01 00:00:18,Friday,1,0,2016-01-01,1,0,Night


In [7]:
len(bus)

836951

### Aggregate trips to 1 minute interval

In [8]:
bus = bus.groupby(['bus_line','direction','date','month','day','day_of_week','time_period', \
                    'hour','minute']).mean().reset_index()
bus.head()

Unnamed: 0,bus_line,direction,date,month,day,day_of_week,time_period,hour,minute,trip_time
0,B11,1.0,2016-01-01,1,1,Friday,MidDay,10,23,41.05
1,B11,1.0,2016-01-01,1,1,Friday,MidDay,10,46,54.783333
2,B11,1.0,2016-01-01,1,1,Friday,MidDay,11,13,54.466667
3,B11,1.0,2016-01-01,1,1,Friday,MidDay,11,35,40.616667
4,B11,1.0,2016-01-01,1,1,Friday,MidDay,11,54,51.15


### Calculate Mean and Standard Deviation for Trip Times in Each Category

In [9]:
bus_temp = bus.drop(['hour', 'date', 'day','minute'], axis=1)
avg_times = bus_temp.groupby(['bus_line', 'direction', 'month', 'day_of_week', 'time_period']).mean().reset_index()
avg_times.columns = ['bus_line', 'direction', 'month','day_of_week', 'time_period', 'avg_trip_time']

In [10]:
std_times = bus_temp.groupby(['bus_line','direction', 'month', 'day_of_week', 'time_period']).std().reset_index()
std_times.columns = ['bus_line', 'direction', 'month', 'day_of_week', 'time_period', 'std_trip_time']

In [11]:
avg_times.head()

Unnamed: 0,bus_line,direction,month,day_of_week,time_period,avg_trip_time
0,B11,1.0,1,Friday,MidDay,64.289123
1,B11,1.0,1,Friday,Night,62.360454
2,B11,1.0,1,Friday,PeakAM,55.033333
3,B11,1.0,1,Friday,PeakPM,69.339542
4,B11,1.0,1,Monday,MidDay,67.660468


In [12]:
std_times.head()

Unnamed: 0,bus_line,direction,month,day_of_week,time_period,std_trip_time
0,B11,1.0,1,Friday,MidDay,16.003063
1,B11,1.0,1,Friday,Night,20.746053
2,B11,1.0,1,Friday,PeakAM,7.915746
3,B11,1.0,1,Friday,PeakPM,13.657906
4,B11,1.0,1,Monday,MidDay,17.047391


In [13]:
bus_times = pd.merge(bus, avg_times, how='left', on=['bus_line', 'direction', 'month', 'day_of_week', 'time_period'])
bus_times = pd.merge(bus_times, std_times, how='left', on=['bus_line', 'direction', 'month', 'day_of_week', 'time_period'])

In [14]:
bus_times['delay_time'] = bus_times['trip_time'] - bus_times['avg_trip_time']

In [15]:
bus_times.head()

Unnamed: 0,bus_line,direction,date,month,day,day_of_week,time_period,hour,minute,trip_time,avg_trip_time,std_trip_time,delay_time
0,B11,1.0,2016-01-01,1,1,Friday,MidDay,10,23,41.05,64.289123,16.003063,-23.239123
1,B11,1.0,2016-01-01,1,1,Friday,MidDay,10,46,54.783333,64.289123,16.003063,-9.50579
2,B11,1.0,2016-01-01,1,1,Friday,MidDay,11,13,54.466667,64.289123,16.003063,-9.822457
3,B11,1.0,2016-01-01,1,1,Friday,MidDay,11,35,40.616667,64.289123,16.003063,-23.672457
4,B11,1.0,2016-01-01,1,1,Friday,MidDay,11,54,51.15,64.289123,16.003063,-13.139123


### Calculate Delay

In [50]:
delay = []
for idx, row in bus_times.iterrows():
    if (1 * row['std_trip_time']) <= row['delay_time'] < (2 * row['std_trip_time']) :
        delay.append(1)
    elif row['delay_time'] > (2 * row['std_trip_time']):
        delay.append(2)
    else:
        delay.append(0)

In [52]:
bus_times['delay'] = delay
bus_times.head()

Unnamed: 0,bus_line,direction,date,month,day,day_of_week,time_period,hour,minute,trip_time,avg_trip_time,std_trip_time,delay_time,delay
0,B11,1.0,2016-01-01,1,1,Friday,MidDay,10,23,41.05,64.289123,16.003063,-23.239123,0
1,B11,1.0,2016-01-01,1,1,Friday,MidDay,10,46,54.783333,64.289123,16.003063,-9.50579,0
2,B11,1.0,2016-01-01,1,1,Friday,MidDay,11,13,54.466667,64.289123,16.003063,-9.822457,0
3,B11,1.0,2016-01-01,1,1,Friday,MidDay,11,35,40.616667,64.289123,16.003063,-23.672457,0
4,B11,1.0,2016-01-01,1,1,Friday,MidDay,11,54,51.15,64.289123,16.003063,-13.139123,0


In [67]:
for i in bus_times.bus_line.unique():
    print i
    print bus_times.loc[bus_times.bus_line==i,:].delay.value_counts()/bus_times.loc[bus_times.bus_line==i,:].shape[0]

B11
0    0.869405
1    0.105897
2    0.024698
Name: delay, dtype: float64
B15
0    0.858053
1    0.118489
2    0.023458
Name: delay, dtype: float64
B3
0    0.882456
1    0.067392
2    0.050152
Name: delay, dtype: float64
B37
0    0.864711
1    0.101606
2    0.033683
Name: delay, dtype: float64
B38
0    0.880555
1    0.079701
2    0.039745
Name: delay, dtype: float64
B4
0    0.875155
1    0.101425
2    0.023420
Name: delay, dtype: float64
B41
0    0.871778
1    0.099978
2    0.028244
Name: delay, dtype: float64
B43
0    0.879599
1    0.084327
2    0.036074
Name: delay, dtype: float64
B54
0    0.878549
1    0.089935
2    0.031516
Name: delay, dtype: float64
B57
0    0.859987
1    0.111087
2    0.028926
Name: delay, dtype: float64
B62
0    0.867978
1    0.104946
2    0.027075
Name: delay, dtype: float64
B8
0    0.870234
1    0.113233
2    0.016534
Name: delay, dtype: float64


In [69]:
delay_dist.head()

Unnamed: 0_level_0,direction,month,day,hour,minute,trip_time,avg_trip_time,std_trip_time,delay_time,delay
bus_line,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
B11,91954.0,404208,974954,845805,1804280,3844157.0,3844157.0,875801.9,5.454126e-11,9614
B15,153145.0,664800,1606264,1294858,2973594,8119225.0,8119225.0,2294975.0,-1.127205e-10,16845
B3,103896.0,450103,1092862,870962,2046167,3613882.0,3613882.0,1028502.0,2.884804e-11,11653
B37,57536.0,192750,455466,374050,824566,1674920.0,1674920.0,453770.3,5.55076e-11,4861
B38,159773.0,689334,1644354,1345737,3053873,6506369.0,6506369.0,1539434.0,-1.33582e-12,16626


### Merge Weather Data

In [70]:
weather = pd.read_csv('data/external/nyc_weather_2016.csv', index_col=0)

In [71]:
weather = weather[[u'Conditions', u'DateUTC', u'Humidity', u'PrecipitationIn', \
                   u'TemperatureF', u'VisibilityMPH', u'Wind SpeedMPH']]

In [72]:
weather['DateUTC'] = pd.to_datetime(weather['DateUTC'])
weather['hour'] = weather['DateUTC'].dt.hour
weather['month'] = weather['DateUTC'].dt.month
weather['day'] = weather['DateUTC'].dt.day
weather.drop('DateUTC', axis=1, inplace=True)
weather.head()

Unnamed: 0,Conditions,Humidity,PrecipitationIn,TemperatureF,VisibilityMPH,Wind SpeedMPH,hour,month,day
0,Overcast,51.0,,42.1,10.0,4.6,5,1,1
1,Overcast,53.0,,41.0,10.0,3.5,6,1,1
2,Overcast,55.0,,41.0,10.0,4.6,7,1,1
3,Overcast,55.0,,41.0,10.0,9.2,8,1,1
4,Overcast,58.0,,39.9,10.0,10.4,9,1,1


In [73]:
weather.drop_duplicates(['hour','month','day'],inplace=True)

In [74]:
len(bus_times)

793681

In [75]:
bus_weather = pd.merge(bus_times, weather, how='left', on=['month','day','hour'])

In [76]:
len(bus_weather)

793681

In [77]:
bus_weather['PrecipitationIn'].fillna(0, inplace=True)
bus_weather.dropna(inplace=True)
bus_weather.sort_values(['bus_line','month','day','hour','minute'], inplace=True)
bus_weather.reset_index(inplace=True, drop=True)
bus_weather.head()

Unnamed: 0,bus_line,direction,date,month,day,day_of_week,time_period,hour,minute,trip_time,avg_trip_time,std_trip_time,delay_time,delay,Conditions,Humidity,PrecipitationIn,TemperatureF,VisibilityMPH,Wind SpeedMPH
0,B11,1.0,2016-01-01,1,1,Friday,Night,5,27,39.816667,62.360454,20.746053,-22.543787,0,Overcast,51.0,0.0,42.1,10.0,4.6
1,B11,2.0,2016-01-01,1,1,Friday,PeakAM,9,11,39.9,37.801282,9.328973,2.098718,0,Overcast,58.0,0.0,39.9,10.0,10.4
2,B11,2.0,2016-01-01,1,1,Friday,PeakAM,9,44,38.45,37.801282,9.328973,0.648718,0,Overcast,58.0,0.0,39.9,10.0,10.4
3,B11,1.0,2016-01-01,1,1,Friday,PeakAM,9,52,53.9,55.033333,7.915746,-1.133333,0,Overcast,58.0,0.0,39.9,10.0,10.4
4,B11,2.0,2016-01-01,1,1,Friday,MidDay,10,10,34.7,58.695643,13.242097,-23.995643,0,Overcast,58.0,0.0,39.9,10.0,6.9


### Merge Spatial Data

In [78]:
spatial = pd.read_csv('data/external/spatial_features.csv')
spatial.head()

Unnamed: 0,month,totalInjuries,route_id,pavementScore,potholeCount
0,1,162,B11,2110.0,237
1,10,110,B11,2110.0,237
2,11,148,B11,2110.0,237
3,12,160,B11,2110.0,237
4,2,114,B11,2110.0,237


In [79]:
len(bus_weather)

788624

In [80]:
bus_weather_spatial = pd.merge(bus_weather, spatial, how='left', left_on=['bus_line','month'], right_on=['route_id','month'])

In [81]:
bus_weather_spatial.drop('route_id', axis=1, inplace=True)
len(bus_weather_spatial)

788624

In [82]:
bus_weather_spatial.head()

Unnamed: 0,bus_line,direction,date,month,day,day_of_week,time_period,hour,minute,trip_time,...,delay,Conditions,Humidity,PrecipitationIn,TemperatureF,VisibilityMPH,Wind SpeedMPH,totalInjuries,pavementScore,potholeCount
0,B11,1.0,2016-01-01,1,1,Friday,Night,5,27,39.816667,...,0,Overcast,51.0,0.0,42.1,10.0,4.6,162,2110.0,237
1,B11,2.0,2016-01-01,1,1,Friday,PeakAM,9,11,39.9,...,0,Overcast,58.0,0.0,39.9,10.0,10.4,162,2110.0,237
2,B11,2.0,2016-01-01,1,1,Friday,PeakAM,9,44,38.45,...,0,Overcast,58.0,0.0,39.9,10.0,10.4,162,2110.0,237
3,B11,1.0,2016-01-01,1,1,Friday,PeakAM,9,52,53.9,...,0,Overcast,58.0,0.0,39.9,10.0,10.4,162,2110.0,237
4,B11,2.0,2016-01-01,1,1,Friday,MidDay,10,10,34.7,...,0,Overcast,58.0,0.0,39.9,10.0,6.9,162,2110.0,237


### Calculate Previous Trip Time Ratio (2nd prior bus departure)

In [83]:
bus_weather_spatial['prev_trip_ratio'] = (bus_weather_spatial['trip_time']/bus_weather_spatial['avg_trip_time']).shift(2)

In [84]:
bus_weather_spatial.head()

Unnamed: 0,bus_line,direction,date,month,day,day_of_week,time_period,hour,minute,trip_time,...,Conditions,Humidity,PrecipitationIn,TemperatureF,VisibilityMPH,Wind SpeedMPH,totalInjuries,pavementScore,potholeCount,prev_trip_ratio
0,B11,1.0,2016-01-01,1,1,Friday,Night,5,27,39.816667,...,Overcast,51.0,0.0,42.1,10.0,4.6,162,2110.0,237,
1,B11,2.0,2016-01-01,1,1,Friday,PeakAM,9,11,39.9,...,Overcast,58.0,0.0,39.9,10.0,10.4,162,2110.0,237,
2,B11,2.0,2016-01-01,1,1,Friday,PeakAM,9,44,38.45,...,Overcast,58.0,0.0,39.9,10.0,10.4,162,2110.0,237,0.638492
3,B11,1.0,2016-01-01,1,1,Friday,PeakAM,9,52,53.9,...,Overcast,58.0,0.0,39.9,10.0,10.4,162,2110.0,237,1.05552
4,B11,2.0,2016-01-01,1,1,Friday,MidDay,10,10,34.7,...,Overcast,58.0,0.0,39.9,10.0,6.9,162,2110.0,237,1.017161


In [85]:
bus_weather_spatial.dropna(inplace=True)

In [86]:
bus_weather_spatial.to_csv('data/interim/bus_times_weather_spatial.csv')