# Filter MBTA Special Days #

## Setup ##

In [1]:
import sys
import datetime
import numpy as np
import pandas as pd

## Import Data ##

In [4]:
data = pd.read_csv("../../../data/mbta_daily.csv", low_memory=False)
data.head()

Unnamed: 0,locationid,service_day,entries,name,line_1,line_2,lat,lon,service_datetime,fog,...,entries_weeks_ago_1,entries_weeks_ago_2,entries_weeks_ago_3,rain_predict,rain_fall_predict,snow_predict,snow_fall_predict,snow_accum,snow_accum_predict,dist_to_center
0,1002,2013-01-01 00:00:00,1892,Andrew Square,Red,,42.32955,-71.05696,2013-01-01 03:00:00,0,...,,,,0,0,0,0,0,0,3.404767
1,1002,2013-01-02 00:00:00,5134,Andrew Square,Red,,42.32955,-71.05696,2013-01-02 04:45:00,0,...,,,,0,0,0,0,0,0,3.404767
2,1002,2013-01-03 00:00:00,5733,Andrew Square,Red,,42.32955,-71.05696,2013-01-03 05:00:00,0,...,,,,0,0,0,0,0,0,3.404767
3,1002,2013-01-04 00:00:00,6125,Andrew Square,Red,,42.32955,-71.05696,2013-01-04 05:00:00,0,...,,,,0,0,0,0,0,0,3.404767
4,1002,2013-01-05 00:00:00,3410,Andrew Square,Red,,42.32955,-71.05696,2013-01-05 04:15:00,0,...,,,,0,0,1,0,0,0,3.404767


## Remove Known Special Days ##

In [5]:
print(len(data))

bad_days = [
    
    # New Years Day.
    datetime.date(2013, 1, 1),
    datetime.date(2014, 1, 1),
    datetime.date(2015, 1, 1),
    
    datetime.date(2013, 1, 2),
    datetime.date(2014, 1, 2),
    datetime.date(2015, 1, 2),
    
    # Boston marathon.
    datetime.date(2013, 4, 15),
    datetime.date(2014, 4, 21),
    
    # Fourth of July.
    datetime.date(2013, 7, 4),
    datetime.date(2014, 7, 4),
    
    datetime.date(2013, 7, 5),
    datetime.date(2014, 7, 5),
    
    # Boston move out day.
    datetime.date(2013, 9, 1),
    datetime.date(2014, 9, 1),
    
    # Labor Day
    datetime.date(2013, 9, 2),
    datetime.date(2014, 9, 1),
    
    # Veterans Day.
    datetime.date(2013, 11, 11),
    datetime.date(2014, 11, 11),
    
    # Thanksgiving.
    datetime.date(2013, 11, 28),
    datetime.date(2014, 11, 27),
    
    # Black Friday.
    datetime.date(2013, 11, 29),
    datetime.date(2014, 11, 28),
    
    # Christmas week.
    datetime.date(2013, 12, 24),
    datetime.date(2014, 12, 24),
    
    datetime.date(2013, 12, 25),
    datetime.date(2014, 12, 25),
    
    datetime.date(2013, 12, 26),
    datetime.date(2014, 12, 26),
    
    datetime.date(2013, 12, 27),
    datetime.date(2014, 12, 27),
    
    # New Years.
    datetime.date(2013, 12, 31),
    datetime.date(2014, 12, 31),
]

for day in bad_days:
    data = data[pd.DatetimeIndex(data['service_day']).date != day]

print(len(data))

47901
45968


In [11]:
tmpdata = data.copy()

In [12]:
def add_previous_week(data, weeks_ago = 1, day_col = 'service_datetime'):
    """
    Adds a column that has the same entries one week ago.

    Key arguments:
    data  -- The current dataframe.
    weeks -- The number of weeks to look back.
    """

    def apply(row):
        max_weeks_ago = weeks_ago + 4
        curr_weeks_ago = weeks_ago
        while curr_weeks_ago <= max_weeks_ago:
            
            # Find the previous day x weeks ago.
            previous_datetime = row[day_col] - np.timedelta64(curr_weeks_ago, 'W')

            # Find the entries for this stations exactly x weeks ago.
            entries = data.loc[
                (data[day_col] == previous_datetime) &
                (data['locationid'] == row['locationid'])
            ]['entries'].values

            # Set the entries from X weeks ago.
            column = 'entries_weeks_ago_' + str(weeks_ago)
            row[column] = np.nan
            if (len(entries) == 1):
                row[column] = entries[0]
                return row
            else:
                curr_weeks_ago += 1

        return row

    return data.apply(apply, axis=1)

In [13]:
tmpdata.drop('entries_weeks_ago_1', axis=1, inplace=True)
tmpdata.drop('entries_weeks_ago_2', axis=1, inplace=True)
tmpdata.drop('entries_weeks_ago_3', axis=1, inplace=True)
tmpdata.head()

Unnamed: 0,locationid,service_day,entries,name,line_1,line_2,lat,lon,service_datetime,fog,...,month_10,month_11,month_12,rain_predict,rain_fall_predict,snow_predict,snow_fall_predict,snow_accum,snow_accum_predict,dist_to_center
2,1002,2013-01-03 00:00:00,5733,Andrew Square,Red,,42.32955,-71.05696,2013-01-03 05:00:00,0,...,0,0,0,0,0,0,0,0,0,3.404767
3,1002,2013-01-04 00:00:00,6125,Andrew Square,Red,,42.32955,-71.05696,2013-01-04 05:00:00,0,...,0,0,0,0,0,0,0,0,0,3.404767
4,1002,2013-01-05 00:00:00,3410,Andrew Square,Red,,42.32955,-71.05696,2013-01-05 04:15:00,0,...,0,0,0,0,0,1,0,0,0,3.404767
5,1002,2013-01-06 00:00:00,2311,Andrew Square,Red,,42.32955,-71.05696,2013-01-06 05:00:00,0,...,0,0,0,0,0,0,0,0,0,3.404767
6,1002,2013-01-07 00:00:00,5998,Andrew Square,Red,,42.32955,-71.05696,2013-01-07 04:00:00,0,...,0,0,0,0,0,0,0,0,0,3.404767


In [16]:
tmpdata['service_day'] = pd.to_datetime(tmpdata['service_day'])
tmpdata = add_previous_week(tmpdata, weeks_ago = 1, day_col = 'service_day')
tmpdata.head()

Unnamed: 0,locationid,service_day,entries,name,line_1,line_2,lat,lon,service_datetime,fog,...,month_11,month_12,rain_predict,rain_fall_predict,snow_predict,snow_fall_predict,snow_accum,snow_accum_predict,dist_to_center,entries_weeks_ago_1
2,1002,2013-01-03,5733,Andrew Square,Red,,42.32955,-71.05696,2013-01-03 05:00:00,0,...,0,0,0,0,0,0,0,0,3.404767,
3,1002,2013-01-04,6125,Andrew Square,Red,,42.32955,-71.05696,2013-01-04 05:00:00,0,...,0,0,0,0,0,0,0,0,3.404767,
4,1002,2013-01-05,3410,Andrew Square,Red,,42.32955,-71.05696,2013-01-05 04:15:00,0,...,0,0,0,0,1,0,0,0,3.404767,
5,1002,2013-01-06,2311,Andrew Square,Red,,42.32955,-71.05696,2013-01-06 05:00:00,0,...,0,0,0,0,0,0,0,0,3.404767,
6,1002,2013-01-07,5998,Andrew Square,Red,,42.32955,-71.05696,2013-01-07 04:00:00,0,...,0,0,0,0,0,0,0,0,3.404767,


In [17]:
tmpdata['service_day'] = pd.to_datetime(tmpdata['service_day'])
tmpdata = add_previous_week(tmpdata, weeks_ago = 2, day_col = 'service_day')
tmpdata.head()

Unnamed: 0,locationid,service_day,entries,name,line_1,line_2,lat,lon,service_datetime,fog,...,month_12,rain_predict,rain_fall_predict,snow_predict,snow_fall_predict,snow_accum,snow_accum_predict,dist_to_center,entries_weeks_ago_1,entries_weeks_ago_2
2,1002,2013-01-03,5733,Andrew Square,Red,,42.32955,-71.05696,2013-01-03 05:00:00,0,...,0,0,0,0,0,0,0,3.404767,,
3,1002,2013-01-04,6125,Andrew Square,Red,,42.32955,-71.05696,2013-01-04 05:00:00,0,...,0,0,0,0,0,0,0,3.404767,,
4,1002,2013-01-05,3410,Andrew Square,Red,,42.32955,-71.05696,2013-01-05 04:15:00,0,...,0,0,0,1,0,0,0,3.404767,,
5,1002,2013-01-06,2311,Andrew Square,Red,,42.32955,-71.05696,2013-01-06 05:00:00,0,...,0,0,0,0,0,0,0,3.404767,,
6,1002,2013-01-07,5998,Andrew Square,Red,,42.32955,-71.05696,2013-01-07 04:00:00,0,...,0,0,0,0,0,0,0,3.404767,,


In [20]:
tmpdata['service_day'] = pd.to_datetime(tmpdata['service_day'])
tmpdata = add_previous_week(tmpdata, weeks_ago = 3, day_col = 'service_day')
tmpdata.head()

Unnamed: 0,locationid,service_day,entries,name,line_1,line_2,lat,lon,service_datetime,fog,...,rain_predict,rain_fall_predict,snow_predict,snow_fall_predict,snow_accum,snow_accum_predict,dist_to_center,entries_weeks_ago_1,entries_weeks_ago_2,entries_weeks_ago_3
2,1002,2013-01-03,5733,Andrew Square,Red,,42.32955,-71.05696,2013-01-03 05:00:00,0,...,0,0,0,0,0,0,3.404767,,,
3,1002,2013-01-04,6125,Andrew Square,Red,,42.32955,-71.05696,2013-01-04 05:00:00,0,...,0,0,0,0,0,0,3.404767,,,
4,1002,2013-01-05,3410,Andrew Square,Red,,42.32955,-71.05696,2013-01-05 04:15:00,0,...,0,0,1,0,0,0,3.404767,,,
5,1002,2013-01-06,2311,Andrew Square,Red,,42.32955,-71.05696,2013-01-06 05:00:00,0,...,0,0,0,0,0,0,3.404767,,,
6,1002,2013-01-07,5998,Andrew Square,Red,,42.32955,-71.05696,2013-01-07 04:00:00,0,...,0,0,0,0,0,0,3.404767,,,


In [21]:
tmpdata.to_csv("../../../data/mbta_daily_noholidays.csv", index=False)