## Estimating Workday Commuters

### Objective

* Data file with estimated commuters by day for consecutive years

### Rationale

* This data is a key to describing the benefits of the system, because of the specific studies of the benefits of using bicycles to commute to work

### Requirements

* Pandas (0.24.2)
* Numpy (1.16.4)
* Scikit-learn (0.21.2)

### Input / Output

* Input files named according to `{yyyy}{mm}-hourly-tripdata.csv` where yyyy is the current year, and mm is the month of interest.  The system will look for two files, one with year yyyy, and another with year yyyy - 1.

* Output file named `{yyyy}{mm}-estimated-commuters.csv`

* These data files should be in a sub-directory called "trips" within the "data" directory, located in the folder above the one containing this notebook. 

* The required input files will be generated as a result of running the notebook called `process_NYC_bike_data_for_tableau.jypnb`, which should accompany this notebook.  The notebook is available on GitHub at https://github.com/andrewguenthner/nyc_bikeshare.  Note that to run the precursor notebook, trip data must also be downloaded from https://s3.amazonaws.com/tripdata/index.html.  


### <span style="color:blue">Required User Input</span>

In [1]:
valid = False
attempts = 0  # Tracked to avoid infinite loops
while (not valid) and (attempts < 6):
    attempts += 1
    if attempts == 6:
        print('Too many invalid attempts.  Exiting loop.  *This notebook will not process data*.')
        break
    try:
        mmyyyy = input('Please enter the two-digit month and four-digit year for analysis (e.g. 05-2019)')
        mm = int(mmyyyy[0:2])
        yyyy = int(mmyyyy[-4:])
        print('Thank you.')
        valid = True
    except ValueError:
        print('Invalid entry.  Pleae try again.')

Please enter the two-digit month and four-digit year for analysis (e.g. 05-2019)05-2019
Thank you.


### Set-Up / Import

In [23]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression 
%matplotlib inline

In [12]:
# Compose filenames
trip_data_dir = '../data/trips/'
if mm < 10:
    month_str = '0' + str(mm)
else:
    month_str = str(mm)
this_year_str = str(yyyy)
last_year_str = str(yyyy - 1)
this_year_in_filename = trip_data_dir + this_year_str + month_str + '-hourly-tripdata.csv'
last_year_in_filename = trip_data_dir + last_year_str + month_str + '-hourly-tripdata.csv'
# Input data
this_yr_hourly = pd.read_csv(this_year_in_filename,parse_dates = ['date'])
last_yr_hourly = pd.read_csv(last_year_in_filename,parse_dates = ['date'])

In [13]:
# Check the data
this_yr_hourly.head()

Unnamed: 0.1,Unnamed: 0,date,hour,start_id,median_duration,trip_count,station_lat,station_long,frac_subscriber,frac_male,frac_under_25,frac_25_34,frac_35_44,frac_45_54,frac_55_over,return_count,comp_net_inven
0,0,2019-05-01,0,83,1802.0,3,40.683826,-73.976323,0.0,0.666667,1.0,0.0,0.0,0.0,0.0,0.0,-3.0
1,1,2019-05-01,0,127,581.0,1,40.731724,-74.006744,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,2,2019-05-01,0,128,548.0,1,40.727103,-74.002971,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,-1.0
3,3,2019-05-01,0,146,467.0,1,40.71625,-74.009106,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,4,2019-05-01,0,151,541.0,1,40.722104,-73.997249,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [14]:
last_yr_hourly.head()

Unnamed: 0.1,Unnamed: 0,date,hour,start_id,median_duration,trip_count,station_lat,station_long,frac_subscriber,frac_male,frac_under_25,frac_25_34,frac_35_44,frac_45_54,frac_55_over,return_count,comp_net_inven
0,0,2018-05-01,0,128,550.0,3,40.727103,-74.002971,1.0,1.0,0.0,0.333333,0.333333,0.0,0.333333,0.0,-3.0
1,1,2018-05-01,0,146,1182.5,2,40.71625,-74.009106,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-2.0
2,2,2018-05-01,0,157,188.0,1,40.690893,-73.996123,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,-1.0
3,3,2018-05-01,0,161,480.5,2,40.72917,-73.998102,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,-2.0
4,4,2018-05-01,0,168,662.0,1,40.739713,-73.994564,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-1.0


### Process Data

In [16]:
# Step by step for current year data -- start with group-by
this_yr_hourly_totals = this_yr_hourly[['date','hour','trip_count']].groupby(['date','hour']).sum().reset_index()
this_yr_hourly_totals.head()

Unnamed: 0,date,hour,trip_count
0,2019-05-01,0,359
1,2019-05-01,1,158
2,2019-05-01,2,88
3,2019-05-01,3,67
4,2019-05-01,4,110


In [19]:
# Filter data to retain only weekdays
this_yr_weekday_hourly_totals = this_yr_hourly_totals[this_yr_hourly_totals['date'].dt.dayofweek < 5]
# Test, we should see 20 - 25 distinct days
this_yr_weekday_hourly_totals.date.value_counts()

2019-05-28    24
2019-05-14    24
2019-05-16    24
2019-05-27    24
2019-05-10    24
2019-05-21    24
2019-05-15    24
2019-05-09    24
2019-05-20    24
2019-05-03    24
2019-05-31    24
2019-05-08    24
2019-05-17    24
2019-05-02    24
2019-05-30    24
2019-05-13    24
2019-05-24    24
2019-05-07    24
2019-05-01    24
2019-05-29    24
2019-05-23    24
2019-05-06    24
2019-05-22    24
Name: date, dtype: int64

In [40]:
def prep_baseline_data(hours_list : int, trip_counts: int):
    """Given a list of hours in a day and a list of trip counts by hour, return the X and y objects needed for a linear regression
    using scikit-learn.  Uses data points at hours 3 (interpreted as 3 AM), 4, 5, 10, 11, and 12 to calculate the regression."""
    hours_list = list(hours_list)
    trip_counts = list(trip_counts)
    X_list = []
    y_list = []
    for hour in [3,4,5,10,11,12]:
        X_list.append(hours_list.index(hour))
        y_list.append(trip_counts[hours_list.index(hour)])
    X = np.array(X_list).reshape(-1,1)
    y = np.array(y_list).reshape(-1,1)
    return X, y

In [48]:
def estimate_baseline(hours_list : int, trip_counts : int) -> int:
    """Given a list of hours in a day and a list of trip counts by hour, returns a list of the baseline trips between 6 AM
    and 9 AM hours by performing a linear regression on trips at 3 AM - 5 AM hours and 10 AM - noon."""
    X, y = prep_baseline_data(hours_list, trip_counts)
    regressor = LinearRegression()
    regressor.fit(X,y)
    hours_list_out = np.array([6,7,8,9]).reshape(-1,1)
    baseline_out = regressor.predict(hours_list_out)
    return hours_list_out, baseline_out

In [57]:
def estimate_daily_commuters(df, date_to_check) -> int:
    """Given a dataframe with hourly trip totals and a date of interest, compute daily commuters by adding up the difference 
    between the trip total and an estimated baseline, for hours 6 to 9 (6AM - 9AM)."""
    # Get baseline
    hours_list = df[df.date == date_to_check].hour.to_list()
    tripcount_list = df[df.date == date_to_check].trip_count.to_list()
    baseline_hours, baseline_trips = estimate_baseline(hours_list, tripcount_list)
    # Convert the ndarrays needed for regression to simple lists 
    baseline_hours = list(baseline_hours)
    baseline_trips = list(baseline_trips)
    # Now count up the difference between actual trips and the baseline 
    estimate = 0
    for hour_to_check in [6,7,8,9]:
        trips = tripcount_list[hours_list.index(hour_to_check)]
        baseline = baseline_trips[baseline_hours.index(hour_to_check)][0]
        estimate += trips - baseline
    return int(estimate)

In [58]:
datelist = this_yr_weekday_hourly_totals.date.unique()
commuters = [estimate_daily_commuters(this_yr_weekday_hourly_totals, date_to_check) for date_to_check in datelist]
this_yr_commuter_data = pd.DataFrame({'date':datelist,'commuters':commuters})
this_yr_commuter_data.head()

Unnamed: 0,date,commuters
0,2019-05-01,11262
1,2019-05-02,10868
2,2019-05-03,9414
3,2019-05-06,10939
4,2019-05-07,12415


In [60]:
# Repeat all above steps for previous year
last_yr_hourly_totals = last_yr_hourly[['date','hour','trip_count']].groupby(['date','hour']).sum().reset_index()
last_yr_weekday_hourly_totals = last_yr_hourly_totals[last_yr_hourly_totals['date'].dt.dayofweek < 5]
datelist = last_yr_weekday_hourly_totals.date.unique()
commuters = [estimate_daily_commuters(last_yr_weekday_hourly_totals, date_to_check) for date_to_check in datelist]
last_yr_commuter_data = pd.DataFrame({'date':datelist,'commuters':commuters})
last_yr_commuter_data.head()

Unnamed: 0,date,commuters
0,2018-05-01,10605
1,2018-05-02,12313
2,2018-05-03,11668
3,2018-05-04,10011
4,2018-05-07,10983


### Save Data

In [62]:
# Compose filenames
trip_data_dir = '../data/trips/'
if mm < 10:
    month_str = '0' + str(mm)
else:
    month_str = str(mm)
this_year_str = str(yyyy)
last_year_str = str(yyyy - 1)
this_year_out_filename = trip_data_dir + this_year_str + month_str + '-estimated-commuters.csv'
last_year_out_filename = trip_data_dir + last_year_str + month_str + '-estimated-commuters.csv'
# Write files
this_yr_commuter_data.to_csv(this_year_out_filename)
last_yr_commuter_data.to_csv(last_year_out_filename)