In [2]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LogisticRegressionCV
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error
from sklearn.neighbors import KNeighborsClassifier,KNeighborsRegressor
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score
from sklearn.model_selection import KFold
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import PolynomialFeatures
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.decomposition import PCA

In [3]:
#import all 3 routes
route1 = pd.read_csv('data/route1processeddata.csv')
route1['route'] = 1
route2 = pd.read_csv('data/route2processeddata.csv')
route2['route'] = 2
route3 = pd.read_csv('data/route3processeddata.csv')
route3['route'] = 3

In [4]:
#combine all 3 routes
df = pd.concat([route1,route2,route3],axis = 0)

In [5]:
#filter for Dockability & Rideability
df = df[df['Dockability']*df['Releasability'] == 1]
#df.describe()

In [6]:
#functions for creating categories
def utcicat(row):
    if row['utcivar1'] < -40:
        val = -5
    elif (-40 <= row['utcivar1'] and row['utcivar1'] <-27):
        val = -4
    elif (-27 <= row['utcivar1'] and row['utcivar1'] <-13):
        val = -3 
    elif (-13 <= row['utcivar1'] and row['utcivar1'] <0):
        val = -2 
    elif (0<= row['utcivar1'] and row['utcivar1'] <9):
        val = -1
    elif (9 <= row['utcivar1'] and row['utcivar1'] <26):
        val = 0
    elif (26 <= row['utcivar1'] and row['utcivar1'] <28):
        val = 1 
    elif (28 <= row['utcivar1'] and row['utcivar1'] <32):
        val = 2
    elif (32 <= row['utcivar1'] and row['utcivar1'] <38):
        val = 3
    elif (38 <= row['utcivar1'] and row['utcivar1'] <46):
        val = 4
    elif (row['utcivar1'] >= 46):
        val = 5
    return val

def precipitationcat(row):
    if row['Precipitation'] == 1:
        val = 1
    else:
        val = 0
    return val

def hourcat(row):
    if row['Hour'] <= 6: 
        val = 'midnight'
    elif row['Hour'] <= 10: 
        val = 'morning'
    elif row['Hour'] <= 3:
        val = 'early_afternoon'
    elif row['Hour'] <= 7:
        val = 'late_afternoon'
    elif row['Hour'] <= 23:
        val = 'evening'
    return val

def countcat(row): 
    if row['count'] == 0: 
        val = 0 
    elif row['count'] <= 2:
        val = 1
    elif row['count'] >= 3: 
        val = 2
    return val

def weekendcat(row):
    year = str(int(row['Year']))
    
    month = int(row['Month'])
    month = str(("0" if month < 10 else "")) + str(month)
    
    day = int(row['Day'])
    day = str(("0" if day < 10 else "")) + str(day)
    
    hour = int(row['Hour'])
    hour = str(("0" if hour < 10 else "")) + str(hour)
    
    inputstring = year + month + day + hour
    datetime = pd.to_datetime(inputstring,format='%Y%m%d%H')
    day = datetime.dayofweek
    isWeekend = 1 if day > 4 else 0
    return isWeekend
    

In [7]:
#set up categorical predictors
df['utcivar1_cat'] = df.apply(utcicat, axis=1)
df['precipitation_cat'] = df.apply(precipitationcat, axis=1)
df['hour_cat'] = df.apply(hourcat, axis=1)
df['count_cat'] = df.apply(countcat, axis=1)
df['weekend'] = df.apply(weekendcat, axis=1)

In [8]:
#drop unecessary columns
df = df.drop(['DateTime','Dockability','Releasability'],axis =1)
df.head()

Unnamed: 0,Year,Month,Day,Hour,DBT,RelHum,WSp,Precipitation,Elev,utcivar1,utcivar2,utcivar3,count,specialdate,route,utcivar1_cat,precipitation_cat,hour_cat,count_cat,weekend
2649,2017.0,4.0,21.0,9.0,6.7,100.0,7.7,0.0,52.392654,-5.488704,-9.169357,-9.233817,0.0,0,1,-2,0,morning,0,0
2650,2017.0,4.0,21.0,10.0,6.7,96.0,8.2,0.0,58.334083,-6.296706,-9.999895,-10.068702,1.0,0,1,-2,0,morning,1,0
2651,2017.0,4.0,21.0,11.0,7.2,97.0,7.2,0.0,59.503749,-3.105039,-6.808441,-6.900785,0.0,0,1,-2,0,evening,0,0
2652,2017.0,4.0,21.0,12.0,6.7,100.0,8.2,0.0,55.398772,-5.578307,-9.260705,-9.355655,0.0,0,1,-2,0,evening,0,0
2653,2017.0,4.0,21.0,13.0,6.7,100.0,7.2,0.0,47.574727,-3.979534,-7.688882,-7.775989,0.0,0,1,-2,0,evening,0,0


#descriptors for all columns

'Year' : year of data point

'Month': month of data point

'Day': day of data point

'Hour': hour of data point

'DBT': dry bulb temperature (degree celcius) of hour from weather file

'RelHum': relative humidity (%) of hour from weather file

'WSp': wind speed (meters per second) of hour from weather file

'Precipitation': number of inches of precipitation of hour from weather file

'Elev': sun elevation (in degrees) of hour from weather file

'utcivar1': utci (degrees celcius) value calculated from first variant (purely weather file data)

'utcivar2': utci (degrees celcius) value calculated from second variant (accounting for local building shade)

'utcivar3': utci (degrees celcius) value calculated from third variant (accounting for local building and trees shade)

'count': number of bluebike trips from start to end station within that hour 

'specialdate': 1 for special date, 0 for non-special date. Special date in route 1 and 3 include public hols + MIT non-term dates. 
                special dates in route 2 include public hols.
'route': indicates which route this data point belongs to. route 1 - bridge route (highly exposed). route 2 - linear park route (dense foliage). route 3 - urban canyon route in MIT

'utcivar1_cat': categorized values of utcivar1

'precipitation_cat': binary values for precipitation (0 for no precipitation, 1 for precipitation)

'hour_cat': hour category that the data point falls into 

'count_cat': categorized bike trip count (0 for no trips, 1 for few trips, 2 for many trips)

'weekend': 0 for weekday, 1 for weekend


<b>Models</b> <br>
Sarah K <br>
Linear Regression / Logistic Classifier 

Sarah M <br>
RF Regression / RF Classifier / 209A Model?

Liz <br>
Boosting Regression / Boosting Classifier / 209A Model?

Cam <br>
KNN Regression / KNN Classifier

<b> Discussion  </b> <br>
Sarah K and I assumed that boosting and random forest classifiers will be the most predictive models but most intensive hyper-parameter tuning so we assigned these models to Liz and Sarah M. <br>
Each person has 2 assigned models with 3 sets of predictors and 3 unique UTCs. Therefore each person is responsible for 18 models, 72 models in all. Each person will explore regularazation, standardization / normalization, hyper parameter tuning, PCA, as well as interaction terms and polynomial terms. <br>
We were unsure what Liz and Sarah's plans were for the 209a model, so we just put in a placeholder.

In [9]:
df.to_csv('data/data.csv',index=False)