In [83]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import KFold, cross_val_score
from pandas.tseries.holiday import USFederalHolidayCalendar
from sklearn.linear_model import Ridge, Lasso
from sklearn.pipeline import make_pipeline
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.model_selection import RandomizedSearchCV
"""
import warnings
warnings.filterwarnings('ignore')
"""
#maybe?
#from sklearn.metrics import mean_squared_error

#get bicycle data
counts = pd.read_csv('data/FremontBridge.csv', index_col='Date', parse_dates=True)
weather = pd.read_csv('data/BicycleWeather.csv', index_col='DATE', parse_dates=True)

#create a data frame with the total traffic for each day
daily = counts.resample('d').sum()
daily['Total'] = daily.sum(axis=1)
daily = daily[['Total']] # remove other columns

# add an indicator about Mon - Sun
days = ['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun']
for i in range(7):
    daily[days[i]] = (daily.index.dayofweek == i).astype(float)
    
# add an indicator about holiday
cal = USFederalHolidayCalendar()
holidays = cal.holidays('2012', '2016')
daily = daily.join(pd.Series(1, index=holidays, name='holiday'))
# replace missing data with 0
daily['holiday'].fillna(0, inplace=True)

#get the amount of daylight hours each day
def hours_of_daylight(date, axis=23.44, latitude=47.61):
    """Compute the hours of daylight for the given date"""
    days = (date - pd.Timestamp(2000, 12, 21)).days
    m = (1. - np.tan(np.radians(latitude))
         * np.tan(np.radians(axis) * np.cos(days * 2 * np.pi / 365.25)))
    return 24. * np.degrees(np.arccos(1 - np.clip(m, 0, 2))) / 180.

daily['daylight_hrs'] = list(map(hours_of_daylight, daily.index))

#get the temperature
weather['TMIN'] /= 10
weather['TMAX'] /= 10
weather['Temp (C)'] = 0.5 * (weather['TMIN'] + weather['TMAX'])

#get the amount  of rain and set a flag to see if it was a dry day
weather['PRCP'] /= 254
weather['dry day'] = (weather['PRCP'] == 0).astype(int)

daily = daily.join(weather[['PRCP', 'Temp (C)', 'dry day']])

# number of years passed
daily['annual'] = (daily.index - daily.index[0]).days / 365.

# Drop any rows with null values
daily.dropna(axis=0, how='any', inplace=True)

#linear prediction
column_names = ['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun', 'holiday',
                'daylight_hrs', 'PRCP', 'dry day', 'Temp (C)', 'annual']
X = daily[column_names]
y = daily['Total']

#create linear model
linModel = LinearRegression(fit_intercept=False)
linScores = cross_val_score(linModel,X,y,cv=KFold(n_splits = 10))
linScores = testModelKF(linModel)
linScore = linScores.mean()
print("Linear:",linScore)

aRange = np.arange(0,6.001,0.001)
aRange = [round(x,3) for x in aRange]
params = {'alpha':aRange}

#create Ridge model
ridModel = Ridge()
randSearchRid = RandomizedSearchCV(estimator = ridModel, param_distributions = params, cv = 10)
randSearchRid.fit(X, y)
ridScore = randSearchRid.best_score_
ridPar = randSearchRid.best_params_
print("Ridge Score:",ridScore)
print("Ridge Alpha:",ridPar)

#create Lasso model
lasModel = Lasso()
randSearchLas = RandomizedSearchCV(estimator = lasModel, param_distributions = params, cv = 10)
randSearchLas.fit(X, y)
lasScore = randSearchLas.best_score_
lasPar = randSearchLas.best_params_
print("Lasso Score:", lasScore)
print("Lasso Alpha:",lasPar)

if linScore > ridScore and linScore > lasScore:
    print("Linear Model Best")
elif ridScore > lasScore:
    print("Ridge Model Best")
else:
    print("Lasso Model Best")

  counts = pd.read_csv('data/FremontBridge.csv', index_col='Date', parse_dates=True)


Linear: 0.7691752923789887
Ridge Score: 0.771064924585374
Ridge Alpha: {'alpha': 4.346}
Lasso Score: 0.7708911890622756
Lasso Alpha: {'alpha': 3.698}
Ridge Model Best
