In [5]:
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns; sns.set()
import numpy as np
import pandas as pd
import datetime

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.model_selection import RandomizedSearchCV, cross_val_score
from sklearn.metrics import accuracy_score
from scipy.stats import uniform

counts = pd.read_csv('data/FremontBridge.csv', index_col='Date', parse_dates=True)
weather = pd.read_csv('data/BicycleWeather.csv', index_col='DATE', parse_dates=True)
daily = counts.resample('d').sum()
daily['Total'] = daily.sum(axis=1)
daily = daily[['Total']]

day_names = ['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun']
for i in range(7):
    daily[day_names[i]] = (daily.index.dayofweek == i).astype(float)

from pandas.tseries.holiday import USFederalHolidayCalendar as calendar
cal = calendar()
holidays = cal.holidays('2012', '2016')
daily = daily.join(pd.Series(1, index=holidays, name='holiday'))

daily['holiday'].fillna(0, inplace=True)

def hours_of_daylight(date, axis=23.44, latitude=47.61):
    """Compute the hours of daylight for the given date"""
    days = (date - datetime(2000, 12, 21)).days
    m = (1. - np.tan(np.radians(latitude))
         * np.tan(np.radians(axis) * np.cos(days * 2 * np.pi / 365.25)))
    return 24. * np.degrees(np.arccos(1 - np.clip(m, 0, 2))) / 180.

daily['daylight_hrs'] = list(map(hours_of_daylight, daily.index))

weather['TMin'] /= 10
weather['TMax'] /= 10
weather['Temp (C)'] = (weather['TMin'] + weather['TMax']) / 2

weather['PRCP'] /= 254
weather['dry day'] = (weather['PRCP'] == 0).astype(int)

daily = daily.join(weather[['PRCP', 'Temp (C)', 'dry day']])

daily['annual'] = (daily.index - daily.index[0]).days / 365.

daily.dropna(axis=0, how='any', inplace=True)
column_names = ['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun', 'holiday', 'daylight_hrs', 'PRCP', 'Temp (C)', 'dry day', 'annual']

X = daily[column_names]
y = daily['Total']

  days = (date - pd.datetime(2000, 12, 21)).days


KeyError: 'TMin'

In [None]:
results = {}

param_grid = {'alpha': uniform()}
rsearch = RandomizedSearchCV(estimator=Ridge(), param_distributions=param_grid, cv=10)
rsearch.fit(X, y)
rbest_alpha = rsearch.best_params_['alpha']
rmean_score = rsearch.best_score_
print('Ridge: alpha = {:.3f}, score = {:.3f}'.format(rbest_alpha, rmean_score))
results['Ridge'] = rmean_score

lsearch = RandomizedSearchCV(estimator=Lasso(tol = .01), param_distributions=param_grid, cv=10)
lsearch.fit(X, y)
lbest_alpha = lsearch.best_params_['alpha']
lmean_score = lsearch.best_score_
print('Lasso: alpha = {:.3f}, score = {:.3f}'.format(lbest_alpha, lmean_score))
results['Lasso'] = lmean_score

model = LinearRegression(fit_intercept=False)
model.fit(X, y)

linear_cv = cross_val_score(model, X, y, cv=10)
lrmean_score = linear_cv.mean()
print('Linear: score = {:.3f}'.format(lrmean_score))
results['Linear'] = lrmean_score

max_score = max(results.values())
print('Best model: {}'.format([k for k, v in results.items() if v == max_score][0]))