In [36]:
import pandas as pd
import numpy as np

import datetime as dt

from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.utils import resample

import statsmodels.api as sm
import scipy.stats as stats

import pickle


import os
import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

    
from src.scrapers import scrape_huskies, scrape_seahawks
from src.data_retrievers import DataRetrieval
from src.holiday_calendars import SeattleHolidays
from src.featurizers import CountCalls, FeaturizeCalls, DateDummies, HolidayDummies, EventDummies, MakeDummies, JoinDataFrames

In [5]:
import pandas as pd
import numpy as np

from sklearn.pipeline import Pipeline

calls_pipe = Pipeline(steps=[
    ('counter', CountCalls(how='neighborhood')),
    ('feturizer', FeaturizeCalls()),
    ('date_dummifier', DateDummies())
])

In [6]:
retriever = DataRetrieval()

In [7]:
calls = retriever.get_calls_data()

In [8]:
data = calls_pipe.fit(calls)

In [9]:
calls_w_features = data.transform(calls)

In [147]:
calls_w_features

Unnamed: 0,date,neighborhood,num_calls,dt_time,day_seq,year,month,day,day_of_week,month_day,...,spec_day_11/15,spec_day_11/16,spec_day_11/23,spec_day_11/27,spec_day_12/04,spec_day_12/12,spec_day_12/19,spec_day_12/21,spec_day_12/23,spec_day_12/29
0,2010-01-01,ALASKA JUNCTION,1.0,2010-01-01,0,2010,01/01,1,4,01/01,...,0,0,0,0,0,0,0,0,0,0
1,2010-01-02,ALKI,0.0,2010-01-02,1,2010,01/02,2,5,01/02,...,0,0,0,0,0,0,0,0,0,0
2,2010-01-03,BALLARD NORTH,1.0,2010-01-03,2,2010,01/03,3,6,01/03,...,0,0,0,0,0,0,0,0,0,0
3,2010-01-04,BALLARD SOUTH,0.0,2010-01-04,3,2010,01/04,4,0,01/04,...,0,0,0,0,0,0,0,0,0,0
4,2010-01-05,BELLTOWN,0.0,2010-01-05,4,2010,01/05,5,1,01/05,...,0,0,0,0,0,0,0,0,0,0
5,2010-01-06,BITTERLAKE,1.0,2010-01-06,5,2010,01/06,6,2,01/06,...,0,0,0,0,0,0,0,0,0,0
6,2010-01-07,BRIGHTON/DUNLAP,1.0,2010-01-07,6,2010,01/07,7,3,01/07,...,0,0,0,0,0,0,0,0,0,0
7,2010-01-08,CAPITOL HILL,1.0,2010-01-08,7,2010,01/08,8,4,01/08,...,0,0,0,0,0,0,0,0,0,0
8,2010-01-09,CENTRAL AREA/SQUIRE PARK,0.0,2010-01-09,8,2010,01/09,9,5,01/09,...,0,0,0,0,0,0,0,0,0,0
9,2010-01-10,CHINATOWN/INTERNATIONAL DISTRICT,0.0,2010-01-10,9,2010,01/10,10,6,01/10,...,0,0,0,0,0,0,0,0,0,0


In [10]:
weather = retriever.get_weather_data()

In [11]:
seahawks_schedule = retriever.get_seahawks_schedule()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  df['date'][i] = (f"{df.iloc[i]['Date']}, {df.iloc[i]['year']}")
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  df['seahawks_game'][i] = 'home_Regular'
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  df['seahawks_game'][i] = 'away_Regular'
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  df['seahawks_game'][i] = 'home_Playoffs'
A value is trying to be set on a copy of a slice fr

In [12]:
huskies_schedule = retriever.get_huskies_schedule()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  df['huskies_game'][i] = 'away'
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  df['huskies_game'][i] = 'home'


In [13]:
sounders_schedule = retriever.get_sounders_schedule()

In [14]:
sports = MakeDummies()

In [15]:
sports.fit(seahawks_schedule)

<src.featurizers.MakeDummies at 0x1c1bae6f98>

In [16]:
seahawks = sports.transform()

In [17]:
sports.fit(huskies_schedule)

<src.featurizers.MakeDummies at 0x1c1bae6f98>

In [18]:
huskies = sports.transform()

In [19]:
sports.fit(sounders_schedule)

<src.featurizers.MakeDummies at 0x1c1bae6f98>

In [20]:
sounders = sports.transform()

In [21]:
us_holiday_dict = SeattleHolidays.CustomHolidays()

In [22]:
us_holiday_dict._populate()

In [23]:
holidayier = HolidayDummies()

In [24]:
holidayier.fit(us_holiday_dict)

<src.featurizers.HolidayDummies at 0x1c1bae6c88>

In [25]:
us_holidays = holidayier.transform()

In [26]:
jewish_holiday_dict = SeattleHolidays.JewishHolidays()

In [27]:
jewish_holiday_dict._populate()

In [28]:
holidayier.fit(jewish_holiday_dict)

<src.featurizers.HolidayDummies at 0x1c1bae6c88>

In [29]:
jewish_holidays = holidayier.transform()

In [30]:
islamic_holiday_dict = SeattleHolidays.IslamicHolidays()

In [31]:
islamic_holiday_dict._populate()





















































































































































































In [32]:
holidayier.fit(islamic_holiday_dict)

<src.featurizers.HolidayDummies at 0x1c1bae6c88>

In [33]:
islamic_holidays = holidayier.transform()

In [37]:
event_dummies = EventDummies()

In [38]:
event_dummies.fit()

<src.featurizers.EventDummies at 0x110d1b7f0>

In [39]:
events = event_dummies.transform()

In [40]:
joiner = JoinDataFrames(weather, us_holidays, islamic_holidays, jewish_holidays,
                        events, seahawks, huskies, sounders)

In [41]:
joiner.fit(calls_w_features)

<src.featurizers.JoinDataFrames at 0x110cf1be0>

In [42]:
calls_neighborhood = joiner.transform()

# Baseline Modle

In [93]:
b_targets = calls_neighborhood.pivot_table(values='num_calls',index='date', columns='neighborhood')

In [94]:
b_features = calls_neighborhood['day_seq'].drop_duplicates()

In [95]:
X_train_b, X_test_b, y_train_b, y_test_b =  train_test_split(b_features, b_targets,
                                                                 test_size=0.2, random_state=157)

X_train_b = np.array(X_train_b).reshape(-1, 1)
X_test_b = np.array(X_test_b).reshape(-1, 1)

In [96]:
neighborhood_model_b = LinearRegression()

In [97]:
neighborhood_model_b.fit(X_train_b, y_train_b)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [98]:
neighborhood_b_train_predictions = neighborhood_model_b.predict(X_train_b)
neighborhood_b_train_mse = mean_squared_error(y_train_b, neighborhood_b_train_predictions)

neighborhood_b_test_predictions = neighborhood_model_b.predict(X_test_b)
neighborhood_b_test_mse = mean_squared_error(y_test_b, neighborhood_b_test_predictions)

In [99]:
neighborhood_b_train_mse, neighborhood_b_test_mse

(0.5616549646134867, 0.5672616233277288)

In [100]:
neighborhood_model_b.score(X_train_b, y_train_b)

0.004964196467862284

In [50]:
calls_neighborhood.columns[5:43]

Index(['year', 'month', 'day', 'day_of_week', 'month_day', 'month_weekday',
       'spec_day', 'day_1', 'day_2', 'day_3', 'day_4', 'day_5', 'day_6',
       'day_7', 'day_8', 'day_9', 'day_10', 'day_11', 'day_12', 'day_13',
       'day_14', 'day_15', 'day_16', 'day_17', 'day_18', 'day_19', 'day_20',
       'day_21', 'day_22', 'day_23', 'day_24', 'day_25', 'day_26', 'day_27',
       'day_28', 'day_29', 'day_30', 'day_31'],
      dtype='object')

# Random Forest Simplified


In [139]:
targets = calls_neighborhood.pivot_table(values='num_calls',index='date', columns='neighborhood')

features_cols = list(calls_neighborhood.columns[12:127])
features_cols.insert(0, 'day_seq')

features = calls_neighborhood[features_cols].drop_duplicates()

In [140]:
features_cols

['day_seq',
 'day_1',
 'day_2',
 'day_3',
 'day_4',
 'day_5',
 'day_6',
 'day_7',
 'day_8',
 'day_9',
 'day_10',
 'day_11',
 'day_12',
 'day_13',
 'day_14',
 'day_15',
 'day_16',
 'day_17',
 'day_18',
 'day_19',
 'day_20',
 'day_21',
 'day_22',
 'day_23',
 'day_24',
 'day_25',
 'day_26',
 'day_27',
 'day_28',
 'day_29',
 'day_30',
 'day_31',
 'month_weekday_Apr_Fri',
 'month_weekday_Apr_Mon',
 'month_weekday_Apr_Sat',
 'month_weekday_Apr_Sun',
 'month_weekday_Apr_Thu',
 'month_weekday_Apr_Tue',
 'month_weekday_Apr_Wed',
 'month_weekday_Aug_Fri',
 'month_weekday_Aug_Mon',
 'month_weekday_Aug_Sat',
 'month_weekday_Aug_Sun',
 'month_weekday_Aug_Thu',
 'month_weekday_Aug_Tue',
 'month_weekday_Aug_Wed',
 'month_weekday_Dec_Fri',
 'month_weekday_Dec_Mon',
 'month_weekday_Dec_Sat',
 'month_weekday_Dec_Sun',
 'month_weekday_Dec_Thu',
 'month_weekday_Dec_Tue',
 'month_weekday_Dec_Wed',
 'month_weekday_Feb_Fri',
 'month_weekday_Feb_Mon',
 'month_weekday_Feb_Sat',
 'month_weekday_Feb_Sun',
 'mont

In [141]:
X_train, X_test, y_train, y_test =  train_test_split(features, targets,
                                                     test_size=0.2, random_state=157)


In [142]:
rf = RandomForestRegressor(n_estimators=5000, max_features='auto', random_state=157, n_jobs=-1)

In [143]:
rf.fit(X_train, y_train)
# ~3 minutes for 10k
# ~10 minutes for 20k
                                     
# Test Prediction
train_pred = rf.predict(X_train)
test_pred = rf.predict(X_test)

In [144]:
mean_squared_error(y_train, train_pred), mean_squared_error(y_test, test_pred)

(0.08176362588048067, 0.6058852219474286)

In [145]:
rf.score(X_train, y_train), rf.score(X_test, y_test)

(0.8551464149637261, -0.06680640382574962)

# Random Forest


In [194]:
targets = calls_neighborhood.pivot_table(values='num_calls',index='date', columns='neighborhood')

features = calls_neighborhood.drop(columns=['neighborhood', 'date', 'num_calls', 'dt_time','year', 
                                                    'month', 'day','day_of_week', 'month_day','month_weekday',
                                                    'spec_day']).drop_duplicates()

In [195]:
X_train, X_test, y_train, y_test =  train_test_split(features, targets,
                                                     test_size=0.2, random_state=157)


In [43]:
rf = RandomForestRegressor(n_estimators=20000, max_features='auto', random_state=157, n_jobs=-1)

In [44]:
rf.fit(X_train, y_train)
# ~3 minutes for 10k
# ~10 minutes for 20k
                                     
# Test Prediction
train_pred = rf.predict(X_train)
test_pred = rf.predict(X_test)

In [45]:
mean_squared_error(y_train, train_pred), mean_squared_error(y_test, test_pred)

(0.0789010307306835, 0.5844703742254185)

In [46]:
rf.score(X_train, y_train), rf.score(X_test, y_test)

(0.8602178286331945, -0.029100422792966277)

In [49]:
import os.path

file_path = "rf1.pkl"
n_bytes = 2**31
max_bytes = 2**31 - 1
data = bytearray(n_bytes)

## write
bytes_out = pickle.dumps(rf)
with open(file_path, 'wb') as f_out:
    for idx in range(0, len(bytes_out), max_bytes):
        f_out.write(bytes_out[idx:idx+max_bytes])

In [None]:
rf.score

In [203]:
rf_city = RandomForestRegressor(n_estimators=10000, max_features='auto', random_state=157, n_jobs=-1)

In [204]:
 rf_city.fit(X_train, y_train.sum(axis=1))

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=10000, n_jobs=-1,
           oob_score=False, random_state=157, verbose=0, warm_start=False)

In [205]:
train_pred_city = rf_city.predict(X_train)
test_pred_city = rf_city.predict(X_test)

mean_squared_error(y_train.sum(axis=1), train_pred_city), mean_squared_error(y_test.sum(axis=1), test_pred_city)

(5.040598607883411, 36.90957163688576)

In [206]:
y_train.sum(axis=1).mean(), np.sqrt(mean_squared_error(y_test.sum(axis=1), test_pred_city))

(30.102503912363066, 6.075324817397483)

In [207]:
rf_city.score(X_train, y_train.sum(axis=1)), rf_city.score(X_test, y_test.sum(axis=1))

(0.8795021306097589, 0.06695108307653952)

In [208]:
np.mean(np.array(y_train)), np.sqrt(mean_squared_error(y_test, test_pred))

(0.5102119307180181, 0.778386293524898)

In [263]:
neighborhood_dist_train = pd.DataFrame(np.array(y_train.T) / np.array(y_train.sum(axis=1))).T
neighborhood_dist_test = pd.DataFrame(np.array(y_test.T) / np.array(y_test.sum(axis=1))).T

In [249]:
rf_dist = RandomForestRegressor(n_estimators=5000, max_features='auto', random_state=157, n_jobs=-1)

In [265]:
rf_dist.fit(X_train, neighborhood_dist_train)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=5000, n_jobs=-1,
           oob_score=False, random_state=157, verbose=0, warm_start=False)

In [271]:
train_pred_dist = rf_dist.predict(X_train)
test_pred_dist = rf_dist.predict(X_test)

mean_squared_error(neighborhood_dist_train, train_pred_dist), mean_squared_error(neighborhood_dist_test, test_pred_dist)

(8.904109288427514e-05, 0.0006596607952942548)

In [276]:
np.sqrt(0.00065966)

0.025683847063864867

In [275]:
rf_dist.score(X_train, neighborhood_dist_train), rf_dist.score(X_test, neighborhood_dist_test)

(0.8587596759517816, -0.04066013616426118)

In [285]:
train_pred_comb = train_pred_city * neighborhood_dist_train.T
test_pred_comb = test_pred_city * neighborhood_dist_test.T


In [290]:

mean_squared_error(y_train, train_pred_comb.T), mean_squared_error(y_test, test_pred_comb.T)

(0.005543797252282832, 0.040403060819437286)

In [291]:
rf.score(X_train, train_pred_comb), rf.score(X_test, test_pred_comb)

ValueError: Number of features of the model must match the input. Model n_features is 116 and input n_features is 277 