Code for Voting Ensemble

In [4]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn import metrics
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import VotingClassifier
from sklearn import model_selection
import tensorflow as tf
from keras.models import Sequential
from keras.layers.core import Dense, Activation

In [1]:
# Load Data
data = pd.read_csv('/Users/sauce/Desktop/DraftKings/ready_data/DraftKingsCleaned.csv', header = None)

# rename columns
data = data.rename(columns={0: "season", 
                        1:'game_date',
                        2: 'player',
                        3: 'team',
                        4: 'opponent',
                        5: 'venue',
                        6: 'minutes',
                        7: 'usage_rate',
                        8: 'rest',
                        9: 'avg_threes',
                        10: 'avg_reb',
                        11: 'avg_ast',
                        12: 'avg_stl',
                        13: 'avg_blk',
                        14: 'avg_tov',
                        15: 'avg_pts',
                        16: 'avg_points_vs_opp',
                        17: 'team_pace',
                        18: 'team_ast',
                        19: 'team_tov',
                        20: 'team_reb_rate',
                        21: 'team_offeff',
                        22: 'team_defeff',
                        23: 'opp_pace',
                        24: 'opp_ast',
                        25: 'opp_tov',
                        26: 'opp_reb_rate',
                        27: 'opp_offeff',
                        28: 'opp_defeff',
                        29: 'opp_pos_avg',
                        30: 'salary',
                        31: 'fantasy_points',
                       })

# Avg 10 data
data["game_date"] = pd.to_datetime(data.game_date)
data.set_index('game_date', inplace=True)
data.sort_index(inplace=True)
df_rolling = data.groupby(['player']).rolling(10).mean().rename(columns={'season':'season1', 'player':'player1'}).reset_index()
data = data.reset_index()
df_rolling = df_rolling.drop(columns=['player'])
df_rolling = df_rolling.rename(columns = {'season1': 'season', 'player1': 'player'})
data = pd.merge(data, df_rolling, on=['player', 'season', 'game_date'], left_index= True , suffixes=['', '_AVG10'])

# Avg 3 data
data["game_date"] = pd.to_datetime(data.game_date)
data.set_index('game_date', inplace=True)
data.sort_index(inplace=True)
df_rolling = data.groupby(['player']).rolling(3).mean().rename(columns={'season':'season1', 'player':'player1'}).reset_index()
data = data.reset_index()
df_rolling = df_rolling.drop(columns=['player'])
df_rolling = df_rolling.rename(columns = {'season1': 'season', 'player1': 'player'})
df = pd.merge(data, df_rolling, on=['player', 'season', 'game_date'], left_index= True , suffixes=['', '_AVG3'])

# Drop useless features
df = df.drop(columns=['team_AVG10','opponent_AVG10','venue_AVG10','rest_AVG10',
                      'avg_threes_AVG10','avg_reb_AVG10','avg_ast_AVG10','avg_stl_AVG10',
                      'avg_blk_AVG10','avg_tov_AVG10','avg_pts_AVG10','avg_points_vs_opp_AVG10',
                      'team_pace_AVG10','team_ast_AVG10','team_tov_AVG10','team_reb_rate_AVG10',
                      'team_defeff_AVG10','opp_pace_AVG10','opp_ast_AVG10','opp_tov_AVG10',
                      'opp_reb_rate_AVG10','opp_offeff_AVG10','salary_AVG10',
                      'team_AVG3', 'opponent_AVG3', 'venue_AVG3','rest_AVG3',
                      'avg_threes_AVG3', 'avg_reb_AVG3','avg_ast_AVG3', 'avg_stl_AVG3', 
                      'avg_blk_AVG3', 'avg_tov_AVG3','avg_pts_AVG3', 
                      'avg_points_vs_opp_AVG3', 'team_pace_AVG3','team_ast_AVG3', 
                      'team_tov_AVG3', 'team_reb_rate_AVG3','team_defeff_AVG3', 
                      'opp_pace_AVG3','opp_ast_AVG3', 'opp_tov_AVG3', 'opp_reb_rate_AVG3',
                      'opp_offeff_AVG3', 'opp_pos_avg_AVG3','salary_AVG3',
                      'team_AVG10_AVG3','opponent_AVG10_AVG3', 'venue_AVG10_AVG3', 
                      'minutes_AVG10_AVG3','usage_rate_AVG10_AVG3', 'rest_AVG10_AVG3',
                      'avg_threes_AVG10_AVG3', 'avg_reb_AVG10_AVG3',
                      'avg_ast_AVG10_AVG3', 'avg_stl_AVG10_AVG3', 'avg_blk_AVG10_AVG3',
                      'avg_tov_AVG10_AVG3', 'avg_pts_AVG10_AVG3',
                      'avg_points_vs_opp_AVG10_AVG3', 'team_pace_AVG10_AVG3',
                      'team_ast_AVG10_AVG3', 'team_tov_AVG10_AVG3',
                      'team_reb_rate_AVG10_AVG3', 'team_offeff_AVG10_AVG3',
                      'team_defeff_AVG10_AVG3', 'opp_pace_AVG10_AVG3',
                      'opp_ast_AVG10_AVG3', 'opp_tov_AVG10_AVG3',
                      'opp_reb_rate_AVG10_AVG3', 'opp_offeff_AVG10_AVG3',
                      'opp_defeff_AVG10_AVG3', 'opp_pos_avg_AVG10_AVG3',
                      'salary_AVG10_AVG3', 'fantasy_points_AVG10_AVG3'])

# Fill NAs
df['fantasy_points_AVG3'] = df['fantasy_points_AVG3'].fillna(df.groupby(['player', 'season'])['fantasy_points'].transform('mean'))
df['opp_defeff_AVG3'] = df['opp_defeff_AVG3'].fillna(df.groupby(['opponent', 'season'])['opp_defeff'].transform('mean'))
df['team_offeff_AVG3'] = df['team_offeff_AVG3'].fillna(df.groupby(['team', 'season'])['team_offeff'].transform('mean'))
df['usage_rate_AVG3'] = df['usage_rate_AVG3'].fillna(df.groupby(['player', 'season'])['usage_rate'].transform('mean'))
df['fantasy_points_AVG10'] = df['fantasy_points_AVG10'].fillna(df.groupby(['player', 'season'])['fantasy_points'].transform('mean'))
df['minutes_AVG3'] = df['minutes_AVG3'].fillna(df.groupby(['player', 'season'])['minutes'].transform('mean'))
df['opp_pos_avg_AVG10'] = df['opp_pos_avg_AVG10'].fillna(df.groupby(['opponent', 'season'])['opp_pos_avg'].transform('mean'))
df['opp_defeff_AVG10'] = df['opp_defeff_AVG10'].fillna(df.groupby(['opponent', 'season'])['opp_defeff'].transform('mean'))
df['team_offeff_AVG10'] = df['team_offeff_AVG10'].fillna(df.groupby(['team', 'season'])['team_offeff'].transform('mean'))
df['usage_rate_AVG10'] = df['usage_rate_AVG10'].fillna(df.groupby(['player', 'season'])['usage_rate'].transform('mean'))
df['minutes_AVG10'] = df['minutes_AVG10'].fillna(df.groupby(['player', 'season'])['minutes'].transform('mean'))

# Drop non predictive columns 
df2 = df.drop(columns=['game_date', 'season', 'player', 'team', 'opponent',
                     'minutes', 'usage_rate', 'salary'])
# Encode Dummies
df2 = pd.get_dummies(df2, columns=['venue', 'rest'], drop_first=True)

# Grab Target Variable and remove it from data.
y = df2['fantasy_points']
X = df2.drop(columns = ['fantasy_points'])

# Split data into train and test sets
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

# Scale data
from sklearn.preprocessing import StandardScaler
sc_X = StandardScaler()
X_train = sc_X.fit_transform(X_train)
X_test = sc_X.transform(X_test)



### Multiple Linear Regression

In [2]:
import statsmodels.formula.api as sm
X = np.append(arr = np.ones((45576, 1)).astype(int), values = X, axis = 1)

X_opt = X[:, [1,2,3,4,5,7,8,
             13,14,19,20,
             21,22,26,27,28,30,
             31,32,33,36]]
regressor_OLS = sm.OLS(endog = y, exog = X_opt).fit()

from sklearn.cross_validation import train_test_split
X_train2, X_test2, y_train2, y_test2 = train_test_split(X_opt, y, test_size = 0.2, random_state = 0)

from sklearn.linear_model import LinearRegression
Mregressor2 = LinearRegression(normalize=False)
Mregressor2.fit(X_train2, y_train2)

lin_pred = Mregressor2.predict(X_test2)

### ANN Regression

In [25]:
from keras import optimizers
from keras.layers import Dropout

optimizer=optimizers.SGD(lr=1e-4)


model_drop = Sequential()
model_drop.add(Dense(41, input_dim=41, kernel_initializer='normal', activation='relu'))
model_drop.add(Dropout(0.2))
model_drop.add(Dense(20, kernel_initializer='normal', activation='relu')) 
model_drop.add(Dense(1, kernel_initializer='normal'))
model_drop.compile(loss='mean_squared_error', optimizer = optimizer)

mod_drop = model_drop.fit(X_train, y_train,
                         batch_size = 25,
                         epochs = 50,
                         verbose = 2,
                         validation_data=(X_test, y_test))

ANN_pred = model_drop.predict(X_test)

Train on 36460 samples, validate on 9116 samples
Epoch 1/50
 - 3s - loss: 255.5100 - val_loss: 46.5855
Epoch 2/50
 - 2s - loss: 46.0071 - val_loss: 36.3568
Epoch 3/50
 - 2s - loss: 41.4589 - val_loss: 35.2787
Epoch 4/50
 - 2s - loss: 40.4326 - val_loss: 34.5826
Epoch 5/50
 - 2s - loss: 39.7488 - val_loss: 34.6276
Epoch 6/50
 - 2s - loss: 39.4493 - val_loss: 34.5340
Epoch 7/50
 - 2s - loss: 38.8817 - val_loss: 34.6160
Epoch 8/50
 - 2s - loss: 38.9252 - val_loss: 34.2049
Epoch 9/50
 - 3s - loss: 38.7294 - val_loss: 34.3314
Epoch 10/50
 - 2s - loss: 38.8584 - val_loss: 34.2864
Epoch 11/50
 - 2s - loss: 38.6206 - val_loss: 34.3625
Epoch 12/50
 - 2s - loss: 38.4764 - val_loss: 34.2131
Epoch 13/50
 - 2s - loss: 38.6460 - val_loss: 34.2649
Epoch 14/50
 - 2s - loss: 38.5716 - val_loss: 34.3056
Epoch 15/50
 - 2s - loss: 38.2862 - val_loss: 34.4448
Epoch 16/50
 - 2s - loss: 38.1811 - val_loss: 34.2984
Epoch 17/50
 - 2s - loss: 38.0248 - val_loss: 34.2335
Epoch 18/50
 - 2s - loss: 38.0551 - val_l

In [6]:
from sklearn.ensemble import RandomForestRegressor
regressor = RandomForestRegressor(n_estimators = 500, min_samples_leaf = 3,  random_state = 0, n_jobs = -1, min_samples_split = 2) 
regressor.fit(X_train, y_train)
rf_pred = regressor.predict(X_test)

In [31]:
#unnest as nested list
ANN_pred_list = ANN_pred.tolist()
from itertools import chain
ann_pred = list(chain(*ANN_pred_list))

In [36]:
# Create Dataframe of predictions
d = {'lin_rg': lin_pred, 'ANN_pred': ann_pred, 'rf_pred': rf_pred}
df = pd.DataFrame(data=d)
df.head()

Unnamed: 0,ANN_pred,lin_rg,rf_pred
0,22.109913,22.690372,23.735803
1,4.425423,1.838132,3.989539
2,28.296963,29.203776,29.457364
3,36.759995,36.787782,39.593445
4,11.014505,11.532207,9.862923


In [37]:
# Average of all predictions
df['mean'] = df.mean(axis=1)
df.head()

Unnamed: 0,ANN_pred,lin_rg,rf_pred,mean
0,22.109913,22.690372,23.735803,22.845363
1,4.425423,1.838132,3.989539,3.417698
2,28.296963,29.203776,29.457364,28.986034
3,36.759995,36.787782,39.593445,37.71374
4,11.014505,11.532207,9.862923,10.803212


In [40]:
# Create Array of mean predictions
mean = df['mean'].values

In [41]:
mean

array([22.84536259,  3.4176982 , 28.98603435, ..., 23.99145243,
        3.66802872, 27.06220759])

In [42]:
from sklearn import metrics
print('Mean Squared Error:', metrics.mean_squared_error(y_test, mean))
print('R2 Score:', metrics.r2_score(y_test, mean))

Mean Squared Error: 34.15454641040592
R2 Score: 0.8262492399786975


### Without Random Forest

In [43]:
d2 = {'lin_rg': lin_pred, 'ANN_pred': ann_pred}
df2 = pd.DataFrame(data=d2)
df2.head()

df2['mean'] = df2.mean(axis=1)
mean2 = df['mean'].values

print('Mean Squared Error:', metrics.mean_squared_error(y_test, mean2))
print('R2 Score:', metrics.r2_score(y_test, mean2))

Mean Squared Error: 34.15454641040592
R2 Score: 0.8262492399786975
