In [90]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn import metrics
from sklearn.model_selection import cross_val_score

In [115]:
# Load Data
data = pd.read_csv('/Users/sauce/Desktop/DraftKings/ready_data/DraftKingsCleaned.csv', header = None)


# rename columns
data = data.rename(columns={0: "season", 
                        1:'game_date',
                        2: 'player',
                        3: 'position',    
                        4: 'team',
                        5: 'opponent',
                        6: 'venue',
                        7: 'minutes',
                        8: 'usage_rate',
                        9: 'rest',
                        10: 'avg_threes',
                        11: 'avg_reb',
                        12: 'avg_ast',
                        13: 'avg_stl',
                        14: 'avg_blk',
                        15: 'avg_tov',
                        16: 'avg_pts',
                        17: 'avg_points_vs_opp',
                        18: 'team_pace',
                        19: 'team_ast',
                        20: 'team_tov',
                        21: 'team_reb_rate',
                        22: 'team_offeff',
                        23: 'team_defeff',
                        24: 'opp_pace',
                        25: 'opp_ast',
                        26: 'opp_tov',
                        27: 'opp_reb_rate',
                        28: 'opp_offeff',
                        29: 'opp_defeff',
                        30: 'opp_pos_avg',
                        31: 'salary',
                        32: 'fantasy_points',
                       })

# Avg 10 data
data["game_date"] = pd.to_datetime(data.game_date)
data.set_index('game_date', inplace=True)
data.sort_index(inplace=True)
df_rolling = data.groupby(['player']).rolling(10).mean().rename(columns={'season':'season1', 'player':'player1'}).reset_index()
data = data.reset_index()
df_rolling = df_rolling.drop(columns=['player'])
df_rolling = df_rolling.rename(columns = {'season1': 'season', 'player1': 'player'})
data = pd.merge(data, df_rolling, on=['player', 'season', 'game_date'], left_index= True , suffixes=['', '_AVG10'])

# Avg 3 data
data["game_date"] = pd.to_datetime(data.game_date)
data.set_index('game_date', inplace=True)
data.sort_index(inplace=True)
df_rolling = data.groupby(['player']).rolling(3).mean().rename(columns={'season':'season1', 'player':'player1'}).reset_index()
data = data.reset_index()
df_rolling = df_rolling.drop(columns=['player'])
df_rolling = df_rolling.rename(columns = {'season1': 'season', 'player1': 'player'})
df = pd.merge(data, df_rolling, on=['player', 'season', 'game_date'], left_index= True , suffixes=['', '_AVG3'])

# Drop useless features
df = df.drop(columns=['team_AVG10','opponent_AVG10','venue_AVG10','rest_AVG10',
                      'avg_threes_AVG10','avg_reb_AVG10','avg_ast_AVG10','avg_stl_AVG10',
                      'avg_blk_AVG10','avg_tov_AVG10','avg_pts_AVG10','avg_points_vs_opp_AVG10',
                      'team_pace_AVG10','team_ast_AVG10','team_tov_AVG10','team_reb_rate_AVG10',
                      'team_defeff_AVG10','opp_pace_AVG10','opp_ast_AVG10','opp_tov_AVG10',
                      'opp_reb_rate_AVG10','opp_offeff_AVG10','salary_AVG10',
                      'team_AVG3', 'opponent_AVG3', 'venue_AVG3','rest_AVG3',
                      'avg_threes_AVG3', 'avg_reb_AVG3','avg_ast_AVG3', 'avg_stl_AVG3', 
                      'avg_blk_AVG3', 'avg_tov_AVG3','avg_pts_AVG3', 
                      'avg_points_vs_opp_AVG3', 'team_pace_AVG3','team_ast_AVG3', 
                      'team_tov_AVG3', 'team_reb_rate_AVG3','team_defeff_AVG3', 
                      'opp_pace_AVG3','opp_ast_AVG3', 'opp_tov_AVG3', 'opp_reb_rate_AVG3',
                      'opp_offeff_AVG3', 'opp_pos_avg_AVG3','salary_AVG3',
                      'team_AVG10_AVG3','opponent_AVG10_AVG3', 'venue_AVG10_AVG3', 
                      'minutes_AVG10_AVG3','usage_rate_AVG10_AVG3', 'rest_AVG10_AVG3',
                      'avg_threes_AVG10_AVG3', 'avg_reb_AVG10_AVG3',
                      'avg_ast_AVG10_AVG3', 'avg_stl_AVG10_AVG3', 'avg_blk_AVG10_AVG3',
                      'avg_tov_AVG10_AVG3', 'avg_pts_AVG10_AVG3',
                      'avg_points_vs_opp_AVG10_AVG3', 'team_pace_AVG10_AVG3',
                      'team_ast_AVG10_AVG3', 'team_tov_AVG10_AVG3',
                      'team_reb_rate_AVG10_AVG3', 'team_offeff_AVG10_AVG3',
                      'team_defeff_AVG10_AVG3', 'opp_pace_AVG10_AVG3',
                      'opp_ast_AVG10_AVG3', 'opp_tov_AVG10_AVG3',
                      'opp_reb_rate_AVG10_AVG3', 'opp_offeff_AVG10_AVG3',
                      'opp_defeff_AVG10_AVG3', 'opp_pos_avg_AVG10_AVG3',
                      'salary_AVG10_AVG3', 'fantasy_points_AVG10_AVG3', 'position_AVG10_AVG3', 'position_AVG3', 'position_AVG10'])

# Fill NAs
df['fantasy_points_AVG3'] = df['fantasy_points_AVG3'].fillna(df.groupby(['player', 'season'])['fantasy_points'].transform('mean'))
df['opp_defeff_AVG3'] = df['opp_defeff_AVG3'].fillna(df.groupby(['opponent', 'season'])['opp_defeff'].transform('mean'))
df['team_offeff_AVG3'] = df['team_offeff_AVG3'].fillna(df.groupby(['team', 'season'])['team_offeff'].transform('mean'))
df['usage_rate_AVG3'] = df['usage_rate_AVG3'].fillna(df.groupby(['player', 'season'])['usage_rate'].transform('mean'))
df['fantasy_points_AVG10'] = df['fantasy_points_AVG10'].fillna(df.groupby(['player', 'season'])['fantasy_points'].transform('mean'))
df['minutes_AVG3'] = df['minutes_AVG3'].fillna(df.groupby(['player', 'season'])['minutes'].transform('mean'))
df['opp_pos_avg_AVG10'] = df['opp_pos_avg_AVG10'].fillna(df.groupby(['opponent', 'season'])['opp_pos_avg'].transform('mean'))
df['opp_defeff_AVG10'] = df['opp_defeff_AVG10'].fillna(df.groupby(['opponent', 'season'])['opp_defeff'].transform('mean'))
df['team_offeff_AVG10'] = df['team_offeff_AVG10'].fillna(df.groupby(['team', 'season'])['team_offeff'].transform('mean'))
df['usage_rate_AVG10'] = df['usage_rate_AVG10'].fillna(df.groupby(['player', 'season'])['usage_rate'].transform('mean'))
df['minutes_AVG10'] = df['minutes_AVG10'].fillna(df.groupby(['player', 'season'])['minutes'].transform('mean'))

# Drop non predictive columns 
df2 = df.drop(columns=['game_date', 'minutes', 'usage_rate', 'position', 'season', 'player', 'team', 'opponent',
                      'salary',])
# Encode Dummies
df2 = pd.get_dummies(df2, columns=['venue', 'rest'], drop_first=True)

# Grab Target Variable and remove it from data.
y = df2['fantasy_points']
X = df2.drop(columns = ['fantasy_points'])

# Split data into train and test sets
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

# Scale data
# from sklearn.preprocessing import StandardScaler
# sc_X = StandardScaler()
# X_train = sc_X.fit_transform(X_train)
# X_test = sc_X.transform(X_test)

In [92]:
df2.columns.values

array(['avg_threes', 'avg_reb', 'avg_ast', 'avg_stl', 'avg_blk',
       'avg_tov', 'avg_pts', 'avg_points_vs_opp', 'team_pace', 'team_ast',
       'team_tov', 'team_reb_rate', 'team_offeff', 'team_defeff',
       'opp_pace', 'opp_ast', 'opp_tov', 'opp_reb_rate', 'opp_offeff',
       'opp_defeff', 'opp_pos_avg', 'fantasy_points', 'minutes_AVG10',
       'usage_rate_AVG10', 'team_offeff_AVG10', 'opp_defeff_AVG10',
       'opp_pos_avg_AVG10', 'fantasy_points_AVG10', 'minutes_AVG3',
       'usage_rate_AVG3', 'team_offeff_AVG3', 'opp_defeff_AVG3',
       'fantasy_points_AVG3', 'venue_R', 'rest_2', 'rest_3+', 'rest_3IN4',
       'rest_3IN4-B2B', 'rest_4IN5', 'rest_4IN5-B2B', 'rest_5IN5-B2B2B',
       'rest_B2B'], dtype=object)

In [93]:
# Fit MLR Model
from sklearn.linear_model import LinearRegression
Mregressor = LinearRegression(normalize=True)
Mregressor.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=True)

In [94]:
# Make Predictions
y_pred = Mregressor.predict(X_test)

In [95]:
# Check Mean Squared Error
from sklearn import metrics
print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_pred))
print('R2 Score:', metrics.r2_score(y_test, y_pred))

Mean Squared Error: 34.303077046694625
R2 Score: 0.8309063763918545


In [96]:
import pickle
filename = 'linreg_model.sav'
pickle.dump(Mregressor, open(filename, 'wb'))

### Backwards Elimination

In [97]:
X.shape

(47738, 41)

In [98]:
X.columns.values

array(['avg_threes', 'avg_reb', 'avg_ast', 'avg_stl', 'avg_blk',
       'avg_tov', 'avg_pts', 'avg_points_vs_opp', 'team_pace', 'team_ast',
       'team_tov', 'team_reb_rate', 'team_offeff', 'team_defeff',
       'opp_pace', 'opp_ast', 'opp_tov', 'opp_reb_rate', 'opp_offeff',
       'opp_defeff', 'opp_pos_avg', 'minutes_AVG10', 'usage_rate_AVG10',
       'team_offeff_AVG10', 'opp_defeff_AVG10', 'opp_pos_avg_AVG10',
       'fantasy_points_AVG10', 'minutes_AVG3', 'usage_rate_AVG3',
       'team_offeff_AVG3', 'opp_defeff_AVG3', 'fantasy_points_AVG3',
       'venue_R', 'rest_2', 'rest_3+', 'rest_3IN4', 'rest_3IN4-B2B',
       'rest_4IN5', 'rest_4IN5-B2B', 'rest_5IN5-B2B2B', 'rest_B2B'],
      dtype=object)

In [99]:
sop = len(X)
import statsmodels.formula.api as sm
X = np.append(arr = np.ones((sop, 1)).astype(int), values = X, axis = 1)

In [100]:
X_opt = X[:, [0,1,2,3,4,5,6,7,8,9,10,
             11,12,13,14,15,16,17,18,19,20,
             21,22,23,24,25,26,27,28,29,30,
             31,32,33,34,35,36,37,38,39,40,
             41]]
regressor_OLS = sm.OLS(endog = y, exog = X_opt).fit()
regressor_OLS.summary()

0,1,2,3
Dep. Variable:,fantasy_points,R-squared:,0.829
Model:,OLS,Adj. R-squared:,0.829
Method:,Least Squares,F-statistic:,6610.0
Date:,"Thu, 09 Aug 2018",Prob (F-statistic):,0.0
Time:,17:21:57,Log-Likelihood:,-152310.0
No. Observations:,47738,AIC:,304700.0
Df Residuals:,47702,BIC:,305000.0
Df Model:,35,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,4.2004,2.354,1.785,0.074,-0.413,8.814
x1,-0.1517,0.056,-2.686,0.007,-0.262,-0.041
x2,-0.5140,0.023,-22.533,0.000,-0.559,-0.469
x3,-0.5816,0.037,-15.902,0.000,-0.653,-0.510
x4,-0.6036,0.103,-5.882,0.000,-0.805,-0.402
x5,-0.7456,0.101,-7.416,0.000,-0.943,-0.549
x6,0.0600,0.101,0.594,0.553,-0.138,0.258
x7,-0.3696,0.015,-24.113,0.000,-0.400,-0.340
x8,0.7250,0.005,146.636,0.000,0.715,0.735

0,1,2,3
Omnibus:,1043.063,Durbin-Watson:,2.012
Prob(Omnibus):,0.0,Jarque-Bera (JB):,2254.918
Skew:,0.081,Prob(JB):,0.0
Kurtosis:,4.052,Cond. No.,2.32e+16


Between these two steps I performed backwards elimination by removing the variable with the highest p-value until all variables had p-values below .05.

In [101]:
X_opt = X[:, [1,2,3,4,5,7,8,
             13,14,19,20,
             21,22,26,27,28,30,
             31,32,33,36]]
regressor_OLS = sm.OLS(endog = y, exog = X_opt).fit()
regressor_OLS.summary()

0,1,2,3
Dep. Variable:,fantasy_points,R-squared:,0.946
Model:,OLS,Adj. R-squared:,0.946
Method:,Least Squares,F-statistic:,43640.0
Date:,"Thu, 09 Aug 2018",Prob (F-statistic):,0.0
Time:,17:21:57,Log-Likelihood:,-152310.0
No. Observations:,47738,AIC:,304700.0
Df Residuals:,47719,BIC:,304800.0
Df Model:,19,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
x1,-0.1408,0.055,-2.552,0.011,-0.249,-0.033
x2,-0.5003,0.021,-23.663,0.000,-0.542,-0.459
x3,-0.5637,0.027,-21.155,0.000,-0.616,-0.511
x4,-0.5688,0.101,-5.614,0.000,-0.767,-0.370
x5,-0.7133,0.098,-7.278,0.000,-0.905,-0.521
x6,-0.3797,0.013,-30.213,0.000,-0.404,-0.355
x7,0.7249,0.005,146.676,0.000,0.715,0.735
x8,-0.2704,0.052,-5.224,0.000,-0.372,-0.169
x9,-0.0522,0.030,-1.714,0.086,-0.112,0.007

0,1,2,3
Omnibus:,1041.573,Durbin-Watson:,2.012
Prob(Omnibus):,0.0,Jarque-Bera (JB):,2251.278
Skew:,0.081,Prob(JB):,0.0
Kurtosis:,4.052,Cond. No.,1.26e+18


In [102]:
X_opt = X[:, [1,2,3,4,5,7,8,
             13,14,19,20,
             21,22,26,27,28,30,
             31,32,33]]
regressor_OLS = sm.OLS(endog = y, exog = X_opt).fit()
regressor_OLS.summary()

0,1,2,3
Dep. Variable:,fantasy_points,R-squared:,0.946
Model:,OLS,Adj. R-squared:,0.946
Method:,Least Squares,F-statistic:,46060.0
Date:,"Thu, 09 Aug 2018",Prob (F-statistic):,0.0
Time:,17:21:57,Log-Likelihood:,-152310.0
No. Observations:,47738,AIC:,304700.0
Df Residuals:,47720,BIC:,304800.0
Df Model:,18,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
x1,-0.1411,0.055,-2.559,0.011,-0.249,-0.033
x2,-0.5003,0.021,-23.660,0.000,-0.542,-0.459
x3,-0.5635,0.027,-21.150,0.000,-0.616,-0.511
x4,-0.5693,0.101,-5.619,0.000,-0.768,-0.371
x5,-0.7134,0.098,-7.279,0.000,-0.905,-0.521
x6,-0.3796,0.013,-30.208,0.000,-0.404,-0.355
x7,0.7249,0.005,146.673,0.000,0.715,0.735
x8,-0.2702,0.052,-5.221,0.000,-0.372,-0.169
x9,-0.0523,0.030,-1.719,0.086,-0.112,0.007

0,1,2,3
Omnibus:,1041.897,Durbin-Watson:,2.011
Prob(Omnibus):,0.0,Jarque-Bera (JB):,2252.315
Skew:,0.081,Prob(JB):,0.0
Kurtosis:,4.052,Cond. No.,1.43e+18


In [103]:
X_opt = X[:, [1,2,3,4,5,7,8,
             13,19,20,
             21,22,26,27,28,30,
             31,32,33,36]]
regressor_OLS = sm.OLS(endog = y, exog = X_opt).fit()
regressor_OLS.summary()

0,1,2,3
Dep. Variable:,fantasy_points,R-squared:,0.946
Model:,OLS,Adj. R-squared:,0.946
Method:,Least Squares,F-statistic:,43640.0
Date:,"Thu, 09 Aug 2018",Prob (F-statistic):,0.0
Time:,17:21:58,Log-Likelihood:,-152310.0
No. Observations:,47738,AIC:,304700.0
Df Residuals:,47719,BIC:,304800.0
Df Model:,19,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
x1,-0.1408,0.055,-2.552,0.011,-0.249,-0.033
x2,-0.5003,0.021,-23.663,0.000,-0.542,-0.459
x3,-0.5637,0.027,-21.155,0.000,-0.616,-0.511
x4,-0.5688,0.101,-5.614,0.000,-0.767,-0.370
x5,-0.7133,0.098,-7.278,0.000,-0.905,-0.521
x6,-0.3797,0.013,-30.213,0.000,-0.404,-0.355
x7,0.7249,0.005,146.676,0.000,0.715,0.735
x8,-0.2704,0.052,-5.224,0.000,-0.372,-0.169
x9,-0.2704,0.052,-5.224,0.000,-0.372,-0.169

0,1,2,3
Omnibus:,1041.573,Durbin-Watson:,2.012
Prob(Omnibus):,0.0,Jarque-Bera (JB):,2251.278
Skew:,0.081,Prob(JB):,0.0
Kurtosis:,4.052,Cond. No.,5980000000000000.0


In [104]:
X_opt = X[:, [1,2,3,4,5,7,8,
             13,14,19,20,
             21,22,26,27,28,30,
             31,32,33]]
regressor_OLS = sm.OLS(endog = y, exog = X_opt).fit()
regressor_OLS.summary()

0,1,2,3
Dep. Variable:,fantasy_points,R-squared:,0.946
Model:,OLS,Adj. R-squared:,0.946
Method:,Least Squares,F-statistic:,46060.0
Date:,"Thu, 09 Aug 2018",Prob (F-statistic):,0.0
Time:,17:21:58,Log-Likelihood:,-152310.0
No. Observations:,47738,AIC:,304700.0
Df Residuals:,47720,BIC:,304800.0
Df Model:,18,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
x1,-0.1411,0.055,-2.559,0.011,-0.249,-0.033
x2,-0.5003,0.021,-23.660,0.000,-0.542,-0.459
x3,-0.5635,0.027,-21.150,0.000,-0.616,-0.511
x4,-0.5693,0.101,-5.619,0.000,-0.768,-0.371
x5,-0.7134,0.098,-7.279,0.000,-0.905,-0.521
x6,-0.3796,0.013,-30.208,0.000,-0.404,-0.355
x7,0.7249,0.005,146.673,0.000,0.715,0.735
x8,-0.2702,0.052,-5.221,0.000,-0.372,-0.169
x9,-0.0523,0.030,-1.719,0.086,-0.112,0.007

0,1,2,3
Omnibus:,1041.897,Durbin-Watson:,2.011
Prob(Omnibus):,0.0,Jarque-Bera (JB):,2252.315
Skew:,0.081,Prob(JB):,0.0
Kurtosis:,4.052,Cond. No.,1.43e+18


In [105]:
X_opt = X[:, [1,2,3,4,5,7,8,
             13,14,19,20,
             22,26,27,28,30,
             31,32,33,36]]
regressor_OLS = sm.OLS(endog = y, exog = X_opt).fit()
regressor_OLS.summary()

0,1,2,3
Dep. Variable:,fantasy_points,R-squared:,0.946
Model:,OLS,Adj. R-squared:,0.946
Method:,Least Squares,F-statistic:,46050.0
Date:,"Thu, 09 Aug 2018",Prob (F-statistic):,0.0
Time:,17:21:58,Log-Likelihood:,-152320.0
No. Observations:,47738,AIC:,304700.0
Df Residuals:,47720,BIC:,304800.0
Df Model:,18,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
x1,-0.1450,0.055,-2.630,0.009,-0.253,-0.037
x2,-0.5013,0.021,-23.707,0.000,-0.543,-0.460
x3,-0.5613,0.027,-21.075,0.000,-0.614,-0.509
x4,-0.5730,0.101,-5.656,0.000,-0.772,-0.374
x5,-0.7091,0.098,-7.235,0.000,-0.901,-0.517
x6,-0.3807,0.013,-30.296,0.000,-0.405,-0.356
x7,0.7265,0.005,147.844,0.000,0.717,0.736
x8,-0.2721,0.052,-5.257,0.000,-0.374,-0.171
x9,-0.0526,0.030,-1.727,0.084,-0.112,0.007

0,1,2,3
Omnibus:,1044.151,Durbin-Watson:,2.012
Prob(Omnibus):,0.0,Jarque-Bera (JB):,2258.398
Skew:,0.081,Prob(JB):,0.0
Kurtosis:,4.053,Cond. No.,1.08e+18


In [106]:
X_opt = X[:, [1,2,3,4,5,7,8,
             13,14,19,20,
             21,22,26,27,28,30,
             31,32,33]]
regressor_OLS = sm.OLS(endog = y, exog = X_opt).fit()
regressor_OLS.summary()

0,1,2,3
Dep. Variable:,fantasy_points,R-squared:,0.946
Model:,OLS,Adj. R-squared:,0.946
Method:,Least Squares,F-statistic:,46060.0
Date:,"Thu, 09 Aug 2018",Prob (F-statistic):,0.0
Time:,17:21:58,Log-Likelihood:,-152310.0
No. Observations:,47738,AIC:,304700.0
Df Residuals:,47720,BIC:,304800.0
Df Model:,18,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
x1,-0.1411,0.055,-2.559,0.011,-0.249,-0.033
x2,-0.5003,0.021,-23.660,0.000,-0.542,-0.459
x3,-0.5635,0.027,-21.150,0.000,-0.616,-0.511
x4,-0.5693,0.101,-5.619,0.000,-0.768,-0.371
x5,-0.7134,0.098,-7.279,0.000,-0.905,-0.521
x6,-0.3796,0.013,-30.208,0.000,-0.404,-0.355
x7,0.7249,0.005,146.673,0.000,0.715,0.735
x8,-0.2702,0.052,-5.221,0.000,-0.372,-0.169
x9,-0.0523,0.030,-1.719,0.086,-0.112,0.007

0,1,2,3
Omnibus:,1041.897,Durbin-Watson:,2.011
Prob(Omnibus):,0.0,Jarque-Bera (JB):,2252.315
Skew:,0.081,Prob(JB):,0.0
Kurtosis:,4.052,Cond. No.,1.43e+18


In [107]:
X_opt = X[:, [1,2,3,4,5,7,8,
             13,19,20,
             21,22,26,27,28,30,
             31,32,33]]
regressor_OLS = sm.OLS(endog = y, exog = X_opt).fit()
regressor_OLS.summary()

0,1,2,3
Dep. Variable:,fantasy_points,R-squared:,0.946
Model:,OLS,Adj. R-squared:,0.946
Method:,Least Squares,F-statistic:,46060.0
Date:,"Thu, 09 Aug 2018",Prob (F-statistic):,0.0
Time:,17:21:58,Log-Likelihood:,-152310.0
No. Observations:,47738,AIC:,304700.0
Df Residuals:,47720,BIC:,304800.0
Df Model:,18,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
x1,-0.1411,0.055,-2.559,0.011,-0.249,-0.033
x2,-0.5003,0.021,-23.660,0.000,-0.542,-0.459
x3,-0.5635,0.027,-21.150,0.000,-0.616,-0.511
x4,-0.5693,0.101,-5.619,0.000,-0.768,-0.371
x5,-0.7134,0.098,-7.279,0.000,-0.905,-0.521
x6,-0.3796,0.013,-30.208,0.000,-0.404,-0.355
x7,0.7249,0.005,146.673,0.000,0.715,0.735
x8,-0.2702,0.052,-5.221,0.000,-0.372,-0.169
x9,-0.2702,0.052,-5.221,0.000,-0.372,-0.169

0,1,2,3
Omnibus:,1041.897,Durbin-Watson:,2.011
Prob(Omnibus):,0.0,Jarque-Bera (JB):,2252.315
Skew:,0.081,Prob(JB):,0.0
Kurtosis:,4.052,Cond. No.,6030000000000000.0


In [108]:
X_opt = X[:, [1,2,3,4,5,7,8,
             13,19,
             21,22,26,27,28,30,
             31,32,33]]
regressor_OLS = sm.OLS(endog = y, exog = X_opt).fit()
regressor_OLS.summary()

0,1,2,3
Dep. Variable:,fantasy_points,R-squared:,0.946
Model:,OLS,Adj. R-squared:,0.946
Method:,Least Squares,F-statistic:,48770.0
Date:,"Thu, 09 Aug 2018",Prob (F-statistic):,0.0
Time:,17:21:58,Log-Likelihood:,-152320.0
No. Observations:,47738,AIC:,304700.0
Df Residuals:,47721,BIC:,304800.0
Df Model:,17,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
x1,-0.1409,0.055,-2.554,0.011,-0.249,-0.033
x2,-0.5008,0.021,-23.685,0.000,-0.542,-0.459
x3,-0.5638,0.027,-21.159,0.000,-0.616,-0.512
x4,-0.5690,0.101,-5.616,0.000,-0.768,-0.370
x5,-0.7118,0.098,-7.263,0.000,-0.904,-0.520
x6,-0.3799,0.013,-30.231,0.000,-0.405,-0.355
x7,0.7249,0.005,146.676,0.000,0.715,0.735
x8,-0.2610,0.051,-5.070,0.000,-0.362,-0.160
x9,-0.2610,0.051,-5.070,0.000,-0.362,-0.160

0,1,2,3
Omnibus:,1043.679,Durbin-Watson:,2.011
Prob(Omnibus):,0.0,Jarque-Bera (JB):,2256.916
Skew:,0.081,Prob(JB):,0.0
Kurtosis:,4.053,Cond. No.,5390000000000000.0


In [109]:
X_opt = X[:, [1,2,3,4,5,7,8,
             13,19,
             21,26,27,28,30,
             31,32,33]]
regressor_OLS = sm.OLS(endog = y, exog = X_opt).fit()
regressor_OLS.summary()

0,1,2,3
Dep. Variable:,fantasy_points,R-squared:,0.946
Model:,OLS,Adj. R-squared:,0.946
Method:,Least Squares,F-statistic:,51810.0
Date:,"Thu, 09 Aug 2018",Prob (F-statistic):,0.0
Time:,17:21:58,Log-Likelihood:,-152320.0
No. Observations:,47738,AIC:,304700.0
Df Residuals:,47722,BIC:,304800.0
Df Model:,16,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
x1,-0.1278,0.055,-2.337,0.019,-0.235,-0.021
x2,-0.5015,0.021,-23.722,0.000,-0.543,-0.460
x3,-0.5691,0.026,-21.500,0.000,-0.621,-0.517
x4,-0.5414,0.100,-5.407,0.000,-0.738,-0.345
x5,-0.7165,0.098,-7.314,0.000,-0.909,-0.524
x6,-0.3809,0.013,-30.343,0.000,-0.406,-0.356
x7,0.7248,0.005,146.664,0.000,0.715,0.735
x8,-0.2604,0.051,-5.058,0.000,-0.361,-0.159
x9,-0.2604,0.051,-5.058,0.000,-0.361,-0.159

0,1,2,3
Omnibus:,1043.414,Durbin-Watson:,2.011
Prob(Omnibus):,0.0,Jarque-Bera (JB):,2255.112
Skew:,0.082,Prob(JB):,0.0
Kurtosis:,4.052,Cond. No.,5360000000000000.0


In [110]:
from sklearn.cross_validation import train_test_split
X_train2, X_test2, y_train2, y_test2 = train_test_split(X_opt, y, test_size = 0.2, random_state = 0)

In [111]:
from sklearn.linear_model import LinearRegression
Mregressor2 = LinearRegression(normalize=True)
Mregressor2.fit(X_train2, y_train2)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=True)

In [112]:
y_pred2 = Mregressor2.predict(X_test2)

In [113]:
from sklearn import metrics
print('Mean Squared Error:', metrics.mean_squared_error(y_test2, y_pred2))
print('R2 Score:', metrics.r2_score(y_test2, y_pred2))

Mean Squared Error: 34.246132908705356
R2 Score: 0.8311870768847793


In [114]:
import pickle
filename = 'linreg_model13.sav'
pickle.dump(Mregressor2, open(filename, 'wb'))