In [47]:
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split, cross_val_score
import pandas as pd
import numpy as np
from sklearn.metrics import classification_report, confusion_matrix
import matplotlib.pyplot as plt
from sklearn.svm import SVR
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Ridge

%matplotlib inline

In [48]:
#Bring in the data

nba = pd.read_csv('./data/nba_analysis_data.csv')
nba.dropna(inplace = True)

In [49]:
nba.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7242 entries, 5 to 8415
Data columns (total 28 columns):
cover                    7242 non-null int64
line_cv                  7242 non-null float64
eff_ratio                7242 non-null float64
eff_ratio2               7242 non-null float64
away_rest                7242 non-null int64
main_referee             7242 non-null int64
home_rest                7242 non-null int64
ref_2                    7242 non-null int64
ref_3                    7242 non-null int64
spread                   7242 non-null float64
home_win_margin          7242 non-null float64
mov_5_fta                7242 non-null float64
mov_5_away_fta           7242 non-null float64
free_throw_ratio         7242 non-null float64
mov_5_home_score         7242 non-null float64
mov_5_away_score         7242 non-null float64
score_ratio              7242 non-null float64
rebound_ratio            7242 non-null float64
mov_5_tot                7242 non-null float64
mov_5_away_t

In [50]:
features = ['eff_ratio', 'mov_5_fta', 'mov_5_away_fta',  
            'eff_ratio2', 'away_rest', 'main_referee',
            'home_rest','ref_2', 'ref_3', 
            'free_throw_ratio', 'spread',
            'score_ratio', 'mov_5_home_score', 'mov_5_away_score',
            'mov_5_away_off_eff', 'mov_5_away_def_eff', 
            'mov_5_away_assists', 'mov_5_home_win_margin',
            'mov_5_win', 'mov_5_away_win_margin']
X = nba[features]
y = nba['home_win_margin']

In [51]:
#splitting data for a train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y , 
                                                    random_state = 2)

In [52]:
#scaling data to use in various other methods
ss = StandardScaler()
X_train_scaled = ss.fit_transform(X_train)
X_test_scaled = ss.transform(X_test)

  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)
  after removing the cwd from sys.path.


In [53]:
rf = RandomForestRegressor()
rf_params = {'n_estimators'      : [300],
             'max_depth'         : [ 35 ],
             'min_samples_split' : [5 ]
            }

In [54]:
gs = GridSearchCV(rf, param_grid= rf_params, cv = 3)

In [55]:
gs.fit(X_train_scaled,y_train)

GridSearchCV(cv=3, error_score='raise-deprecating',
       estimator=RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators='warn', n_jobs=None,
           oob_score=False, random_state=None, verbose=0, warm_start=False),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'n_estimators': [300], 'max_depth': [35], 'min_samples_split': [5]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [56]:
gs.best_score_

0.13012043223628653

In [57]:
gs.best_params_

{'max_depth': 35, 'min_samples_split': 5, 'n_estimators': 300}

In [58]:
gs.score(X_test_scaled, y_test)

0.17311863489239

In [59]:
y_hat_rf = gs.predict(X_test_scaled)

In [68]:
y_hat_rf

array([98.05795551, 95.99893122, 92.28909921, ..., 98.10281626,
       99.19296525, 93.27024074])

In [60]:
svm = SVR()


In [61]:
svm.fit(X_train,y_train)



SVR(C=1.0, cache_size=200, coef0=0.0, degree=3, epsilon=0.1,
  gamma='auto_deprecated', kernel='rbf', max_iter=-1, shrinking=True,
  tol=0.001, verbose=False)

In [62]:
svm.score(X_train, y_train)

0.11941914993323932

In [63]:
svm.score(X_test, y_test)

-0.0015238423804044832

In [64]:
X_test['predict'] =  (y_hat_rf+ X_test.spread).map(lambda x: 0 if x <0 else 1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [65]:
X_test['actual'] = (y_test + X_test.spread).map(lambda x: 0 if x< 0 else 1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [66]:
print(classification_report(X_test.actual, X_test.predict))

              precision    recall  f1-score   support

           1       1.00      1.00      1.00      1811

   micro avg       1.00      1.00      1.00      1811
   macro avg       1.00      1.00      1.00      1811
weighted avg       1.00      1.00      1.00      1811



In [67]:
confusion_matrix(X_test.actual, X_test.predict)

array([[1811]])