In [95]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

nba_ = pd.read_csv("/Users/ncapofari/Documents/CUNY_SPS/Capstone/%s" 
                  % "nba_scraper/data_sets/4factor_model_data.csv")

nba = nba_[nba_['Seed_Diff'] > 0]
nba = nba_[nba_['Seed_Diff'] > 1]
nba = nba_[nba_['Seed_Diff'] == 1]

train_set, test_set = train_test_split(nba, 
                                       test_size=0.2, 
                                       random_state=23)

In [96]:
#models

#nba_train = train_set["Seed_Diff"].copy().values.reshape(-1, 1)
#nba_train = train_set["Pyt_Win_Pct_Diff"].copy().values.reshape(-1, 1)
nba_train = train_set.drop(["Seed_Diff","Pyt_Win_Pct_Diff",
                            "Won_Series", "id"], axis=1)

f = "four_fact_SVC_is_one"

nba_labels = train_set["Won_Series"].copy()
nba_labels = (nba_labels == 1)
nba_test = test_set.copy()

In [97]:
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif

pipeline = Pipeline([
    ('std_scaler', StandardScaler()),
    ('select', SelectKBest(f_classif, 5))
])

nba_prepared = pipeline.fit_transform(nba_train, nba_labels)
print(nba_prepared.shape)

features = pipeline.named_steps['select']

nba_train.columns[features.get_support()]

(118, 5)


Index(['TOV..Misc', 'ORB..Misc', 'DRB..Misc', 'Opp.eFG..Misc_._Opponent',
       'Opp.TOV..Misc_._Opponent'],
      dtype='object')

In [98]:
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV

scv_clf = SVC(kernel='rbf', class_weight='balanced', verbose=False)
param_grid = {'C': [1e3, 5e3, 1e4, 5e4, 1e5],
              'gamma': [0.0001, 0.0005, 0.001, 0.005, 0.01, 0.1], }
svc_clf = GridSearchCV(scv_clf, param_grid, cv=5)
svc_clf.fit(nba_prepared, nba_labels)

print("Best estimator found by grid search:")
print(svc_clf.best_estimator_)

Best estimator found by grid search:
SVC(C=1000.0, cache_size=200, class_weight='balanced', coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma=0.0001, kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)


In [99]:
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import confusion_matrix, precision_score
from sklearn.metrics import roc_curve, roc_auc_score

def scores(m, fs, tgt, fname):
    cross_val_score(m, fs, tgt, cv=5)
    y_train_pred = cross_val_predict(m, fs, tgt, cv=5)
    print(confusion_matrix(nba_labels, y_train_pred))
    print("precision score:", precision_score(nba_labels, y_train_pred))
    fpr, tpr, thresholds = roc_curve(nba_labels, y_train_pred)
    print("roc_auc:", roc_auc_score(nba_labels, y_train_pred))
    temp = pd.DataFrame({"fpr":fpr, "tpr":tpr, "thresholds":thresholds})
    temp.to_csv("scores/"+fname+".csv")
    preds = pd.DataFrame({"Prediction":y_train_pred})
    x = pd.concat([preds.reset_index(drop=True), 
                   train_set.reset_index(drop=True)], axis=1)
    x.to_csv("results/"+fname+".csv")

In [100]:
scores(svc_clf, nba_prepared, nba_labels, f)

[[41 24]
 [24 29]]
precision score: 0.5471698113207547
roc_auc: 0.5889695210449928
