In [77]:
import _pickle as pickle
import pandas as pd
import numpy as np
from itertools import chain
import time

# Models
from sklearn import linear_model
from sklearn.neural_network import MLPClassifier
from sklearn.neural_network import MLPRegressor
from sklearn.svm import SVC
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingRegressor

# Feature Selection
from sklearn.feature_selection import VarianceThreshold
from sklearn.feature_selection import SelectFromModel

# Helpers
from sklearn.pipeline import Pipeline
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import GridSearchCV
from sklearn import tree

from feature_extraction import *

In [2]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

In [3]:
%load_ext autoreload
%autoreload 2

In [4]:
print("Estimated Runtime: 2 minutes")
start = time.time()

game_list_2013 = load_game_list('Game_List_2013.p')
player_data_2013 = pd.DataFrame.from_csv("Player_Data_2013.csv")
team_data_2013 = pd.DataFrame.from_csv("Team_Data_2013.csv")

game_list_2014 = load_game_list('Game_List_2014.p')
player_data_2014 = pd.DataFrame.from_csv("Player_Data_2014.csv")
team_data_2014 = pd.DataFrame.from_csv("Team_Data_2014.csv")

game_list_2015 = load_game_list('Game_List_2015.p')
player_data_2015 = pd.DataFrame.from_csv("Player_Data_2015.csv")
team_data_2015 = pd.DataFrame.from_csv("Team_Data_2015.csv")

game_list_2016 = load_game_list('Game_List_2016.p')
player_data_2016 = pd.DataFrame.from_csv("Player_Data_2016.csv")
team_data_2016 = pd.DataFrame.from_csv("Team_Data_2016.csv")

#features, outputs = generate_game_features(game_list_2015, player_data_2015, team_data_2015, True)

# After getting the game lists, split into training and test sets

test_set_ratio_2013 = .4
test_set_ratio_2014 = .4
test_set_ratio_2015 = .4
test_set_ratio_2016 = .4

train_games_2013, test_games_2013 = train_test_split(game_list_2013, test_size = test_set_ratio_2013)
train_games_2014, test_games_2014 = train_test_split(game_list_2014, test_size = test_set_ratio_2014)
train_games_2015, test_games_2015 = train_test_split(game_list_2015, test_size = test_set_ratio_2015)
train_games_2016, test_games_2016 = train_test_split(game_list_2016, test_size = test_set_ratio_2016)

# Change this function in feature_extraction to modify which features are used
train_features_2013, train_outputs_2013 = generate_game_features(train_games_2013, player_data_2013, team_data_2013, True)
train_features_2014, train_outputs_2014 = generate_game_features(train_games_2014, player_data_2014, team_data_2014, True)
train_features_2015, train_outputs_2015 = generate_game_features(train_games_2015, player_data_2015, team_data_2015, True)
train_features_2016, train_outputs_2016 = generate_game_features(train_games_2016, player_data_2016, team_data_2016, True)

average_features_2013, test_set_outputs_2013 = generate_average_vectors(test_games_2013, player_data_2013, team_data_2013, True)
average_features_2014, test_set_outputs_2014 = generate_average_vectors(test_games_2014, player_data_2014, team_data_2014, True)
average_features_2015, test_set_outputs_2015 = generate_average_vectors(test_games_2015, player_data_2015, team_data_2015, True)
average_features_2016, test_set_outputs_2016 = generate_average_vectors(test_games_2016, player_data_2016, team_data_2016, True)

# Split each .4 test set into half validation half test
validation_features_2013, test_features_2013, validation_outputs_2013, test_outputs_2013 = train_test_split(average_features_2013, test_set_outputs_2013, test_size=.5)
validation_features_2014, test_features_2014, validation_outputs_2014, test_outputs_2014 = train_test_split(average_features_2014, test_set_outputs_2014, test_size=.5)
validation_features_2015, test_features_2015, validation_outputs_2015, test_outputs_2015 = train_test_split(average_features_2015, test_set_outputs_2015, test_size=.5)
validation_features_2016, test_features_2016, validation_outputs_2016, test_outputs_2016 = train_test_split(average_features_2016, test_set_outputs_2016, test_size=.5)


### All the training feature and ouptuts ###
all_train_features = list(chain(train_features_2013, train_features_2014, train_features_2015, train_features_2016))
all_train_outputs = list(chain(train_outputs_2013, train_outputs_2014, train_outputs_2015, train_outputs_2016))

all_validation_features = list(chain(validation_features_2013, validation_features_2014, validation_features_2015, validation_features_2016))
all_validation_outputs = list(chain(validation_outputs_2013, validation_outputs_2014, validation_outputs_2015, validation_outputs_2016))

all_test_features = list(chain(test_features_2013, test_features_2014, test_features_2015, test_features_2016))
all_test_outputs = list(chain(test_outputs_2013, test_outputs_2014, test_outputs_2015, test_outputs_2016))


end = time.time()
print("Time Elapsed:", end-start)

Estimated Runtime: 2 minutes
Time Elapsed: 102.50933599472046


In [61]:
# Regression Outputs (Actual Game Spreads)
train_features_2013_reg, train_outputs_2013_reg = generate_game_features(train_games_2013, player_data_2013, team_data_2013, False)
train_features_2014_reg, train_outputs_2014_reg = generate_game_features(train_games_2014, player_data_2014, team_data_2014, False)
train_features_2015_reg, train_outputs_2015_reg = generate_game_features(train_games_2015, player_data_2015, team_data_2015, False)
train_features_2016_reg, train_outputs_2016_reg = generate_game_features(train_games_2016, player_data_2016, team_data_2016, False)

average_features_2013_reg, test_set_outputs_2013_reg = generate_average_vectors(test_games_2013, player_data_2013, team_data_2013, False)
average_features_2014_reg, test_set_outputs_2014_reg = generate_average_vectors(test_games_2014, player_data_2014, team_data_2014, False)
average_features_2015_reg, test_set_outputs_2015_reg = generate_average_vectors(test_games_2015, player_data_2015, team_data_2015, False)
average_features_2016_reg, test_set_outputs_2016_reg = generate_average_vectors(test_games_2016, player_data_2016, team_data_2016, False)

# Split each .4 test set into half validation half test
validation_features_2013_reg, test_features_2013_reg, validation_outputs_2013_reg, test_outputs_2013_reg = train_test_split(average_features_2013_reg, test_set_outputs_2013_reg, test_size=.5)
validation_features_2014_reg, test_features_2014_reg, validation_outputs_2014_reg, test_outputs_2014_reg = train_test_split(average_features_2014_reg, test_set_outputs_2014_reg, test_size=.5)
validation_features_2015_reg, test_features_2015_reg, validation_outputs_2015_reg, test_outputs_2015_reg = train_test_split(average_features_2015_reg, test_set_outputs_2015_reg, test_size=.5)
validation_features_2016_reg, test_features_2016_reg, validation_outputs_2016_reg, test_outputs_2016_reg = train_test_split(average_features_2016_reg, test_set_outputs_2016_reg, test_size=.5)


### All the training feature and ouptuts ###
all_train_features_reg = list(chain(train_features_2013_reg, train_features_2014_reg, train_features_2015_reg, train_features_2016_reg))
all_train_outputs_reg = list(chain(train_outputs_2013_reg, train_outputs_2014_reg, train_outputs_2015_reg, train_outputs_2016_reg))

all_validation_features_reg = list(chain(validation_features_2013_reg, validation_features_2014_reg, validation_features_2015_reg, validation_features_2016_reg))
all_validation_outputs_reg = list(chain(validation_outputs_2013_reg, validation_outputs_2014_reg, validation_outputs_2015_reg, validation_outputs_2016_reg))

all_test_features_reg = list(chain(test_features_2013_reg, test_features_2014_reg, test_features_2015_reg, test_features_2016_reg))
all_test_outputs_reg = list(chain(test_outputs_2013_reg, test_outputs_2014_reg, test_outputs_2015_reg, test_outputs_2016_reg))


In [11]:
team_data_2016

Unnamed: 0,Team Name,Date,MP_x,FG,FGA,FG%,3P,3PA,3P%,FT,...,ORB%,DRB%,TRB%,AST%,STL%,BLK%,TOV%,USG%,ORtg,DRtg
13,New York Knicks,2016-10-25,240,32,87,0.368,9,27,0.333,15,...,24.5,72.5,45.2,53.1,6.0,10.2,15.8,100.0,88.1,117.1
13,Cleveland Cavaliers,2016-10-25,240,45,94,0.479,13,35,0.371,14,...,27.5,75.5,54.8,68.9,12.0,8.3,12.0,100.0,117.1,88.1
13,San Antonio Spurs,2016-10-25,240,47,98,0.480,12,24,0.500,23,...,43.8,81.0,61.1,53.2,13.2,5.8,10.6,100.0,131.3,101.8
13,Golden State Warriors,2016-10-25,240,40,85,0.471,7,33,0.212,13,...,19.0,56.3,38.9,60.0,11.2,8.1,14.7,100.0,101.8,131.3
13,Utah Jazz,2016-10-25,240,40,82,0.488,8,24,0.333,16,...,17.1,83.3,47.7,47.5,9.9,8.9,11.0,100.0,114.9,124.8
13,Portland Trail Blazers,2016-10-25,240,39,75,0.520,13,19,0.684,22,...,16.7,82.9,52.3,56.4,5.5,5.2,12.4,100.0,124.8,114.9
13,Brooklyn Nets,2016-10-26,240,43,97,0.443,15,44,0.341,16,...,30.0,70.7,48.4,51.2,7.8,5.3,13.1,100.0,113.6,118.4
13,Boston Celtics,2016-10-26,240,48,89,0.539,11,32,0.344,15,...,29.3,70.0,51.6,75.0,12.6,17.0,16.4,100.0,118.4,113.6
13,Dallas Mavericks,2016-10-26,265,45,104,0.433,18,48,0.375,13,...,18.5,83.0,48.5,57.8,7.0,10.8,11.8,100.0,105.8,113.7
13,Indiana Pacers,2016-10-26,265,47,93,0.505,10,19,0.526,26,...,17.0,81.5,51.5,63.8,9.6,10.7,12.9,100.0,113.7,105.8


In [12]:
player_data_2015

Unnamed: 0,Starters,Date,MP_x,FG,FGA,FG%,3P,3PA,3P%,FT,...,ORB%,DRB%,TRB%,AST%,STL%,BLK%,TOV%,USG%,ORtg,DRtg
0,Andre Drummond,2015-10-27,37:09,6.0,16.0,0.375,0.0,0.0,,6.0,...,18.5,33.1,24.8,13.3,1.4,4.7,8.9,23.6,109.0,92.0
1,Marcus Morris,2015-10-27,37:05,6.0,19.0,0.316,1.0,4.0,0.250,5.0,...,11.6,15.1,13.1,17.7,0.0,0.0,0.0,22.9,119.0,102.0
2,Kentavious Caldwell-Pope,2015-10-27,37:03,7.0,14.0,0.500,4.0,7.0,0.571,3.0,...,2.3,9.0,5.2,4.6,1.4,0.0,11.5,18.3,132.0,101.0
3,Ersan Ilyasova,2015-10-27,34:26,6.0,12.0,0.500,3.0,6.0,0.500,1.0,...,7.5,13.0,9.9,14.6,0.0,2.5,18.9,18.1,114.0,100.0
4,Reggie Jackson,2015-10-27,32:07,4.0,10.0,0.400,2.0,4.0,0.500,5.0,...,2.7,24.3,12.1,24.1,3.1,0.0,14.1,17.3,132.0,94.0
5,Stanley Johnson,2015-10-27,24:29,3.0,10.0,0.300,1.0,3.0,0.333,0.0,...,10.5,4.6,7.9,18.9,0.0,0.0,9.1,17.6,98.0,104.0
6,Steve Blake,2015-10-27,15:53,1.0,6.0,0.167,1.0,5.0,0.200,0.0,...,0.0,0.0,0.0,35.6,3.2,0.0,33.3,22.2,60.0,98.0
7,Jodie Meeks,2015-10-27,10:57,1.0,4.0,0.250,0.0,0.0,,0.0,...,7.8,10.2,8.9,0.0,0.0,0.0,20.0,17.9,56.0,103.0
8,Aron Baynes,2015-10-27,10:51,3.0,5.0,0.600,0.0,0.0,,0.0,...,7.9,41.2,22.3,0.0,0.0,0.0,16.7,21.7,102.0,97.0
9,Darrun Hilliard,2015-10-27,Did Not Play,,,,,,,,...,,,,,,,,,,


In [294]:
print(len(all_train_features), len(all_train_outputs))
print(len(all_validation_features), len(all_validation_outputs))
print(len(all_test_features), len(all_test_outputs))

print(len(all_train_features[0]), len(all_validation_features[0]), len(all_test_features[0]))

4724 4724
1580 1580
1580 1580
62 62 62


# Classification Pipelines

In [14]:
def classification_pipeline(feature_selection_model, clf_model):
    
    clf = Pipeline([
      ('feature_selection', feature_selection_model),
      ('classification', clf_model)
    ])
    
    clf.fit(all_train_features, all_train_outputs)

    print("Validation Confusion Matrix")
    print(confusion_matrix(clf.predict(all_validation_features), all_validation_outputs))
    print("Test Confusion Matrix")
    print(confusion_matrix(clf.predict(all_test_features), all_test_outputs))

    print("Number of Features Retained:", clf.named_steps['feature_selection'].get_support().tolist().count(True))
    print("Training Score:", clf.score(all_train_features, all_train_outputs))
    print("Validation Score:", clf.score(all_validation_features, all_validation_outputs))
    print("Test Score:", clf.score(all_test_features, all_test_outputs))

In [314]:
# Logistic Regression pipeline
feature_selection_model = SelectFromModel(LinearSVC(C=0.1, penalty="l1", dual=False))
clf_model = linear_model.LogisticRegression(penalty='l2')

classification_pipeline(feature_selection_model, clf_model)

Validation Confusion Matrix
[[537 282]
 [258 503]]
Test Confusion Matrix
[[508 254]
 [277 541]]
Number of Features Retained: 43
Training Score: 1.0
Validation Score: 0.658227848101
Test Score: 0.663924050633


In [50]:
# Neural Network pipeline
feature_selection_model = SelectFromModel(LinearSVC(C=0.1, penalty="l1", dual=False))
clf_model = MLPClassifier(solver='lbfgs', alpha=100.0, hidden_layer_sizes=(250,250), max_iter=200)


pipeline = Pipeline([
                      ('feature_selection', feature_selection_model),
                      ('classification', clf_model)
                    ])
parameters = {
    'classification__alpha': (.01, .001, .0001, .00001),
    'classification__hidden_layer_sizes': ((10,10), (50,50), (100,100), (250, 250), (500,500)),  # unigrams or bigrams
}

# grid_search = GridSearchCV(pipeline, parameters, n_jobs=1, verbose=1)
# grid_search.fit(all_train_features, all_train_outputs)


classification_pipeline(feature_selection_model, clf_model)



Validation Confusion Matrix
[[487 258]
 [227 483]]
Test Confusion Matrix
[[516 238]
 [225 476]]
Number of Features Retained: 41
Training Score: 0.991502067065
Validation Score: 0.666666666667
Test Score: 0.681786941581


In [34]:
grid_search.best_estimator_.get_params()

{'classification': MLPClassifier(activation='relu', alpha=0.01, batch_size='auto', beta_1=0.9,
        beta_2=0.999, early_stopping=False, epsilon=1e-08,
        hidden_layer_sizes=(250, 250), learning_rate='constant',
        learning_rate_init=0.001, max_iter=200, momentum=0.9,
        nesterovs_momentum=True, power_t=0.5, random_state=None,
        shuffle=True, solver='lbfgs', tol=0.0001, validation_fraction=0.1,
        verbose=False, warm_start=False),
 'classification__activation': 'relu',
 'classification__alpha': 0.01,
 'classification__batch_size': 'auto',
 'classification__beta_1': 0.9,
 'classification__beta_2': 0.999,
 'classification__early_stopping': False,
 'classification__epsilon': 1e-08,
 'classification__hidden_layer_sizes': (250, 250),
 'classification__learning_rate': 'constant',
 'classification__learning_rate_init': 0.001,
 'classification__max_iter': 200,
 'classification__momentum': 0.9,
 'classification__nesterovs_momentum': True,
 'classification__power_t': 

In [37]:
grid_search.score(all_train_features, all_train_outputs), grid_search.score(all_validation_features, all_validation_outputs), grid_search.score(all_test_features, all_test_outputs)

(1.0, 0.66460481099656354, 0.68041237113402064)

In [316]:
# SVM pipeline

feature_selection_model = SelectFromModel(LinearSVC(C=0.1, penalty="l1", dual=False))
clf_model = SVC()

classification_pipeline(feature_selection_model, clf_model)

Validation Confusion Matrix
[[545 295]
 [250 490]]
Test Confusion Matrix
[[516 280]
 [269 515]]
Number of Features Retained: 42
Training Score: 1.0
Validation Score: 0.655063291139
Test Score: 0.65253164557


In [317]:
# Random Forest classifier pipeline

feature_selection_model = SelectFromModel(LinearSVC(C=0.1, penalty="l1", dual=False))
clf_model = RandomForestClassifier(max_features='log2', min_samples_leaf=10)

classification_pipeline(feature_selection_model, clf_model)

Validation Confusion Matrix
[[503 276]
 [292 509]]
Test Confusion Matrix
[[481 255]
 [304 540]]
Number of Features Retained: 43
Training Score: 0.9866638442
Validation Score: 0.640506329114
Test Score: 0.646202531646


# Regression Pipelines

In [62]:
def regression_pipeline(feature_selection_model, reg_model):
    
    reg = Pipeline([
      ('feature_selection', feature_selection_model),
      ('regression', reg_model)
    ])
    
    reg.fit(all_train_features_reg, all_train_outputs_reg)
    

    print("Number of Features Retained:", reg.named_steps['feature_selection'].get_support().tolist().count(True))
    print("")
    print("Mean Squared Error")
    print("Training Score:", mean_squared_error(reg.predict(all_train_features_reg), all_train_outputs_reg))
    print("Validation Score:", mean_squared_error(reg.predict(all_validation_features_reg), all_validation_outputs_reg))
    print("Test Score:", mean_squared_error(reg.predict(all_test_features_reg), all_test_outputs_reg))
    print("")
    print("Mean Absolute Error")
    print("Training Score:", mean_absolute_error(reg.predict(all_train_features_reg), all_train_outputs_reg))
    print("Validation Score:", mean_absolute_error(reg.predict(all_validation_features_reg), all_validation_outputs_reg))
    print("Test Score:", mean_absolute_error(reg.predict(all_test_features_reg), all_test_outputs_reg))

In [66]:
# Ridge Regression

feature_selection_model = SelectFromModel(LinearSVC(C=0.01, penalty="l1", dual=False))
reg_model = linear_model.Ridge(alpha=10.0)

regression_pipeline(feature_selection_model, reg_model)

Number of Features Retained: 48

Mean Squared Error
Training Score: 0.00137416313753
Validation Score: 185.831856422
Test Score: 188.874968584

Mean Absolute Error
Training Score: 0.0253532220387
Validation Score: 10.8082449967
Test Score: 10.8940049846


In [64]:
# OLS
feature_selection_model = SelectFromModel(LinearSVC(C=0.1, penalty="l1", dual=False))
reg_model = linear_model.LinearRegression()

regression_pipeline(feature_selection_model, reg_model)

Number of Features Retained: 48

Mean Squared Error
Training Score: 6.64697234649e-28
Validation Score: 188.565463435
Test Score: 191.148178471

Mean Absolute Error
Training Score: 2.05342301627e-14
Validation Score: 10.8845580113
Test Score: 10.9587768506


In [71]:
# Neural Nets
feature_selection_model = SelectFromModel(LinearSVC(C=0.001, penalty="l1", dual=False))
reg_model = MLPRegressor(solver='lbfgs', alpha=100.0, hidden_layer_sizes=(20,20), max_iter=200)

regression_pipeline(feature_selection_model, reg_model)

Number of Features Retained: 22

Mean Squared Error
Training Score: 0.590054470658
Validation Score: 161.192928478
Test Score: 173.85333202

Mean Absolute Error
Training Score: 0.516871658922
Validation Score: 10.0041120722
Test Score: 10.3897779509


In [76]:
# Decision Tree Regression
feature_selection_model = SelectFromModel(LinearSVC(C=0.01, penalty="l1", dual=False))
reg_model = tree.DecisionTreeRegressor(min_samples_leaf = 50)

regression_pipeline(feature_selection_model, reg_model)

Number of Features Retained: 45

Mean Squared Error
Training Score: 8.55030490141
Validation Score: 173.850090868
Test Score: 186.987905314

Mean Absolute Error
Training Score: 2.04560617092
Validation Score: 10.3713693311
Test Score: 10.713025104


In [None]:
feature_selection_model = SelectFromModel(LinearSVC(C=0.01, penalty="l1", dual=False))
reg_model = GradientBoostingRegressor()

regression_pipeline(feature_selection_model, reg_model)


# Feature Selection

In [210]:
# Remove only the elements that are the same for all game instances
# Removes 2 100's for team totals, USG%
# selector = VarianceThreshold(threshold=0.0)
# test = all_train_features
# test = selector.fit_transform(test)

selector = VarianceThreshold(threshold=0.5)
test = all_train_features
test = selector.fit_transform(test)

len(test[0]), len(all_train_features[0])

(48, 64)

In [212]:
type(all_train_features)

list

In [274]:
# SVM L1 Based Feature Selection
all_train_features = np.array(all_train_features)
print(all_train_features.shape)

# C=1, 57
# C=.1, 44
# C=.01, 24
# Larger C = more values in resulting transform (less 0-weight)
lsvc = LinearSVC(C=.1, penalty="l1", dual=False).fit(all_train_features, all_train_outputs)
model = SelectFromModel(lsvc, prefit=True)

new_features = model.transform(all_train_features)

print(new_features.shape)

(4730, 64)
(4730, 43)


In [268]:
# Logistic Regression Based L1 Feature Selection
all_train_features = np.array(all_train_features)
print(all_train_features.shape)

# C=1, 64->21
# C=2, 61->20
# C=.1, 64->10
# Larger C = more values in resulting transform (less 0-weight), this is because C is INVERSE of regularization strength
lin_mod = linear_model.LogisticRegression(C=.1, penalty="l1", dual=False).fit(all_train_features, all_train_outputs)
model = SelectFromModel(lin_mod, prefit=True)

new_features = model.transform(all_train_features)

print(new_features.shape)

(4730, 64)
(4730, 10)


In [209]:
len(test[0]), len(all_train_features[0])

(62, 64)

In [207]:
test[0]

array([  38.   ,   88.   ,    0.432,    7.   ,   20.   ,    0.35 ,
         18.   ,   28.   ,    0.643,   20.   ,   35.   ,   55.   ,
         14.   ,    3.   ,    7.   ,    8.   ,   20.   ,  101.   ,
          0.503,    0.472,    0.227,    0.318,   39.2  ,   79.5  ,
         57.9  ,   36.8  ,    3.4  ,   10.9  ,    7.4  ,  115.5  ,
        109.7  ,   35.   ,   81.   ,    0.432,    6.   ,   17.   ,
          0.353,   20.   ,   27.   ,    0.741,    9.   ,   31.   ,
         40.   ,   18.   ,    5.   ,    4.   ,    7.   ,   20.   ,
         96.   ,    0.517,    0.469,    0.21 ,    0.333,   20.5  ,
         60.8  ,   42.1  ,   51.4  ,    5.7  ,    5.9  ,    7.   ,
        109.7  ,  115.5  ])

# Classification

In [157]:
### Support Vector Machines ###
svm = SVC()
svm.fit(train_features_2014, train_outputs_2014)
# svm.fit(train_features_2015, train_outputs_2015)


print("Predicted Losses - ", svm.predict(average_features_2014).tolist().count(0))
print("Predicted Wins - ", svm.predict(average_features_2014).tolist().count(1))

# Row value is actual, column value is prediction
print("2014 Confusion Matrix")
print(confusion_matrix(svm.predict(average_features_2014), test_set_outputs_2014))
print("2015 Confusion Matrix")
print(confusion_matrix(svm.predict(average_features_2015), test_set_outputs_2015))

score_2014 = svm.score(average_features_2014, test_set_outputs_2014)
score_2015 = svm.score(average_features_2015, test_set_outputs_2015)

print("2014 -", score_2014)
print("2015 -", score_2015)

Predicted Losses -  315
Predicted Wins -  301
2014 Confusion Matrix
[[211 104]
 [ 97 204]]
2015 Confusion Matrix
[[192 122]
 [110 180]]
2014 - 0.673701298701
2015 - 0.615894039735


In [158]:
### Neural Networks (Feedforward) ###

from sklearn.neural_network import MLPClassifier
clf = None
clf = MLPClassifier(solver='adam', alpha=1e-5, hidden_layer_sizes=(50,50), max_iter=200)
clf.fit(train_features_2014, train_outputs_2014)
#clf.fit(train_features_2015, train_outputs_2015)

print("Predicted Losses - ", clf.predict(average_features_2014).tolist().count(0))
print("Predicted Wins - ", clf.predict(average_features_2014).tolist().count(1))

# Row value is actual, column value is prediction
print("2014 Confusion Matrix")
print(confusion_matrix(clf.predict(average_features_2014), test_set_outputs_2014))
print("2015 Confusion Matrix")
print(confusion_matrix(clf.predict(average_features_2015), test_set_outputs_2015))

score_2014 = clf.score(average_features_2014, test_set_outputs_2014)
score_2015 = clf.score(average_features_2015, test_set_outputs_2015)

print("2014 -", score_2014)
print("2015 -", score_2015)

Predicted Losses -  142
Predicted Wins -  474
2014 Confusion Matrix
[[107  35]
 [201 273]]
2015 Confusion Matrix
[[ 83  32]
 [219 270]]
2014 - 0.616883116883
2015 - 0.584437086093


In [160]:
### Logistic Regression ###

logistic_regression_classifier = linear_model.LogisticRegression(penalty='l1')
logistic_regression_classifier.fit(train_features_2015, train_outputs_2015)

# print(logistic_regression_classifier.coef_)
# for i,v in enumerate(logistic_regression_classifier.coef_[0]):
#     if v!=0:
#         print(i)

print("Predicted Losses - ", logistic_regression_classifier.predict(average_features_2014).tolist().count(0))
print("Predicted Wins - ", logistic_regression_classifier.predict(average_features_2014).tolist().count(1))

# Row value is actual, column value is prediction
print("2014 Confusion Matrix")
print(confusion_matrix(logistic_regression_classifier.predict(average_features_2014), test_set_outputs_2014))
print("2015 Confusion Matrix")
print(confusion_matrix(logistic_regression_classifier.predict(average_features_2015), test_set_outputs_2015))

score_2014 = logistic_regression_classifier.score(average_features_2014, test_set_outputs_2014)
score_2015 = logistic_regression_classifier.score(average_features_2015, test_set_outputs_2015)

print("2014 -", score_2014)
print("2015 -", score_2015)


Predicted Losses -  308
Predicted Wins -  308
2014 Confusion Matrix
[[212  96]
 [ 96 212]]
2015 Confusion Matrix
[[190 112]
 [112 190]]
2014 - 0.688311688312
2015 - 0.629139072848


In [142]:
len(team_data_2015.dropna(axis=1).columns)

36

# Regression

In [11]:
train_features_regression, train_outputs_regression = generate_game_features(train_games, player_data_2015, team_data_2015, False)

In [12]:
average_features_regression, test_set_outputs_regression = generate_average_vectors(test_games, player_data_2015, team_data_2015, False)

Finished generating team averages


In [49]:
# Standard OLS
# linearRegressionModel = linear_model.LinearRegression()
# linearRegressionModel.fit(train_features_regression, train_outputs_regression)
# print(linearRegressionModel.score(average_features_regression, test_set_outputs_regression))

# Ridge Regression
ridge_regression_model = linear_model.Ridge(alpha=1.0)
ridge_regression_model.fit(train_features_regression, train_outputs_regression)

predicted_outputs = ridge_regression_model.predict(average_features_regression)


print(mean_squared_error(predicted_outputs))

1.47267603319e-14
13.0724855429
