In [145]:
import _pickle as pickle
import pandas as pd
import numpy as np

from sklearn import linear_model
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import confusion_matrix
from sklearn.svm import SVC

from feature_extraction import *

In [51]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

In [2]:
%load_ext autoreload
%autoreload 2

In [156]:
game_list_2014 = load_game_list('Game_List_2014.p')
player_data_2014 = pd.DataFrame.from_csv("Player_Data_2014.csv")
team_data_2014 = pd.DataFrame.from_csv("Team_Data_2014.csv")

game_list_2015 = load_game_list('Game_List_2015.p')
player_data_2015 = pd.DataFrame.from_csv("Player_Data_2015.csv")
team_data_2015 = pd.DataFrame.from_csv("Team_Data_2015.csv")

#features, outputs = generate_game_features(game_list_2015, player_data_2015, team_data_2015, True)

# After getting the game lists, split into training and test sets

test_set_ratio_2014 = .25
test_set_ratio_2015 = .25

train_games_2014, test_games_2014 = train_test_split(game_list_2014, test_size = test_set_ratio_2014)
train_games_2015, test_games_2015 = train_test_split(game_list_2015, test_size = test_set_ratio_2015)

# Change this function in feature_extraction to modify which features are used
train_features_2014, train_outputs_2014 = generate_game_features(train_games_2014, player_data_2014, team_data_2014, True)
train_features_2015, train_outputs_2015 = generate_game_features(train_games_2015, player_data_2015, team_data_2015, True)

average_features_2014, test_set_outputs_2014 = generate_average_vectors(test_games_2014, player_data_2014, team_data_2014, True)
average_features_2015, test_set_outputs_2015 = generate_average_vectors(test_games_2015, player_data_2015, team_data_2015, True)

Finished generating team averages
Finished generating team averages


# Classification

In [157]:
### Support Vector Machines ###
svm = SVC()
svm.fit(train_features_2014, train_outputs_2014)
# svm.fit(train_features_2015, train_outputs_2015)


print("Predicted Losses - ", svm.predict(average_features_2014).tolist().count(0))
print("Predicted Wins - ", svm.predict(average_features_2014).tolist().count(1))

# Row value is actual, column value is prediction
print("2014 Confusion Matrix")
print(confusion_matrix(svm.predict(average_features_2014), test_set_outputs_2014))
print("2015 Confusion Matrix")
print(confusion_matrix(svm.predict(average_features_2015), test_set_outputs_2015))

score_2014 = svm.score(average_features_2014, test_set_outputs_2014)
score_2015 = svm.score(average_features_2015, test_set_outputs_2015)

print("2014 -", score_2014)
print("2015 -", score_2015)

Predicted Losses -  315
Predicted Wins -  301
2014 Confusion Matrix
[[211 104]
 [ 97 204]]
2015 Confusion Matrix
[[192 122]
 [110 180]]
2014 - 0.673701298701
2015 - 0.615894039735


In [158]:
### Neural Networks (Feedforward) ###

from sklearn.neural_network import MLPClassifier
clf = None
clf = MLPClassifier(solver='adam', alpha=1e-5, hidden_layer_sizes=(50,50), max_iter=200)
clf.fit(train_features_2014, train_outputs_2014)
#clf.fit(train_features_2015, train_outputs_2015)

print("Predicted Losses - ", clf.predict(average_features_2014).tolist().count(0))
print("Predicted Wins - ", clf.predict(average_features_2014).tolist().count(1))

# Row value is actual, column value is prediction
print("2014 Confusion Matrix")
print(confusion_matrix(clf.predict(average_features_2014), test_set_outputs_2014))
print("2015 Confusion Matrix")
print(confusion_matrix(clf.predict(average_features_2015), test_set_outputs_2015))

score_2014 = clf.score(average_features_2014, test_set_outputs_2014)
score_2015 = clf.score(average_features_2015, test_set_outputs_2015)

print("2014 -", score_2014)
print("2015 -", score_2015)

Predicted Losses -  142
Predicted Wins -  474
2014 Confusion Matrix
[[107  35]
 [201 273]]
2015 Confusion Matrix
[[ 83  32]
 [219 270]]
2014 - 0.616883116883
2015 - 0.584437086093


In [160]:
### Logistic Regression ###

logistic_regression_classifier = linear_model.LogisticRegression(penalty='l1')
logistic_regression_classifier.fit(train_features_2015, train_outputs_2015)

# print(logistic_regression_classifier.coef_)
# for i,v in enumerate(logistic_regression_classifier.coef_[0]):
#     if v!=0:
#         print(i)

print("Predicted Losses - ", logistic_regression_classifier.predict(average_features_2014).tolist().count(0))
print("Predicted Wins - ", logistic_regression_classifier.predict(average_features_2014).tolist().count(1))

# Row value is actual, column value is prediction
print("2014 Confusion Matrix")
print(confusion_matrix(logistic_regression_classifier.predict(average_features_2014), test_set_outputs_2014))
print("2015 Confusion Matrix")
print(confusion_matrix(logistic_regression_classifier.predict(average_features_2015), test_set_outputs_2015))

score_2014 = logistic_regression_classifier.score(average_features_2014, test_set_outputs_2014)
score_2015 = logistic_regression_classifier.score(average_features_2015, test_set_outputs_2015)

print("2014 -", score_2014)
print("2015 -", score_2015)


Predicted Losses -  308
Predicted Wins -  308
2014 Confusion Matrix
[[212  96]
 [ 96 212]]
2015 Confusion Matrix
[[190 112]
 [112 190]]
2014 - 0.688311688312
2015 - 0.629139072848


In [142]:
len(team_data_2015.dropna(axis=1).columns)

36

# Regression

In [11]:
train_features_regression, train_outputs_regression = generate_game_features(train_games, player_data_2015, team_data_2015, False)

In [12]:
average_features_regression, test_set_outputs_regression = generate_average_vectors(test_games, player_data_2015, team_data_2015, False)

Finished generating team averages


In [49]:
# Standard OLS
# linearRegressionModel = linear_model.LinearRegression()
# linearRegressionModel.fit(train_features_regression, train_outputs_regression)
# print(linearRegressionModel.score(average_features_regression, test_set_outputs_regression))

# Ridge Regression
ridge_regression_model = linear_model.Ridge(alpha=1.0)
ridge_regression_model.fit(train_features_regression, train_outputs_regression)

predicted_outputs = ridge_regression_model.predict(average_features_regression)


difference = np.array(predicted_outputs) - np.array(test_set_outputs_regression)
print(np.average(difference))
print(np.std(difference))
# print(ridge_regression_model.score(average_features_regression, test_set_outputs_regression))

# Lasso Regression
# lasso_regression_model = linear_model.Lasso(alpha=1.0, max_iter=5000)
# lasso_regression_model.fit(train_features_regression, train_outputs_regression)
# print(lasso_regression_model.coef_)

# # Visualizing L1 to 0
# for i,v in enumerate(lasso_regression_model.coef_):
#     if v!=0:
#         print (i)
        
# used_data = team_data_2015.dropna(axis=1)

# print(lasso_regression_model.score(average_features_regression, test_set_outputs_regression))

1.47267603319e-14
13.0724855429
