In [2]:
import numpy as np
import pandas as pd
import datetime as dt
import time
from sklearn.model_selection import train_test_split
from __future__ import division

In [3]:
# Read in data
filename = 'Algorithms_Data.csv'
data = pd.read_csv(filename)
data = data[-15000:]
targets = data['Win']
data = data[['fran_elo', 'opp_elo', 'RollAvg_A_5_pts', 'RollAvg_B_5_pts', 'RollAvg_A_5_opp_pts', 'RollAvg_B_5_opp_pts', 'Days_Since_Last']]

In [4]:
def runAlgorithm(model, trials=10):
    t1 = []
    t2 = []
    preds =[]
    for t in range(trials):
        X_train, X_test, y_train, y_test = train_test_split(data, targets, test_size=.2)
        t1.append(time.time())

        model.fit(X_train, y_train)
        preds.append(model.predict(X_test) == y_test)
        t2.append(time.time())
        
    return np.mean(preds),np.mean(np.array(t2)-np.array(t1))


# Baseline
Guess that the home team wins every game.  If a method doesn't do better than this, it isn't learning much.

In [39]:
print (np.zeros(300) == Y_test).mean()


0.573333333333


# SVM

In [19]:
from sklearn import svm
from sklearn import preprocessing

# Normalize data
X = preprocessing.scale(X_train)

s = time.time()
SVM_model = svm.SVC(C=10**-5, kernel='poly', coef0=0, gamma=10**-2).fit(X, Y_train)
preds = SVM_model.predict(X_test)
prob = np.sum(preds == Y_test)/len(Y_test)
svm_time = time.time() - s
print prob
print 'time: ', svm_time
#print preds

0.616666666667
time:  3.7990000248


# Logistic Regression

In [35]:
from sklearn import linear_model as lm

lam = 10**2

model = lm.LogisticRegression(C=np.abs(1/lam))
runAlgorithm(model,trials = 100)

(0.75880666666666663, 0.02807000160217285)

# Ridge Regression 

In [None]:
from sklearn.linear_model import Ridge as ri

skRidge = ri()
skRidge.fit(X_train, Y_train)
preds = skRidge.predict(X_test) > .5
print (preds == Y_test).mean()
print preds

# Gradient Boost 

In [7]:
from sklearn.ensemble import GradientBoostingClassifier as GBC

#start = time.time()
gbc = GBC(max_leaf_nodes=500, min_weight_fraction_leaf=0.001, min_samples_split=100, learning_rate=.4, max_features="auto")#.fit(X_train,Y_train)
"""preds = gbc.predict(X_test)
gradient_boost_time = time.time()-start
print (preds == Y_test).mean()
print 'time: ' + str(gradient_boost_time)"""
runAlgorithm(gbc, trials=100)

(0.74130666666666667, 0.77775991201400752)

# MLP

In [29]:
from sklearn.neural_network import MLPClassifier as mlp

start = time.time()
gbm = mlp(solver="lbfgs", activation='tanh', tol=1e-4, alpha=1e-5).fit(X_train,Y_train)
print (gbm.predict(X_test) == Y_test).mean()
mlp_time = time.time()-start
print 'time: ' + str(mlp_time)

0.723333333333
time: 8.35599994659


# Random Forests

In [None]:
from sklearn.ensemble import RandomForestClassifier as RFC

rfc = RFC(min_samples_split=3, n_estimators =100).fit(X_train,Y_train)
print (rfc.predict(X_test) == Y_test).mean()

# Fine tuned parameters, this is the best I could get

# Decision Tree

In [None]:
from sklearn.tree import DecisionTreeClassifier as DTC

dtc = DTC(criterion='entropy').fit(X_train,Y_train)
preds = dtc.predict(X_test)
print (preds == Y_test).mean()
print preds

# XGBoost

In [None]:
import xgboost as xgb
dtrain = xgb.DMatrix(X_train, Y_train)
dtest = xgb.DMatrix(X_test)

bst = xgb.train({}, dtrain)
preds = bst.predict(dtest) > .5
print (preds == Y_test).mean()
print preds


# Algorithms we chose not to use

### Nearest neighbor

Due to the fact that we are not performing any kind of cluster analysis, we have decided the NN-classifying and NN-regression algorithms are not useful. 

### Gaussian discriminant analysis

Similar to the issues with nearest neighbor, since we are not doing any kind of classification, GDA will not be useful with our data set or our project.

### Mixture models with latent variables (train with EM)

Because there is not an unknown distribution in our dataset, mixture models along with EM will not work with our data set.


### Kalman Filters

Our data is not a true time series because we are not interested in how teams' overall performances changes over time. Games are decided strictly by winners and losers, so there is not any measurable error or noise. Because of this we have determined that it will not be useful in our methods. 

### AR, MA, ARMA, ARIMA time series models

Because these models are trying to describe certain time-varying processes of a time series, they will not be helpful in trying to predict wins and losses of specific games. 