In [1]:
import numpy as np
import pandas as pd
import datetime as dt
import time
from sklearn.model_selection import train_test_split
from __future__ import division

In [2]:
# Read in data
filename = 'Algorithms_Data.csv'
data = pd.read_csv(filename)
data = data[-15000:]
targets = data['Win']
data = data[['fran_elo', 'opp_elo', 'RollAvg_A_5_pts', 'RollAvg_B_5_pts', 'RollAvg_A_5_opp_pts', 'RollAvg_B_5_opp_pts', 'Days_Since_Last']]

In [3]:
def runAlgorithm(model, trials=10,):
    t1 = []
    t2 = []
    preds =[]
    for t in range(trials):
        X_train, X_test, y_train, y_test = train_test_split(data, targets, test_size=.2)
        t1.append(time.time())

        model.fit(X_train, y_train)
        preds.append((model.predict(X_test)>.5) == y_test)
        t2.append(time.time())
        
    return np.mean(preds),np.mean(np.array(t2)-np.array(t1))


# Baseline
Guess that the home team wins every game.  If a method doesn't do better than this, it isn't learning much.

In [9]:
t1 = []
t2 = []
preds = []
for t in range(10):
    X_train, X_test, y_train, y_test = train_test_split(data, targets, test_size=.2)
    t1.append(time.time())

    preds.append((np.zeros(len(y_test)) == y_test).mean())
    t2.append(time.time())

baseline_ = np.mean(preds), np.mean(np.array(t2)-np.array(t1))
baseline_

(0.59779999999999989, 0.00042645931243896485)

# SVM

In [10]:
from sklearn import svm
from sklearn import preprocessing

# Normalize data
X = preprocessing.scale(X_train)

accuracy = []
times = []
for i in xrange(11):
    X_train, X_test, y_train, y_test = train_test_split(data, targets, test_size=.2)
    X = preprocessing.scale(X_train)
    s = time.time()
    SVM_model = svm.SVC(C=10**-5, kernel='poly', coef0=0, gamma=10**-2).fit(X, y_train)
    preds = SVM_model.predict(X_test)
    prob = np.sum(preds == y_test)/len(y_test)
    times.append(time.time() - s)
    accuracy.append(prob)

svm_r = (np.mean(accuracy), np.mean(times))
print svm_r

(0.66039393939393942, 3.9025016047737817)


# Logistic Regression

In [11]:
from sklearn import linear_model as lm

lam = 10**2

model = lm.LogisticRegression(C=np.abs(1/lam))
log_reg = runAlgorithm(model, trials=100)
print log_reg

(0.75888666666666671, 0.046206696033477782)


# Ridge Regression 

In [12]:
from sklearn.linear_model import Ridge as ri

skRidge = ri()
rid_reg = runAlgorithm(model,trials = 10)
print rid_reg

(0.75860000000000005, 0.086951708793640142)


# Gradient Boost 

In [13]:
from sklearn.ensemble import GradientBoostingClassifier as GBC

#start = time.time()
gbc = GBC(max_leaf_nodes=500, min_weight_fraction_leaf=0.001, min_samples_split=100, learning_rate=.4, max_features="auto")
grad = runAlgorithm(gbc, trials=100)
print grad

(0.74092666666666662, 0.76354182243347168)


# MLP

In [14]:
from sklearn.neural_network import MLPClassifier as mlp

start = time.time()
gbm = mlp(solver="lbfgs", activation='tanh', tol=1e-4, alpha=1e-5)#.fit(X_train,Y_train)
mlp_r = runAlgorithm(gbm, trials=10)
print mlp_r

(0.74706666666666666, 5.2733932018280028)


# Random Forests

In [15]:
from sklearn.ensemble import RandomForestClassifier as RFC

rfc = RFC(min_samples_split=3, n_estimators =100)
forest = runAlgorithm(rfc, trials=10)
print forest

(0.7399, 1.6552239894866942)


# Decision Tree

In [16]:
from sklearn.tree import DecisionTreeClassifier as DTC

dtc = DTC(criterion='entropy')
tree = runAlgorithm(dtc, trials=10)
print tree

(0.66620000000000001, 0.11191978454589843)


# XGBoost

In [7]:
import xgboost as xgb
accuracy = []
times = []
for i in xrange(11):
    X_train, X_test, y_train, y_test = train_test_split(data, targets, test_size=.2)
    start = time.time()
    dtrain = xgb.DMatrix(X_train, y_train)
    dtest = xgb.DMatrix(X_test)

    bst = xgb.train({}, dtrain)
    preds = bst.predict(dtest) > .5
    accuracy.append((preds == y_test).mean())
    times.append(time.time() -start)
    #print preds

xgb_r = (np.mean(accuracy), np.mean(times))
print xgb_r

(0.74051515151515157, 0.063956282355568619)


# Algorithms we chose not to use

### Nearest neighbor

Due to the fact that we are not performing any kind of cluster analysis, we have decided the NN-classifying and NN-regression algorithms are not useful. 

### Gaussian discriminant analysis

Similar to the issues with nearest neighbor, since we are not doing any kind of classification, GDA will not be useful with our data set or our project.

### Mixture models with latent variables (train with EM)

Because there is not an unknown distribution in our dataset, mixture models along with EM will not work with our data set.


### Kalman Filters

Our data is not a true time series because we are not interested in how teams' overall performances changes over time. Games are decided strictly by winners and losers, so there is not any measurable error or noise. Because of this we have determined that it will not be useful in our methods. 

### AR, MA, ARMA, ARIMA time series models

Because these models are trying to describe certain time-varying processes of a time series, they will not be helpful in trying to predict wins and losses of specific games. 

# Model Comparison

In [17]:
comps = pd.DataFrame(columns=['Algorithm', 'Accuracy', 'Time'])
comps['Algorithm'] = ['Baseline', 'SVM', 'Logistic Regression', 'Ridge Regression', 
                      'Gradient Boost', 'XGBoost',
                      'MLP', 'Decision Tree', 'Random Forest']

results = [baseline_, svm_r, log_reg, rid_reg, grad, xgb_r, mlp_r, tree, forest]
comps['Accuracy'] = [m[0] for m in results]
comps['Time'] = [m[1] for m in results]
comps

Unnamed: 0,Algorithm,Accuracy,Time
0,Baseline,0.5978,0.000426
1,SVM,0.660394,3.902502
2,Logistic Regression,0.758887,0.046207
3,Ridge Regression,0.7586,0.086952
4,Gradient Boost,0.740927,0.763542
5,XGBoost,0.740515,0.063956
6,MLP,0.747067,5.273393
7,Decision Tree,0.6662,0.11192
8,Random Forest,0.7399,1.655224
