In [75]:
import numpy as np
import pandas as pd
import datetime as dt
import time
from sklearn.model_selection import train_test_split
from __future__ import division

In [78]:
# Read in data
filename = 'Algorithms_Data.csv'
data = pd.read_csv(filename)
data = data[-15000:]
targets = data['Win']
data = data[['A_elo', 'B_elo', 'RollAvg_A_5_pts', 'RollAvg_B_5_pts', 'RollAvg_A_5_opp_pts', 'RollAvg_B_5_opp_pts', 'Days_Since_Last']]

In [79]:
data.head()

Unnamed: 0,A_elo,B_elo,RollAvg_A_5_pts,RollAvg_B_5_pts,RollAvg_A_5_opp_pts,RollAvg_B_5_opp_pts,Days_Since_Last
32530,1489.768214,1684.042297,91.8,95.2,96.2,86.8,1.0
32531,1569.508971,1558.067109,83.8,93.2,77.4,84.4,1.0
32532,1687.615767,1528.192875,107.2,98.2,94.2,92.2,1.0
32533,1509.910406,1448.525491,101.2,100.0,106.8,97.6,1.0
32534,1456.563819,1571.171564,97.4,93.6,91.2,83.6,3.0


The above table is a sample of the data we will be using in order to predict whether the home team will win an NBA basketball game. Each row represents one match up between two basketball teams. Team A is the home team, team B is the away team.  

The ELO rating system is a method for calculating the relative skill levels of players in two-player games such as chess. We have adapted this system in order to measure skill levels of different NBA teams based on their recent performance.  We computed the ELO ratings for each team over the history of our dataset, and we use this as a feature.

A_elo: The ELO ranking for the home team (A) at the time this game was played.

B_elo: The ELO ranking for the away team (B) at the time this game was played.

RollAvg_A_5_pts: For the home team, the average points this specific team has scored in their last 5 games.

RollAvg_B_5_pts: For the away team, the average points this specific team has scored in their last 5 games.

RollAvg_A_5_opp_pts: For the home team, the average points this specific team has allowed in their last 5 games.

RollAvg_B_5_opp_pts: For away team, the average points this specific team has allowed in their last 5 games.

Days_Since_Last: Number of days since the home team last played a game.

For more information on ELO, see our Machine Learning page

In [57]:
def runAlgorithm(model, trials=10,):
    """ Run a given machine learning algorithm on globally defined data.
        Splits data randomly into test and training sets.
        Returns the average time and accuracy.
    
    :param model: A sklearn model initialized with parameters.
    :param trials: Number of trials to run.  Defaults to 10.
    """
    
    t1 = []
    t2 = []
    preds =[]
    for t in range(trials):
        X_train, X_test, y_train, y_test = train_test_split(data, targets, test_size=.2)
        t1.append(time.time())

        model.fit(X_train, y_train)
        preds.append((model.predict(X_test)>.5) == y_test)
        t2.append(time.time())
        
    return np.mean(preds),np.mean(np.array(t2)-np.array(t1))


# Application of ML algorithms

## Baseline
Guess that the home team wins every game.  If a method doesn't do better than this, it isn't learning much.

In [37]:
t1 = []
t2 = []
preds = []
for t in range(10):
    X_train, X_test, y_train, y_test = train_test_split(data, targets, test_size=.2)
    t1.append(time.time())

    preds.append((np.zeros(len(y_test)) == y_test).mean())
    t2.append(time.time())

baseline_ = np.mean(preds), np.mean(np.array(t2)-np.array(t1))
print 'Accuracy: {}\nTime: {}'.format(*baseline_)

Accuracy: 0.593766666667
Time: 0.000300002098083


## SVM

In [38]:
from sklearn import svm
from sklearn import preprocessing

# Normalize data
X = preprocessing.scale(X_train)

accuracy = []
times = []
for i in xrange(11):
    X_train, X_test, y_train, y_test = train_test_split(data, targets, test_size=.2)
    X = preprocessing.scale(X_train)
    s = time.time()
    SVM_model = svm.SVC(C=10**-5, kernel='poly', coef0=0, gamma=10**-2).fit(X, y_train)
    preds = SVM_model.predict(X_test)
    prob = np.sum(preds == y_test)/len(y_test)
    times.append(time.time() - s)
    accuracy.append(prob)

svm_r = (np.mean(accuracy), np.mean(times))
print 'Accuracy: {}\nTime: {}'.format(*svm_r)

Accuracy: 0.652727272727
Time: 3.0811818513


## Logistic Regression

In [39]:
from sklearn import linear_model as lm

lam = 10**2

model = lm.LogisticRegression(C=np.abs(1/lam))
log_reg = runAlgorithm(model, trials=100)
print 'Accuracy: {}\nTime: {}'.format(*log_reg)

Accuracy: 0.759096666667
Time: 0.0327799868584


## Ridge Regression 

In [None]:
from sklearn.linear_model import Ridge as ri

skRidge = ri()
rid_reg = runAlgorithm(model,trials = 10)
print 'Accuracy: {}\nTime: {}'.format(*rid_reg)

## Linear Discriminant Analysis

In [45]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

model = LinearDiscriminantAnalysis(solver='svd')
lda_ = runAlgorithm(model,100)
print 'Accuracy: {}\nTime: {}'.format(*lda_)

Accuracy: 0.758602666667
Time: 0.00719099807739


## Quadratic Discriminant Analysis

In [49]:
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis

model = QuadraticDiscriminantAnalysis()
qda_= runAlgorithm(model, 100)
print 'Accuracy: {}\nTime: {}'.format(*qda_)

Accuracy: 0.75611
Time: 0.00387000083923


## Gradient Boost 

In [41]:
from sklearn.ensemble import GradientBoostingClassifier as GBC

#start = time.time()
gbc = GBC(max_leaf_nodes=500, min_weight_fraction_leaf=0.001, min_samples_split=100, learning_rate=.4, max_features="auto")
grad = runAlgorithm(gbc, trials=100)
print 'Accuracy: {}\nTime: {}'.format(*grad)

Accuracy: 0.741476666667
Time: 0.720160024166


# MLP

In [42]:
from sklearn.neural_network import MLPClassifier as mlp

start = time.time()
gbm = mlp(solver="lbfgs", activation='tanh', tol=1e-4, alpha=1e-5)#.fit(X_train,Y_train)
mlp_r = runAlgorithm(gbm, trials=10)
print 'Accuracy: {}\nTime: {}'.format(*mlp_r)

Accuracy: 0.732533333333
Time: 7.18240003586


# Random Forests

In [43]:
from sklearn.ensemble import RandomForestClassifier as RFC

rfc = RFC(min_samples_split=3, n_estimators =100)
forest = runAlgorithm(rfc, trials=10)
print 'Accuracy: {}\nTime: {}'.format(*forest)

Accuracy: 0.7428
Time: 1.59470002651


# Decision Tree

In [44]:
from sklearn.tree import DecisionTreeClassifier as DTC

dtc = DTC(criterion='entropy')
tree = runAlgorithm(dtc, trials=100)
print 'Accuracy: {}\nTime: {}'.format(*tree)

Accuracy: 0.664233333333
Time: 0.0975999832153


# XGBoost

In [7]:
import xgboost as xgb
accuracy = []
times = []
for i in xrange(10):
    X_train, X_test, y_train, y_test = train_test_split(data, targets, test_size=.2)
    start = time.time()
    dtrain = xgb.DMatrix(X_train, y_train)
    dtest = xgb.DMatrix(X_test)

    param = {'objective':'multi:softmax', 'num_class':5,'normalize_type':'forest','rate_drop':1, 'lambda':50, 'alpha':10}
    bst = xgb.train(param, dtrain)
    preds = bst.predict(dtest) > .5
    accuracy.append((preds == y_test).mean())
    times.append(time.time() -start)
    #print preds

xgb_r = (np.mean(accuracy), np.mean(times))
print 'Accuracy: {}\nTime: {}'.format(*xgb_r)

(0.74051515151515157, 0.063956282355568619)


# Model Comparison

In [73]:
comps = pd.DataFrame(columns=['Algorithm', 'Accuracy', 'Time'])
comps['Algorithm'] = ['Baseline', 'SVM', 'Logistic Regression', 'Ridge Regression', 
                      'Gradient Boost', 'XGBoost',
                      'MLP', 'Decision Tree', 'Random Forest', 'LDA', 'QDA']

results = [baseline_, svm_r, log_reg, rid_reg, grad, xgb_r, mlp_r, tree, forest, lda_, qda_]
comps['Accuracy'] = [m[0] for m in results]
comps['Time'] = [m[1] for m in results]
comps.sort_values('Accuracy', ascending=False).reset_index(drop=True)

Unnamed: 0,Algorithm,Accuracy,Time
0,Ridge Regression,0.761733,0.0315
1,Logistic Regression,0.759097,0.03278
2,LDA,0.758603,0.007191
3,QDA,0.75611,0.00387
4,Random Forest,0.7428,1.5947
5,Gradient Boost,0.741477,0.72016
6,MLP,0.732533,7.1824
7,Decision Tree,0.664233,0.0976
8,SVM,0.652727,3.081182
9,Baseline,0.593767,0.0003


In [71]:
comps.sort_values('Time').reset_index(drop=True)

Unnamed: 0,Algorithm,Accuracy,Time
0,XGBoost,0.0,0.0
1,Baseline,0.593767,0.0003
2,QDA,0.75611,0.00387
3,LDA,0.758603,0.007191
4,Ridge Regression,0.761733,0.0315
5,Logistic Regression,0.759097,0.03278
6,Decision Tree,0.664233,0.0976
7,Gradient Boost,0.741477,0.72016
8,Random Forest,0.7428,1.5947
9,SVM,0.652727,3.081182


### Evaluate accuracy and effectiveness of the methods you used and compare the relative strengths and weaknesses of each method for this particular project.  

The methods that we focused on were regression based methods, tree based methods, neural nets, and SVMs. 

#### Regression based
Methods used: Logistic Regression, Ridge Regression

Both methods worked extremely well, with accuracies around 76% and running in only ~.03 seconds. 

#### Tree based
Methods used: Decision Tree, Random Forest, Gradient Boost, XGBoost

Random Forests ran about as accurately as some of our better performers at 74%, but is a much slower method running in about a second and a half. 

As expected, Decision Trees ran less accurately than Random Forests at 67% but runs much faster at ~.1 seconds. Decision Trees are one of the fasted methods we tested.

XGBoost was among the best of the tree methods, both fast and respectably accurate.  However we could not quite get it to surpass the regression methods in accuracy, and those take only half the time.

Gradient Boost performed just under XGBoost both in time and accuracy.


#### Gaussian Discriminant Analysis
Methods used: Linear Discriminant Analysis, Quadratic Discriminant Analysis

Despite their simplicity and the lack of parameters to tune, these two methods both achieve accuracy above 75% and run quickly. They are not quite as accurate as the regression methods, but they are about an order of magnitude faster. The fact that this works so well suggests that the wins and losses are somewhat linearly separable.


#### Neural nets
Methods used: MLP

Our Multi Layer Perception (MLP) was as accurate as many of our top performers at ~75%, but was by far our slowest algorithm taking nearly 7.5 seconds to run. As such even though it is accurate, we would not choose to continue working with it because it is so much slower than other algorithms that are equally accurate.

#### SVM
Our Support Vector Machine ran the worst accuracy out of any of our other methods at 65%. It also was our second slowest methods running at nearly 3 seconds. Because it was outperformed by almost all of our other tests in both categories, we will not continue working with this method.

# Algorithms we chose not to use

### Nearest neighbor

Due to the fact that we are not performing any kind of cluster analysis, we have decided the NN-classifying and NN-regression algorithms are not useful. 

### Mixture models with latent variables (train with EM)

Because there is not an unknown distribution in our dataset, mixture models along with EM will not work with our data set.


### Kalman Filters

Our data is not a true time series because we are not interested in how teams' overall performances changes over time. Games are decided strictly by winners and losers, so there is not any measurable error or noise. Because of this we have determined that it will not be useful in our methods. 

### AR, MA, ARMA, ARIMA time series models

Because these models are trying to describe certain time-varying processes of a time series, they will not be helpful in trying to predict wins and losses of specific games. 