In [1]:
import matplotlib.pyplot as plt
from graphviz import Source
from sklearn import svm, metrics, grid_search, tree
from sklearn.tree import export_graphviz
from sklearn.neighbors import KNeighborsClassifier
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
import seaborn as sns
from sklearn.datasets import load_iris
from pandas import DataFrame
import numpy as np
from sklearn.ensemble import ExtraTreesClassifier



In [2]:
# import normalized deltas for training and testing data
# for the sake of reproducibility, the training data contains the 2015 and 2016 seasons, and testing contains the 2017 season
# No random selection of training and testing data is used here

training = DataFrame.from_csv("/Users/Jojo/week-one-cfb-predictions/data/raw_data_normalized_t.csv")
testing = DataFrame.from_csv("/Users/Jojo/week-one-cfb-predictions/data/raw_data_normalized_test.csv")
training = training.dropna(axis=1, how='all')
testing = testing.dropna(axis=1, how='all')
training = training.dropna(axis=0, how='any')
testing = testing.dropna(axis=0, how='any')
trainLabel = training['Label']
testLabel = testing['Label']

In [3]:
# Now drop the unwanted data, which includes team names and labels
# Games will be classified as a win or loss for the 'Home' team, indicated by a 1 or 0, respectively

training = training.drop('away_team', 1).drop('home_team', 1).drop("Label", 1)
testing = testing.drop('away_team', 1).drop('home_team', 1).drop("Label", 1)

In [4]:
# Although this is overwritten by the model, this classifier shows how the importance of each feature is weighted in the context of feature elimination

model = ExtraTreesClassifier(n_estimators=100)
model.fit(training.as_matrix(), trainLabel.as_matrix())
#print(sorted(model.feature_importances_))

# To view the calculated importances of the features based on accuracy, uncomment the print statement above. 
# Note that some of the imporances are 0.0, meaning that they are of no use to the model

ExtraTreesClassifier(bootstrap=False, class_weight=None, criterion='gini',
           max_depth=None, max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=1,
           oob_score=False, random_state=None, verbose=0, warm_start=False)

In [12]:
# Now we reassign the model and apply the RFE package with 17 features

model = ExtraTreesClassifier(n_estimators=100)
rfe = RFE(model, 17)
fit = rfe.fit(training.as_matrix(), trainLabel.as_matrix())

In [6]:
# We have our desired features, so now need to rebuild the training and testing sets
# This means that we have to iterate through the rankings from the previous step to create new DataFrames

dummy = DataFrame()
dtest = DataFrame()

for i in range(len(fit.ranking_)):
    if fit.ranking_[i] == 1:
        #print(list(training.columns)[i], fit.support_[i])
        dummy[list(training.columns)[i]] = training[list(training.columns)[i]]
        dtest[list(testing.columns)[i]] = testing[list(testing.columns)[i]]

In [7]:
model.fit( dummy.as_matrix(), trainLabel.as_matrix())

print("Confusion Matrix:\n", metrics.confusion_matrix( np.array(testLabel), model.predict( dtest.as_matrix() )))
print("Accuracy:", metrics.accuracy_score( np.array(testLabel), model.predict( dtest.as_matrix() )))
print("F1 Score:", metrics.f1_score( np.array(testLabel), model.predict( dtest.as_matrix() )))
print("Precision:", metrics.precision_score( np.array(testLabel), model.predict( dtest.as_matrix() )))
print("Recall:", metrics.recall_score( np.array(testLabel), model.predict( dtest.as_matrix() )))

# These are the metrics for the ExtraTrees classifier, as modeled above.

Confusion Matrix:
 [[11 12]
 [ 2 49]]
Accuracy: 0.810810810811
F1 Score: 0.875
Precision: 0.803278688525
Recall: 0.960784313725


In [11]:
# Note that, by the nature of the ExtraTrees Classifier, the results of each test will vary from the others
# Now, we test this same dataset with an rbf-based SVM classifier to view the results.
# grid_seachCV is used for C and gamma value selection.

model = machine = grid_search.GridSearchCV( svm.SVC( kernel='rbf', degree=3 ), cv=2, param_grid = {"C": [ 4, 2, 1, 0.1, 0.01, 0.001], "gamma": np.logspace(-2, 2, 5)})
model.fit( dummy.as_matrix(), trainLabel.as_matrix())
    
#print("Prediction:", model.predict( dtest.as_matrix() ).round())
#print("Real Value:", np.array(testLabel))
    
print("Confusion Matrix:\n", metrics.confusion_matrix( np.array(testLabel), model.predict( dtest.as_matrix() ).round()))
print("Accuracy:", metrics.accuracy_score( np.array(testLabel), model.predict( dtest.as_matrix() ).round()))
print("F1 Score:", metrics.f1_score( np.array(testLabel), model.predict( dtest.as_matrix() ).round()))
print("Precision:", metrics.precision_score( np.array(testLabel), model.predict( dtest.as_matrix() ).round()))
print("Recall:", metrics.recall_score( np.array(testLabel), model.predict( dtest.as_matrix() ).round()))

# Note: we need to round the prediction results here.

Confusion Matrix:
 [[10 13]
 [ 2 49]]
Accuracy: 0.797297297297
F1 Score: 0.867256637168
Precision: 0.790322580645
Recall: 0.960784313725


In [13]:
# Since the features are carried over from the ExtraTrees Classifier, there is still random selection affecting these values
# As a result, running this code through a loop allows us to find which features RFE tends to favor
# To test this, run this with k=100 iterations

freqs = dict()

for k in range(100):
    model = ExtraTreesClassifier()
    rfe = RFE(model, 17)
    fit = rfe.fit(training.as_matrix(), trainLabel.as_matrix())
    
    dummy = DataFrame()
    dtest = DataFrame()
    
    for i in range(len(fit.ranking_)):
        if fit.ranking_[i] == 1:
            #print(list(training.columns)[i], fit.support_[i])
            dummy[list(training.columns)[i]] = training[list(training.columns)[i]]
            dtest[list(testing.columns)[i]] = testing[list(testing.columns)[i]]
            try:
                freqs[list(training.columns)[i]] += 1
            except KeyError:
                freqs[list(training.columns)[i]] = 1


import operator
for i in sorted(freqs, key=operator.itemgetter(1), reverse=True):
    print (i, freqs[i])


d_ Opponent_Fumble_Recovery_Percentage 8
d_ 2nd_Half_Points/Game 62
d_ Opponent_2nd_Half_Points/Game 9
d_ Yards_per_Game 43
d_ Fourth_Downs_per_Game 12
d_ Punt_Attempts_per_Game 25
d_ 1st_Quarter_Points/Game 15
d_ Offensive_Points_per_Game_(Estimated) 76
d_ Yards_per_Point 31
d_ Points_per_Play 69
d_ 1st_Half_Points/Game 32
d_ Opponent_Average_Scoring_Margin 46
d_ Opponent_Penalty_Yards_per_Game 7
d_ Yards_per_Play 13
d_RecPoints 63
d_OffPassSP 66
d_OffSDSP 30
d_ Opponent_Offensive_Touchdowns_per_Game 6
d_ Third_Down_Conversions_per_Game 7
d_ 4th_Quarter_Time_of_Possession_Share_% 22
d_ Opponent_Red_Zone_Scoring_Percentage_(TDs_and_FGs) 26
d_ Offensive_Touchdowns_per_Game 19
d_ 3rd_Quarter_Points/Game 37
d_ Average_Scoring_Margin 48
d_ Punts_per_Play 48
d_RecRank 51
d_OffSP 40
d_OffRushSP 16
d_ Fourth_Down_Conversions_per_Game 21
d_ Opp_Yards_per_Point 17
d_ Opponent_Third_Down_Conversion_Percentage 47
d_ Yards_per_Completion 50
d_ Opponent_Punts_per_Offensive_Score 9
d_ Punts_per_Offe