In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
from sklearn import metrics
import numpy as np
import pandas as pd
import os
import random
import csv
from IPython.core.display import HTML
from sklearn.cross_validation import KFold
from sklearn.metrics import accuracy_score
from sklearn.cross_validation import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.grid_search import GridSearchCV
css = open('style-table.css').read() + open('style-notebook.css').read()
HTML('<style>{}</style>'.format(css))



In [2]:
def generate_file_path(year):
    #generates the file path to csv file that contains annual play by play data in the NFL
    file_path = 'C:\\Users\\abird\\Documents\\Springboard\\Capstone Project\\NFL\\pbp-'
    year = str(year)
    return file_path + year + '.csv'



def cv_score(clf, x, y, score_func=accuracy_score):
    result = 0
    nfold = 5
    for train, test in KFold(y.size, nfold): # split data into train/test groups, 5 times
        clf.fit(x[train], y[train]) # fit
        result += score_func(clf.predict(x[test]), y[test]) # evaluate score function on held-out data
    return result / nfold # average



def load_data(filePath):
    #loads csv file into data frame and returns the data frame
    df = pd.DataFrame.from_csv(filePath)
    #unindex GameId
    df = df.reset_index('GameId')
    return df



def generate_teams_lst(df):
    #stores all the team abbreviation
    teams = df.OffenseTeam.dropna().drop_duplicates().sort_values()
    return list(teams)


def preprocess_data(df):
    '''This function will apply preprocessing steps to the initial data frame'''
    
    #sort the data frame so all games are in order
    df = df.sort_values(by = ['GameId', 'Quarter', 'Minute', 'Second'], ascending = [True, True, False, False])
    
    #we only want plays that are RUSH or PASS for purposes of prediction
    df_subset = df[np.logical_or(df.PlayType == 'RUSH', df.PlayType == 'PASS')]
    
    #subset of the data that we believe will be important for our prediction problem
    df_final = df_subset[['OffenseTeam', 'DefenseTeam', 'Down', 'ToGo'
                      , 'YardLine', 'YardLineDirection', 'Quarter', 'Minute', 'Second','Formation', 'PlayType']]
    
    #get rid of rows that have NaN's in any columns
    df_final = df_final.dropna()
    
    #create a label encoding column in the dataframe for PlayType since it is the variable we are trying to predict
    df_final.PlayType = df_final.PlayType.astype('category')
    df_final['PlayType_cat'] = df_final.PlayType.cat.codes
    
    #create dummy variables for each feature that is categorical
    #(OffenseTeam, DefenseTeam, YardLineDirection, and Formation)
    df_final = pd.get_dummies(df_final, columns = ['OffenseTeam', 'DefenseTeam', 'YardLineDirection', 'Formation'])
    
    return df_final



def subset_team(df, team):
    '''This function takes in a data frame of play by play data and a str abbreviation for a team and
    returns the subset of plays belonging to that specific team'''
    subset_df = df[df.OffenseTeam == team]
    return subset_df


def round_array(array):
    '''return a list of floats that have been rounded from given array'''
    lst_to_return = []
    for entry in array:
        lst_to_return.append(round(entry))
    return lst_to_return

In [3]:
years_of_data_lst = [2014, 2015, 2016] #years of available data

Cs = [0.001, 0.1, 1, 10, 100] #regularization constants

headers = [
    'year',
    'team',
    'acc_score_train_set',
    'acc_score_test_set',
    'cross_val_score',
    'best_reg_par',
    'acc_score_best_reg',
    'precision',
    'recall',
    'f1_score',
    'auc'
]

model_results_lst = [] #list that stores each team's model results for each year of data

#store model results in csv file for further analysis
csv_file_path = 'C:\\Users\\abird\\Documents\\Springboard\\Capstone Project\\NFL\\team_logistic_reg_results.csv'

#start loop to generate team model for each year and gather model evaluation metrics
for year in years_of_data_lst:
    filePath = generate_file_path(year)
    df = load_data(filePath)
    teams_lst = generate_teams_lst(df)


    #start building a regularized logistic regression model for each team
    for team in teams_lst:
        #subset of data for each specific team
        subset_df = subset_team(df, team)

        #preprocess subset_df
        df_final = preprocess_data(subset_df)

        #let's create a train/test set
        x_train, x_test, y_train, y_test = train_test_split(df_final.drop(labels = ['PlayType', 'PlayType_cat'], axis = 1).values, 
                                                  df_final.PlayType_cat.values, random_state = 69)

        #let's fit a logistic regression model to predict PlayType (run or pass)
        clf = LogisticRegression()
        clf.fit(x_train, y_train)
        acc_score_train_set = accuracy_score(clf.predict(x_train), y_train)
        acc_score_test_set = accuracy_score(clf.predict(x_test), y_test)
        #print team
        #print 'Accuracy score on training set: ', acc_score_train_set
        #print 'Accuracy score on test set: ', acc_score_test_set

        #5-Fold Cross Validation on training set
        cross_val_score = cv_score(clf, x_train, y_train)
        #print 'cv_score: ', cross_val_score

        #Grid Search for the best regularization constant for logistic regression
        param_grid = {'C' : Cs }
        model = GridSearchCV(LogisticRegression(), param_grid, scoring = 'accuracy', cv = 5)
        model.fit(x_train, y_train)

        best_reg_par = model.best_params_['C']
        acc_score_best_reg = accuracy_score(model.predict(x_test), y_test)
        #print 'The best regularization parameter from the Grid Search is:', best_reg_par
        #print 'The accuracy score with the best regularization parameter is', acc_score_best_reg

        #AUC score for the ROC Curve for each team based model
        preds = model.predict_proba(x_test)[:,1]
        fpr, tpr, _ = metrics.roc_curve(y_test, preds)

        y_test_rounded = round_array(y_test)
        preds_rounded = round_array(preds)

        #precision (true positives / [true positives + false positives])
        precision = metrics.precision_score(y_test_rounded, preds_rounded)

        #recall (true positives / [true positives + false negatives])
        recall = metrics.recall_score(y_test_rounded, preds_rounded)

        #F1 score is the geometric/harmonic mean of precision and recall; 2 * tp / ( 2 * tp + fp + fn)
        f1_score = metrics.f1_score(y_test_rounded, preds_rounded)
        
        #AUC (area under the curve)
        auc = metrics.auc(fpr, tpr)
        #print 'The precision score is', precision
        #print 'The recall score is', recall
        #print 'The F1 score is', f1_score
        #print 'The AUC for the ROC Curve is', auc
        #print '\n'

        tup = (
        year,
        team,
        acc_score_train_set,
        acc_score_test_set,
        cross_val_score,
        best_reg_par,
        acc_score_best_reg,
        precision,
        recall,
        f1_score,
        auc
        )
        model_results_lst.append(tup)

model_results_df = pd.DataFrame(model_results_lst, columns = headers)
model_results_df = model_results_df.sort_values(by = ['year', 'team'], ascending = [True, True])
model_results_df.to_csv(csv_file_path)