## nfl lines machine learning project


source data for historical spreads and game scores
http://www.repole.com/sun4cast/data.html

In [1]:
import os

# define the root directory for the nfl code in $MLNLF_ROOT
codeDir = "".join([os.environ['MLNFL_ROOT'], os.path.sep])
dataRoot = "".join([codeDir, "data", os.path.sep])

os.chdir(codeDir)

print(codeDir)
print(dataRoot)

/Users/amit/repos/mlnfl/nfl/
/Users/amit/repos/mlnfl/nfl/data/


In [2]:
# warnings control
import warnings
# choose default, ignore, always
warnings.filterwarnings('default')


In [3]:
# import necessary modules
%matplotlib inline

from __future__ import division
from __future__ import print_function
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import madden

from sklearn import linear_model
from sklearn import svm
from sklearn import ensemble

print ("pandas version ",pd.__version__)

pandas version  0.15.1


In [4]:
# location of lookup files
from referencedata import ReferenceData
lookupFiles = { 'teams' : {'file': 'nflTeams.csv' },
                'seasons' : {'file': 'seasons.csv' },
               }

lookupDir = "".join([dataRoot, 'lookup', os.path.sep])

print ("lookupFiles = %s" % lookupFiles)
print ("lookupDir = %s" % lookupDir)

lookupFiles = {'seasons': {'file': 'seasons.csv'}, 'teams': {'file': 'nflTeams.csv'}}
lookupDir = /Users/amit/repos/mlnfl/nfl/data/lookup/


In [5]:
# import reference data
reference_data = ReferenceData(lookupDir)
reference_data.teams_df.head()

Unnamed: 0_level_0,city,mascot,league,division,year
team,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Baltimore Ravens,,Ravens,afc,north,
New England Patriots,,Patriots,afc,east,
Tennessee Titans,,Titans,afc,south,
Atlanta Falcons,,Falcons,nfc,south,
Tampa Bay Buccaneers,,Buccaneers,nfc,south,


In [25]:
###  multi-season training
reload(madden)

# testYears and trainYears need to be arrays
trainYears = [2014] #range(2008,2013) 
testYear = [2015]

# train on previous 3 yrs of data
testYear = [2015]
trainYears = range(testYear[0]-3,testYear[0]) 

# training data set - includes one extra year for prev yr record
seasons = np.array(trainYears)
print ("training seasons >> ", seasons)

training seasons >>  [2012 2013 2014]


In [26]:
reload(madden)

# get training data
# 1 - read all the games
path_to_lines = dataRoot + "lines/"
dfAllGames = madden.readGamesAll(path_to_lines, seasons)
# 2 - compile season record for all teams
dfAllTeams = madden.seasonRecord(dfAllGames, reference_data)
# 3 - apply season records and compute other fields for all games
dfAllGames = madden.processGames(dfAllGames, dfAllTeams, reference_data)
# 4 - remove extra year of data 
dfAllGames = dfAllGames[dfAllGames.season.isin(seasons)]

# use different test set 
seasonTest = np.array(testYear) # should be only one year
print ("results for >> ", seasonTest)
# 1 - read all the games
dfGamesTest = madden.readGamesAll(path_to_lines, seasonTest)  
# 2 - compile season record for all teams
dfTeamsTest = madden.seasonRecord(dfGamesTest,reference_data) 
# 3 - apply season records and compute other fields for all games
dfGamesTest = madden.processGames(dfGamesTest, dfTeamsTest, reference_data) 
# 4 - remove extra year of data 
dfGamesTest = dfGamesTest[dfGamesTest.season.isin(seasonTest)]


results for >>  [2015]


In [27]:
# define independent variables for logistic regression
features = ['favoredRecord','underdogRecord',  # current year records of both teams
            'prevFavoredRecord','prevUnderdogRecord', # prev year records, helps early in season when only few games played
            'gameWeek',  # week in season, should make a good/bad record later in season more important
            'absLine',  # absolute value of spread since favored team already determined
            'divisionGame', # T/F, usually more competitive rivalry games, i.e. bad teams still win home division games.
            'favoredHomeGame', # T/F, important since output of classifier is "did the favored team win?"
            ]

# run the classifer
random_state = 11
classifier = svm.SVC(kernel='poly',probability=True, random_state=random_state)
#classifier = linear_model.LogisticRegression(C=1e5)

mlClassifier = madden.runScikitClassifier(dfAllGames,features,classifier)

# apply results of logistic regression to the test set
#dfPredict = madden.predictGames(dfGamesTest,mlClassifier,features)

# apply ranking logic and determine scoring outcomes for league
#dfAll = madden.rankGames(dfPredict,reference_data,seasonTest[0])

training data accuracy =  0.71484375


In [28]:
# predict one week of current season
iweek = 1
reload(madden)

# use different test set - current year
testYear = [2015]
seasonTest = np.array(testYear) # should be only one year
print ("results for >> ", seasonTest)

dfGamesTest = madden.readGamesAll(path_to_lines, seasonTest)
dfTeamsTest = madden.seasonRecord(dfGamesTest,reference_data)
dfGamesTest = madden.processGames(dfGamesTest, dfTeamsTest, reference_data)
dfGamesTest = dfGamesTest[dfGamesTest.season.isin(seasonTest)]

# pick only this weeks games for predict
dfTest = dfGamesTest[dfGamesTest.gameWeek == iweek]

# apply results of logistic regression to the test set
dfPredict = madden.predictGames(dfTest,mlClassifier,features)
# apply ranking logic and determine scoring outcomes for league
dfAll = madden.rankGames(dfPredict, reference_data, seasonTest[0])



results for >>  [2015]


In [29]:
# display weekly ranking output

# ranking methods choices
# 0. pick based on spread
# 1. always pick favored team, rank by probability of win
# 2. pick winner based on abs(probability - .5), rank by probability
# 3. pick winner based on abs(probability - .5), rank by abs(probability - .5)

dispCols = ['season','gameWeek','Visitor','visitorRecord','Home Team','homeRecord',
            'Line','prevFavoredRecord','prevUnderdogRecord','predict_proba',
            'lineGuess','probaGuess', 'probaAbsGuess', 'predictTeam']


dfAll['predictTeam'] = np.where((dfAll['predict_proba'] - .5) > 0 , dfAll['favorite'], dfAll['underdog'])
guessCol = 'probaGuess'
predictCols = ['gameWeek','predictTeam', 'predict_proba', guessCol, 'favorite','lineGuess', 'Line']
dfAll[predictCols].sort(guessCol, ascending=False)

Unnamed: 0,gameWeek,predictTeam,predict_proba,probaGuess,favorite,lineGuess,Line
8699,1,Dallas Cowboys,0.700154,16,Dallas Cowboys,15,5.5
8698,1,Denver Broncos,0.69747,15,Denver Broncos,14,4.5
8686,1,New England Patriots,0.68918,14,New England Patriots,7,3.0
8694,1,Arizona Cardinals,0.683601,13,Arizona Cardinals,3,2.5
8696,1,Tampa Bay Buccaneers,0.681596,12,Tampa Bay Buccaneers,9,3.0
8689,1,New York Jets,0.68046,11,New York Jets,8,3.0
8695,1,San Diego Chargers,0.678731,10,San Diego Chargers,4,2.5
8688,1,Houston Texans,0.668569,9,Houston Texans,1,1.5
8687,1,Green Bay Packers,0.666369,8,Green Bay Packers,16,-6.5
8693,1,Seattle Seahawks,0.652597,7,Seattle Seahawks,13,-3.5


In [17]:
week_filter = dfAll.gameWeek == 1
dfAll[week_filter][predictCols].sort(guessCol, ascending=False)

Unnamed: 0,gameWeek,predictTeam,predict_proba,probaGuess,favorite,lineGuess,Line
8687,1,Green Bay Packers,0.729452,16,Green Bay Packers,16,-6.5
8699,1,Dallas Cowboys,0.684925,15,Dallas Cowboys,15,5.5
8692,1,Carolina Panthers,0.679254,14,Carolina Panthers,10,-3.5
8691,1,Miami Dolphins,0.678809,13,Miami Dolphins,12,-3.5
8697,1,Cincinnati Bengals,0.677719,12,Cincinnati Bengals,11,-3.5
8696,1,Tampa Bay Buccaneers,0.67507,11,Tampa Bay Buccaneers,9,3.0
8700,1,Philadelphia Eagles,0.674416,10,Philadelphia Eagles,5,-3.0
8690,1,Indianapolis Colts,0.672998,9,Indianapolis Colts,6,-3.0
8701,1,Minnesota Vikings,0.671835,8,Minnesota Vikings,2,-2.0
8698,1,Denver Broncos,0.668967,7,Denver Broncos,14,4.5


In [18]:
dfAll.predict_proba 

8688    0.656605
8701    0.671835
8694    0.661199
8695    0.653584
8700    0.674416
8690    0.672998
8686    0.653974
8689    0.666068
8696    0.675070
8692    0.679254
8697    0.677719
8691    0.678809
8693    0.660486
8698    0.668967
8699    0.684925
8687    0.729452
Name: predict_proba, dtype: float64

In [19]:
pdb

Automatic pdb calling has been turned OFF


In [20]:
# display weekly ranking output for spread method

# ranking methods choices
# 0. pick based on spread
# 1. always pick favored team, rank by probability of win
# 2. pick winner based on abs(probability - .5), rank by probability
# 3. pick winner based on abs(probability - .5), rank by abs(probability - .5)

predictCols = ['favorite','lineGuess', 'absLine','Line', 'favoredHomeGame', 'divisionGame', 'favoredRecord']

sortCols = ['absLine','favoredHomeGame', 'divisionGame', 'favoredRecord', 'favorite']
dfSpread = dfAll[predictCols].sort(sortCols , ascending=False)
#print(dfSpread.to_csv(sys.stdout,sep=',', index=False))
dfSpread

Unnamed: 0,favorite,lineGuess,absLine,Line,favoredHomeGame,divisionGame,favoredRecord
8687,Green Bay Packers,16,6.5,-6.5,0,1,0
8699,Dallas Cowboys,15,5.5,5.5,1,1,0
8698,Denver Broncos,14,4.5,4.5,1,0,0
8693,Seattle Seahawks,13,3.5,-3.5,0,1,0
8691,Miami Dolphins,12,3.5,-3.5,0,0,0
8697,Cincinnati Bengals,11,3.5,-3.5,0,0,0
8692,Carolina Panthers,10,3.5,-3.5,0,0,0
8696,Tampa Bay Buccaneers,9,3.0,3.0,1,0,0
8689,New York Jets,8,3.0,3.0,1,0,0
8686,New England Patriots,7,3.0,3.0,1,0,0


In [33]:
# loop over multiple years

import runMadden
import walsh
reload(madden)
reload(runMadden)


trainStart = 2008
trainLen = 3

mlClassifier = svm.SVC(kernel='poly',probability=True)
dfSVM = runMadden.runSeasonLoop(trainStart, trainLen, mlClassifier, path_to_lines, reference_data)

mlClassifier = linear_model.LogisticRegression(C=1e5)
dfLog = runMadden.runSeasonLoop(trainStart, trainLen, mlClassifier, path_to_lines, reference_data)
#dfLoop = dfSVM


2011 [2008, 2009, 2010]
training data accuracy =  0.70703125
1670 <type 'int'>
2012 [2009, 2010, 2011]
training data accuracy =  0.716145833333
1632 <type 'int'>
2013 [2010, 2011, 2012]
training data accuracy =  0.71484375
1653 <type 'int'>
2014 [2011, 2012, 2013]
training data accuracy =  0.716145833333
1664 <type 'int'>
2011 [2008, 2009, 2010]
training data accuracy =  0.697916666667
1670 <type 'int'>
2012 [2009, 2010, 2011]
training data accuracy =  0.712239583333
1632 <type 'int'>
2013 [2010, 2011, 2012]
training data accuracy =  0.690104166667
1653 <type 'int'>
2014 [2011, 2012, 2013]
training data accuracy =  0.6953125
1664 <type 'int'>


In [34]:
dfSVM

Unnamed: 0_level_0,lineScore,probaScore1,probaScore2,probaScore3,trainYears,classifierType,classifier
season,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2011,34,22,31,40,[2008 2009 2010],<class 'sklearn.svm.classes.SVC'>,"SVC(C=1.0, cache_size=200, class_weight=None, ..."
2012,-14,22,42,38,[2009 2010 2011],<class 'sklearn.svm.classes.SVC'>,"SVC(C=1.0, cache_size=200, class_weight=None, ..."
2013,2,8,-11,-32,[2010 2011 2012],<class 'sklearn.svm.classes.SVC'>,"SVC(C=1.0, cache_size=200, class_weight=None, ..."
2014,5,86,107,120,[2011 2012 2013],<class 'sklearn.svm.classes.SVC'>,"SVC(C=1.0, cache_size=200, class_weight=None, ..."


In [35]:
dfLog

Unnamed: 0_level_0,lineScore,probaScore1,probaScore2,probaScore3,trainYears,classifierType,classifier
season,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2011,34,49,42,33,[2008 2009 2010],<class 'sklearn.linear_model.logistic.Logistic...,"LogisticRegression(C=100000.0, class_weight=No..."
2012,-14,-2,20,2,[2009 2010 2011],<class 'sklearn.linear_model.logistic.Logistic...,"LogisticRegression(C=100000.0, class_weight=No..."
2013,2,28,40,35,[2010 2011 2012],<class 'sklearn.linear_model.logistic.Logistic...,"LogisticRegression(C=100000.0, class_weight=No..."
2014,5,5,-35,-47,[2011 2012 2013],<class 'sklearn.linear_model.logistic.Logistic...,"LogisticRegression(C=100000.0, class_weight=No..."
