## nfl lines machine learning project


source data for historical spreads and game scores
http://www.repole.com/sun4cast/data.html

In [1]:
import os

# define the root directory for the nfl code in $MLNLF_ROOT
codeDir = "".join([os.environ['MLNFL_ROOT'], os.path.sep])
dataRoot = "".join([codeDir, "data", os.path.sep])

os.chdir(codeDir)

print(codeDir)
print(dataRoot)

/Users/amit/repos/mlnfl/nfl/
/Users/amit/repos/mlnfl/nfl/data/


In [2]:
# warnings control
import warnings
# choose default, ignore, always
warnings.filterwarnings('default')


In [3]:
# import necessary modules
%matplotlib inline

from __future__ import division
from __future__ import print_function
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import madden

from sklearn import linear_model
from sklearn import svm
from sklearn import ensemble

print ("pandas version ",pd.__version__)

pandas version  0.15.1


In [4]:
# location of lookup files
from referencedata import ReferenceData
lookupFiles = { 'teams' : {'file': 'nflTeams.csv' },
                'seasons' : {'file': 'seasons.csv' },
               }

lookupDir = "".join([dataRoot, 'lookup', os.path.sep])

print ("lookupFiles = %s" % lookupFiles)
print ("lookupDir = %s" % lookupDir)

lookupFiles = {'seasons': {'file': 'seasons.csv'}, 'teams': {'file': 'nflTeams.csv'}}
lookupDir = /Users/amit/repos/mlnfl/nfl/data/lookup/


In [5]:
# import reference data
reference_data = ReferenceData(lookupDir)
reference_data.teams_df.head()

Unnamed: 0_level_0,city,mascot,league,division,year
team,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Baltimore Ravens,,Ravens,afc,north,
New England Patriots,,Patriots,afc,east,
Tennessee Titans,,Titans,afc,south,
Atlanta Falcons,,Falcons,nfc,south,
Tampa Bay Buccaneers,,Buccaneers,nfc,south,


In [61]:
###  multi-season training
reload(madden)

# testYears and trainYears need to be arrays
trainYears = [2014] #range(2008,2013) 
testYear = [2015]

# train on previous 3 yrs of data
testYear = [2014]
trainYears = range(testYear[0]-3,testYear[0]) 

# training data set - includes one extra year for prev yr record
seasons = np.array(trainYears)
print ("training seasons >> ", seasons)

training seasons >>  [2011 2012 2013]


In [56]:
testYear = [2015]

In [62]:
reload(madden)

# get training data
# 1 - read all the games
path_to_lines = dataRoot + "lines/"
dfAllGames = madden.readGamesAll(path_to_lines, seasons)
# 2 - compile season record for all teams
dfAllTeams = madden.seasonRecord(dfAllGames, reference_data)
# 3 - apply season records and compute other fields for all games
dfAllGames = madden.processGames(dfAllGames, dfAllTeams, reference_data)
# 4 - remove extra year of data 
dfAllGames = dfAllGames[dfAllGames.season.isin(seasons)]

# use different test set 
seasonTest = np.array(testYear) # should be only one year
print ("results for >> ", seasonTest)
# 1 - read all the games
dfGamesTest = madden.readGamesAll(path_to_lines, seasonTest)  
# 2 - compile season record for all teams
dfTeamsTest = madden.seasonRecord(dfGamesTest,reference_data) 
# 3 - apply season records and compute other fields for all games
dfGamesTest = madden.processGames(dfGamesTest, dfTeamsTest, reference_data) 
# 4 - remove extra year of data 
dfGamesTest = dfGamesTest[dfGamesTest.season.isin(seasonTest)]


results for >>  [2014]


In [63]:
# define independent variables for logistic regression
features = ['favoredRecord','underdogRecord',  # current year records of both teams
            'prevFavoredRecord','prevUnderdogRecord', # prev year records, helps early in season when only few games played
            'gameWeek',  # week in season, should make a good/bad record later in season more important
            'absLine',  # absolute value of spread since favored team already determined
            'divisionGame', # T/F, usually more competitive rivalry games, i.e. bad teams still win home division games.
            'favoredHomeGame', # T/F, important since output of classifier is "did the favored team win?"
            ]

# run the classifer
random_state = 11
classifier = svm.SVC(kernel='poly',probability=True, random_state=random_state)
#classifier = linear_model.LogisticRegression(C=1e5)

mlClassifier = madden.runScikitClassifier(dfAllGames,features,classifier)

# apply results of logistic regression to the test set
#dfPredict = madden.predictGames(dfGamesTest,mlClassifier,features)

# apply ranking logic and determine scoring outcomes for league
dfAll = madden.rankGames(dfPredict,reference_data,seasonTest[0])

In [46]:
# diagnostics ... weekly outcomes
g = dfAll.groupby('gameWeek')['lineScore','probaScore1','probaScore2','probaScore3'].sum()
print(g)

g.sum()

          lineScore  probaScore1  probaScore2  probaScore3
gameWeek                                                  
1                99           89           89           89
2               122          123          123          123
3                96           92           92           92
4               107          105          105          105
5                93           93           93           93
6               111          102          102          102
7                82           86           82           82
8                96           96           96           96
9                77           78           81           81
10               74           78           79           84
11              107          109          113          116
12              124          128          131          132
13              103          110          121          121
14              114          111          114          118
15               86           70           69           

lineScore      1704
probaScore1    1686
probaScore2    1699
probaScore3    1716
dtype: float64

In [59]:
# predict one week of current season
iweek = 1
reload(madden)

# use different test set - current year
testYear = [2015]
seasonTest = np.array(testYear) # should be only one year
print ("results for >> ", seasonTest)

dfGamesTest = madden.readGamesAll(path_to_lines, seasonTest)
dfTeamsTest = madden.seasonRecord(dfGamesTest,reference_data)
dfGamesTest = madden.processGames(dfGamesTest, dfTeamsTest, reference_data)
dfGamesTest = dfGamesTest[dfGamesTest.season.isin(seasonTest)]

# pick only this weeks games for predict
dfTest = dfGamesTest[dfGamesTest.gameWeek == iweek]

# apply results of logistic regression to the test set
dfPredict = madden.predictGames(dfTest,mlClassifier,features)
# apply ranking logic and determine scoring outcomes for league
dfAll = madden.rankGames(dfPredict, reference_data, seasonTest[0])



results for >>  [2015]


In [60]:
# display weekly ranking output

# ranking methods choices
# 0. pick based on spread
# 1. always pick favored team, rank by probability of win
# 2. pick winner based on abs(probability - .5), rank by probability
# 3. pick winner based on abs(probability - .5), rank by abs(probability - .5)

dispCols = ['season','gameWeek','Visitor','visitorRecord','Home Team','homeRecord',
            'Line','prevFavoredRecord','prevUnderdogRecord','predict_proba',
            'lineGuess','probaGuess', 'probaAbsGuess', 'predictTeam']


dfAll['predictTeam'] = np.where((dfAll['predict_proba'] - .5) > 0 , dfAll['favorite'], dfAll['underdog'])
guessCol = 'probaGuess'
predictCols = ['gameWeek','predictTeam', 'predict_proba', guessCol, 'favorite','lineGuess', 'Line']
dfAll[predictCols].sort(guessCol, ascending=False)

Unnamed: 0,gameWeek,predictTeam,predict_proba,probaGuess,favorite,lineGuess,Line
8686,1,New England Patriots,0.740155,16,New England Patriots,16,7.0
8699,1,Dallas Cowboys,0.719797,15,Dallas Cowboys,14,6.0
8698,1,Denver Broncos,0.691672,14,Denver Broncos,13,4.5
8695,1,San Diego Chargers,0.668912,13,San Diego Chargers,8,3.0
8689,1,New York Jets,0.666255,12,New York Jets,7,3.0
8696,1,Tampa Bay Buccaneers,0.66524,11,Tampa Bay Buccaneers,9,3.0
8687,1,Green Bay Packers,0.661482,10,Green Bay Packers,15,-6.5
8694,1,Arizona Cardinals,0.660412,9,Arizona Cardinals,4,2.5
8693,1,Seattle Seahawks,0.645085,8,Seattle Seahawks,12,-4.0
8688,1,Houston Texans,0.635793,7,Houston Texans,1,1.0


In [54]:
week_filter = dfAll.gameWeek == 3
dfAll['predictTeam'] = np.where((dfAll['predict_proba'] - .5) > 0 , dfAll['favorite'], dfAll['underdog'])
dfAll[week_filter][predictCols].sort(guessCol, ascending=False)

Unnamed: 0,gameWeek,predictTeam,predict_proba,probaGuess,favorite,lineGuess,Line
8472,3,New England Patriots,0.871801,16,New England Patriots,16,15.0
8473,3,San Francisco 49ers,0.744192,15,San Francisco 49ers,6,-2.5
8476,3,Carolina Panthers,0.721799,14,Carolina Panthers,8,3.0
8474,3,Seattle Seahawks,0.716605,13,Seattle Seahawks,10,5.0
8465,3,Philadelphia Eagles,0.714089,12,Philadelphia Eagles,13,6.5
8468,3,Cincinnati Bengals,0.701794,11,Cincinnati Bengals,11,6.5
8462,3,Atlanta Falcons,0.701773,10,Atlanta Falcons,12,6.5
8463,3,Buffalo Bills,0.6859,9,Buffalo Bills,5,2.0
8475,3,Miami Dolphins,0.682691,8,Miami Dolphins,9,4.0
8477,3,New York Jets,0.679013,7,New York Jets,7,2.5


In [12]:
# display weekly ranking output for spread method

# ranking methods choices
# 0. pick based on spread
# 1. always pick favored team, rank by probability of win
# 2. pick winner based on abs(probability - .5), rank by probability
# 3. pick winner based on abs(probability - .5), rank by abs(probability - .5)

predictCols = ['favorite','lineGuess', 'absLine','Line', 'favoredHomeGame', 'divisionGame', 'favoredRecord']

sortCols = ['absLine','favoredHomeGame', 'divisionGame', 'favoredRecord', 'favorite']
dfSpread = dfAll[predictCols].sort(sortCols , ascending=False)
#print(dfSpread.to_csv(sys.stdout,sep=',', index=False))
dfSpread

Unnamed: 0,favorite,lineGuess,absLine,Line,favoredHomeGame,divisionGame,favoredRecord
8686,New England Patriots,16,7.0,7.0,1,0,0
8687,Green Bay Packers,15,6.5,-6.5,0,1,0
8699,Dallas Cowboys,14,6.0,6.0,1,1,0
8698,Denver Broncos,13,4.5,4.5,1,0,0
8693,Seattle Seahawks,12,4.0,-4.0,0,1,0
8691,Miami Dolphins,11,3.5,-3.5,0,0,0
8697,Cincinnati Bengals,10,3.5,-3.5,0,0,0
8696,Tampa Bay Buccaneers,9,3.0,3.0,1,0,0
8695,San Diego Chargers,8,3.0,3.0,1,0,0
8689,New York Jets,7,3.0,3.0,1,0,0


In [30]:
# loop over multiple years

import runMadden
import walsh
reload(madden)
reload(runMadden)


trainStart = 2010
trainLen = 3

random_state = None
mlClassifier = svm.SVC(kernel='poly',probability=True, random_state=random_state)
dfSVM = runMadden.runSeasonLoop(trainStart, trainLen, mlClassifier, path_to_lines, reference_data)

mlClassifier = linear_model.LogisticRegression(C=1e5)
dfLog = runMadden.runSeasonLoop(trainStart, trainLen, mlClassifier, path_to_lines, reference_data)
#dfLoop = dfSVM


2013 [2010, 2011, 2012]
1653 <type 'int'>
2014 [2011, 2012, 2013]
1664 <type 'int'>
2013 [2010, 2011, 2012]
1653 <type 'int'>
2014 [2011, 2012, 2013]
1664 <type 'int'>


In [31]:
dfSVM

Unnamed: 0_level_0,lineScore,probaScore1,probaScore2,probaScore3,trainYears,classifierType,classifier
season,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2013,2,8,2,-17,[2010 2011 2012],<class 'sklearn.svm.classes.SVC'>,"SVC(C=1.0, cache_size=200, class_weight=None, ..."
2014,5,85,99,113,[2011 2012 2013],<class 'sklearn.svm.classes.SVC'>,"SVC(C=1.0, cache_size=200, class_weight=None, ..."


In [32]:
dfLog

Unnamed: 0_level_0,lineScore,probaScore1,probaScore2,probaScore3,trainYears,classifierType,classifier
season,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2013,2,28,40,35,[2010 2011 2012],<class 'sklearn.linear_model.logistic.Logistic...,"LogisticRegression(C=100000.0, class_weight=No..."
2014,5,5,-35,-47,[2011 2012 2013],<class 'sklearn.linear_model.logistic.Logistic...,"LogisticRegression(C=100000.0, class_weight=No..."
