# nfl lines machine learning project


source data for historical spreads and game scores
http://www.repole.com/sun4cast/data.html

### setup

In [1]:
import os

# define the root directory for the nfl code in $MLNLF_ROOT
codeDir = "".join([os.environ['MLNFL_ROOT'], os.path.sep])
dataRoot = "".join([codeDir, "data", os.path.sep])

os.chdir(codeDir)

print(codeDir)
print(dataRoot)

/Users/amit/repos/mlnfl/nfl/
/Users/amit/repos/mlnfl/nfl/data/


In [3]:
# warnings control
import warnings
# choose default, ignore, always
warnings.filterwarnings('ignore')


In [4]:
# import necessary modules
%matplotlib inline

from __future__ import division
from __future__ import print_function
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import madden

from sklearn import linear_model
from sklearn import svm
from sklearn import ensemble

print ("pandas version ",pd.__version__)

pandas version  0.18.0


In [5]:
# location of lookup files
from referencedata import ReferenceData
lookupFiles = { 'teams' : {'file': 'nflTeams.csv' },
                'seasons' : {'file': 'seasons.csv' },
               }

lookupDir = "".join([dataRoot, 'lookup', os.path.sep])

print ("lookupFiles = %s" % lookupFiles)
print ("lookupDir = %s" % lookupDir)

lookupFiles = {'seasons': {'file': 'seasons.csv'}, 'teams': {'file': 'nflTeams.csv'}}
lookupDir = /Users/amit/repos/mlnfl/nfl/data/lookup/


In [6]:
# import reference data
reference_data = ReferenceData(lookupDir)
reference_data.teams_df.head()

Unnamed: 0_level_0,city,mascot,league,division,year
team,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Baltimore Ravens,,Ravens,afc,north,
New England Patriots,,Patriots,afc,east,
Tennessee Titans,,Titans,afc,south,
Atlanta Falcons,,Falcons,nfc,south,
Tampa Bay Buccaneers,,Buccaneers,nfc,south,


### define test and training sets

In [7]:
###  multi-season training
# train on previous 3 yrs of data
testYear = [2016]
trainYears = range(testYear[0]-3,testYear[0]) 

# training data set - includes one extra year for prev yr record
seasons = np.array(trainYears)
print ("training seasons >> ", seasons)
print ("test seasons >> ", testYear)

training seasons >>  [2013 2014 2015]
test seasons >>  [2016]


### load and process historical data for training and test sets

In [8]:
reload(madden)

# get training data
# 1 - read all the games
path_to_lines = dataRoot + "lines/"
dfAllGames = madden.readGamesAll(path_to_lines, seasons)
# 2 - compile season record for all teams
dfAllTeams = madden.seasonRecord(dfAllGames, reference_data)
# 3 - apply season records and compute other fields for all games
dfAllGames = madden.processGames(dfAllGames, dfAllTeams, reference_data)
# 4 - remove extra year of data 
dfAllGames = dfAllGames[dfAllGames.season.isin(seasons)]

# use different test set 
seasonTest = np.array(testYear) # should be only one year
print ("getting results for >> ", seasonTest)
# 1 - read all the games
dfGamesTest = madden.readGamesAll(path_to_lines, seasonTest)  
# 2 - compile season record for all teams
dfTeamsTest = madden.seasonRecord(dfGamesTest,reference_data) 
# 3 - apply season records and compute other fields for all games
dfGamesTest = madden.processGames(dfGamesTest, dfTeamsTest, reference_data) 
# 4 - remove extra year of data 
dfGamesTest = dfGamesTest[dfGamesTest.season.isin(seasonTest)]


results for >>  [2016]


### what does input data look like ?


In [47]:
# features
dfAllGames[madden.FEATURE_COLUMNS].head()

Unnamed: 0,favoredRecord,underdogRecord,prevFavoredRecord,prevUnderdogRecord,gameWeek,absLine,divisionGame,favoredHomeGame
8174,0.0,0.0,0.8125,0.625,1,7.5,0,1
8175,0.0,0.0,0.75,0.375,1,10.5,1,0
8176,0.0,0.0,0.5,0.375,1,6.0,0,1
8177,0.0,0.0,0.4375,0.8125,1,3.5,1,1
8178,0.0,0.0,0.4375,0.375,1,6.0,0,0


In [48]:
# classifier historical outcomes
dfAllGames.favoredWin.head()

8174    1
8175    1
8176    0
8177    1
8178    0
Name: favoredWin, dtype: int64

### setup logistic regression classifier

In [9]:
# define independent variables for logistic regression
features = ['favoredRecord','underdogRecord',  # current year records of both teams
            'prevFavoredRecord','prevUnderdogRecord', # prev year records, helps early in season when only few games played
            'gameWeek',  # week in season, should make a good/bad record later in season more important
            'absLine',  # absolute value of spread since favored team already determined
            'divisionGame', # T/F, usually more competitive rivalry games, i.e. bad teams still win home division games.
            'favoredHomeGame', # T/F, important since output of classifier is "did the favored team win?"
            ]

# run the classifer
random_state = 11
#classifier = svm.SVC(kernel='poly',probability=True, random_state=random_state)
classifier = linear_model.LogisticRegression(C=1e5)
mlClassifier = madden.runScikitClassifier(dfAllGames,features,classifier)


In [51]:
# predict one week of current season
iweek = 9

seasonTest = np.array(testYear) # should be only one year
print ("results for >> ", seasonTest)

# pick only this weeks games for predict
dfTest = dfGamesTest[dfGamesTest.gameWeek == iweek]

# apply results of logistic regression to the test set
dfPredict = madden.predictGames(dfTest,mlClassifier,features)

# apply ranking logic and determine scoring outcomes for league
dfAll = madden.rankGames(dfPredict, reference_data, seasonTest[0])



results for >>  [2016]


### what does prediciton output data look like?

In [58]:
dispCols = ['favorite', 'underdog', 'predict_proba']
dfPredict[dispCols].sort('predict_proba', ascending=False)

Unnamed: 0,favorite,underdog,predict_proba
9065,Kansas City Chiefs,Jacksonville Jaguars,0.815774
9063,Dallas Cowboys,Cleveland Browns,0.800223
9064,Minnesota Vikings,Detroit Lions,0.729041
9071,Green Bay Packers,Indianapolis Colts,0.722061
9074,Seattle Seahawks,Buffalo Bills,0.685294
9062,Atlanta Falcons,Tampa Bay Buccaneers,0.615777
9070,New Orleans Saints,San Francisco 49ers,0.59486
9067,New York Giants,Philadelphia Eagles,0.568567
9066,Miami Dolphins,New York Jets,0.56585
9072,San Diego Chargers,Tennessee Titans,0.565649


### check to see what spread method would predict

In [37]:
# display weekly ranking output for spread method

predictCols = ['favorite','lineGuess', 'absLine','Line', 'favoredHomeGame', 'divisionGame', 'favoredRecord']
sortCols = ['absLine','favoredHomeGame', 'divisionGame', 'favoredRecord', 'favorite']
dfSpread = dfAll[predictCols].sort(sortCols , ascending=False)

dfSpread

Unnamed: 0,favorite,lineGuess,absLine,Line,favoredHomeGame,divisionGame,favoredRecord
9065,Kansas City Chiefs,16.0,7.5,7.5,1,0,0.625
9063,Dallas Cowboys,15.0,7.5,-7.5,0,0,0.75
9074,Seattle Seahawks,14.0,7.0,7.0,1,0,0.5
9071,Green Bay Packers,13.0,7.0,7.0,1,0,0.5
9064,Minnesota Vikings,12.0,6.0,6.0,1,1,0.625
9072,San Diego Chargers,11.0,5.0,5.0,1,0,0.375
9066,Miami Dolphins,10.0,3.5,3.5,1,1,0.375
9062,Atlanta Falcons,9.0,3.5,-3.5,0,1,0.625
9070,New Orleans Saints,8.0,3.5,-3.5,0,0,0.375
9069,Carolina Panthers,7.0,3.0,-3.0,0,0,0.25


### model predictions with probability

In [38]:
# display weekly ranking output

# ranking methods choices
# 0. pick based on spread
# 1. always pick favored team, rank by probability of win
# 2. pick winner based on abs(probability - .5), rank by probability
# 3. pick winner based on abs(probability - .5), rank by abs(probability - .5)

dispCols = ['season','gameWeek','Visitor','visitorRecord','Home Team','homeRecord',
            'Line','prevFavoredRecord','prevUnderdogRecord','predict_proba',
            'lineGuess','probaGuess', 'probaAbsGuess', 'predictTeam']


dfAll['predictTeam'] = np.where((dfAll['predict_proba'] - .5) > 0 , dfAll['favorite'], dfAll['underdog'])
guessCol = 'probaGuess'
predictCols = ['gameWeek','predictTeam', 'predict_proba', guessCol, 'favorite','lineGuess', 'Line']
dfAll[predictCols].sort(guessCol, ascending=False)

Unnamed: 0,gameWeek,predictTeam,predict_proba,probaGuess,favorite,lineGuess,Line
9065,9,Kansas City Chiefs,0.815774,16.0,Kansas City Chiefs,16.0,7.5
9063,9,Dallas Cowboys,0.800223,15.0,Dallas Cowboys,15.0,-7.5
9064,9,Minnesota Vikings,0.729041,14.0,Minnesota Vikings,12.0,6.0
9071,9,Green Bay Packers,0.722061,13.0,Green Bay Packers,13.0,7.0
9074,9,Seattle Seahawks,0.685294,12.0,Seattle Seahawks,14.0,7.0
9062,9,Atlanta Falcons,0.615777,11.0,Atlanta Falcons,9.0,-3.5
9070,9,New Orleans Saints,0.59486,10.0,New Orleans Saints,8.0,-3.5
9067,9,New York Giants,0.568567,9.0,New York Giants,6.0,2.5
9066,9,Miami Dolphins,0.56585,8.0,Miami Dolphins,10.0,3.5
9072,9,San Diego Chargers,0.565649,7.0,San Diego Chargers,11.0,5.0


In [39]:
# actual outcomes
dispCols = ['week', 'favorite', 'underdog', 'absLine', 'predict_proba', 'predictWin']
dfPredict[dispCols].sort('predict_proba', ascending=False)

Unnamed: 0,week,favorite,underdog,absLine,predict_proba,predictWin
9065,9.0,Kansas City Chiefs,Jacksonville Jaguars,7.5,0.815774,0
9063,9.0,Dallas Cowboys,Cleveland Browns,7.5,0.800223,0
9064,9.0,Minnesota Vikings,Detroit Lions,6.0,0.729041,0
9071,9.0,Green Bay Packers,Indianapolis Colts,7.0,0.722061,0
9074,9.0,Seattle Seahawks,Buffalo Bills,7.0,0.685294,0
9062,9.0,Atlanta Falcons,Tampa Bay Buccaneers,3.5,0.615777,0
9070,9.0,New Orleans Saints,San Francisco 49ers,3.5,0.59486,0
9067,9.0,New York Giants,Philadelphia Eagles,2.5,0.568567,0
9066,9.0,Miami Dolphins,New York Jets,3.5,0.56585,0
9072,9.0,San Diego Chargers,Tennessee Titans,5.0,0.565649,0
