In [184]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import mean_squared_error

In [185]:
nfl_games = pd.read_csv('../data/nfl_games.csv')
scores = nfl_games[['score1', 'score2']]
nfl_games

Unnamed: 0,date,season,neutral,playoff,team1,team2,elo1,elo2,elo_prob1,score1,score2,result1
0,1920-09-26,1920,0,0,RII,STP,1503.947000,1300.000000,0.824651,48,0,1.0
1,1920-10-03,1920,0,0,AKR,WHE,1503.420000,1300.000000,0.824212,43,0,1.0
2,1920-10-03,1920,0,0,RCH,ABU,1503.420000,1300.000000,0.824212,10,0,1.0
3,1920-10-03,1920,0,0,DAY,COL,1493.002000,1504.908000,0.575819,14,0,1.0
4,1920-10-03,1920,0,0,RII,MUN,1516.108000,1478.004000,0.644171,45,0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...
16269,2019-01-13,2018,0,1,NE,LAC,1640.171960,1647.624483,0.582068,41,28,1.0
16270,2019-01-13,2018,0,1,NO,PHI,1669.105861,1633.114673,0.641378,20,14,1.0
16271,2019-01-20,2018,0,1,NO,LAR,1682.450194,1648.424105,0.638772,23,26,0.0
16272,2019-01-20,2018,0,1,KC,NE,1675.286412,1661.668566,0.611248,31,37,0.0


# Exploring The Data

We do a little bit of data exploration by taking the averages of regular season and playoff scores and notice a small difference in the home teams score. Then just for fun we look at the total number of teams that have existed in the NFL. Interestingly enough many teams played only 1 or 2 official games. I considered cutting these teams but figured that getting comparative elo scores would make them useful data points. This turned out to not work out too well because in later tests we find elo to not correlate to score very well on its own. 

In [186]:
print(nfl_games[['score1', 'score2']].mean())
print(nfl_games[nfl_games['playoff'] == 0][['score1', 'score2']].mean())
print(nfl_games[nfl_games['playoff'] == 1][['score1', 'score2']].mean())

score1    21.544058
score2    18.578161
dtype: float64
score1    21.441877
score2    18.583079
dtype: float64
score1    24.379859
score2    18.441696
dtype: float64


In [187]:
print('Home Teams:', len(nfl_games['team1'].unique()))
print('Away Teams:', len(nfl_games['team2'].unique()))
print('All  Teams:', len(nfl_games['team1'].append(nfl_games['team2']).unique()))

Home Teams: 101
Away Teams: 108
All  Teams: 123


# Making new features
The features added to the original data set are simple. First splitting the dates up just to deal with the numbers individually. I also calculated the difference in elo between teams. The other added features were the season average of a team in its home and away games and the previous season's average of the team's home and away games. I added the season and previous season averages after doing some regressions and found that the previous season's average had almost no bearing on the predictive strength of the model, but current season averages did. Another interesting feature to add would be the cumulative season average for each team as each game was played, but I don't have the time and pandas-fu to put that together. 

In [188]:
# Split Month Day Year
dates_split = nfl_games['date'].apply(lambda x: x.split('-'))
year  = dates_split.apply(lambda x: x[0])
month = dates_split.apply(lambda x: x[1])
day   = dates_split.apply(lambda x: x[2])
dates_df = pd.DataFrame({'year':year, 'month':month, 'day':day})
nfl_games = dates_df.join(nfl_games.drop('date', axis=1))

In [189]:
# elo difference between home and away team
elo_diff = nfl_games['elo1'] - nfl_games['elo2']
nfl_games['elo_diff'] = elo_diff

In [190]:
seasons = nfl_games['season'].unique()
teams = nfl_games['team1'].append(nfl_games['team2']).unique()

rows1 = []
rows2 = []
for s in seasons:
    row1 = []
    row2 = []
    for t in teams:
        row1.append(nfl_games[(nfl_games['season'] == s) & (nfl_games['team1'] == t)]['score1'].mean())
        row2.append(nfl_games[(nfl_games['season'] == s) & (nfl_games['team2'] == t)]['score2'].mean())
    rows1.append(row1)
    rows2.append(row2)
    
team1_avg = pd.DataFrame(rows1, index=seasons, columns=teams)
team1_avg = team1_avg.fillna(0)

team2_avg = pd.DataFrame(rows2, index=seasons, columns=teams)
team2_avg = team2_avg.fillna(0)

In [191]:
team1_ps_avg = []
team2_ps_avg = []

for s,t in nfl_games[nfl_games['season'] != 1920][['season', 'team1']].values:        
    team1_ps_avg.append(team1_avg[t][s-1])

for s,t in nfl_games[nfl_games['season'] != 1920][['season', 'team2']].values:        
    team2_ps_avg.append(team2_avg[t][s-1])
    
first_season = [0] * (len(nfl_games) - len(team1_ps_avg))

team1_ps_avg = first_season + team1_ps_avg
team2_ps_avg = first_season + team2_ps_avg

team1_ps_avg = pd.Series(team1_ps_avg, name='team1_ps_avg')
team2_ps_avg = pd.Series(team2_ps_avg, name='team2_ps_avg')

nfl_games['team1_ps_avg'] = team1_ps_avg
nfl_games['team2_ps_avg'] = team2_ps_avg

team1_c_avg = []
team2_c_avg = []

for s,t in nfl_games[['season', 'team1']].values:        
    team1_c_avg.append(team1_avg[t][s])

for s,t in nfl_games[['season', 'team2']].values:        
    team2_c_avg.append(team2_avg[t][s])

team1_c_avg = pd.Series(team1_c_avg, name='team1_avg')
team2_c_avg = pd.Series(team2_c_avg, name='team2_avg')

nfl_games['team1_avg'] = team1_c_avg
nfl_games['team2_avg'] = team2_c_avg

# Regressions

To start the regressions I wanted to see whether or not one hot encoding the team name variable made any difference. I found that one hot encodings had a marginally better mean squared error, but took longer to compute, so for all other regressions I used the label encodings. As for the model, I started with the base Random Forest Regressor from scikit-learn. 

In [192]:
# Label Encode Teams
le = LabelEncoder()

le.fit(nfl_games['team1'].append(nfl_games['team2']))

team1_le = pd.DataFrame(le.transform(nfl_games['team1']), columns=['team1_cat'])
team2_le = pd.DataFrame(le.transform(nfl_games['team2']), columns=['team2_cat'])

nfl_games_le = nfl_games.join(team1_le).join(team2_le).drop('team1', axis=1).drop('team2', axis=1)

In [193]:
# One Hot Encoding of teams
nfl_one_hot_1 = pd.get_dummies(nfl_games, columns=['team1'], prefix=['Team_1_is'] )
nfl_games_oe = pd.get_dummies(nfl_one_hot_1, columns=['team2'], prefix=['Team_2_is'])

In [194]:
def fit_data_random_forest(data):
    features = pd.DataFrame(data.drop(['score1', 'score2'], axis=1))
    labels = data[['score1', 'score2']]
    
    rf = RandomForestRegressor()
    rf.fit(features.values, labels.values)
    return rf

def predict_data(model, data):
    return model.predict(pd.DataFrame(data.drop(['score1', 'score2'], axis=1)))

In [195]:
# Try with label encodings
training_le = nfl_games_le.sample(frac=0.90, random_state=1)
testing_le = nfl_games_le.drop(training_le.index)

rf_le = fit_data_random_forest(training_le)
pred_le = predict_data(rf_le, testing_le)
mean_squared_error(testing_le[['score1','score2']], pred_le)

62.99543534111866

In [196]:
# Try with one hot encodings
training_oe = nfl_games_oe.sample(frac=0.90, random_state=1)
testing_oe = nfl_games_oe.drop(training_oe.index)

rf_oe = fit_data_random_forest(training_oe)
pred_oe = predict_data(rf_oe, testing_oe)
mean_squared_error(testing_oe[['score1','score2']], pred_oe)

63.562776551936075

In [197]:
nfl_games = nfl_games_le

# Testing different subsets of data

To see which variables made the biggest impact on deciding the scores, I took a variety of features and ran them through regressions on their own. I compared the mean squared error to the mean squared error of simply taking the average over all seasons. The surprising value that jumps out is that elo and elo related features were the worst when taken on their own. 

In [198]:
training = nfl_games.sample(frac=0.90, random_state=1)
testing = nfl_games.drop(training.index)

In [199]:
# pure averages
avg = training[['score1','score2']].mean()
avg_df = pd.DataFrame({'score1':avg[0], 'score2':avg[1]}, index=[0])
avgs = pd.concat([avg_df]*(len(testing)), ignore_index=True)
print(mean_squared_error(testing[['score1','score2']], avgs))

121.12152427236573


In [200]:
# elos only
feats = ['elo1', 'elo2', 'score1', 'score2']
rf_elo = fit_data_random_forest(training[feats])
pred_elo = predict_data(rf_elo, testing[feats])
print(mean_squared_error(testing[['score1','score2']], pred_elo))

130.00449103750458


In [201]:
# season only
feats = ['season', 'score1', 'score2']
rf_elo = fit_data_random_forest(training[feats])
pred_elo = predict_data(rf_elo, testing[feats])
print(mean_squared_error(testing[['score1','score2']], pred_elo))

109.63884290741646


In [202]:
# teams only
feats = ['team1_cat', 'team2_cat', 'score1', 'score2']
rf_elo = fit_data_random_forest(training[feats])
pred_elo = predict_data(rf_elo, testing[feats])
print(mean_squared_error(testing[['score1','score2']], pred_elo))

119.89509037526537


In [203]:
# date only
feats = ['month', 'day', 'year', 'score1', 'score2']
rf_elo = fit_data_random_forest(training[feats])
pred_elo = predict_data(rf_elo, testing[feats])
print(mean_squared_error(testing[['score1','score2']], pred_elo))

125.328981832238


In [204]:
# date but season only
feats = ['month', 'day', 'season', 'score1', 'score2']
rf_elo = fit_data_random_forest(training[feats])
pred_elo = predict_data(rf_elo, testing[feats])
print(mean_squared_error(testing[['score1','score2']], pred_elo))

125.6995814938548


In [205]:
# playoffs
feats = ['playoff', 'score1', 'score2']
rf_elos = fit_data_random_forest(training[feats])
pred_elos = predict_data(rf_elos, testing[feats])
print(mean_squared_error(testing[['score1','score2']], pred_elos))

120.90832509910177


In [206]:
# neutral
feats = ['neutral', 'score1', 'score2']
rf_elos = fit_data_random_forest(training[feats])
pred_elos = predict_data(rf_elos, testing[feats])
print(mean_squared_error(testing[['score1','score2']], pred_elos))

121.02809200328338


In [207]:
feats = ['elo_diff', 'score1', 'score2']
rf_elo_diff = fit_data_random_forest(training[feats])
pred_elo_diff = predict_data(rf_elo_diff, testing[feats])
print(mean_squared_error(testing[['score1','score2']], pred_elo_diff))

162.978346242261


In [208]:
feats = ['elo1', 'elo2', 'elo_diff', 'elo_prob1', 'score1', 'score2']
rf_elos = fit_data_random_forest(training[feats])
pred_elos = predict_data(rf_elos, testing[feats])
print(mean_squared_error(testing[['score1','score2']], pred_elos))

131.54608452790413


# Testing Gradient Boosting

To test another model, I tried the gradient boosting regressor from scikit-learn. This model performed marginally better than the random forest, but takes longer to run. 

In [209]:
def fit_data_gb(data):
    features = pd.DataFrame(data.drop(['score1', 'score2'], axis=1))
    labels1 = data[['score1']]
    labels2 = data[['score2']]
    
    gb1 = GradientBoostingRegressor()
    gb2 = GradientBoostingRegressor()
    gb1.fit(features.values, labels1.values)
    gb2.fit(features.values, labels2.values)
    return (gb1, gb2)

def predict_data_gb(model, data):
    df = pd.DataFrame(data.drop(['score1', 'score2'], axis=1))
    return (model[0].predict(df.values), model[1].predict(df.values))

In [210]:
training = nfl_games_le.sample(frac=0.90, random_state=1)
testing = nfl_games_le.drop(training.index)

gb = fit_data_gb(training)
pred_gb = predict_data_gb(gb, testing)

  return f(**kwargs)
  return f(**kwargs)


In [211]:
print(mean_squared_error(testing[['score1','score2']], list(zip(pred_gb[0], pred_gb[1]))))

60.14200950971905


# Final Thoughts

Overall, I was surprised by how little ELO helped determine scores. Looking at what ELO actually is, it makes more sense. Two teams with the same ELO could have wildly different ways of winning. It would be interesting to add data regarding rosters and coaching staff to the models. I think that this type of data would go a long way towards determining scores. Also of note is that my models are not very good for future predictions because of the season average score for each team. This data can't exist until a full season has been played. 