In [178]:
import pandas as pd
import numpy as np
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import Imputer, OneHotEncoder
from sklearn.pipeline import Pipeline, make_pipeline, FeatureUnion

nfl_teams = pd.read_csv('https://raw.githubusercontent.com/mauzeyj/machine_learning_club/master/Data/nfl_teams.csv')
nfl_stadiums = pd.read_csv('https://raw.githubusercontent.com/mauzeyj/machine_learning_club/master/Data/nfl_stadiums.csv', encoding='latin')
spreadspoke_scores = pd.read_csv('https://raw.githubusercontent.com/mauzeyj/machine_learning_club/master/Data/spreadspoke_scores.csv')

## Data Munging, Cleaning, and feature engineering

Now I'm going to select only the features that are most likely to contribute to performance and are easy to handle.

In [161]:
nfl_stadiums = nfl_stadiums[['stadium_name', 'stadium_type', 'stadium_weather_type', 'stadium_capacity', 'stadium_surface', 'ELEVATION']]
nfl_stadiums.rename(index=str, columns={'stadium_name':'stadium'}, inplace=True)
nfl_stadiums['ELEVATION'] = nfl_stadiums['ELEVATION'].astype(str)

In [162]:
nfl_stadiums.head()

Unnamed: 0,stadium,stadium_type,stadium_weather_type,stadium_capacity,stadium_surface,ELEVATION
0,Alamo Dome,indoor,dome,72000.0,FieldTurf,
1,Alltel Stadium,,,,,
2,Alumni Stadium,outdoor,cold,,Grass,
3,Anaheim Stadium,outdoor,warm,,,
4,Arrowhead Stadium,outdoor,cold,76416.0,Grass,264.9


The rows 'team_favorite_id', 'spread_favorite', and 'over_under_line' are predictions themselves, and will not affect the outcome of a game - so we drop them. The field 'weather_detail', on the other hand, is only occupied by very few so it will also be dropped.

In [163]:
spreadspoke_scores.drop(['team_favorite_id', 'spread_favorite', 'over_under_line', 'weather_detail', 'schedule_date'], axis=1, inplace=True)
df = pd.merge(nfl_stadiums, spreadspoke_scores, on='stadium', how='outer')
df = df.dropna(subset=['score_home', 'score_away'])
df['ELEVATION'] = df['ELEVATION'].astype(float)

In [164]:
df.head()

Unnamed: 0,stadium,stadium_type,stadium_weather_type,stadium_capacity,stadium_surface,ELEVATION,schedule_season,schedule_week,schedule_playoff,team_home,score_home,score_away,team_away,stadium_neutral,weather_temperature,weather_wind_mph,weather_humidity
0,Alamo Dome,indoor,dome,72000.0,FieldTurf,,2005.0,4,False,New Orleans Saints,19.0,7.0,Buffalo Bills,True,72.0,0.0,
1,Alamo Dome,indoor,dome,72000.0,FieldTurf,,2005.0,6,False,New Orleans Saints,31.0,34.0,Atlanta Falcons,True,72.0,0.0,
2,Alamo Dome,indoor,dome,72000.0,FieldTurf,,2005.0,16,False,New Orleans Saints,12.0,13.0,Detroit Lions,True,72.0,0.0,
3,Alltel Stadium,,,,,,2004.0,Superbowl,True,Philadelphia Eagles,21.0,24.0,New England Patriots,True,,,
4,Alumni Stadium,outdoor,cold,,Grass,,1967.0,7,False,Boston Patriots,41.0,10.0,Miami Dolphins,False,54.0,11.0,75.0


### Now to add some more helpful features

In [165]:
def game_winner(row):
    if row['score_home'] > row['score_away']:
        return row['team_home']
    elif row['score_home'] < row['score_away']:
        return row['team_away']
    else:
        return 'None'
    
def winner_loc(row):
    if row['winner'] == row['team_home']:
        return 'home'
    elif row['winner'] == row['team_away']:
        return 'away'
    else:
        return 'None'

def dumb_function(val):
    if val == 'Superbowl':
        return 25
    elif val == 'SuperBowl':
        return 25
    elif val == 'Wildcard':
        return 23
    elif val == 'WildCard':
        return 23
    elif val == 'Conference':
        return 22
    elif val == 'Division':
        return 20
    else:
        return val

def other_dumb_function(val):
    if type(val) == str:
        return val.replace(',', '')
    else:
        return val
    
df['spread'] = abs(df['score_home'] - df['score_away']) # score spreads
df['winner'] = df.apply(game_winner, axis=1) # explicity encode the winner
df['winner_loc'] = df.apply(winner_loc, axis=1) # encode the winner as home or away
category_to_num = ['stadium_capacity', 'schedule_week', 'weather_humidity'] # convert these weird columns to numerical
df['stadium_capacity'] = df['stadium_capacity'].apply(other_dumb_function) # convert stadium_capacity to float
df['schedule_week']= df['schedule_week'].apply(dumb_function) # change Superbowl value to be large int
df['weather_humidity'] = df['weather_humidity'].apply(lambda x: float(str(x).split("%")[0])) # change humidity to all be num
for f in category_to_num:
    df[f] = df[f].astype(np.float)

#### Deal with the null values

In [200]:
df = pd.get_dummies(df)
cols = df.columns

# use the imputer to replace null numerical data
i = Imputer(strategy='median')
data = i.fit_transform(df)
new_data = pd.DataFrame(data, columns=cols)

And now we have the processed data!

In [202]:
new_data.head()

Unnamed: 0,stadium_capacity,ELEVATION,schedule_season,schedule_week,score_home,score_away,weather_temperature,weather_wind_mph,weather_humidity,spread,...,winner_Seattle Seahawks,winner_St. Louis Cardinals,winner_St. Louis Rams,winner_Tampa Bay Buccaneers,winner_Tennessee Oilers,winner_Tennessee Titans,winner_Washington Redskins,winner_loc_None,winner_loc_away,winner_loc_home
0,72000.0,145.4,2005.0,4.0,19.0,7.0,72.0,0.0,69.0,12.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,72000.0,145.4,2005.0,6.0,31.0,34.0,72.0,0.0,69.0,3.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,72000.0,145.4,2005.0,16.0,12.0,13.0,72.0,0.0,69.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3,71250.0,145.4,2004.0,100.0,21.0,24.0,61.0,8.0,69.0,3.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,71250.0,145.4,1967.0,7.0,41.0,10.0,54.0,11.0,75.0,31.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


## Machine Learning Models

### Prepare the data for the models