In [81]:
import pandas as pd
import numpy as np
from sklearn_pandas import DataFrameMapper
from sklearn.preprocessing import Imputer, LabelEncoder, OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline, FeatureUnion

nfl_teams = pd.read_csv('https://raw.githubusercontent.com/mauzeyj/machine_learning_club/master/Data/nfl_teams.csv')
nfl_stadiums = pd.read_csv('https://raw.githubusercontent.com/mauzeyj/machine_learning_club/master/Data/nfl_stadiums.csv', encoding='latin')
spreadspoke_scores = pd.read_csv('https://raw.githubusercontent.com/mauzeyj/machine_learning_club/master/Data/spreadspoke_scores.csv')

## Data Munging, Cleaning, and feature engineering

Now I'm going to select only the features that are most likely to contribute to performance and are easy to handle.

In [15]:
nfl_stadiums = nfl_stadiums[['stadium_name', 'stadium_type', 'stadium_weather_type', 'stadium_capacity', 'stadium_surface', 'ELEVATION']]
nfl_stadiums.rename(index=str, columns={'stadium_name':'stadium'}, inplace=True)
nfl_stadiums['ELEVATION'] = nfl_stadiums['ELEVATION'].astype(str)

The rows 'team_favorite_id', 'spread_favorite', and 'over_under_line' are predictions themselves, and will not affect the outcome of a game - so we drop them. The field 'weather_detail', on the other hand, is only occupied by very few so it will also be dropped.

In [16]:
spreadspoke_scores.drop(['team_favorite_id', 'spread_favorite', 'over_under_line', 'weather_detail', 'schedule_date'], axis=1, inplace=True)
df = pd.merge(nfl_stadiums, spreadspoke_scores, on='stadium', how='outer')
df = df.dropna(subset=['score_home', 'score_away'])
df['ELEVATION'] = df['ELEVATION'].astype(float)

### Now to add some more helpful features

In [17]:
def game_winner(row):
    if row['score_home'] > row['score_away']:
        return row['team_home']
    elif row['score_home'] < row['score_away']:
        return row['team_away']
    else:
        return 'None'
    
def winner_loc(row):
    if row['winner'] == row['team_home']:
        return 'home'
    elif row['winner'] == row['team_away']:
        return 'away'
    else:
        return 'None'

def dumb_function(val):
    if val == 'Superbowl':
        return 25
    elif val == 'SuperBowl':
        return 25
    elif val == 'Wildcard':
        return 23
    elif val == 'WildCard':
        return 23
    elif val == 'Conference':
        return 22
    elif val == 'Division':
        return 20
    else:
        return val

def other_dumb_function(val):
    if type(val) == str:
        return val.replace(',', '')
    else:
        return val
    
df['spread'] = abs(df['score_home'] - df['score_away']) # score spreads
df['winner'] = df.apply(game_winner, axis=1) # explicity encode the winner
df['winner_loc'] = df.apply(winner_loc, axis=1) # encode the winner as home or away
category_to_num = ['stadium_capacity', 'schedule_week', 'weather_humidity'] # convert these weird columns to numerical
df['stadium_capacity'] = df['stadium_capacity'].apply(other_dumb_function) # convert stadium_capacity to float
df['schedule_week']= df['schedule_week'].apply(dumb_function) # change Superbowl value to be large int
df['weather_humidity'] = df['weather_humidity'].apply(lambda x: float(str(x).split("%")[0])) # change humidity to all be num
for f in category_to_num:
    df[f] = df[f].astype(np.float)

In [18]:
print(df.head())
print(df.columns)

          stadium stadium_type stadium_weather_type  stadium_capacity  \
0      Alamo Dome       indoor                 dome           72000.0   
1      Alamo Dome       indoor                 dome           72000.0   
2      Alamo Dome       indoor                 dome           72000.0   
3  Alltel Stadium          NaN                  NaN               NaN   
4  Alumni Stadium      outdoor                 cold               NaN   

  stadium_surface  ELEVATION  schedule_season  schedule_week schedule_playoff  \
0       FieldTurf        NaN           2005.0            4.0            False   
1       FieldTurf        NaN           2005.0            6.0            False   
2       FieldTurf        NaN           2005.0           16.0            False   
3             NaN        NaN           2004.0           25.0             True   
4           Grass        NaN           1967.0            7.0            False   

             team_home  score_home  score_away             team_away  \
0 

In [29]:
df["stadium_type"].value_counts().index

Index(['outdoor', 'indoor', 'retractable'], dtype='object')

In [107]:
df.columns

Index(['stadium', 'stadium_type', 'stadium_weather_type', 'stadium_capacity',
       'stadium_surface', 'ELEVATION', 'schedule_season', 'schedule_week',
       'schedule_playoff', 'team_home', 'score_home', 'score_away',
       'team_away', 'stadium_neutral', 'weather_temperature',
       'weather_wind_mph', 'weather_humidity', 'spread', 'winner',
       'winner_loc'],
      dtype='object')

#### Deal with the null values

In [112]:
def one_hot_labels(df, cols):
    labels = []
    for c in cols:
        vals = df[c].value_counts().index
        for v in vals:
            labels.append(str(c) + '_' + str(v))
    return labels

['stadium_Giants Stadium',
 'stadium_Lambeau Field',
 'stadium_Qualcomm Stadium',
 'stadium_Arrowhead Stadium',
 'stadium_Soldier Field',
 'stadium_Candlestick Park',
 'stadium_Ralph Wilson Stadium',
 'stadium_Louisiana Superdome',
 'stadium_Oakland Coliseum',
 'stadium_Texas Stadium',
 'stadium_Mile High Stadium',
 'stadium_Veterans Stadium',
 'stadium_Three Rivers Stadium',
 'stadium_Sun Life Stadium',
 'stadium_RFK Memorial Stadium',
 'stadium_Foxboro Stadium',
 'stadium_Cleveland Municipal Stadium',
 'stadium_Cinergy Field',
 'stadium_Los Angeles Memorial Coliseum',
 'stadium_Hubert H. Humphrey Metrodome',
 'stadium_Houston Astrodome',
 'stadium_Georgia Dome',
 'stadium_Pontiac Silverdome',
 'stadium_University of Phoenix Stadium',
 'stadium_RCA Dome',
 'stadium_Atlanta-Fulton County Stadium',
 'stadium_Bank of America Stadium',
 'stadium_Seattle Kingdome',
 'stadium_EverBank Field',
 'stadium_FedEx Field',
 'stadium_M&T Bank Stadium',
 'stadium_Raymond James Stadium',
 "stadium_Ho

In [73]:
from sklearn.base import TransformerMixin, BaseEstimator

class DataFrameSelector(BaseEstimator, TransformerMixin):
    def __init__(self, attribute_names):
        self.attribute_names = attribute_names
        
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        return X[self.attribute_names].values

In [127]:
class GetDummiesFrame(TransformerMixin):
    def __init__(self, cols):
        self.cols = cols
        
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        return pd.get_dummies(pd.DataFrame(X, columns=self.cols)) #.values

In [125]:
from sklearn_pandas import CategoricalImputer

numerical_fields = ['stadium_capacity', 'ELEVATION', 'schedule_season', 'schedule_week', 'score_home', 'score_away', 
                    'weather_temperature', 'weather_wind_mph', 'weather_humidity']
categorical_fields = ['stadium', 'stadium_type', 'stadium_weather_type', 'stadium_surface', 'team_home',
                     'team_away', 'stadium_neutral']

num_pipeline = Pipeline([
    ("select", DataFrameSelector(numerical_fields)),
    ("impute", Imputer(strategy='mean')),
    ("scale", StandardScaler())
])

cat_pipeline = Pipeline([
    ("select", DataFrameSelector(categorical_fields)),
    ("impute", CategoricalImputer(strategy='most_frequent')),
    ("encode", GetDummiesFrame(categorical_fields))
])

full_pipeline = FeatureUnion(transformer_list = [
    ("num_pipe", num_pipeline),
    ("cat_pipe", cat_pipeline)
])

cat_pipeline.fit_transform(df)
cat_pipeline.get_params()
#d = full_pipeline.fit_transform(df)

TypeError: get_params() got an unexpected keyword argument 'deep'

In [21]:
clean_df.head()

AttributeError: 'DataFrameMapper' object has no attribute 'head'

In [5]:
df = pd.get_dummies(df)
cols = df.columns

# use the imputer to replace null numerical data
i = Imputer(strategy='median')
data = i.fit_transform(df)
new_data = pd.DataFrame(data, columns=cols)

And now we have the processed data!

In [6]:
new_data.head()

Unnamed: 0,stadium_capacity,ELEVATION,schedule_season,schedule_week,score_home,score_away,weather_temperature,weather_wind_mph,weather_humidity,spread,...,winner_Seattle Seahawks,winner_St. Louis Cardinals,winner_St. Louis Rams,winner_Tampa Bay Buccaneers,winner_Tennessee Oilers,winner_Tennessee Titans,winner_Washington Redskins,winner_loc_None,winner_loc_away,winner_loc_home
0,72000.0,145.4,2005.0,4.0,19.0,7.0,72.0,0.0,69.0,12.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,72000.0,145.4,2005.0,6.0,31.0,34.0,72.0,0.0,69.0,3.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,72000.0,145.4,2005.0,16.0,12.0,13.0,72.0,0.0,69.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3,71250.0,145.4,2004.0,25.0,21.0,24.0,61.0,8.0,69.0,3.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,71250.0,145.4,1967.0,7.0,41.0,10.0,54.0,11.0,75.0,31.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


## Machine Learning Models

### Prepare the data for the models

In [8]:
for col in new_data.columns:
    print(col)

stadium_capacity
ELEVATION
schedule_season
schedule_week
score_home
score_away
weather_temperature
weather_wind_mph
weather_humidity
spread
stadium_AT&T Stadium
stadium_Alamo Dome
stadium_Alltel Stadium
stadium_Alumni Stadium
stadium_Anaheim Stadium
stadium_Arrowhead Stadium
stadium_Atlanta-Fulton County Stadium
stadium_Balboa Stadium
stadium_Bank of America Stadium
stadium_Busch Memorial Stadium
stadium_Candlestick Park
stadium_CenturyLink Field
stadium_Cinergy Field
stadium_Cleveland Municipal Stadium
stadium_Cotton Bowl
stadium_Cowboys Stadium
stadium_Dolphin Stadium
stadium_Edward Jones Dome
stadium_Estadio Azteca
stadium_EverBank Field
stadium_FedEx Field
stadium_Fenway Park
stadium_FirstEnergy Stadium
stadium_Ford Field
stadium_Foxboro Stadium
stadium_Franklin Field
stadium_Georgia Dome
stadium_Giants Stadium
stadium_Gillette Stadium
stadium_Hard Rock Stadium
stadium_Harvard Stadium
stadium_Heinz Field
stadium_Houlihan's Stadium
stadium_Houston Astrodome
stadium_Hubert H. Humphre