In [1]:
import pandas as pd
import numpy as np
import math
import matplotlib.pyplot as plt
import random
import seaborn as sns
import math
import re
%matplotlib inline
pd.set_option('display.max_columns', 100)

In [2]:
df = pd.read_csv('spreadspoke_scores.csv')

In [3]:
# Drop all rows without Over/Under value 

df = df.dropna(subset=['over_under_line'])
df["spread_favorite"] = df['spread_favorite'] * -1
df["spread_favorite"] = np.where(df['spread_favorite'] == 0, 0, df['spread_favorite'])

In [4]:
# Add "Total" column with combined points of Home and Away team

df['total'] = df['score_home'] + df['score_away'] 
df = df.astype({'total': 'int'})

In [5]:
# Transform "Over/Under" Line to float

df = df[df.over_under_line != ' ']
df = df.astype({'over_under_line': 'float'})

In [6]:
# Add Target Variable 'Result' columns. Based on if Total Points when Over or Under the Over/Under Value

df['over_under_result'] = np.where(df['total'] > df['over_under_line'], 'Over', 'Under')

In [7]:
# Change 'stadium' column name to 'stadium_name' to match Stadium Dataset

df['stadium_name'] = df['stadium']


In [8]:
# Stadium Name Changes

df['stadium_name'] = np.where(df['stadium_name'] == 'Joe Robbie Stadium', 'Hard Rock Stadium',df['stadium_name'])
df['stadium_name'] = np.where(df['stadium_name'] == 'Pro Player Stadium', 'Hard Rock Stadium',df['stadium_name'])
df['stadium_name'] = np.where(df['stadium_name'] == 'Dolphin Stadium', 'Hard Rock Stadium',df['stadium_name'])
df['stadium_name'] = np.where(df['stadium_name'] == 'Tampa Stadium', 'Raymond James Stadium',df['stadium_name'])
df['stadium_name'] = np.where(df['stadium_name'] == 'Alltel Stadium', 'TIAA Bank Field',df['stadium_name'])
df['stadium_name'] = np.where(df['stadium_name'] == 'Jack Murphy Stadium', 'Qualcomm Stadium',df['stadium_name'])

In [9]:
#Import Stadium Data

stadium_df = pd.read_csv('nfl_stadiums.csv')
print(stadium_df.shape)

(106, 15)


In [10]:
# Merge Stadium Data to Betting DataFrame

df = df.merge(stadium_df,on='stadium_name',how='left')

In [11]:
#Drop Uneeded Columns

df = df.drop(columns=['stadium','stadium_close','LATITUDE','LONGITUDE','NAME'])

In [12]:
# Manually Add Stadium Locations
# Drop Most Stadiums outside U.S.A.

df['stadium_location'] = np.where(df['stadium_name'] == 'FedEx Field', 'Landover, MD',df['stadium_location'])
df['stadium_location'] = np.where(df['stadium_name'] == 'TIAA Bank Field', 'Jacksonville, FL',df['stadium_location'])
df = df.dropna(subset=['stadium_location'])
df = df.drop([6168,9219,9484,10021,7118,7366,7576,7828,8203,8437,9158,9424,9435])

In [13]:
# Only Scores after 1978

df = df[df.schedule_season > 1978]

In [14]:
# Stadium Open Dates and Change to Int

df['stadium_open'] = np.where(df['stadium_name'] == 'FedEx Field', 1997,df['stadium_open'])
df['stadium_open'] = np.where(df['stadium_name'] == 'TIAA Bank Field', 1994,df['stadium_open'])
df['stadium_open'] = np.where(df['stadium_name'] == 'Rogers Centre', 1989,df['stadium_open'])
df['stadium_open'] = np.where(df['stadium_name'] == 'Rose Bowl', 1921,df['stadium_open'])
df['stadium_open'] = np.where(df['stadium_name'] == 'Tulane Stadium', 1926,df['stadium_open'])
df['stadium_open'] = np.where(df['stadium_name'] == 'Alamo Dome', 1993,df['stadium_open'])
df['stadium_open'] = np.where(df['stadium_name'] == 'Rice Stadium', 1950,df['stadium_open'])
df['stadium_open'] = np.where(df['stadium_name'] == 'Stanford Stadium', 1921,df['stadium_open'])

df = df.astype({'stadium_open': 'int'})

In [15]:
# Fix Stadium Type

df['stadium_type'] = np.where(df['stadium_name'] == 'FedEx Field', 'outdoor',df['stadium_type'])
df['stadium_type'] = np.where(df['stadium_name'] == 'TIAA Bank Field', 'outdoor',df['stadium_type'])
df['stadium_type'] = np.where(df['stadium_name'] == 'Stanford Stadium', 'outdoor',df['stadium_type'])

In [16]:
# Get Zipcodes

df['zipcode'] = df['stadium_address'].str[-5:]

# Adding Zip Codes to Stadiums Without Address

df['zipcode'] = np.where(df['stadium_name'] == 'Rose Bowl', 91103,df['zipcode'])
df['zipcode'] = np.where(df['stadium_name'] == 'Stanford Stadium', 94305,df['zipcode'])
df['zipcode'] = np.where(df['stadium_name'] == 'FedEx Field', 20785,df['zipcode'])
df['zipcode'] = np.where(df['stadium_name'] == 'TIAA Bank Field', 32202,df['zipcode'])
df['zipcode'] = np.where(df['stadium_name'] == 'Mercedes-Benz Stadium', 30313,df['zipcode'])
df['zipcode'] = np.where(df['stadium_name'] == 'SoFi Stadium', 90301,df['zipcode'])
df['zipcode'] = np.where(df['stadium_name'] == 'Allegiant Stadium', 89118,df['zipcode'])

#Outside U.S.

df['zipcode'] = np.where(df['stadium_name'] == 'Wembley Stadium', 11111,df['zipcode'])

# Zipcodes to Integers

df['zipcode'] = df['zipcode'].astype(int)

In [17]:
# Fix Stadium Surface

df['stadium_surface'] = np.where(df['stadium_surface'] == 'Hellas Matrix Turf', 'FieldTurf',df['stadium_surface'])
df['stadium_surface'] = np.where(df['stadium_surface'] == 'Grass, Turf (1971-1974)', 'Grass',df['stadium_surface'])

df['stadium_surface'] = np.where(df['stadium_name'] == 'Giants Stadium', 'FieldTurf',df['stadium_surface'])
df['stadium_surface'] = np.where(df['stadium_name'] == 'Candlestick Park', 'Grass',df['stadium_surface'])
df['stadium_surface'] = np.where(df['stadium_name'] == 'Sun Life Stadium', 'Grass',df['stadium_surface'])
df['stadium_surface'] = np.where(df['stadium_name'] == 'Texas Stadium', 'FieldTurf',df['stadium_surface'])
df['stadium_surface'] = np.where(df['stadium_name'] == 'Hubert H. Humphrey Metrodome', 'FieldTurf',df['stadium_surface'])
df['stadium_surface'] = np.where(df['stadium_name'] == 'RCA Dome', 'FieldTurf',df['stadium_surface'])
df['stadium_surface'] = np.where(df['stadium_name'] == 'Veterans Stadium', 'FieldTurf',df['stadium_surface'])
df['stadium_surface'] = np.where(df['stadium_name'] == 'FedEx Field', 'Grass',df['stadium_surface'])
df['stadium_surface'] = np.where(df['stadium_name'] == 'Foxboro Stadium', 'FieldTurf',df['stadium_surface'])
df['stadium_surface'] = np.where(df['stadium_name'] == 'Pontiac Silverdome', 'FieldTurf',df['stadium_surface'])
df['stadium_surface'] = np.where(df['stadium_name'] == 'Mile High Stadium', 'Grass',df['stadium_surface'])
df['stadium_surface'] = np.where(df['stadium_name'] == 'Three Rivers Stadium', 'FieldTurf',df['stadium_surface'])
df['stadium_surface'] = np.where(df['stadium_name'] == 'Edward Jones Dome', 'FieldTurf',df['stadium_surface'])
df['stadium_surface'] = np.where(df['stadium_name'] == 'Cinergy Field', 'FieldTurf',df['stadium_surface'])
df['stadium_surface'] = np.where(df['stadium_name'] == 'Seattle Kingdome', 'FieldTurf',df['stadium_surface'])
df['stadium_surface'] = np.where(df['stadium_name'] == "Houlihan's Stadium", 'Grass',df['stadium_surface'])
df['stadium_surface'] = np.where(df['stadium_name'] == 'Houston Astrodome', 'FieldTurf',df['stadium_surface'])
df['stadium_surface'] = np.where(df['stadium_name'] == 'RFK Memorial Stadium', 'Grass',df['stadium_surface'])
df['stadium_surface'] = np.where(df['stadium_name'] == 'Cleveland Municipal Stadium', 'Grass',df['stadium_surface'])
df['stadium_surface'] = np.where(df['stadium_name'] == 'Anaheim Stadium', 'Grass',df['stadium_surface'])
df['stadium_surface'] = np.where(df['stadium_name'] == 'Atlanta-Fulton County Stadium', 'Grass',df['stadium_surface'])
df['stadium_surface'] = np.where(df['stadium_name'] == 'Busch Memorial Stadium', 'FieldTurf',df['stadium_surface'])
df['stadium_surface'] = np.where(df['stadium_name'] == 'Orange Bowl', 'Grass',df['stadium_surface'])
df['stadium_surface'] = np.where(df['stadium_name'] == 'Memorial Stadium (Baltimore)', 'Grass',df['stadium_surface'])
df['stadium_surface'] = np.where(df['stadium_name'] == 'Sun Devil Stadium', 'Grass',df['stadium_surface'])
df['stadium_surface'] = np.where(df['stadium_name'] == 'Mall of America Field', 'FieldTurf',df['stadium_surface'])
df['stadium_surface'] = np.where(df['stadium_name'] == 'Metropolitan Stadium', 'Grass',df['stadium_surface'])
df['stadium_surface'] = np.where(df['stadium_name'] == 'Wembley Stadium', 'Grass',df['stadium_surface'])
df['stadium_surface'] = np.where(df['stadium_name'] == 'Husky Stadium', 'FieldTurf',df['stadium_surface'])
df['stadium_surface'] = df[['stadium_surface']].fillna(value='Grass')




In [18]:
df.zipcode.value_counts()

7073     669
2035     353
15212    347
19148    346
80204    346
54304    342
64129    337
70112    336
45202    332
33607    331
14127    330
60605    329
92108    309
44114    307
46225    303
77054    299
55415    296
94124    294
33056    277
75062    243
30313    237
94621    224
85305    216
28202    203
32202    201
20785    191
21230    187
48342    181
37213    176
63101    168
98104    161
98134    159
48226    150
20003    142
90037    131
92806    123
30312     98
76011     97
63102     70
33125     60
95054     55
21218     52
85287     50
55425     24
11111     22
90746     22
98195     19
55455     18
61820     10
38104      8
37203      8
29634      8
90301      6
70803      4
91103      4
89118      3
78203      3
94305      1
Name: zipcode, dtype: int64

In [19]:
# Fix Stadium Capacity

df['stadium_capacity'] = np.where(df['stadium_name'] == 'Giants Stadium',80242,df['stadium_capacity'])
df['stadium_capacity'] = np.where(df['stadium_name'] == 'Candlestick Park',69732,df['stadium_capacity'])
df['stadium_capacity'] = np.where(df['stadium_name'] == 'Sun Life Stadium',64767,df['stadium_capacity'])
df['stadium_capacity'] = np.where(df['stadium_name'] == 'Texas Stadium',65675,df['stadium_capacity'])
df['stadium_capacity'] = np.where(df['stadium_name'] == 'Hubert H. Humphrey Metrodome', 64121,df['stadium_capacity'])
df['stadium_capacity'] = np.where(df['stadium_name'] == 'RCA Dome',60567,df['stadium_capacity'])
df['stadium_capacity'] = np.where(df['stadium_name'] == 'Veterans Stadium',65352,df['stadium_capacity'])
df['stadium_capacity'] = np.where(df['stadium_name'] == 'FedEx Field', 82000,df['stadium_capacity'])
df['stadium_capacity'] = np.where(df['stadium_name'] == 'Foxboro Stadium',60292,df['stadium_capacity'])
df['stadium_capacity'] = np.where(df['stadium_name'] == 'Pontiac Silverdome',80311,df['stadium_capacity'])
df['stadium_capacity'] = np.where(df['stadium_name'] == 'Mile High Stadium', 75000,df['stadium_capacity'])
df['stadium_capacity'] = np.where(df['stadium_name'] == 'Three Rivers Stadium',59000,df['stadium_capacity'])
df['stadium_capacity'] = np.where(df['stadium_name'] == 'Edward Jones Dome', 67277,df['stadium_capacity'])
df['stadium_capacity'] = np.where(df['stadium_name'] == 'Cinergy Field', 59754,df['stadium_capacity'])
df['stadium_capacity'] = np.where(df['stadium_name'] == 'Seattle Kingdome',66000,df['stadium_capacity'])
df['stadium_capacity'] = np.where(df['stadium_name'] == "Houlihan's Stadium", 50000,df['stadium_capacity'])
df['stadium_capacity'] = np.where(df['stadium_name'] == 'Houston Astrodome', 65000,df['stadium_capacity'])
df['stadium_capacity'] = np.where(df['stadium_name'] == 'RFK Memorial Stadium',45596,df['stadium_capacity'])
df['stadium_capacity'] = np.where(df['stadium_name'] == 'Cleveland Municipal Stadium', 81000,df['stadium_capacity'])
df['stadium_capacity'] = np.where(df['stadium_name'] == 'Anaheim Stadium', 69008,df['stadium_capacity'])
df['stadium_capacity'] = np.where(df['stadium_name'] == 'Atlanta-Fulton County Stadium', 60606,df['stadium_capacity'])
df['stadium_capacity'] = np.where(df['stadium_name'] == 'Busch Memorial Stadium', 60000,df['stadium_capacity'])
df['stadium_capacity'] = np.where(df['stadium_name'] == 'Orange Bowl', 75000,df['stadium_capacity'])
df['stadium_capacity'] = np.where(df['stadium_name'] == 'Memorial Stadium (Baltimore)', 50000,df['stadium_capacity'])
df['stadium_capacity'] = np.where(df['stadium_name'] == 'Sun Devil Stadium', 53599,df['stadium_capacity'])
df['stadium_capacity'] = np.where(df['stadium_name'] == 'Mall of America Field', 64121,df['stadium_capacity'])
df['stadium_capacity'] = np.where(df['stadium_name'] == 'Metropolitan Stadium', 41200,df['stadium_capacity'])
df['stadium_capacity'] = np.where(df['stadium_name'] == 'Wembley Stadium', 86000,df['stadium_capacity'])
df['stadium_capacity'] = np.where(df['stadium_name'] == 'Husky Stadium', 70000,df['stadium_capacity'])
df['stadium_capacity'] = np.where(df['stadium_name'] == 'TCF Bank Stadium', 50805,df['stadium_capacity'])
df['stadium_capacity'] = np.where(df['stadium_name'] == 'TIAA Bank Field', 67814,df['stadium_capacity'])
df['stadium_capacity'] = np.where(df['stadium_name'] == "Memorial Stadium (Champaign)", 60670,df['stadium_capacity'])
df['stadium_capacity'] = np.where(df['stadium_name'] == "Memorial Stadium (Clemson)", 74000,df['stadium_capacity'])
df['stadium_capacity'] = np.where(df['stadium_name'] == 'Liberty Bowl Memorial Stadium', 58325,df['stadium_capacity'])
df['stadium_capacity'] = np.where(df['stadium_name'] == 'Vanderbilt Stadium', 40550,df['stadium_capacity'])
df['stadium_capacity'] = np.where(df['stadium_name'] == 'Rose Bowl', 92542,df['stadium_capacity'])
df['stadium_capacity'] = np.where(df['stadium_name'] == "Tiger Stadium (LSU)", 100000,df['stadium_capacity'])
df['stadium_capacity'] = np.where(df['stadium_name'] == 'Tulane Stadium', 70000,df['stadium_capacity'])
df['stadium_capacity'] = np.where(df['stadium_name'] == 'Stanford Stadium', 50000,df['stadium_capacity'])
df['stadium_capacity'] = np.where(df['stadium_name'] == 'Rice Stadium', 47000,df['stadium_capacity'])


In [20]:
df.isnull().sum()

schedule_date                      0
schedule_season                    0
schedule_week                      0
schedule_playoff                   0
team_home                          0
score_home                         0
score_away                         0
team_away                          0
team_favorite_id                   0
spread_favorite                    0
over_under_line                    0
stadium_neutral                    0
weather_temperature              534
weather_wind_mph                 534
weather_humidity                3941
weather_detail                  7713
total                              0
over_under_result                  0
stadium_name                       0
stadium_location                   0
stadium_open                       0
stadium_type                       0
stadium_address                  243
stadium_weather_station_code     243
stadium_weather_type             209
stadium_capacity                   0
stadium_surface                    0
S

In [21]:
df[df['stadium_capacity'].isna()].stadium_name.value_counts()

Series([], Name: stadium_name, dtype: int64)

In [22]:
df.dtypes

schedule_date                    object
schedule_season                   int64
schedule_week                    object
schedule_playoff                   bool
team_home                        object
score_home                      float64
score_away                      float64
team_away                        object
team_favorite_id                 object
spread_favorite                 float64
over_under_line                 float64
stadium_neutral                    bool
weather_temperature             float64
weather_wind_mph                float64
weather_humidity                 object
weather_detail                   object
total                             int64
over_under_result                object
stadium_name                     object
stadium_location                 object
stadium_open                      int64
stadium_type                     object
stadium_address                  object
stadium_weather_station_code     object
stadium_weather_type             object


In [23]:
df

Unnamed: 0,schedule_date,schedule_season,schedule_week,schedule_playoff,team_home,score_home,score_away,team_away,team_favorite_id,spread_favorite,over_under_line,stadium_neutral,weather_temperature,weather_wind_mph,weather_humidity,weather_detail,total,over_under_result,stadium_name,stadium_location,stadium_open,stadium_type,stadium_address,stadium_weather_station_code,stadium_weather_type,stadium_capacity,stadium_surface,STATION,ELEVATION,zipcode
12,9/1/1979,1979,1,False,Tampa Bay Buccaneers,31.0,16.0,Detroit Lions,TB,3.0,30.0,False,79.0,9.0,87,,47,Over,Houlihan's Stadium,"Tampa, FL",1969,outdoor,"4201 North Dale Mabry Highway, Tampa, Florida ...",33607,warm,50000,Grass,USW00012842,5.8,33607
13,9/2/1979,1979,1,False,Buffalo Bills,7.0,9.0,Miami Dolphins,MIA,5.0,39.0,False,74.0,15.0,74,,16,Under,Ralph Wilson Stadium,"Orchard Park, NY",1973,outdoor,"1 Bills Dr, Orchard Park, NY 14127",14127,cold,73967,FieldTurf,US1NYER0093,178.0,14127
14,9/2/1979,1979,1,False,Chicago Bears,6.0,3.0,Green Bay Packers,CHI,3.0,31.0,False,78.0,11.0,68,,9,Under,Soldier Field,"Chicago, IL",1926,outdoor,"1410 Museum Campus Dr, Chicago, IL 60605",60605,cold,61500,Grass,USC00111550,177.7,60605
15,9/2/1979,1979,1,False,Denver Broncos,10.0,0.0,Cincinnati Bengals,DEN,3.0,31.5,False,69.0,6.0,38,,10,Under,Mile High Stadium,"Denver, CO",1960,outdoor,"1701 Bryant St, Denver, CO 80204",80204,cold,75000,Grass,USW00023062,1611.2,80204
16,9/2/1979,1979,1,False,Kansas City Chiefs,14.0,0.0,Baltimore Colts,KC,1.0,37.0,False,76.0,8.0,71,,14,Under,Arrowhead Stadium,"Kansas City, MO",1972,outdoor,"1 Arrowhead Dr, Kansas City, MO 64129",64129,cold,76416,Grass,US1MOJC0028,264.9,64129
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10241,11/1/2020,2020,8,False,Kansas City Chiefs,35.0,9.0,New York Jets,KC,19.5,49.0,False,,,,,44,Under,Arrowhead Stadium,"Kansas City, MO",1972,outdoor,"1 Arrowhead Dr, Kansas City, MO 64129",64129,cold,76416,Grass,US1MOJC0028,264.9,64129
10242,11/1/2020,2020,8,False,Miami Dolphins,28.0,17.0,Los Angeles Rams,LAR,3.5,45.5,False,,,,,45,Under,Hard Rock Stadium,"Miami Gardens, FL",1987,outdoor,"347 Don Shula Dr, Miami Gardens, FL 33056",33056,warm,65326,Grass,USW00012839,8.8,33056
10243,11/1/2020,2020,8,False,Philadelphia Eagles,23.0,9.0,Dallas Cowboys,PHI,11.5,42.5,False,,,,,32,Under,Lincoln Financial Field,"Philadelphia, PA",2003,outdoor,"1020 Pattison Ave, Philadelphia, PA 19148",19148,cold,69176,Grass,USW00013739,3.0,19148
10244,11/1/2020,2020,8,False,Seattle Seahawks,37.0,27.0,San Francisco 49ers,SEA,3.0,53.5,False,,,,,64,Over,CenturyLink Field,"Seattle, WA",2002,outdoor,"800 Occidental Ave S, Seattle, WA 98134",98134,moderate,67000,FieldTurf,US1WAKG0038,93.0,98134


### Fixing Data Types

In [24]:
# converting the string to datetime format 
import datetime

df['schedule_date'] = pd.to_datetime(df['schedule_date'])

print(df.dtypes)

schedule_date                   datetime64[ns]
schedule_season                          int64
schedule_week                           object
schedule_playoff                          bool
team_home                               object
score_home                             float64
score_away                             float64
team_away                               object
team_favorite_id                        object
spread_favorite                        float64
over_under_line                        float64
stadium_neutral                           bool
weather_temperature                    float64
weather_wind_mph                       float64
weather_humidity                        object
weather_detail                          object
total                                    int64
over_under_result                       object
stadium_name                            object
stadium_location                        object
stadium_open                             int64
stadium_type 

In [25]:
# Capacity to Int

df['stadium_capacity'] = df.stadium_capacity.replace(',','', regex=True)
df = df.astype({'stadium_capacity': 'int'})

In [26]:
# Schedule Week 
# 1-17 Regular Season , > 17 Playoffs
# 18 = Wildcard
# 19 = Divisonal Round
# 20 = Conference Championship
# 21 = Superbowl

df['schedule_week'] = np.where(df['schedule_week'] == 'Division', 19,df['schedule_week'])
df['schedule_week'] = np.where(df['schedule_week'] == "Wildcard", 18,df['schedule_week'])
df['schedule_week'] = np.where(df['schedule_week'] == "WildCard", 18,df['schedule_week'])
df['schedule_week'] = np.where(df['schedule_week'] == 'Conference', 20,df['schedule_week'])
df['schedule_week'] = np.where(df['schedule_week'] == 'Superbowl', 21,df['schedule_week'])
df['schedule_week'] = np.where(df['schedule_week'] == 'SuperBowl', 21,df['schedule_week'])

df = df.astype({'schedule_week': 'int'})

In [27]:
df.dtypes

schedule_date                   datetime64[ns]
schedule_season                          int64
schedule_week                            int64
schedule_playoff                          bool
team_home                               object
score_home                             float64
score_away                             float64
team_away                               object
team_favorite_id                        object
spread_favorite                        float64
over_under_line                        float64
stadium_neutral                           bool
weather_temperature                    float64
weather_wind_mph                       float64
weather_humidity                        object
weather_detail                          object
total                                    int64
over_under_result                       object
stadium_name                            object
stadium_location                        object
stadium_open                             int64
stadium_type 

In [30]:
df.schedule_playoff.value_counts()

False    9845
True      373
Name: schedule_playoff, dtype: int64

In [29]:
df.to_csv('final_df.csv', index = False, header=True)