## Data Cleaning

In [6]:
##ADD YOUR IMPORTS HERE...
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from datetime import datetime as dt
import os
import seaborn as sns
import re
import folium
import dash
from dash import dcc, html
from dash.dependencies import Input, Output
import plotly.graph_objs as go
from jupyter_dash import JupyterDash

In [7]:
spreadspoke_df = pd.read_csv('NFL_sportsbetting_data/spreadspoke_scores.csv')
teams_df = pd.read_csv('NFL_sportsbetting_data/nfl_teams.csv')
stadiums_df = pd.read_csv('NFL_sportsbetting_data/nfl_stadiums.csv', encoding='ISO-8859-1')

In [8]:
#need to make sure that data has non null values for the betting data
spreadspoke_df = spreadspoke_df.dropna(subset=['team_favorite_id','spread_favorite','over_under_line'])

#converting the date column to datetimes
spreadspoke_df['schedule_date'] = pd.to_datetime(spreadspoke_df['schedule_date'], format='%m/%d/%Y')

#dropping games before 2002
spreadspoke_df = spreadspoke_df[spreadspoke_df['schedule_season'] >= 2002]

#spreadspoke_df.head()

In [9]:
#Walker
#Read in nfl_teams.csv as a usable dataframe
teams_df = pd.read_csv('NFL_sportsbetting_data/nfl_teams.csv')
#Clean up teams_df to remove useless information
teams_df.drop(['team_name_short', 'team_id_pfr', 'team_conference_pre2002', 'team_division_pre2002'], axis=1, inplace=True)
teams_df

Unnamed: 0,team_name,team_id,team_conference,team_division
0,Arizona Cardinals,ARI,NFC,NFC West
1,Atlanta Falcons,ATL,NFC,NFC South
2,Baltimore Colts,IND,AFC,
3,Baltimore Ravens,BAL,AFC,AFC North
4,Boston Patriots,NE,AFC,
5,Buffalo Bills,BUF,AFC,AFC East
6,Carolina Panthers,CAR,NFC,NFC South
7,Chicago Bears,CHI,NFC,NFC North
8,Cincinnati Bengals,CIN,AFC,AFC North
9,Cleveland Browns,CLE,AFC,AFC North


In [10]:
#Create a dictionary mapping team names to IDs
team_id_mapping = dict(zip(teams_df['team_name'], teams_df['team_id']))

#Add columns with team ID tags
spreadspoke_df['team_home_id'] = spreadspoke_df['team_home'].replace(team_id_mapping)
spreadspoke_df['team_away_id'] = spreadspoke_df['team_away'].replace(team_id_mapping)
spreadspoke_df = spreadspoke_df[spreadspoke_df['team_favorite_id'] != 'PICK']
spreadspoke_df.head(5)

Unnamed: 0,schedule_date,schedule_season,schedule_week,schedule_playoff,team_home,score_home,score_away,team_away,team_favorite_id,spread_favorite,over_under_line,stadium,stadium_neutral,weather_temperature,weather_wind_mph,weather_humidity,weather_detail,team_home_id,team_away_id
7872,2002-09-05,2002,1,False,New York Giants,13.0,16.0,San Francisco 49ers,SF,-4.0,39,Giants Stadium,False,75.0,12.0,58.0,,NYG,SF
7873,2002-09-08,2002,1,False,Buffalo Bills,31.0,37.0,New York Jets,NYJ,-3.0,43,Ralph Wilson Stadium,False,75.0,7.0,50.0,,BUF,NYJ
7875,2002-09-08,2002,1,False,Chicago Bears,27.0,23.0,Minnesota Vikings,CHI,-4.5,41,Memorial Stadium (Champaign),False,76.0,5.0,75.0,,CHI,MIN
7876,2002-09-08,2002,1,False,Cincinnati Bengals,6.0,34.0,San Diego Chargers,CIN,-3.0,37,Paul Brown Stadium,False,81.0,5.0,50.0,,CIN,LAC
7877,2002-09-08,2002,1,False,Cleveland Browns,39.0,40.0,Kansas City Chiefs,CLE,-2.0,36,FirstEnergy Stadium,False,78.0,7.0,54.0,,CLE,KC


In [11]:
# Step 1: Determine the winning team
spreadspoke_df['winning_team'] = spreadspoke_df.apply(lambda row: row['team_home_id'] if row['score_home'] > row['score_away'] else row['team_away_id'], axis=1)

# Step 2: Create the new column
spreadspoke_df['favorite_won'] = spreadspoke_df['team_favorite_id'] == spreadspoke_df['winning_team']

# Prefer to have 'Yes'/'No' instead of True/False:
spreadspoke_df['favorite_won'] = (spreadspoke_df['team_favorite_id'] == spreadspoke_df['winning_team']).map({True: 'Yes', False: 'No'})

spreadspoke_df.head(10)

Unnamed: 0,schedule_date,schedule_season,schedule_week,schedule_playoff,team_home,score_home,score_away,team_away,team_favorite_id,spread_favorite,...,stadium,stadium_neutral,weather_temperature,weather_wind_mph,weather_humidity,weather_detail,team_home_id,team_away_id,winning_team,favorite_won
7872,2002-09-05,2002,1,False,New York Giants,13.0,16.0,San Francisco 49ers,SF,-4.0,...,Giants Stadium,False,75.0,12.0,58.0,,NYG,SF,SF,Yes
7873,2002-09-08,2002,1,False,Buffalo Bills,31.0,37.0,New York Jets,NYJ,-3.0,...,Ralph Wilson Stadium,False,75.0,7.0,50.0,,BUF,NYJ,NYJ,Yes
7875,2002-09-08,2002,1,False,Chicago Bears,27.0,23.0,Minnesota Vikings,CHI,-4.5,...,Memorial Stadium (Champaign),False,76.0,5.0,75.0,,CHI,MIN,CHI,Yes
7876,2002-09-08,2002,1,False,Cincinnati Bengals,6.0,34.0,San Diego Chargers,CIN,-3.0,...,Paul Brown Stadium,False,81.0,5.0,50.0,,CIN,LAC,LAC,No
7877,2002-09-08,2002,1,False,Cleveland Browns,39.0,40.0,Kansas City Chiefs,CLE,-2.0,...,FirstEnergy Stadium,False,78.0,7.0,54.0,,CLE,KC,KC,No
7878,2002-09-08,2002,1,False,Denver Broncos,23.0,16.0,St. Louis Rams,LAR,-3.0,...,Sports Authority Field at Mile High,False,73.0,13.0,45.0,,DEN,LAR,DEN,No
7879,2002-09-08,2002,1,False,Green Bay Packers,37.0,34.0,Atlanta Falcons,GB,-7.0,...,Lambeau Field,False,72.0,6.0,78.0,,GB,ATL,GB,Yes
7880,2002-09-08,2002,1,False,Houston Texans,19.0,10.0,Dallas Cowboys,DAL,-8.5,...,Reliant Stadium,False,72.0,0.0,,indoor,HOU,DAL,HOU,No
7881,2002-09-08,2002,1,False,Jacksonville Jaguars,25.0,28.0,Indianapolis Colts,IND,-3.5,...,EverBank Field,False,82.0,14.0,77.0,,JAX,IND,IND,Yes
7882,2002-09-08,2002,1,False,Miami Dolphins,49.0,21.0,Detroit Lions,MIA,-9.5,...,Sun Life Stadium,False,83.0,9.0,80.0,,MIA,DET,MIA,Yes


In [12]:
#Convert the values in the three required columns into floats to be compared to one another
spreadspoke_df['score_home'] = spreadspoke_df['score_home'].astype(float)
spreadspoke_df['score_away'] = spreadspoke_df['score_away'].astype(float)
spreadspoke_df['over_under_line'] = spreadspoke_df['over_under_line'].astype(float)

spreadspoke_df['game_score_sum'] = spreadspoke_df['score_home'] + spreadspoke_df['score_away']

spreadspoke_df['over_under_delta'] = spreadspoke_df['game_score_sum'] - spreadspoke_df['over_under_line']

spreadspoke_df['over_under_result'] = spreadspoke_df.apply(lambda row: 
    'Over' if row['game_score_sum'] > row['over_under_line'] 
    else ('Under' if row['game_score_sum'] < row['over_under_line'] 
          else 'Push'), axis=1)
spreadspoke_df.head()

Unnamed: 0,schedule_date,schedule_season,schedule_week,schedule_playoff,team_home,score_home,score_away,team_away,team_favorite_id,spread_favorite,...,weather_wind_mph,weather_humidity,weather_detail,team_home_id,team_away_id,winning_team,favorite_won,game_score_sum,over_under_delta,over_under_result
7872,2002-09-05,2002,1,False,New York Giants,13.0,16.0,San Francisco 49ers,SF,-4.0,...,12.0,58.0,,NYG,SF,SF,Yes,29.0,-10.0,Under
7873,2002-09-08,2002,1,False,Buffalo Bills,31.0,37.0,New York Jets,NYJ,-3.0,...,7.0,50.0,,BUF,NYJ,NYJ,Yes,68.0,25.0,Over
7875,2002-09-08,2002,1,False,Chicago Bears,27.0,23.0,Minnesota Vikings,CHI,-4.5,...,5.0,75.0,,CHI,MIN,CHI,Yes,50.0,9.0,Over
7876,2002-09-08,2002,1,False,Cincinnati Bengals,6.0,34.0,San Diego Chargers,CIN,-3.0,...,5.0,50.0,,CIN,LAC,LAC,No,40.0,3.0,Over
7877,2002-09-08,2002,1,False,Cleveland Browns,39.0,40.0,Kansas City Chiefs,CLE,-2.0,...,7.0,54.0,,CLE,KC,KC,No,79.0,43.0,Over


In [13]:
#Define the new order of columns
new_order = ["schedule_date","team_home","score_home","score_away","team_away","team_favorite_id","favorite_won",
             "spread_favorite","over_under_line","over_under_result","over_under_delta","game_score_sum","schedule_season","schedule_playoff","schedule_week","stadium","stadium_neutral",
             "weather_temperature","weather_wind_mph","weather_humidity", "team_home_id", "team_away_id"]

#Rearrange the DataFrame
spreadspoke_df = spreadspoke_df[new_order]
spreadspoke_df = spreadspoke_df.set_index('schedule_date')

pd.set_option('display.max_columns', None)

spreadspoke_df.head()

Unnamed: 0_level_0,team_home,score_home,score_away,team_away,team_favorite_id,favorite_won,spread_favorite,over_under_line,over_under_result,over_under_delta,game_score_sum,schedule_season,schedule_playoff,schedule_week,stadium,stadium_neutral,weather_temperature,weather_wind_mph,weather_humidity,team_home_id,team_away_id
schedule_date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2002-09-05,New York Giants,13.0,16.0,San Francisco 49ers,SF,Yes,-4.0,39.0,Under,-10.0,29.0,2002,False,1,Giants Stadium,False,75.0,12.0,58.0,NYG,SF
2002-09-08,Buffalo Bills,31.0,37.0,New York Jets,NYJ,Yes,-3.0,43.0,Over,25.0,68.0,2002,False,1,Ralph Wilson Stadium,False,75.0,7.0,50.0,BUF,NYJ
2002-09-08,Chicago Bears,27.0,23.0,Minnesota Vikings,CHI,Yes,-4.5,41.0,Over,9.0,50.0,2002,False,1,Memorial Stadium (Champaign),False,76.0,5.0,75.0,CHI,MIN
2002-09-08,Cincinnati Bengals,6.0,34.0,San Diego Chargers,CIN,No,-3.0,37.0,Over,3.0,40.0,2002,False,1,Paul Brown Stadium,False,81.0,5.0,50.0,CIN,LAC
2002-09-08,Cleveland Browns,39.0,40.0,Kansas City Chiefs,CLE,No,-2.0,36.0,Over,43.0,79.0,2002,False,1,FirstEnergy Stadium,False,78.0,7.0,54.0,CLE,KC


In [14]:
#Adding Stadium Info

In [15]:
# # Values to replace
# values_to_replace = ['Washington Redskins', 'Washington Football Team']

# # Replace specific values with 'New Team Name'
# spreadspoke_df['team_home'] = spreadspoke_df['team_home'].replace(values_to_replace, 'Washington Commanders')  
# spreadspoke_df['team_away'] = spreadspoke_df['team_away'].replace(values_to_replace, 'Washington Commanders')  

# #spreadspoke_df.head(20) Commanders correctly replaced

In [16]:
nfl_teams = [
    "Arizona Cardinals",  "Atlanta Falcons",  "Baltimore Ravens",  "Buffalo Bills",  "Carolina Panthers",  "Chicago Bears",
    "Cincinnati Bengals", "Cleveland Browns", "Dallas Cowboys", "Denver Broncos", "Detroit Lions", "Green Bay Packers",
    "Houston Texans", "Indianapolis Colts",  "Jacksonville Jaguars",  "Kansas City Chiefs",  "Las Vegas Raiders",
    "Los Angeles Chargers",  "Los Angeles Rams",  "Miami Dolphins",  "Minnesota Vikings",  "New England Patriots",
    "New Orleans Saints",  "New York Giants",  "New York Jets",  "Philadelphia Eagles",  "Pittsburgh Steelers",
    "San Francisco 49ers",  "Seattle Seahawks",  "Tampa Bay Buccaneers",  "Tennessee Titans",  "Washington Commanders"
]

# Create regex pattern to match any of the NFL team names
pattern = r'\b(' + '|'.join(nfl_teams) + r')\b'

# Filter the DataFrame to only keep rows that contain an NFL team name
filtered_df = spreadspoke_df[spreadspoke_df['team_home'].str.contains(pattern, case=False, regex=True)]
spreadspoke_df2 = filtered_df[filtered_df['team_away'].str.contains(pattern, case=False, regex=True)]

# Display the filtered DataFrame
spreadspoke_df2.head()

#we don't lose washington commanders as the home team here
len(list(spreadspoke_df2['team_home'].unique())) 

  filtered_df = spreadspoke_df[spreadspoke_df['team_home'].str.contains(pattern, case=False, regex=True)]
  spreadspoke_df2 = filtered_df[filtered_df['team_away'].str.contains(pattern, case=False, regex=True)]


32

In [17]:
values_to_replace = ['Washington Redskins', 'Washington Football Team']

# Replace washington redskins and washington football team with washington commanders
spreadspoke_df2.loc[spreadspoke_df2['team_home'].isin(values_to_replace), 'team_home'] = 'Washington Commanders'
spreadspoke_df2.loc[spreadspoke_df2['team_away'].isin(values_to_replace), 'team_away'] = 'Washington Commanders'

#prove that it was replaced correctly
# spreadspoke_df2.query("team_away == 'Washington Commanders'")
# spreadspoke_df2.query("team_home == 'Washington Commanders'")

In [18]:
#Matching teams to stadiums:

#list of current stadiums that are open (including stadiums that have had multiple names since 2002)
stadiums = [
    "Acrisure Stadium", "Allegiant Stadium", "GEHA Field at Arrowhead Stadium", "AT&T Stadium", "Bank of America Stadium",
    "Caesars Superdome", "Empower Field at Mile High", "FedEx Field","FirstEnergy Stadium", "Ford Field", "Gillette Stadium",
    "Hard Rock Stadium", "Highmark Stadium", "Lambeau Field", "Levi's Stadium", "Lincoln Financial Field","Lumen Field",
    "M&T Bank Stadium", "Mercedes-Benz Stadium", "MetLife Stadium", "NRG Stadium", "Nissan Stadium", "Paycor Stadium",
    "Raymond James Stadium", "SoFi Stadium", "Soldier Field", "State Farm Stadium", "TIAA Bank Field", "U.S. Bank Stadium",
    "Lucas Oil Stadium",
    "Cowboys Stadium","Sports Authority Field at Mile High","Mile High Stadium","University of Phoenix Stadium",
    "Reliant Stadium", "EverBank Field", "Arrowhead Stadium", "LP Stadium", "Louisiana Superdome",
    "Mercedes-Benz Superdome","Bills Stadium", "New Era Field","Ralph Wilson Stadium", "Heinz Field"]

pattern = r'\b(' + '|'.join(stadiums) + r')\b'
stadiums_filtered_df = stadiums_df[stadiums_df['stadium_name'].str.contains(pattern, case=False, regex=True)]
stadiums_filtered_df = stadiums_filtered_df.drop(columns=['stadium_close', 'stadium_address', 'stadium_weather_station_zipcode', 'stadium_weather_station',	'stadium_weather_station_name', 'stadium_azimuthangle', 'stadium_elevation']).reset_index().drop(columns=['index'])
#stadiums_filtered_df

  stadiums_filtered_df = stadiums_df[stadiums_df['stadium_name'].str.contains(pattern, case=False, regex=True)]


In [19]:
#add a column to the spreadspoke_df to match the hometown to the home team
stadium_locations = {'Pittsburgh Steelers':'Pittsburgh, PA', 'Las Vegas Raiders':'Paradise, NV', 'Dallas Cowboys':'Arlington, TX',
                     'Carolina Panthers':'Charlotte, NC', 'New Orleans Saints':'New Orleans, LA', 'Denver Broncos':'Denver, CO',
                     'Washington Commanders':'Landover, MD', 'Cleveland Browns':'Cleveland, OH', 'Detroit Lions':'Detroit, MI',
                     'Kansas City Chiefs':'Kansas City, MO', 'New England Patriots':'Foxborough, MA', 
                     'Miami Dolphins':'Miami Gardens, FL', 'Buffalo Bills':'Orchard Park, NY', 'Green Bay Packers':'Green Bay, WI',
                     'San Francisco 49ers':'Santa Clara, CA', 'Philadelphia Eagles':'Philadelphia, PA', 
                     'Indianapolis Colts':'Indianapolis, IN', 'Seattle Seahawks':'Seattle, WA', 'Baltimore Ravens':'Baltimore, MD',
                     'Atlanta Falcons':'Atlanta, GA', 'New York Giants':'East Rutherford, NJ', 'New York Jets':'East Rutherford, NJ',
                     'Tennessee Titans':'Nashville, TN', 'Houston Texans':'Houston, TX', 'Cincinnati Bengals':'Cincinnati, OH',
                     'Tampa Bay Buccaneers':'Tampa, FL', 'Los Angeles Chargers':'Inglewood, CA', 'Los Angeles Rams':'Inglewood, CA',
                     'Chicago Bears':'Chicago, IL', 'Arizona Cardinals':'Glendale, AZ', 'Jacksonville Jaguars':'Jacksonville, FL',
                     'Minnesota Vikings':'Minneapolis, MN'}

spreadspoke_df2['stadium_location'] = spreadspoke_df2['team_home'].map(stadium_locations) #this is where you lose the commanders
spreadspoke_df2.reset_index()
spreadspoke_df2.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  spreadspoke_df2['stadium_location'] = spreadspoke_df2['team_home'].map(stadium_locations) #this is where you lose the commanders


Unnamed: 0_level_0,team_home,score_home,score_away,team_away,team_favorite_id,favorite_won,spread_favorite,over_under_line,over_under_result,over_under_delta,game_score_sum,schedule_season,schedule_playoff,schedule_week,stadium,stadium_neutral,weather_temperature,weather_wind_mph,weather_humidity,team_home_id,team_away_id,stadium_location
schedule_date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
2002-09-05,New York Giants,13.0,16.0,San Francisco 49ers,SF,Yes,-4.0,39.0,Under,-10.0,29.0,2002,False,1,Giants Stadium,False,75.0,12.0,58.0,NYG,SF,"East Rutherford, NJ"
2002-09-08,Buffalo Bills,31.0,37.0,New York Jets,NYJ,Yes,-3.0,43.0,Over,25.0,68.0,2002,False,1,Ralph Wilson Stadium,False,75.0,7.0,50.0,BUF,NYJ,"Orchard Park, NY"
2002-09-08,Chicago Bears,27.0,23.0,Minnesota Vikings,CHI,Yes,-4.5,41.0,Over,9.0,50.0,2002,False,1,Memorial Stadium (Champaign),False,76.0,5.0,75.0,CHI,MIN,"Chicago, IL"
2002-09-08,Cleveland Browns,39.0,40.0,Kansas City Chiefs,CLE,No,-2.0,36.0,Over,43.0,79.0,2002,False,1,FirstEnergy Stadium,False,78.0,7.0,54.0,CLE,KC,"Cleveland, OH"
2002-09-08,Green Bay Packers,37.0,34.0,Atlanta Falcons,GB,Yes,-7.0,42.5,Over,28.5,71.0,2002,False,1,Lambeau Field,False,72.0,6.0,78.0,GB,ATL,"Green Bay, WI"


In [71]:
#merge the stadium dataframe to the spreadspoke dataframe to get the stadium information in the same df as the game information
spreadspoke_df3 = pd.merge(spreadspoke_df2, stadiums_filtered_df, on='stadium_location', how='outer')

#create a mask to filter rows for games played in stadiums that no longer exist
mask = spreadspoke_df3['schedule_season'] >= spreadspoke_df3['stadium_open']
spreadspoke_df3 = spreadspoke_df3[mask]

#need to see all of the columns
pd.set_option('display.max_columns', None)

#check for the Los Angeles Chargers
#check for the Los Angeles Rams
#check for the Las Vegas Raiders

# spreadspoke_df3.query("team_home == 'Los Angeles Chargers'").head(5)
# spreadspoke_df3.query("team_home == 'Los Angeles Rams'").head(5)
# spreadspoke_df3.query("team_home == 'Las Vegas Raiders'").head(5)

In [83]:
#check to make sure we didn't lose any stadiums during the merge
unique_stadiums = spreadspoke_df3['stadium_name'].unique()

#the number of stadiums stays the same (we didn't lose any)
print(len(stadiums)) #44
print(len(unique_stadiums)) #44

44
44


In [85]:
final_df = spreadspoke_df3.reset_index().drop(columns=['index'])
final_df
#final_df contains a combination of the stadium data and the game data

Unnamed: 0,team_home,score_home,score_away,team_away,team_favorite_id,favorite_won,spread_favorite,over_under_line,over_under_result,over_under_delta,game_score_sum,schedule_season,schedule_playoff,schedule_week,stadium,stadium_neutral,weather_temperature,weather_wind_mph,weather_humidity,team_home_id,team_away_id,stadium_location,stadium_name,stadium_open,stadium_type,stadium_weather_type,stadium_capacity,stadium_surface,stadium_latitude,stadium_longitude
0,New York Giants,31.0,18.0,Carolina Panthers,NYG,Yes,-6.0,41.0,Over,8.0,49.0,2010,False,1,MetLife Stadium,False,65.0,1.0,67.0,NYG,CAR,"East Rutherford, NJ",MetLife Stadium,2010.0,outdoor,cold,82500,FieldTurf,40.813528,-74.074361
1,New York Jets,9.0,10.0,Baltimore Ravens,NYJ,No,-1.0,36.5,Under,-17.5,19.0,2010,False,1,MetLife Stadium,False,73.0,1.0,100.0,NYJ,BAL,"East Rutherford, NJ",MetLife Stadium,2010.0,outdoor,cold,82500,FieldTurf,40.813528,-74.074361
2,New York Jets,28.0,14.0,New England Patriots,NE,No,-3.0,39.5,Over,2.5,42.0,2010,False,2,MetLife Stadium,False,82.0,6.0,36.0,NYJ,NE,"East Rutherford, NJ",MetLife Stadium,2010.0,outdoor,cold,82500,FieldTurf,40.813528,-74.074361
3,New York Giants,10.0,29.0,Tennessee Titans,NYG,No,-3.0,43.5,Under,-4.5,39.0,2010,False,3,MetLife Stadium,False,70.0,3.0,48.0,NYG,TEN,"East Rutherford, NJ",MetLife Stadium,2010.0,outdoor,cold,82500,FieldTurf,40.813528,-74.074361
4,New York Giants,17.0,3.0,Chicago Bears,NYG,Yes,-3.5,44.0,Under,-24.0,20.0,2010,False,4,MetLife Stadium,False,59.0,1.0,52.0,NYG,CHI,"East Rutherford, NJ",MetLife Stadium,2010.0,outdoor,cold,82500,FieldTurf,40.813528,-74.074361
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6496,Washington Commanders,31.0,38.0,Philadelphia Eagles,PHI,Yes,-7.0,43.0,Over,26.0,69.0,2023,False,8,FedEx Field,False,,,,WAS,PHI,"Landover, MD",FedEx Field,1997.0,outdoor,moderate,79000,Grass,38.907778,-76.864444
6497,Washington Commanders,19.0,31.0,New York Giants,WAS,No,-8.5,39.0,Over,11.0,50.0,2023,False,11,FedEx Field,False,,,,WAS,NYG,"Landover, MD",FedEx Field,1997.0,outdoor,moderate,79000,Grass,38.907778,-76.864444
6498,Washington Commanders,15.0,45.0,Miami Dolphins,MIA,Yes,-9.0,49.5,Over,10.5,60.0,2023,False,13,FedEx Field,False,,,,WAS,MIA,"Landover, MD",FedEx Field,1997.0,outdoor,moderate,79000,Grass,38.907778,-76.864444
6499,Washington Commanders,10.0,27.0,San Francisco 49ers,SF,Yes,-14.0,48.5,Under,-11.5,37.0,2023,False,17,FedEx Field,False,,,,WAS,SF,"Landover, MD",FedEx Field,1997.0,outdoor,moderate,79000,Grass,38.907778,-76.864444


In [87]:
def spread_cover(row):
    # Determine the non-favorite team
    if row['team_favorite_id'] == row['team_home']:
        row['team_not_favorite_id'] = row['team_away_id']
    else:
        row['team_not_favorite_id'] = row['team_home_id']

    # Determine if the favorite won or not
    if row['favorite_won'] == 'No':
        row['spread_cover'] = row['team_not_favorite_id']
        row['favorite_covered'] = False  # Favorite did not cover
    else:
        if row['team_favorite_id'] == row['team_home']:
            if row['score_home'] + row['spread_favorite'] > row['score_away']:
                row['spread_cover'] = row['team_home']
                row['favorite_covered'] = True  # Favorite covered
            else:
                row['spread_cover'] = row['team_away']
                row['favorite_covered'] = False  # Favorite did not cover
        else:
            if row['score_away'] + row['spread_favorite'] > row['score_home']:
                row['spread_cover'] = row['team_away_id']
                row['favorite_covered'] = True  # Favorite covered
            else:
                row['spread_cover'] = row['team_home_id']
                row['favorite_covered'] = False  # Favorite did not cover

    return row

# Apply the function to each row in the DataFrame
final_df = final_df.apply(spread_cover, axis=1)
final_df

Unnamed: 0,team_home,score_home,score_away,team_away,team_favorite_id,favorite_won,spread_favorite,over_under_line,over_under_result,over_under_delta,game_score_sum,schedule_season,schedule_playoff,schedule_week,stadium,stadium_neutral,weather_temperature,weather_wind_mph,weather_humidity,team_home_id,team_away_id,stadium_location,stadium_name,stadium_open,stadium_type,stadium_weather_type,stadium_capacity,stadium_surface,stadium_latitude,stadium_longitude,team_not_favorite_id,spread_cover,favorite_covered
0,New York Giants,31.0,18.0,Carolina Panthers,NYG,Yes,-6.0,41.0,Over,8.0,49.0,2010,False,1,MetLife Stadium,False,65.0,1.0,67.0,NYG,CAR,"East Rutherford, NJ",MetLife Stadium,2010.0,outdoor,cold,82500,FieldTurf,40.813528,-74.074361,NYG,NYG,False
1,New York Jets,9.0,10.0,Baltimore Ravens,NYJ,No,-1.0,36.5,Under,-17.5,19.0,2010,False,1,MetLife Stadium,False,73.0,1.0,100.0,NYJ,BAL,"East Rutherford, NJ",MetLife Stadium,2010.0,outdoor,cold,82500,FieldTurf,40.813528,-74.074361,NYJ,NYJ,False
2,New York Jets,28.0,14.0,New England Patriots,NE,No,-3.0,39.5,Over,2.5,42.0,2010,False,2,MetLife Stadium,False,82.0,6.0,36.0,NYJ,NE,"East Rutherford, NJ",MetLife Stadium,2010.0,outdoor,cold,82500,FieldTurf,40.813528,-74.074361,NYJ,NYJ,False
3,New York Giants,10.0,29.0,Tennessee Titans,NYG,No,-3.0,43.5,Under,-4.5,39.0,2010,False,3,MetLife Stadium,False,70.0,3.0,48.0,NYG,TEN,"East Rutherford, NJ",MetLife Stadium,2010.0,outdoor,cold,82500,FieldTurf,40.813528,-74.074361,NYG,NYG,False
4,New York Giants,17.0,3.0,Chicago Bears,NYG,Yes,-3.5,44.0,Under,-24.0,20.0,2010,False,4,MetLife Stadium,False,59.0,1.0,52.0,NYG,CHI,"East Rutherford, NJ",MetLife Stadium,2010.0,outdoor,cold,82500,FieldTurf,40.813528,-74.074361,NYG,NYG,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6496,Washington Commanders,31.0,38.0,Philadelphia Eagles,PHI,Yes,-7.0,43.0,Over,26.0,69.0,2023,False,8,FedEx Field,False,,,,WAS,PHI,"Landover, MD",FedEx Field,1997.0,outdoor,moderate,79000,Grass,38.907778,-76.864444,WAS,WAS,False
6497,Washington Commanders,19.0,31.0,New York Giants,WAS,No,-8.5,39.0,Over,11.0,50.0,2023,False,11,FedEx Field,False,,,,WAS,NYG,"Landover, MD",FedEx Field,1997.0,outdoor,moderate,79000,Grass,38.907778,-76.864444,WAS,WAS,False
6498,Washington Commanders,15.0,45.0,Miami Dolphins,MIA,Yes,-9.0,49.5,Over,10.5,60.0,2023,False,13,FedEx Field,False,,,,WAS,MIA,"Landover, MD",FedEx Field,1997.0,outdoor,moderate,79000,Grass,38.907778,-76.864444,WAS,MIA,True
6499,Washington Commanders,10.0,27.0,San Francisco 49ers,SF,Yes,-14.0,48.5,Under,-11.5,37.0,2023,False,17,FedEx Field,False,,,,WAS,SF,"Landover, MD",FedEx Field,1997.0,outdoor,moderate,79000,Grass,38.907778,-76.864444,WAS,SF,True


In [91]:
new_order = [
    'team_home', 'score_home' , 'score_away', 'team_away', 'team_home_id', 'team_away_id', 'team_favorite_id', 'team_not_favorite_id',
    'spread_favorite', 'spread_cover', 'favorite_covered',
    'over_under_line', 'over_under_result', 'over_under_delta', 'game_score_sum',
    'schedule_season', 'schedule_week', 'schedule_playoff',
    'stadium', 'stadium_name', 'stadium_location', 'stadium_open', 'stadium_type', 'stadium_weather_type',
    'stadium_capacity', 'stadium_surface', 'stadium_latitude', 'stadium_longitude',
    'weather_temperature', 'weather_wind_mph', 'weather_humidity', 'stadium_neutral'
]

# Reorder the DataFrame columns
final_df = final_df[new_order]

# Print the reordered DataFrame
final_df.head()

Unnamed: 0,team_home,score_home,score_away,team_away,team_home_id,team_away_id,team_favorite_id,team_not_favorite_id,spread_favorite,spread_cover,favorite_covered,over_under_line,over_under_result,over_under_delta,game_score_sum,schedule_season,schedule_week,schedule_playoff,stadium,stadium_name,stadium_location,stadium_open,stadium_type,stadium_weather_type,stadium_capacity,stadium_surface,stadium_latitude,stadium_longitude,weather_temperature,weather_wind_mph,weather_humidity,stadium_neutral
0,New York Giants,31.0,18.0,Carolina Panthers,NYG,CAR,NYG,NYG,-6.0,NYG,False,41.0,Over,8.0,49.0,2010,1,False,MetLife Stadium,MetLife Stadium,"East Rutherford, NJ",2010.0,outdoor,cold,82500,FieldTurf,40.813528,-74.074361,65.0,1.0,67.0,False
1,New York Jets,9.0,10.0,Baltimore Ravens,NYJ,BAL,NYJ,NYJ,-1.0,NYJ,False,36.5,Under,-17.5,19.0,2010,1,False,MetLife Stadium,MetLife Stadium,"East Rutherford, NJ",2010.0,outdoor,cold,82500,FieldTurf,40.813528,-74.074361,73.0,1.0,100.0,False
2,New York Jets,28.0,14.0,New England Patriots,NYJ,NE,NE,NYJ,-3.0,NYJ,False,39.5,Over,2.5,42.0,2010,2,False,MetLife Stadium,MetLife Stadium,"East Rutherford, NJ",2010.0,outdoor,cold,82500,FieldTurf,40.813528,-74.074361,82.0,6.0,36.0,False
3,New York Giants,10.0,29.0,Tennessee Titans,NYG,TEN,NYG,NYG,-3.0,NYG,False,43.5,Under,-4.5,39.0,2010,3,False,MetLife Stadium,MetLife Stadium,"East Rutherford, NJ",2010.0,outdoor,cold,82500,FieldTurf,40.813528,-74.074361,70.0,3.0,48.0,False
4,New York Giants,17.0,3.0,Chicago Bears,NYG,CHI,NYG,NYG,-3.5,NYG,False,44.0,Under,-24.0,20.0,2010,4,False,MetLife Stadium,MetLife Stadium,"East Rutherford, NJ",2010.0,outdoor,cold,82500,FieldTurf,40.813528,-74.074361,59.0,1.0,52.0,False


## Visulaizations

In [24]:
#Walker is doing a bar chart of one team vs all the others comparing the over vs the under

In [25]:
#Other Ideas 

# How does weather affect the spread? the over/under? (see walker's graphs) (maybe visualize them differently or fitler them for more specific results?)
# Indoor vs Outdoor games
# Home team when underdog? Home team when favored? (checking to see if teams have a significant home field advantage)
# 

In [31]:
# Home team when underdog? Home team when favored? (checking to see if teams have a significant home field advantage)

