# NFL winning Predication and Full Data Analysis


## **Contents in this Project**

1. Data Loading and Summary Checking
2. Data Cleaning
3. Feature Extraction
4. EDA and Data Visualisation
5. Best teams  Clusters since 2008 based on Performance
6. NFL Match Winning Prediction 🏆


In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC 
import seaborn as sns

import warnings
warnings.filterwarnings("ignore")
pd.set_option('display.max_columns',None)


## DataSet reading and understanding

In [2]:
Teams_Df = pd.read_csv('./Datasets/nfl_team_stats_2002-2022.csv')
Teams_Df.head()

Unnamed: 0,season,week,date,away,home,score_away,score_home,first_downs_away,first_downs_home,third_down_comp_away,third_down_att_away,third_down_comp_home,third_down_att_home,fourth_down_comp_away,fourth_down_att_away,fourth_down_comp_home,fourth_down_att_home,plays_away,plays_home,drives_away,drives_home,yards_away,yards_home,pass_comp_away,pass_att_away,pass_yards_away,pass_comp_home,pass_att_home,pass_yards_home,sacks_num_away,sacks_yards_away,sacks_num_home,sacks_yards_home,rush_att_away,rush_yards_away,rush_att_home,rush_yards_home,pen_num_away,pen_yards_away,pen_num_home,pen_yards_home,redzone_comp_away,redzone_att_away,redzone_comp_home,redzone_att_home,fumbles_away,fumbles_home,interceptions_away,interceptions_home,def_st_td_away,def_st_td_home,possession_away,possession_home
0,2002,1,9/5/2002,49ers,Giants,16,13,13,21,4,12,9,16,0,0,0,1,51,70,13,15,279,361,16,26,166,28,45,318,0,0,3,24,25,113,22,43,5,29,10,80,0,8,0,6,0,0,1,3,0,0,27.53,32.47
1,2002,1,9/8/2002,Jets,Bills,37,31,18,26,2,8,7,17,0,0,2,2,47,75,17,18,266,384,24,30,193,26,39,242,3,17,4,29,14,73,32,142,10,90,10,82,0,9,0,8,1,1,0,2,2,0,21.1,39.13
2,2002,1,9/8/2002,Vikings,Bears,23,27,19,20,5,13,7,13,0,0,0,0,62,60,17,17,368,368,16,28,228,20,33,288,1,6,1,9,33,140,26,80,8,52,4,33,0,7,0,6,1,1,2,1,0,0,31.52,28.48
3,2002,1,9/8/2002,Chargers,Bengals,34,6,27,13,6,10,4,11,0,0,0,0,65,48,11,15,401,203,15,19,160,18,31,167,1,0,4,31,45,241,13,36,4,39,9,57,0,7,0,5,0,0,0,1,0,0,37.8,22.2
4,2002,1,9/8/2002,Chiefs,Browns,40,39,24,24,5,11,4,11,0,0,0,0,59,61,20,17,470,411,20,29,276,28,40,352,0,0,1,7,30,194,20,59,9,87,4,38,0,10,0,10,0,1,1,0,0,0,30.27,29.73


In [3]:
Teams_Df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5644 entries, 0 to 5643
Data columns (total 53 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   season                 5644 non-null   int64  
 1   week                   5644 non-null   object 
 2   date                   5644 non-null   object 
 3   away                   5644 non-null   object 
 4   home                   5644 non-null   object 
 5   score_away             5644 non-null   int64  
 6   score_home             5644 non-null   int64  
 7   first_downs_away       5644 non-null   int64  
 8   first_downs_home       5644 non-null   int64  
 9   third_down_comp_away   5644 non-null   int64  
 10  third_down_att_away    5644 non-null   int64  
 11  third_down_comp_home   5644 non-null   int64  
 12  third_down_att_home    5644 non-null   int64  
 13  fourth_down_comp_away  5644 non-null   int64  
 14  fourth_down_att_away   5644 non-null   int64  
 15  four

In [4]:
games_Df = pd.read_csv('./Datasets/Historical_Game_Stats.csv')
games_Df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14073 entries, 0 to 14072
Data columns (total 17 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   schedule_date        14073 non-null  object 
 1   schedule_season      14073 non-null  int64  
 2   schedule_week        14073 non-null  object 
 3   schedule_playoff     14073 non-null  bool   
 4   team_home            14073 non-null  object 
 5   score_home           13801 non-null  float64
 6   score_away           13801 non-null  float64
 7   team_away            14073 non-null  object 
 8   team_favorite_id     11322 non-null  object 
 9   spread_favorite      11322 non-null  float64
 10  over_under_line      11312 non-null  object 
 11  stadium              14073 non-null  object 
 12  stadium_neutral      14073 non-null  bool   
 13  weather_temperature  12525 non-null  float64
 14  weather_wind_mph     12509 non-null  float64
 15  weather_humidity     8476 non-null  

In [5]:
#games_Df['schedule_date'] = pd.to_datetime(games_Df['schedule_date'],format='mixed')
#games_Df_2002 = games_Df[games_Df['schedule_date'] >= '2002/01/01']

games_Df = games_Df.dropna(subset=['schedule_date'])
games_Df['schedule_date'] = pd.to_datetime(games_Df['schedule_date'],format='mixed')
games_Df_2002 = games_Df[games_Df['schedule_date'] >= '2002/01/01']
games_Df_2002.head()
games_Df_2002.shape # 6227
games_Df_2002.info()


<class 'pandas.core.frame.DataFrame'>
Index: 6227 entries, 7846 to 14072
Data columns (total 17 columns):
 #   Column               Non-Null Count  Dtype         
---  ------               --------------  -----         
 0   schedule_date        6227 non-null   datetime64[ns]
 1   schedule_season      6227 non-null   int64         
 2   schedule_week        6227 non-null   object        
 3   schedule_playoff     6227 non-null   bool          
 4   team_home            6227 non-null   object        
 5   score_home           5955 non-null   float64       
 6   score_away           5955 non-null   float64       
 7   team_away            6227 non-null   object        
 8   team_favorite_id     5955 non-null   object        
 9   spread_favorite      5955 non-null   float64       
 10  over_under_line      5955 non-null   object        
 11  stadium              6227 non-null   object        
 12  stadium_neutral      6227 non-null   bool          
 13  weather_temperature  5087 non-null

In [6]:
games_Df_2002 = games_Df_2002.drop(columns=['weather_humidity','weather_detail','over_under_line','spread_favorite','score_away','score_home'])
games_Df_2002.columns

Index(['schedule_date', 'schedule_season', 'schedule_week', 'schedule_playoff',
       'team_home', 'team_away', 'team_favorite_id', 'stadium',
       'stadium_neutral', 'weather_temperature', 'weather_wind_mph'],
      dtype='object')

In [7]:
stadium_Df = pd.read_csv('./Datasets/nfl_stadiums.csv')
stadium_Df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 120 entries, 0 to 119
Data columns (total 16 columns):
 #   Column                           Non-Null Count  Dtype  
---  ------                           --------------  -----  
 0   stadium_name                     120 non-null    object 
 1   stadium_location                 119 non-null    object 
 2   stadium_open                     90 non-null     float64
 3   stadium_close                    41 non-null     float64
 4   stadium_type                     109 non-null    object 
 5   stadium_address                  102 non-null    object 
 6   stadium_weather_station_zipcode  101 non-null    object 
 7   stadium_weather_type             117 non-null    object 
 8   stadium_capacity                 54 non-null     object 
 9   stadium_surface                  68 non-null     object 
 10  stadium_weather_station          62 non-null     object 
 11  stadium_weather_station_name     63 non-null     object 
 12  stadium_latitude      

In [8]:
stadium_Df = stadium_Df.drop(columns=['stadium_location','stadium_open','stadium_close','stadium_address','stadium_weather_station_zipcode',
                                      'stadium_capacity','stadium_weather_station','stadium_weather_station_name'])

In [9]:
stadium_Df['stadium_name'].unique()

array(['Acrisure Stadium', 'Alamo Dome', 'Allegiant Stadium',
       'Allianz Arena', 'Alltel Stadium', 'Alumni Stadium',
       'Anaheim Stadium', 'Arrowhead Stadium', 'AT&T Stadium',
       'Atlanta-Fulton County Stadium', 'Balboa Stadium',
       'Bank of America Stadium', 'Bills Stadium',
       'Busch Memorial Stadium', 'Caesars Superdome', 'Candlestick Park',
       'CenturyLink Field', 'Cinergy Field',
       'Cleveland Municipal Stadium', 'Cotton Bowl', 'Cowboys Stadium',
       'Dignity Health Sports Park', 'Dolphin Stadium',
       'Edward Jones Dome', 'Empower Field at Mile High',
       'Estadio Azteca', 'EverBank Field', 'FedEx Field', 'Fenway Park',
       'FirstEnergy Stadium', 'Ford Field', 'Foxboro Stadium',
       'Frankfurt Stadium', 'Franklin Field',
       'GEHA Field at Arrowhead Stadium', 'Georgia Dome',
       'Giants Stadium', 'Gillette Stadium', 'Hard Rock Stadium',
       'Harvard Stadium', 'Heinz Field', 'Highmark Stadium',
       "Houlihan's Stadium", 'Hous

# Data Clean up 

In [10]:
Teams_Df['win_away'] = Teams_Df['score_away'] > Teams_Df['score_home']
Teams_Df['win_home'] = Teams_Df['score_home'] > Teams_Df['score_away']


In [11]:
# Extract columns with "Away" and "Home" in their names
away_columns = [col for col in Teams_Df.columns if '_away' in col]
home_columns = [col for col in Teams_Df.columns if '_home' in col]

# Add common columns to both datasets
common_columns = ['season', 'week', 'date','away','home']

df_away = Teams_Df[common_columns + away_columns]
df_home = Teams_Df[common_columns + home_columns]


# Rename the columns to remove 'away' and 'home'
df_away.columns = common_columns + [col.replace('_away', '') for col in away_columns]
df_home.columns = common_columns + [col.replace('_home', '') for col in home_columns]

# Add a column to indicate team type
df_away['team_type'] = 'away'
df_home['team_type'] = 'home'

# Rename the 'away' and 'home' columns in each DataFrame to 'team' and 'opponent' accordingly
df_away = df_away.rename(columns={'away': 'team', 'home': 'opponent'})
df_home = df_home.rename(columns={'home': 'team', 'away': 'opponent'})

df_away['date'] = pd.to_datetime(df_away['date'],format='mixed')
df_home['date'] = pd.to_datetime(df_home['date'],format='mixed')
# Concatenate the datasets
#df_teams_All = pd.concat([df_away, df_home], ignore_index=True)

#df_teams_All.head()




In [12]:
#get all the look ups 
team_names_replacement_df = pd.read_excel('./Datasets/Team_Name_Lookup.xlsx')

team_names_replacement_df.head()

#replace the old team names with the new ones 
for index,row in team_names_replacement_df.iterrows(): 
   games_Df_2002['team_home'] = games_Df_2002['team_home'].str.replace(row['Historical Data'], row['Detailed Data'])
   games_Df_2002['team_away'] = games_Df_2002['team_away'].str.replace(row['Historical Data'], row['Detailed Data'])

print(" -------------------- ")
print(games_Df_2002['team_home'].unique())
print(" -------------------- ")
print(games_Df_2002['team_away'].unique())
print(" -------------------- ")


 -------------------- 
['Panthers' 'Bears' 'Lions' 'Colts' 'Dolphins' 'Saints' 'Giants' 'Raiders'
 'Steelers' 'Seahawks' 'Rams' 'Buccaneers' 'Titans' 'Commanders' 'Ravens'
 'Eagles' 'Packers' 'Patriots' 'Bills' 'Bengals' 'Browns' 'Broncos'
 'Texans' 'Jaguars' 'Falcons' 'Cowboys' 'Chiefs' 'Vikings' 'Jets'
 'Chargers' '49ers' 'Cardinals']
 -------------------- 
['Patriots' 'Jaguars' 'Cowboys' 'Broncos' 'Bills' '49ers' 'Packers' 'Jets'
 'Browns' 'Chiefs' 'Falcons' 'Eagles' 'Bengals' 'Cardinals' 'Vikings'
 'Buccaneers' 'Ravens' 'Raiders' 'Chargers' 'Rams' 'Colts' 'Lions'
 'Seahawks' 'Saints' 'Steelers' 'Bears' 'Titans' 'Dolphins' 'Texans'
 'Giants' 'Panthers' 'Commanders']
 -------------------- 


In [13]:

#merge the weather info with the away and home dataset and concate
merged_away_df = pd.merge(df_away, games_Df_2002, left_on=['date', 'team'], right_on=['schedule_date','team_away'], how='inner')

merged_home_df = pd.merge(df_home, games_Df_2002, left_on=['date', 'team'], right_on=['schedule_date','team_home'], how='inner')

df_teams_All = pd.concat([merged_away_df, merged_home_df], ignore_index=True)

df_teams_All.head()


Unnamed: 0,season,week,date,team,opponent,score,first_downs,third_down_comp,third_down_att,fourth_down_comp,fourth_down_att,plays,drives,yards,pass_comp,pass_att,pass_yards,sacks_num,sacks_yards,rush_att,rush_yards,pen_num,pen_yards,redzone_comp,redzone_att,fumbles,interceptions,def_st_td,possession,win,team_type,schedule_date,schedule_season,schedule_week,schedule_playoff,team_home,team_away,team_favorite_id,stadium,stadium_neutral,weather_temperature,weather_wind_mph
0,2002,1,2002-09-05,49ers,Giants,16,13,4,12,0,0,51,13,279,16,26,166,0,0,25,113,5,29,0,8,0,1,0,27.53,True,away,2002-09-05,2002,1,False,Giants,49ers,SF,Giants Stadium,False,75.0,12.0
1,2002,1,2002-09-08,Jets,Bills,37,18,2,8,0,0,47,17,266,24,30,193,3,17,14,73,10,90,0,9,1,0,2,21.1,True,away,2002-09-08,2002,1,False,Bills,Jets,NYJ,Ralph Wilson Stadium,False,75.0,7.0
2,2002,1,2002-09-08,Vikings,Bears,23,19,5,13,0,0,62,17,368,16,28,228,1,6,33,140,8,52,0,7,1,2,0,31.52,False,away,2002-09-08,2002,1,False,Bears,Vikings,CHI,Memorial Stadium (Champaign),False,76.0,5.0
3,2002,1,2002-09-08,Chargers,Bengals,34,27,6,10,0,0,65,11,401,15,19,160,1,0,45,241,4,39,0,7,0,0,0,37.8,True,away,2002-09-08,2002,1,False,Bengals,Chargers,CIN,Paul Brown Stadium,False,81.0,5.0
4,2002,1,2002-09-08,Chiefs,Browns,40,24,5,11,0,0,59,20,470,20,29,276,0,0,30,194,9,87,0,10,0,1,0,30.27,True,away,2002-09-08,2002,1,False,Browns,Chiefs,CLE,FirstEnergy Stadium,False,78.0,7.0


In [14]:
# merge the df_teams with the standium dataset

final_teams_df = pd.merge(df_teams_All, stadium_Df, left_on=['stadium'], right_on=['stadium_name'], how='inner')
final_teams_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11282 entries, 0 to 11281
Data columns (total 50 columns):
 #   Column                Non-Null Count  Dtype         
---  ------                --------------  -----         
 0   season                11282 non-null  int64         
 1   week                  11282 non-null  object        
 2   date                  11282 non-null  datetime64[ns]
 3   team                  11282 non-null  object        
 4   opponent              11282 non-null  object        
 5   score                 11282 non-null  int64         
 6   first_downs           11282 non-null  int64         
 7   third_down_comp       11282 non-null  int64         
 8   third_down_att        11282 non-null  int64         
 9   fourth_down_comp      11282 non-null  int64         
 10  fourth_down_att       11282 non-null  int64         
 11  plays                 11282 non-null  int64         
 12  drives                11282 non-null  int64         
 13  yards           

In [15]:
#drop unwatnted columns after reviwing dataset info
final_teams_df = final_teams_df.drop(columns=['stadium_azimuthangle','stadium_elevation'])

In [16]:
# replace the null stadium surface type rows with valid values 

final_teams_df.loc[(final_teams_df['stadium_surface'].isnull()) & (final_teams_df['stadium_type'] == 'indoor'),'stadium_surface'] = 'FieldTurf' 

final_teams_df.loc[(final_teams_df['stadium_surface'].isnull()) & (final_teams_df['stadium_type'] == 'outdoor'),'stadium_surface'] = 'Grass' 

final_teams_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11282 entries, 0 to 11281
Data columns (total 48 columns):
 #   Column                Non-Null Count  Dtype         
---  ------                --------------  -----         
 0   season                11282 non-null  int64         
 1   week                  11282 non-null  object        
 2   date                  11282 non-null  datetime64[ns]
 3   team                  11282 non-null  object        
 4   opponent              11282 non-null  object        
 5   score                 11282 non-null  int64         
 6   first_downs           11282 non-null  int64         
 7   third_down_comp       11282 non-null  int64         
 8   third_down_att        11282 non-null  int64         
 9   fourth_down_comp      11282 non-null  int64         
 10  fourth_down_att       11282 non-null  int64         
 11  plays                 11282 non-null  int64         
 12  drives                11282 non-null  int64         
 13  yards           

In [17]:
final_teams_clean =  final_teams_df.dropna()
final_teams_clean.head()

Unnamed: 0,season,week,date,team,opponent,score,first_downs,third_down_comp,third_down_att,fourth_down_comp,fourth_down_att,plays,drives,yards,pass_comp,pass_att,pass_yards,sacks_num,sacks_yards,rush_att,rush_yards,pen_num,pen_yards,redzone_comp,redzone_att,fumbles,interceptions,def_st_td,possession,win,team_type,schedule_date,schedule_season,schedule_week,schedule_playoff,team_home,team_away,team_favorite_id,stadium,stadium_neutral,weather_temperature,weather_wind_mph,stadium_name,stadium_type,stadium_weather_type,stadium_surface,stadium_latitude,stadium_longitude
0,2002,1,2002-09-05,49ers,Giants,16,13,4,12,0,0,51,13,279,16,26,166,0,0,25,113,5,29,0,8,0,1,0,27.53,True,away,2002-09-05,2002,1,False,Giants,49ers,SF,Giants Stadium,False,75.0,12.0,Giants Stadium,outdoor,cold,Grass,40.812222,-74.076944
1,2002,2,2002-09-15,Patriots,Jets,44,24,10,16,2,2,75,14,432,25,35,269,0,0,40,163,3,36,0,7,1,1,4,38.07,True,away,2002-09-15,2002,2,False,Jets,Patriots,NYJ,Giants Stadium,False,75.0,13.0,Giants Stadium,outdoor,cold,Grass,40.812222,-74.076944
2,2002,3,2002-09-22,Seahawks,Giants,6,7,4,12,0,0,45,14,145,16,25,105,2,13,18,40,6,37,0,3,0,1,0,23.17,False,away,2002-09-22,2002,3,False,Giants,Seahawks,NYG,Giants Stadium,False,76.0,12.0,Giants Stadium,outdoor,cold,Grass,40.812222,-74.076944
3,2002,5,2002-10-06,Chiefs,Jets,29,24,5,10,0,0,62,13,504,23,33,289,1,7,28,215,7,65,0,6,0,1,0,30.98,True,away,2002-10-06,2002,5,False,Jets,Chiefs,KC,Giants Stadium,False,63.0,13.0,Giants Stadium,outdoor,cold,Grass,40.812222,-74.076944
4,2002,6,2002-10-13,Falcons,Giants,17,15,8,13,0,0,51,10,313,19,25,246,2,11,24,67,3,20,0,4,0,0,0,29.97,True,away,2002-10-13,2002,6,False,Giants,Falcons,NYG,Giants Stadium,False,59.0,10.0,Giants Stadium,outdoor,cold,Grass,40.812222,-74.076944


In [18]:
final_teams_clean['win'].value_counts()

win
False    4641
True     4619
Name: count, dtype: int64

## Data Modeling

In [19]:
#final_teams_clean.loc[(final_teams_clean['week'].isnull()) & (final_teams_df['stadium_type'] == 'outdoor'),'stadium_surface'] = 'Grass' 
final_teams_clean['week'].replace({'Wildcard': 18, 'Division': 19, 'Conference': 20, 'Superbowl': 21}) 

0         1
1         2
2         3
3         5
4         6
         ..
11243    18
11252    19
11253    20
11262    19
11263    20
Name: week, Length: 9260, dtype: object

In [36]:
#Clean up and remove duplicate columns for dummies processing
drop_final = final_teams_clean.drop(['team_home', 'team_away', 'schedule_week', 'date', 'schedule_date'], axis=1)
drop_final.head()

Unnamed: 0,season,week,team,opponent,score,first_downs,third_down_comp,third_down_att,fourth_down_comp,fourth_down_att,plays,drives,yards,pass_comp,pass_att,pass_yards,sacks_num,sacks_yards,rush_att,rush_yards,pen_num,pen_yards,redzone_comp,redzone_att,fumbles,interceptions,def_st_td,possession,win,team_type,schedule_season,schedule_playoff,team_favorite_id,stadium,stadium_neutral,weather_temperature,weather_wind_mph,stadium_name,stadium_type,stadium_weather_type,stadium_surface,stadium_latitude,stadium_longitude
0,2002,1,49ers,Giants,16,13,4,12,0,0,51,13,279,16,26,166,0,0,25,113,5,29,0,8,0,1,0,27.53,True,away,2002,False,SF,Giants Stadium,False,75.0,12.0,Giants Stadium,outdoor,cold,Grass,40.812222,-74.076944
1,2002,2,Patriots,Jets,44,24,10,16,2,2,75,14,432,25,35,269,0,0,40,163,3,36,0,7,1,1,4,38.07,True,away,2002,False,NYJ,Giants Stadium,False,75.0,13.0,Giants Stadium,outdoor,cold,Grass,40.812222,-74.076944
2,2002,3,Seahawks,Giants,6,7,4,12,0,0,45,14,145,16,25,105,2,13,18,40,6,37,0,3,0,1,0,23.17,False,away,2002,False,NYG,Giants Stadium,False,76.0,12.0,Giants Stadium,outdoor,cold,Grass,40.812222,-74.076944
3,2002,5,Chiefs,Jets,29,24,5,10,0,0,62,13,504,23,33,289,1,7,28,215,7,65,0,6,0,1,0,30.98,True,away,2002,False,KC,Giants Stadium,False,63.0,13.0,Giants Stadium,outdoor,cold,Grass,40.812222,-74.076944
4,2002,6,Falcons,Giants,17,15,8,13,0,0,51,10,313,19,25,246,2,11,24,67,3,20,0,4,0,0,0,29.97,True,away,2002,False,NYG,Giants Stadium,False,59.0,10.0,Giants Stadium,outdoor,cold,Grass,40.812222,-74.076944


In [21]:
# Encode using pd.get_dummies()
final_teams_dummies = pd.get_dummies(drop_final)
final_teams_dummies.head()

Unnamed: 0,season,score,first_downs,third_down_comp,third_down_att,fourth_down_comp,fourth_down_att,plays,drives,yards,pass_comp,pass_att,pass_yards,sacks_num,sacks_yards,rush_att,rush_yards,pen_num,pen_yards,redzone_comp,redzone_att,fumbles,interceptions,def_st_td,possession,win,schedule_season,schedule_playoff,stadium_neutral,weather_temperature,weather_wind_mph,stadium_latitude,stadium_longitude,week_1,week_10,week_11,week_12,week_13,week_14,week_15,week_16,week_17,week_18,week_2,week_3,week_4,week_5,week_6,week_7,week_8,week_9,week_Conference,week_Division,week_Superbowl,week_Wildcard,team_49ers,team_Bears,team_Bengals,team_Bills,team_Broncos,team_Browns,team_Buccaneers,team_Cardinals,team_Chargers,team_Chiefs,team_Colts,team_Commanders,team_Cowboys,team_Dolphins,team_Eagles,team_Falcons,team_Giants,team_Jaguars,team_Jets,team_Lions,team_Packers,team_Panthers,team_Patriots,team_Raiders,team_Rams,team_Ravens,team_Saints,team_Seahawks,team_Steelers,team_Texans,team_Titans,team_Vikings,opponent_49ers,opponent_Bears,opponent_Bengals,opponent_Bills,opponent_Broncos,opponent_Browns,opponent_Buccaneers,opponent_Cardinals,opponent_Chargers,opponent_Chiefs,opponent_Colts,opponent_Commanders,opponent_Cowboys,opponent_Dolphins,opponent_Eagles,opponent_Falcons,opponent_Giants,opponent_Jaguars,opponent_Jets,opponent_Lions,opponent_Packers,opponent_Panthers,opponent_Patriots,opponent_Raiders,opponent_Rams,opponent_Ravens,opponent_Saints,opponent_Seahawks,opponent_Steelers,opponent_Texans,opponent_Titans,opponent_Vikings,team_type_away,team_type_home,team_favorite_id_ARI,team_favorite_id_ATL,team_favorite_id_BAL,team_favorite_id_BUF,team_favorite_id_CAR,team_favorite_id_CHI,team_favorite_id_CIN,team_favorite_id_CLE,team_favorite_id_DAL,team_favorite_id_DEN,team_favorite_id_DET,team_favorite_id_GB,team_favorite_id_HOU,team_favorite_id_IND,team_favorite_id_JAX,team_favorite_id_KC,team_favorite_id_LAC,team_favorite_id_LAR,team_favorite_id_LVR,team_favorite_id_MIA,team_favorite_id_MIN,team_favorite_id_NE,team_favorite_id_NO,team_favorite_id_NYG,team_favorite_id_NYJ,team_favorite_id_PHI,team_favorite_id_PICK,team_favorite_id_PIT,team_favorite_id_SEA,team_favorite_id_SF,team_favorite_id_TB,team_favorite_id_TEN,team_favorite_id_WAS,stadium_AT&T Stadium,stadium_Alamo Dome,stadium_Allegiant Stadium,stadium_Arrowhead Stadium,stadium_Bank of America Stadium,stadium_Candlestick Park,stadium_CenturyLink Field,stadium_Cowboys Stadium,stadium_Edward Jones Dome,stadium_Empower Field at Mile High,stadium_Estadio Azteca,stadium_EverBank Field,stadium_FedEx Field,stadium_FirstEnergy Stadium,stadium_Ford Field,stadium_GEHA Field at Arrowhead Stadium,stadium_Georgia Dome,stadium_Giants Stadium,stadium_Gillette Stadium,stadium_Hard Rock Stadium,stadium_Heinz Field,stadium_Highmark Stadium,stadium_LP Stadium,stadium_Lambeau Field,stadium_Levi's Stadium,stadium_Lincoln Financial Field,stadium_Los Angeles Memorial Coliseum,stadium_Lucas Oil Stadium,stadium_Lumen Field,stadium_M&T Bank Stadium,stadium_Memorial Stadium (Champaign),stadium_Mercedes-Benz Stadium,stadium_Mercedes-Benz Superdome,stadium_MetLife Stadium,stadium_NRG Stadium,stadium_New Era Field,stadium_Nissan Stadium,stadium_Oakland Coliseum,stadium_Paul Brown Stadium,stadium_Paycor Stadium,stadium_Qualcomm Stadium,stadium_RCA Dome,stadium_Ralph Wilson Stadium,stadium_Raymond James Stadium,stadium_Reliant Stadium,stadium_SoFi Stadium,stadium_Soldier Field,stadium_Sports Authority Field at Mile High,stadium_State Farm Stadium,stadium_StubHub Center,stadium_Sun Devil Stadium,stadium_Sun Life Stadium,stadium_TCF Bank Stadium,stadium_TIAA Bank Field,stadium_Texas Stadium,stadium_Tiger Stadium (LSU),stadium_Twickenham Stadium,stadium_U.S. Bank Stadium,stadium_University of Phoenix Stadium,stadium_Veterans Stadium,stadium_Wembley Stadium,stadium_name_AT&T Stadium,stadium_name_Alamo Dome,stadium_name_Allegiant Stadium,stadium_name_Arrowhead Stadium,stadium_name_Bank of America Stadium,stadium_name_Candlestick Park,stadium_name_CenturyLink Field,stadium_name_Cowboys Stadium,stadium_name_Edward Jones Dome,stadium_name_Empower Field at Mile High,stadium_name_Estadio Azteca,stadium_name_EverBank Field,stadium_name_FedEx Field,stadium_name_FirstEnergy Stadium,stadium_name_Ford Field,stadium_name_GEHA Field at Arrowhead Stadium,stadium_name_Georgia Dome,stadium_name_Giants Stadium,stadium_name_Gillette Stadium,stadium_name_Hard Rock Stadium,stadium_name_Heinz Field,stadium_name_Highmark Stadium,stadium_name_LP Stadium,stadium_name_Lambeau Field,stadium_name_Levi's Stadium,stadium_name_Lincoln Financial Field,stadium_name_Los Angeles Memorial Coliseum,stadium_name_Lucas Oil Stadium,stadium_name_Lumen Field,stadium_name_M&T Bank Stadium,stadium_name_Memorial Stadium (Champaign),stadium_name_Mercedes-Benz Stadium,stadium_name_Mercedes-Benz Superdome,stadium_name_MetLife Stadium,stadium_name_NRG Stadium,stadium_name_New Era Field,stadium_name_Nissan Stadium,stadium_name_Oakland Coliseum,stadium_name_Paul Brown Stadium,stadium_name_Paycor Stadium,stadium_name_Qualcomm Stadium,stadium_name_RCA Dome,stadium_name_Ralph Wilson Stadium,stadium_name_Raymond James Stadium,stadium_name_Reliant Stadium,stadium_name_SoFi Stadium,stadium_name_Soldier Field,stadium_name_Sports Authority Field at Mile High,stadium_name_State Farm Stadium,stadium_name_StubHub Center,stadium_name_Sun Devil Stadium,stadium_name_Sun Life Stadium,stadium_name_TCF Bank Stadium,stadium_name_TIAA Bank Field,stadium_name_Texas Stadium,stadium_name_Tiger Stadium (LSU),stadium_name_Twickenham Stadium,stadium_name_U.S. Bank Stadium,stadium_name_University of Phoenix Stadium,stadium_name_Veterans Stadium,stadium_name_Wembley Stadium,stadium_type_indoor,stadium_type_outdoor,stadium_type_retractable,stadium_weather_type_cold,stadium_weather_type_indoor,stadium_weather_type_moderate,stadium_weather_type_warm,stadium_surface_FieldTurf,stadium_surface_Grass,stadium_surface_Hellas Matrix Turf
0,2002,16,13,4,12,0,0,51,13,279,16,26,166,0,0,25,113,5,29,0,8,0,1,0,27.53,True,2002,False,False,75.0,12.0,40.812222,-74.076944,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,True,False,False,False,False,True,False
1,2002,44,24,10,16,2,2,75,14,432,25,35,269,0,0,40,163,3,36,0,7,1,1,4,38.07,True,2002,False,False,75.0,13.0,40.812222,-74.076944,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,True,False,False,False,False,True,False
2,2002,6,7,4,12,0,0,45,14,145,16,25,105,2,13,18,40,6,37,0,3,0,1,0,23.17,False,2002,False,False,76.0,12.0,40.812222,-74.076944,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,True,False,False,False,False,True,False
3,2002,29,24,5,10,0,0,62,13,504,23,33,289,1,7,28,215,7,65,0,6,0,1,0,30.98,True,2002,False,False,63.0,13.0,40.812222,-74.076944,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,True,False,False,False,False,True,False
4,2002,17,15,8,13,0,0,51,10,313,19,25,246,2,11,24,67,3,20,0,4,0,0,0,29.97,True,2002,False,False,59.0,10.0,40.812222,-74.076944,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,True,False,False,False,False,True,False


In [22]:
final_teams_dummies.dtypes

season                                int64
score                                 int64
first_downs                           int64
third_down_comp                       int64
third_down_att                        int64
                                      ...  
stadium_weather_type_moderate          bool
stadium_weather_type_warm              bool
stadium_surface_FieldTurf              bool
stadium_surface_Grass                  bool
stadium_surface_Hellas Matrix Turf     bool
Length: 286, dtype: object

In [34]:
# Split features and target
X = final_teams_dummies.drop(['win', 'score'], axis=1)
y = final_teams_dummies['win']

final_teams_dummies.columns.to_list()

['season',
 'score',
 'first_downs',
 'third_down_comp',
 'third_down_att',
 'fourth_down_comp',
 'fourth_down_att',
 'plays',
 'drives',
 'yards',
 'pass_comp',
 'pass_att',
 'pass_yards',
 'sacks_num',
 'sacks_yards',
 'rush_att',
 'rush_yards',
 'pen_num',
 'pen_yards',
 'redzone_comp',
 'redzone_att',
 'fumbles',
 'interceptions',
 'def_st_td',
 'possession',
 'win',
 'schedule_season',
 'schedule_playoff',
 'stadium_neutral',
 'weather_temperature',
 'weather_wind_mph',
 'stadium_latitude',
 'stadium_longitude',
 'week_1',
 'week_10',
 'week_11',
 'week_12',
 'week_13',
 'week_14',
 'week_15',
 'week_16',
 'week_17',
 'week_18',
 'week_2',
 'week_3',
 'week_4',
 'week_5',
 'week_6',
 'week_7',
 'week_8',
 'week_9',
 'week_Conference',
 'week_Division',
 'week_Superbowl',
 'week_Wildcard',
 'team_49ers',
 'team_Bears',
 'team_Bengals',
 'team_Bills',
 'team_Broncos',
 'team_Browns',
 'team_Buccaneers',
 'team_Cardinals',
 'team_Chargers',
 'team_Chiefs',
 'team_Colts',
 'team_Comma

In [24]:
# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=1)


In [25]:
# Train the Random Forest model w/ n_estimators = 500
RF_Model = RandomForestClassifier(random_state=1, n_estimators=500)

# Fit the model
RF_Model.fit(X_train, y_train)


# Score the model
print(f'Training Score: {RF_Model.score(X_train, y_train)}')
print(f'Testing Score: {RF_Model.score(X_test, y_test)}')

Training Score: 1.0
Testing Score: 0.8095032397408207


In [26]:
# Train the Random Forest modelw/ n_estimators = 100
RF_Model = RandomForestClassifier(random_state=1, n_estimators=100)

# Fit the model
RF_Model.fit(X_train, y_train)


# Score the model
print(f'Training Score: {RF_Model.score(X_train, y_train)}')
print(f'Testing Score: {RF_Model.score(X_test, y_test)}')

Training Score: 1.0
Testing Score: 0.811231101511879


In [27]:
# Get the feature importance array
feature_importances = RF_Model.feature_importances_

In [28]:
importances_sorted = sorted(zip(feature_importances, X.columns), reverse=True)
importances_sorted[:20]

[(0.10756367812738386, 'rush_att'),
 (0.055239070592370945, 'possession'),
 (0.048519658448996354, 'pass_att'),
 (0.04661891227088974, 'rush_yards'),
 (0.04389010543997839, 'interceptions'),
 (0.037075956208523095, 'yards'),
 (0.036825754577031015, 'fourth_down_att'),
 (0.03320863751500432, 'sacks_yards'),
 (0.026828931688820376, 'sacks_num'),
 (0.02426647680701449, 'pass_yards'),
 (0.022895499022040692, 'first_downs'),
 (0.022511751853577126, 'redzone_att'),
 (0.02220179614609698, 'plays'),
 (0.02130019985438171, 'pass_comp'),
 (0.019782437394923198, 'def_st_td'),
 (0.019635453228432534, 'redzone_comp'),
 (0.019441472897932306, 'third_down_comp'),
 (0.018201679063221773, 'pen_yards'),
 (0.014943428894527443, 'drives'),
 (0.014867265570528225, 'weather_temperature')]

In [None]:
# Create the support vector machine classifier model
svm_model = SVC(kernel='poly')

# Fit the model to the training data
svm_model.fit(X_train, y_train)

# Validate the model by checking the model accuracy with model.score
print('Train Accuracy: %.3f' % svm_model.score(X_train, y_train))
print('Test Accuracy: %.3f' % svm_model.score(X_test, y_test))

In [None]:
# Create the support vector machine classifier model
svm_model = SVC(kernel='rbf')

# Fit the model to the training data
svm_model.fit(X_train, y_train)

# Validate the model by checking the model accuracy with model.score
print('Train Accuracy: %.3f' % svm_model.score(X_train, y_train))
print('Test Accuracy: %.3f' % svm_model.score(X_test, y_test))

In [None]:
# Create the support vector machine classifier model
svm_model = SVC(kernel='sigmoid')

# Fit the model to the training data
svm_model.fit(X_train, y_train)

# Validate the model by checking the model accuracy with model.score
print('Train Accuracy: %.3f' % svm_model.score(X_train, y_train))
print('Test Accuracy: %.3f' % svm_model.score(X_test, y_test))

## Final prediction Model with one-hotencolder and pipeline 

In [66]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

trf = ColumnTransformer([
    ('trf', OneHotEncoder(sparse=False,drop='first'),['team','week','opponent','team_favorite_id','stadium_name','stadium','stadium_type','stadium_weather_type','stadium_surface','team_type'])
],
remainder = 'passthrough')



In [67]:
from sklearn.model_selection import train_test_split

X2 = drop_final.drop('win', axis=1)
y2 = drop_final['win']
X2.shape, y2.shape

((9260, 42), (9260,))

In [68]:
X2_train, X2_test, y2_train, y2_test = train_test_split(X2, y2, test_size=0.01, random_state=42)

In [69]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline

In [70]:
pipe = Pipeline(steps=[
    ('step1',trf),
    ('step2',RandomForestClassifier())
])
pipe.fit(X2_train, y2_train)
y2_pred = pipe.predict(X2_test)

In [71]:
pipe.predict_proba(X2_test)

array([[0.63, 0.37],
       [0.67, 0.33],
       [0.07, 0.93],
       [0.24, 0.76],
       [0.67, 0.33],
       [0.75, 0.25],
       [0.62, 0.38],
       [0.56, 0.44],
       [0.93, 0.07],
       [0.48, 0.52],
       [0.5 , 0.5 ],
       [0.29, 0.71],
       [0.25, 0.75],
       [0.62, 0.38],
       [0.22, 0.78],
       [0.1 , 0.9 ],
       [0.16, 0.84],
       [0.34, 0.66],
       [0.15, 0.85],
       [0.93, 0.07],
       [0.98, 0.02],
       [0.66, 0.34],
       [0.64, 0.36],
       [0.84, 0.16],
       [0.18, 0.82],
       [0.99, 0.01],
       [0.98, 0.02],
       [0.79, 0.21],
       [0.82, 0.18],
       [0.37, 0.63],
       [0.4 , 0.6 ],
       [0.46, 0.54],
       [0.6 , 0.4 ],
       [0.23, 0.77],
       [0.7 , 0.3 ],
       [0.25, 0.75],
       [0.28, 0.72],
       [0.49, 0.51],
       [0.94, 0.06],
       [0.77, 0.23],
       [0.8 , 0.2 ],
       [0.35, 0.65],
       [0.28, 0.72],
       [0.22, 0.78],
       [0.88, 0.12],
       [0.76, 0.24],
       [0.64, 0.36],
       [0.57,

In [79]:
#Set up predic_proba

values = [['2023', '1', 'Chiefs', 'Lions', 'GEHA Field at Arrowhead Stadium',44,24,
           10,16,2,2,75,14,432,25,35,269,0,0,40,164,3,36,0,7,1,1,4,38.07,'away',2002,False,'NYJ','Giants Stadium',False,75.0,13.0,'Giants Stadium','outdoor','cold','Grass',40.81222,-74.0769 ]]
columns_2 = ['season', 'week', 'team', 'opponent', 'score', 'first_downs',
       'third_down_comp', 'third_down_att', 'fourth_down_comp',
       'fourth_down_att', 'plays', 'drives', 'yards', 'pass_comp', 'pass_att',
       'pass_yards', 'sacks_num', 'sacks_yards', 'rush_att', 'rush_yards',
       'pen_num', 'pen_yards', 'redzone_comp', 'redzone_att', 'fumbles',
       'interceptions', 'def_st_td', 'possession', 'team_type',
       'schedule_season', 'schedule_playoff', 'team_favorite_id', 'stadium',
       'stadium_neutral', 'weather_temperature', 'weather_wind_mph',
       'stadium_name', 'stadium_type', 'stadium_weather_type',
       'stadium_surface', 'stadium_latitude', 'stadium_longitude']

Predict2023 = pd.DataFrame(values, columns=columns_2)



In [80]:
pipe.predict_proba(Predict2023)

array([[0.14, 0.86]])