In [1]:
import datetime as dt
import pandas as pd

In [2]:
# identifier for each team on pro-football-reference
teams = ['crd', # cardinals
         'atl', # falcons
         'rav', # ravens
         'buf', # bills
         'car', # panthers
         'chi', # bears
         'cin', # bengals
         'cle', # browns
         'dal', # cowboys
         'den', # denver
         'det', # lions
         'gnb', # packers
         'htx', # texans
         'clt', # colts
         'jax', # jaguars
         'kan', # chiefs
         'sdg', # chargers
         'ram', # rams
         'mia', # dolphins
         'min', # vikings
         'nwe', # patriots
         'nor', # saints
         'nyg', # giants
         'nyj', # jets
         'rai', # raiders
         'phi', # eagles
         'pit', # steelers
         'sfo', # 49ers
         'sea', # seahawks
         'tam', # buccs
         'oti', # titans
         'was' # football team
        ]
len(teams)

32

In [3]:
# year range starts in 2002 because the houston texans didn't exist in a prior form, Causing it to kick an error later in the program.
current_season = dt.date.today().year
year_range = range(2002, current_season)
year_dict = {}

In [4]:
%%time
# general data ingestion for every year, every team

for year in year_range:
    temp_dict={}
    for team in teams:
        url = f'https://www.pro-football-reference.com/teams/{team}/{year}.htm'
        data = pd.read_html(url, header=1)
        temp_dict[team] = data[1]
        temp_dict[team]['year'] = year
        temp_dict[team]['team'] = team
    print(f'{year} complete')
    print('-------------')
    year_dict[year] = temp_dict

2002 complete
-------------
2003 complete
-------------
2004 complete
-------------
2005 complete
-------------
2006 complete
-------------
2007 complete
-------------
2008 complete
-------------
2009 complete
-------------
2010 complete
-------------
2011 complete
-------------
2012 complete
-------------
2013 complete
-------------
2014 complete
-------------
2015 complete
-------------
2016 complete
-------------
2017 complete
-------------
2018 complete
-------------
2019 complete
-------------
Wall time: 4min 27s


In [5]:
# testing that I can access the dataframe
year_dict[2002]['rav']

Unnamed: 0,Week,Day,Date,Unnamed: 3,Unnamed: 4,Unnamed: 5,OT,Rec,Unnamed: 8,Opp,...,1stD.1,TotYd.1,PassY.1,RushY.1,TO.1,Offense,Defense,Sp. Tms,year,team
0,1,Sun,September 8,1:04PM ET,boxscore,L,,0-1,@,Carolina Panthers,...,15.0,265.0,120.0,145.0,,-6.68,4.38,-4.2,2002,rav
1,2,Sun,September 15,1:02PM ET,boxscore,L,,0-2,,Tampa Bay Buccaneers,...,17.0,279.0,205.0,74.0,,-25.3,0.76,-4.5,2002,rav
2,3,,,,,,,,,Bye Week,...,,,,,,,,,2002,rav
3,4,Mon,September 30,9:08PM ET,boxscore,W,,1-2,,Denver Broncos,...,30.0,403.0,306.0,97.0,3.0,-7.77,1.13,17.33,2002,rav
4,5,Sun,October 6,8:36PM ET,boxscore,W,,2-2,@,Cleveland Browns,...,21.0,433.0,371.0,62.0,5.0,7.54,3.26,-2.8,2002,rav
5,6,Sun,October 13,12:00PM ET,boxscore,L,,2-3,@,Indianapolis Colts,...,19.0,317.0,257.0,60.0,1.0,-5.8,2.91,1.83,2002,rav
6,7,Sun,October 20,1:02PM ET,boxscore,W,,3-3,,Jacksonville Jaguars,...,22.0,397.0,224.0,173.0,3.0,-7.13,9.84,2.43,2002,rav
7,8,Sun,October 27,1:02PM ET,boxscore,L,,3-4,,Pittsburgh Steelers,...,16.0,283.0,179.0,104.0,1.0,-3.51,-2.2,-10.09,2002,rav
8,9,Sun,November 3,1:05PM ET,boxscore,L,,3-5,@,Atlanta Falcons,...,14.0,241.0,115.0,126.0,1.0,-5.43,7.57,-4.5,2002,rav
9,10,Sun,November 10,1:02PM ET,boxscore,W,,4-5,,Cincinnati Bengals,...,20.0,372.0,249.0,123.0,4.0,7.67,6.17,-1.94,2002,rav


In [6]:
# column cleaning dictionary
column_clean_dict =  {
     'Day': 'Day_Week',
     'Unnamed: 3':'Kickoff_Time',
     'Unnamed: 5': 'W/L',
     'Unnamed: 8' : 'Home/Away',
     'Tm' : 'Team_Score',
     'Opp.1' : 'Oppo_score',
     '1stD' : '1D_gained',
     'TotYd' : 'Team_Total_Yards',
     'PassY' : 'Team_Pass_Yards',
     'RushY' : 'Team_Rush_Yards',
     'TO' : 'team_turnover',
     '1stD.1' : '1D_allowed',
     'TotYd.1' : 'oppo_total_yards',
     'PassY.1' : 'oppo_pass_yards',
     'RushY.1' : 'oppo_rush_yards',
     'TO.1' : 'oppo_turnover',
     'Offense' : 'offense_expected_points',
     'Defense' : 'defense_expected_points',
     'Sp. Tms' : 'sp_expected_points'
    }

# dictionary for mapping month into form for datetime transformation
month_dict = {
    'September' : 9,
    'October' : 10,
    'November' : 11,
    'December' : 12,
    'January' : 1,
    'February': 2
}

In [7]:

# cleans and standardizes each dataframe, adds datetime column
for year in year_dict:
    for team in year_dict[year]:
        iter_df = year_dict[year][team] #for my sanity.
        
        iter_df.rename(column_clean_dict, inplace=True, axis=1) # renames columns
        iter_df.drop('Unnamed: 4', axis=1, inplace=True) # drops an unused column
        
        iter_df.drop(iter_df[iter_df['Week'] == 'Playoffs'].index, inplace=True, axis=0) # drops blank row separating playoffs from regular season
        iter_df.drop(iter_df[iter_df['Opp'] == 'Bye Week'].index, inplace=True, axis=0) # drops bye weeks, will replace in future.
        iter_df.drop(iter_df[iter_df['Date']== 'Playoffs'].index, inplace=True, axis=0) # some rows have playoff marker in date
        
        iter_df['Week'].fillna(value=0, inplace=True)
        iter_df['Week'] = iter_df['Week'].astype(str)
        iter_df['Week'].replace({'Wild Card': 18, 'Division': 19, 'Conf. Champ.': 20, 'SuperBowl': 21}, inplace=True) # replaces str in 'Week' with playoff jgames like extension of regular season
        iter_df['Week'] = iter_df['Week'].astype(int) # turns back to int
        
        
        iter_df['OT'].fillna(value= 0, inplace=True) # replaces NaNs in the overtime column with 0
        iter_df['OT'] = iter_df['OT'].astype(str)
        iter_df['OT'].replace({'OT': 1}, inplace=True) # replaces 'OT' with 1 to signal overtime happened
        
        iter_df['W/L'].replace({'W' : 1, 'L': 0, 'T': 2}, inplace=True) # sets wins, losses, ties to numbers
        
        iter_df['Home/Away'].fillna(value='HOME', inplace=True) # fills in 'HOME' for home game
        iter_df['Home/Away'].replace({'@': 'AWAY', 'N':'Neutral'}, inplace=True) # fills in 'AWAY' for away game
        
        
        
        iter_df['team_turnover'].fillna(value=0, inplace=True) # fills 0s in for NaNs
        iter_df['oppo_turnover'].fillna(value=0, inplace=True) # fills 0s in for Nans
        
        iter_df['Kickoff_Time'].fillna(value='0', inplace=True)
        kick_time = []
        kick_split = []
        for i in iter_df['Kickoff_Time']:
            kick_time.append(i)
        for i in kick_time:
            kick_split.append(i.split()[0])
        iter_df['Kickoff_Time'] = kick_split
            
        
        month_list = []
        month_split = []
        for i in iter_df['Date']:
            month_list.append(i)
        for i in month_list:
            month_split.append(i.split()[0])
        iter_df['Month'] = month_split
        iter_df['Month'].replace(month_dict, inplace=True)

        day_split = []
        for i in month_list:
            day_split.append(i.split()[1])
        iter_df['Day'] = day_split
        iter_df['Day'] = iter_df['Day'].astype(int)
        
        year_list = []
        month_list = []
        day_list = []
        datetime_list = []

        for i in iter_df['year']:
            year_list.append(i)
        for i in iter_df['Month']:
            month_list.append(i)
        for i in iter_df['Day']:
            day_list.append(i)

        counter = 0
        for i in iter_df['year']:
            datetime_list.append(dt.datetime(year_list[counter], month_list[counter], day_list[counter]))
            counter += 1
        iter_df['dt_date'] = datetime_list
        
year_dict[2006]['chi'].dtypes

  res_values = method(rvalues)


Week                                int32
Day_Week                           object
Date                               object
Kickoff_Time                       object
W/L                                 int64
OT                                 object
Rec                                object
Home/Away                          object
Opp                                object
Team_Score                        float64
Oppo_score                        float64
1D_gained                         float64
Team_Total_Yards                  float64
Team_Pass_Yards                   float64
Team_Rush_Yards                   float64
team_turnover                     float64
1D_allowed                        float64
oppo_total_yards                  float64
oppo_pass_yards                   float64
oppo_rush_yards                   float64
oppo_turnover                     float64
offense_expected_points           float64
defense_expected_points           float64
sp_expected_points                

In [8]:
year_dict[2006]['chi']

Unnamed: 0,Week,Day_Week,Date,Kickoff_Time,W/L,OT,Rec,Home/Away,Opp,Team_Score,...,oppo_rush_yards,oppo_turnover,offense_expected_points,defense_expected_points,sp_expected_points,year,team,Month,Day,dt_date
0,1,Sun,September 10,4:15PM,1,0,1-0,AWAY,Green Bay Packers,26.0,...,103.0,3.0,0.02,12.18,11.2,2006,chi,9,10,2006-09-10
1,2,Sun,September 17,1:03PM,1,0,2-0,HOME,Detroit Lions,34.0,...,46.0,3.0,12.53,10.87,4.21,2006,chi,9,17,2006-09-17
2,3,Sun,September 24,1:05PM,1,0,3-0,AWAY,Minnesota Vikings,19.0,...,97.0,2.0,-8.81,11.64,1.04,2006,chi,9,24,2006-09-24
3,4,Sun,October 1,8:21PM,1,0,4-0,HOME,Seattle Seahawks,37.0,...,77.0,2.0,12.91,20.11,0.75,2006,chi,10,1,2006-10-01
4,5,Sun,October 8,1:03PM,1,0,5-0,HOME,Buffalo Bills,40.0,...,58.0,5.0,9.21,24.13,-0.82,2006,chi,10,8,2006-10-08
5,6,Mon,October 16,8:42PM,1,0,6-0,AWAY,Arizona Cardinals,24.0,...,66.0,2.0,-35.67,26.0,10.37,2006,chi,10,16,2006-10-16
7,8,Sun,October 29,1:03PM,1,0,7-0,HOME,San Francisco 49ers,41.0,...,127.0,5.0,14.41,12.76,7.68,2006,chi,10,29,2006-10-29
8,9,Sun,November 5,1:02PM,0,0,7-1,HOME,Miami Dolphins,13.0,...,161.0,2.0,-24.19,3.26,1.87,2006,chi,11,5,2006-11-05
9,10,Sun,November 12,8:21PM,1,0,8-1,AWAY,New York Giants,38.0,...,150.0,3.0,-4.12,13.39,5.95,2006,chi,11,12,2006-11-12
10,11,Sun,November 19,1:04PM,1,0,9-1,AWAY,New York Jets,10.0,...,108.0,2.0,-3.45,17.43,-0.7,2006,chi,11,19,2006-11-19


In [9]:
# exports each team's data as an individual sheet in case I want to come back and work with those on a focused level
for team in teams:
    full_team = pd.DataFrame()
    for year in year_dict:
        full_team = full_team.append(year_dict[year][team])
    full_team.to_excel(r'C:\Users\fitzp\Desktop\GitR\git_repos\personal_projects\Experimental Projects\nfl_model\data\{}.xlsx'.format(team), sheet_name = team)

In [10]:
# combines all data into a single frame
full_data = pd.DataFrame()
for year in year_dict:
    for team in teams:
        full_data = full_data.append(year_dict[year][team])
full_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 9612 entries, 0 to 16
Data columns (total 29 columns):
 #   Column                   Non-Null Count  Dtype         
---  ------                   --------------  -----         
 0   Week                     9612 non-null   int32         
 1   Day_Week                 9612 non-null   object        
 2   Date                     9612 non-null   object        
 3   Kickoff_Time             9612 non-null   object        
 4   W/L                      9612 non-null   int64         
 5   OT                       9612 non-null   object        
 6   Rec                      9612 non-null   object        
 7   Home/Away                9612 non-null   object        
 8   Opp                      9612 non-null   object        
 9   Team_Score               9612 non-null   float64       
 10  Oppo_score               9612 non-null   float64       
 11  1D_gained                9612 non-null   float64       
 12  Team_Total_Yards         9612 non-nu

In [11]:
full_data.to_excel('full_data_set.xlsx', sheet_name = 'Full Data')

In [12]:
full_data[full_data['oppo_pass_yards'].isnull()]

Unnamed: 0,Week,Day_Week,Date,Kickoff_Time,W/L,OT,Rec,Home/Away,Opp,Team_Score,...,oppo_rush_yards,oppo_turnover,offense_expected_points,defense_expected_points,sp_expected_points,year,team,Month,Day,dt_date
16,17,Sun,December 28,4:15PM,1,0,4-12,HOME,Oakland Raiders,21.0,...,141.0,1.0,1.64,25.91,-16.75,2003,sdg,12,28,2003-12-28
16,17,Sun,January 3,8:31PM,1,0,9-7,HOME,Cincinnati Bengals,37.0,...,72.0,3.0,10.82,33.61,-6.11,2009,nyj,1,3,2009-01-03


In [13]:
full_data[full_data['Team_Pass_Yards'].isnull()]

Unnamed: 0,Week,Day_Week,Date,Kickoff_Time,W/L,OT,Rec,Home/Away,Opp,Team_Score,...,oppo_rush_yards,oppo_turnover,offense_expected_points,defense_expected_points,sp_expected_points,year,team,Month,Day,dt_date
16,17,Sun,December 28,4:15PM,0,0,4-12,AWAY,San Diego Chargers,14.0,...,263.0,0.0,-25.91,-1.64,16.75,2003,rai,12,28,2003-12-28
16,17,Sun,January 3,8:31PM,0,0,10-6,AWAY,New York Jets,0.0,...,257.0,0.0,-33.61,-10.82,6.11,2009,cin,1,3,2009-01-03
