In [1]:
import datetime as dt
import pandas as pd

In [2]:
# identifier for each team on pro-football-reference
teams = ['crd', # cardinals
         'atl', # falcons
         'rav', # ravens
         'buf', # bills
         'car', # panthers
         'chi', # bears
         'cin', # bengals
         'cle', # browns
         'dal', # cowboys
         'den', # denver
         'det', # lions
         'gnb', # packers
         'htx', # texans
         'clt', # colts
         'jax', # jaguars
         'kan', # chiefs
         'sdg', # chargers
         'ram', # rams
         'mia', # dolphins
         'min', # vikings
         'nwe', # patriots
         'nor', # saints
         'nyg', # giants
         'nyj', # jets
         'rai', # raiders
         'phi', # eagles
         'pit', # steelers
         'sfo', # 49ers
         'sea', # seahawks
         'tam', # buccs
         'oti', # titans
         'was' # football team
        ]
len(teams)

32

In [3]:
# year range starts in 2002 because the houston texans didn't exist in a prior form, Causing it to kick an error later in the program.
current_season = dt.date.today().year
year_range = range(2002, current_season)
year_dict = {}

In [None]:
%%time
# general data ingestion for every year, every team

for year in year_range:
    temp_dict={}
    for team in teams:
        url = f'https://www.pro-football-reference.com/teams/{team}/{year}.htm'
        data = pd.read_html(url, header=1)
        temp_dict[team] = data[1]
        temp_dict[team]['year'] = year
        temp_dict[team]['team'] = team
    print(f'{year} complete')
    print('-------------')
    year_dict[year] = temp_dict

2002 complete
-------------
2003 complete
-------------
2004 complete
-------------
2005 complete
-------------
2006 complete
-------------
2007 complete
-------------
2008 complete
-------------
2009 complete
-------------
2010 complete
-------------
2011 complete
-------------
2012 complete
-------------
2013 complete
-------------
2014 complete
-------------
2015 complete
-------------
2016 complete
-------------


In [None]:
# testing that I can access the dataframe
year_dict[2002]['chi']

In [None]:
# column cleaning dictionary
column_clean_dict =  {'Unnamed: 3':'Kickoff_time',
     'Unnamed: 5': 'W/L',
     'Unnamed: 8' : 'Home/Away',
     'Tm' : 'Team_Score',
     'Opp.1' : 'Oppo_score',
     '1stD' : '1D_gained',
     'TotYd' : 'Team_Total_Yards',
     'PassY' : 'Team_Pass_Yards',
     'RushY' : 'Team_Rush_Yards',
     'TO' : 'team_turnover',
     '1stD.1' : '1D_allowed',
     'TotYd.1' : 'oppo_total_yards',
     'PassY.1' : 'oppo_pass_yards',
     'RushY.1' : 'oppo_rush_yards',
     'TO.1' : 'oppo_turnover',
     'Offense' : 'offense_expected_points',
     'Defense' : 'defense_expected_points',
     'Sp. Tms' : 'sp_expected_points'
    }

In [None]:

# cleans and standardizes each dataframe
for year in year_dict:
    for team in year_dict[year]:
        iter_df = year_dict[year][team] #for my sanity.
        iter_df.rename(column_clean_dict, inplace=True, axis=1) # renames columns
        iter_df.drop('Unnamed: 4', axis=1, inplace=True) # drops an unused column
        iter_df.drop(iter_df[iter_df['Week'] == 'Playoffs'].index, inplace=True, axis=0) # drops blank row separating playoffs from regular season
        iter_df.drop(iter_df[iter_df['Opp'] == 'Bye Week'].index, inplace=True, axis=0) # drops bye weeks, will replace in future.
        iter_df['OT'] = iter_df['OT'].astype(str)
        iter_df['OT'].fillna(value=0, inplace=True) # replaces NaNs in the overtime column with 0
        iter_df['OT'].replace({'OT': 1}, inplace=True) # replaces 'OT' with 1 to signal overtime happened
        iter_df['Home/Away'].fillna(value='HOME', inplace=True) # fills in 'HOME' for home game
        iter_df['Home/Away'].replace({'@': 'AWAY'}, inplace=True) # fills in 'AWAY' for away game
        iter_df['Week'] = iter_df['Week'].astype(str)
        iter_df.dropna(inplace=True)
        iter_df['Week'].replace({'Wild Card': 18, 'Division': 19, 'Conf. Champ.': 20, 'SuperBowl': 21}, inplace=True) # replaces str in 'Week' with playoff jgames like extension of regular season
        iter_df['Week'] = iter_df['Week'].astype(int) # turns back to int
        iter_df['team_turnover'].fillna(value=0, inplace=True) # fills 0s in for NaNs
        iter_df['oppo_turnover'].fillna(value=0, inplace=True) # fills 0s in for Nans
        
year_dict[2006]['chi'].dtypes

In [None]:
year_dict[2006]['chi']

In [None]:
# exports each team's data as an individual sheet in case I want to come back and work with those on a focused level
for team in teams:
    full_team = pd.DataFrame()
    for year in year_dict:
        full_team = full_team.append(year_dict[year][team])
    full_team.to_excel(r'C:\Users\fitzp\Desktop\GitR\git_repos\personal_projects\Experimental Projects\nfl_model\data\{}.xlsx'.format(team), sheet_name = team)

In [None]:
# combines all data into a single frame
full_data = pd.DataFrame()
for year in year_dict:
    for team in teams:
        full_data = full_data.append(year_dict[year][team])
full_data.info()

In [None]:
full_data.to_excel('full_data_set.xlsx')