In [None]:
import pandas as pd
import requests
import time
import numpy as np
pd.options.display.max_columns = 999
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

## QB

### Pass Completions

In [None]:
url = 'https://www.teamrankings.com/nfl/player-stat/passing-plays-completed?split=&rate=per-game'
resp = requests.get(url)
resp.status_code, resp.text[:50]

In [None]:
output = open(f'../nfl_data/qb_completions.xls', 'wb')
output.write(resp.content)
output.close()

In [None]:
qb_completions = pd.read_html('../nfl_data/qb_completions.xls')
qb_completions = pd.DataFrame(qb_completions[0]) # Saves df var to dataframe
qb_completions

In [None]:
# filter out non-quarterbacks
qb_completions = qb_completions.loc[(qb_completions['Pos'] == 'QB')].copy()

### Pass Completions @ Home

In [None]:
url = 'https://www.teamrankings.com/nfl/player-stat/passing-plays-completed?split=home&rate=per-game'
resp = requests.get(url)
output = open(f'./nfl_data/qb_completions_home.xls', 'wb')
output.write(resp.content)
output.close()

In [None]:
qb_completions_home = pd.read_html('./nfl_data/qb_completions_home.xls')
qb_completions_home = pd.DataFrame(qb_completions_home[0]) # Saves df var to dataframe
qb_completions_home = qb_completions_home.loc[(qb_completions_home['Pos'] == 'QB')].copy()
qb_completions_home

### Pass Completions last 2 wks

In [None]:
url = 'https://www.teamrankings.com/nfl/player-stat/passing-plays-completed?split=last_2_weeks&rate=per-game'
resp = requests.get(url)
output = open(f'../nfl_data/qb_completions_last2wks.xls', 'wb')
output.write(resp.content)
output.close()

In [None]:
qb_completions_last2wks = pd.read_html('./nfl_data/qb_completions_home.xls')
qb_completions_last2wks = pd.DataFrame(qb_completions_last2wks[0]) # Saves df var to dataframe
qb_completions_last2wks = qb_completions_last2wks.loc[(qb_completions_last2wks['Pos'] == 'QB')].copy()
qb_completions_last2wks

### Scrape 8 QB categories, each with 5 sub-categories.  All on a per-game basis.

- Categories: Pass Completions, Pass Attempts, Passing Yards, Passing TD, INT, Longest Pass, QB Rating

- Sub-categories: Home games, Division Games, L2 Weeks, L4 weeks, Vs Top 10

In [None]:
stats = ['passing-plays-completed', 'passing-plays-attempted', 'passing-gross-yards', 'passing-touchdowns', 
        'passing-plays-intercepted', 'passing-longest-yards', 'qb-rating-nfl', 'passing-2pt-conversions-succeeded']

sub_cats = ['home', 'division', 'last_2_weeks','last_4_weeks', 'top_10_nfl']

file_list = []

for stat in stats:
    for cat in sub_cats:
        url = f'https://www.teamrankings.com/nfl/player-stat/{stat}?split={cat}&rate=per-game'
        resp = requests.get(url)
        output = open(f'../nfl_data/{stat}_{cat}.xls', 'wb')
        output.write(resp.content)
        file_list.append(f'../nfl_data/{stat}_{cat}.xls')
        output.close() 
        data = pd.read_html(f'../nfl_data/{stat}_{cat}.xls')
        df = pd.DataFrame(data[0]) 
        print((stat, cat), df.shape)

### Loop through all files and merge into one df

In [None]:
# get df started with the 1st file './nfl_data/passing-plays-completed_home.xls'.  This way we have something to merge to.

df_qb = pd.read_html('../nfl_data/passing-plays-completed_home.xls')
df_qb = pd.DataFrame(df_qb[0]) 
df_qb = df_qb.loc[(df_qb['Pos'] == 'QB')].copy() # filter out non-QBs
df_qb.drop(columns = 'Rank', inplace = True) 
col_name = file_list[0][12:-4]
df_qb.rename(columns = {'Value': col_name}, inplace = True)
df_qb.head(2)

In [None]:
for file in file_list[1:]:
    data = pd.read_html(file)
    df = pd.DataFrame(data[0])
    df = df.loc[(df['Pos'] == 'QB')].copy()
    df.drop(columns = 'Rank', inplace = True) 
    col_name = file[12:-4]
    df.rename(columns = {'Value': col_name}, inplace = True)    
    df_qb = pd.merge(df_qb, df, on = ['Player', 'Team', 'Pos'], how = 'outer')
    print(f'The shape of the merged df is {df_qb.shape}')

In [None]:
df_qb.shape

In [None]:
df_qb.head(3)

In [None]:
player_team = pd.Series(df_qb.Team.values,index=df_qb.Player).to_dict()
player_team

### Convert all numerical columns to int or float

In [None]:
df_qb.dtypes

In [None]:
numcols_to_change = ['passing-plays-completed_home', 'passing-plays-completed_division', 'passing-plays-completed_last_2_weeks', 'passing-plays-completed_last_4_weeks',
                     'passing-plays-completed_top_10_nfl', 'passing-plays-attempted_division', 'passing-plays-attempted_last_2_weeks', 'passing-plays-attempted_last_4_weeks',
                     'passing-plays-attempted_top_10_nfl', 'passing-gross-yards_home', 'passing-gross-yards_division', 'passing-gross-yards_last_2_weeks', 'passing-gross-yards_last_4_weeks',
                     'passing-gross-yards_top_10_nfl', 'passing-touchdowns_home', 'passing-touchdowns_division', 'passing-touchdowns_last_2_weeks', 'passing-touchdowns_last_4_weeks',
                     'passing-touchdowns_top_10_nfl', 'passing-plays-intercepted_home', 'passing-plays-intercepted_division', 'passing-plays-intercepted_last_2_weeks', 'passing-plays-intercepted_last_4_weeks',
                     'passing-plays-intercepted_top_10_nfl', 'passing-2pt-conversions-succeeded_home', 'passing-2pt-conversions-succeeded_division', 'passing-2pt-conversions-succeeded_last_2_weeks',
                    'passing-2pt-conversions-succeeded_last_4_weeks', 'passing-2pt-conversions-succeeded_top_10_nfl']
# need to clean these 17 columns and convert to int or float, before merge

In [None]:
def drop_rows(position):
    for header in position.columns:
        index_list = df_qb[df_qb.eq("--").any(1)].index
        position.drop(labels=index_list, axis=0, inplace=True)
        return
drop_rows(df_qb)

In [None]:
numcols_to_change2 = []
for col in numcols_to_change:
    try:
        df_qb[col] = df_qb[col].astype(float)
        print('success!')
    except:
        numcols_to_change2.append(col)
        print(f'need to clean column: {col}')

In [None]:
df_qb.fillna(0, inplace = True)

In [None]:
# removes suffixes
df_qb['Player'] =df_qb['Player'].map(lambda x: x.split()[0] + ' ' + x.split()[1])

### bring in football reference aggregage data

In [None]:
#bring in football reference data
fr_qb = pd.read_html('./nfl_data/2021_passing_stats.xls')
fr_qb = pd.DataFrame(fr_qb[0]) # Saves df var to dataframe
fr_qb.fillna(0, inplace=True)
fr_qb['Player'] = fr_qb['Player'].map(lambda x: x.lstrip('*').rstrip('*').rstrip('+').rstrip('*'))
fr_qb.drop(columns = ['Rk', 'Age', 'QBrec', '4QC', 'GWD', 'G', 'GS'], inplace = True)
fr_qb.head(15)

In [None]:
fr_qb.shape

In [None]:
def drop_rows(df):
    for header in df.columns:
        index_list = df.loc[(df[header] == header)].index
        df.drop(labels=index_list, axis=0, inplace = True)
        return
drop_rows(fr_qb)

int_cols = ['Cmp', 'Att', 'Yds', 'TD', 'Int', '1D', 'Lng', 'Sk', 'Yds.1']
float_cols = ['Cmp%', 'TD%', 'Int%', 'Y/A', 'AY/A', 'Y/C', 'Y/G', 'Rate', 'QBR', 'Rate', 'NY/A', 'ANY/A', 'Sk%']

for int_col in int_cols:
    fr_qb[f'{int_col}'] = fr_qb[f'{int_col}'].astype(int)

for float_col in float_cols:
    fr_qb[f'{float_col}'] = fr_qb[f'{float_col}'].astype(float)

In [None]:
fr_qb.shape, df_qb.shape

In [None]:
# removes suffixes
fr_qb['Player'] =fr_qb['Player'].map(lambda x: x.split()[0] + ' ' + x.split()[1])

### merge teamrankings and football reference data

In [None]:
df_qb_all =pd.merge(fr_qb, df_qb, on = ['Player'], how = 'inner')
df_qb_all.shape

In [None]:
df_qb_all.head(15)

In [None]:
x = {i : f'{i}_Tot' for i in df_qb_all.columns if len(i) < 18 and i not in ['Player', 'Tm', 'Pos', 'Team']}
df_qb_all = df_qb_all.rename(columns = x)

In [None]:
x = {i : f'{i}_Avg' for i in df_qb_all.columns if len(i) > 18}
df_qb_all = df_qb_all.rename(columns = x)

In [None]:
df_qb_all.drop(columns = 'Pos_x_Tot', inplace = True)
df_qb_all.rename(columns = {'Pos_y_Tot': 'Pos'})
df_qb_all.head()

In [None]:
df_qb_all.to_csv('./modeling_data/passing.csv', index = 0)