In [1]:
import pandas as pd
import requests
import time
import numpy as np
pd.options.display.max_columns = 999
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

### Scrape RB categories. All on a per-game basis.

- Categories: 

- Sub-categories: Home games, Division Games, L2 Weeks, L4 weeks, Vs Top 10

In [2]:
stats = ['rushing-plays', 'rushing-net-yards', 'rushing-yards-per-attempt', 'rushing-touchdowns', 
        'rushing-longest-yards', 'rushing-2pt-conversions-succeeded']

sub_cats = ['home', 'division', 'last_2_weeks','last_4_weeks', 'top_10_nfl']

file_list = []

for stat in stats:
    for cat in sub_cats:
        url = f'https://www.teamrankings.com/nfl/player-stat/{stat}?split={cat}&rate=per-game'
        resp = requests.get(url)
        output = open(f'./nfl_data/{stat}_{cat}.xls', 'wb')
        output.write(resp.content)
        file_list.append(f'./nfl_data/{stat}_{cat}.xls')
        output.close() 
        data = pd.read_html(f'./nfl_data/{stat}_{cat}.xls')
        df = pd.DataFrame(data[0]) 
        print((stat, cat), df.shape)

('rushing-plays', 'home') (100, 5)
('rushing-plays', 'division') (100, 5)
('rushing-plays', 'last_2_weeks') (100, 5)
('rushing-plays', 'last_4_weeks') (100, 5)
('rushing-plays', 'top_10_nfl') (100, 5)
('rushing-net-yards', 'home') (100, 5)
('rushing-net-yards', 'division') (100, 5)
('rushing-net-yards', 'last_2_weeks') (100, 5)
('rushing-net-yards', 'last_4_weeks') (100, 5)
('rushing-net-yards', 'top_10_nfl') (100, 5)
('rushing-yards-per-attempt', 'home') (67, 5)
('rushing-yards-per-attempt', 'division') (88, 5)
('rushing-yards-per-attempt', 'last_2_weeks') (2, 5)
('rushing-yards-per-attempt', 'last_4_weeks') (11, 5)
('rushing-yards-per-attempt', 'top_10_nfl') (100, 5)
('rushing-touchdowns', 'home') (100, 5)
('rushing-touchdowns', 'division') (100, 5)
('rushing-touchdowns', 'last_2_weeks') (100, 5)
('rushing-touchdowns', 'last_4_weeks') (100, 5)
('rushing-touchdowns', 'top_10_nfl') (100, 5)
('rushing-longest-yards', 'home') (100, 5)
('rushing-longest-yards', 'division') (100, 5)
('rush

### Loop through all files and merge into one df

In [3]:
# get df started with the 1st file.  This way we have something to merge to.

df_rb = pd.read_html('./nfl_data/rushing-plays_home.xls')
df_rb = pd.DataFrame(df_rb[0]) 
#df_rb = df_rb.loc[(df_rb['Pos'] == 'RB')].copy() # filter out non-rbs if wanting to model for ONLY RBs (non-Flex)
df_rb.drop(columns = 'Rank', inplace = True) 
col_name = file_list[0][11:-4]
df_rb.rename(columns = {'Value': col_name}, inplace = True)

In [4]:
df_rb.shape

(100, 4)

In [5]:
for file in file_list[1:]:
    data = pd.read_html(file)
    df = pd.DataFrame(data[0])
    #df = df.loc[(df['Pos'] == 'RB')].copy() # filter out non-rbs if wanting to model for ONLY RBs (non-Flex)
    df.drop(columns = ['Rank', 'Team', 'Pos'], inplace = True) 
    col_name = file[11:-4]
    df.rename(columns = {'Value': col_name}, inplace = True)    
    df_rb = pd.merge(df_rb, df, on = ['Player'], how = 'outer')
    print(f'The shape of the merged df is {df_rb.shape}')

The shape of the merged df is (114, 5)
The shape of the merged df is (209, 6)
The shape of the merged df is (272, 7)
The shape of the merged df is (285, 8)
The shape of the merged df is (288, 9)
The shape of the merged df is (291, 10)
The shape of the merged df is (292, 11)
The shape of the merged df is (297, 12)
The shape of the merged df is (301, 13)
The shape of the merged df is (301, 14)
The shape of the merged df is (301, 15)
The shape of the merged df is (301, 16)
The shape of the merged df is (301, 17)
The shape of the merged df is (307, 18)
The shape of the merged df is (318, 19)
The shape of the merged df is (331, 20)
The shape of the merged df is (331, 21)
The shape of the merged df is (339, 22)
The shape of the merged df is (350, 23)
The shape of the merged df is (357, 24)
The shape of the merged df is (365, 25)
The shape of the merged df is (365, 26)
The shape of the merged df is (365, 27)
The shape of the merged df is (372, 28)
The shape of the merged df is (452, 29)
The s

In [6]:
df_rb.shape

(590, 33)

In [7]:
numcols_to_change = df_rb.columns
numcols_to_change2 = []
for col in numcols_to_change:
    try:
        df_rb[col] = df_rb[col].astype(float)
        print('success!')
    except:
        numcols_to_change2.append(col)
        print(f'need to clean column: {col}')

need to clean column: Player
need to clean column: Team
need to clean column: Pos
success!
success!
need to clean column: rushing-plays_last_2_weeks
need to clean column: rushing-plays_last_4_weeks
success!
success!
success!
need to clean column: rushing-net-yards_last_2_weeks
need to clean column: rushing-net-yards_last_4_weeks
success!
success!
success!
success!
success!
success!
success!
success!
need to clean column: rushing-touchdowns_last_2_weeks
need to clean column: rushing-touchdowns_last_4_weeks
need to clean column: rushing-touchdowns_top_10_nfl
success!
success!
success!
success!
success!
need to clean column: rushing-2pt-conversions-succeeded_home
need to clean column: rushing-2pt-conversions-succeeded_division
need to clean column: rushing-2pt-conversions-succeeded_last_2_weeks
need to clean column: rushing-2pt-conversions-succeeded_last_4_weeks
need to clean column: rushing-2pt-conversions-succeeded_top_10_nfl


In [8]:
def drop_rows(position):
    for header in position.columns:
        index_list = df_rb[df_rb.eq("--").any(1)].index
        position.drop(labels=index_list, axis=0, inplace=True)
        return
drop_rows(df_rb)

In [9]:
df_rb.shape

(546, 33)

In [10]:
df_rb.fillna(0, inplace = True)

In [11]:
df_rb.shape

(546, 33)

In [12]:
# remove suffixes
df_rb['Player'] = df_rb['Player'].map(lambda x: x.split()[0] + ' ' + x.split()[1])

In [13]:
df_rb.shape

(546, 33)

### bring in football reference aggregage data

In [14]:
#bring in football reference data
fr_rb = pd.read_html('./nfl_data/2021_rushing_stats.xls')
fr_rb = pd.DataFrame(fr_rb[0]) # Saves df var to dataframe
fr_rb.fillna(0, inplace=True)
fr_rb = fr_rb.droplevel(0, axis=1)
fr_rb['Player'] = fr_rb['Player'].map(lambda x: x.lstrip('*').rstrip('*').rstrip('+').rstrip('*'))
fr_rb.drop(columns = ['Age', 'Rk'], inplace = True)
fr_rb.head(15)

Unnamed: 0,Player,Tm,Pos,G,GS,Att,Yds,TD,1D,Lng,Y/A,Y/G,Fmb
0,Jonathan Taylor,IND,RB,17,17,332,1811,18,107,83,5.5,106.5,4
1,Najee Harris,PIT,RB,17,17,307,1200,7,62,37,3.9,70.6,0
2,Joe Mixon,CIN,RB,16,16,292,1205,13,60,32,4.1,75.3,2
3,Antonio Gibson,WAS,RB,16,14,258,1037,7,65,27,4.0,64.8,6
4,Dalvin Cook,MIN,RB,13,13,249,1159,6,57,66,4.7,89.2,3
5,Alvin Kamara,NOR,rb,13,10,240,898,4,42,30,3.7,69.1,0
6,Ezekiel Elliott,DAL,RB,17,17,237,1002,10,55,47,4.2,58.9,1
7,Nick Chubb,CLE,RB,14,14,228,1259,8,61,70,5.5,89.9,2
8,David Montgomery,CHI,RB,13,13,225,849,7,55,41,3.8,65.3,1
9,Derrick Henry,TEN,rb,8,8,219,937,10,49,76,4.3,117.1,1


In [15]:
def drop_rows(df):
    for header in df.columns:
        index_list = df.loc[(df[header] == header)].index
        df.drop(labels=index_list, axis=0, inplace = True)
        return
drop_rows(fr_rb)

for float_col in fr_rb.columns[3:]:
    fr_rb[f'{float_col}'] = fr_rb[f'{float_col}'].astype(float)

In [16]:
fr_rb.shape

(371, 13)

In [17]:
# remove suffixes
fr_rb['Player'] = fr_rb['Player'].map(lambda x: x.split()[0] + ' ' + x.split()[1])

### merge teamrankings and football reference data

In [18]:
df_rb_all =pd.merge(fr_rb, df_rb, on = ['Player'], how = 'outer')
df_rb_all.shape

(691, 45)

In [19]:
x = {i : f'{i}_Tot' for i in df_rb_all.columns if len(i) < 18 and i not in ['Player', 'Tm', 'Pos_x', 'Pos_y', 'Team']}
df_rb_all = df_rb_all.rename(columns = x)
df_rb_all.rename(columns={'Pos_y': 'Pos'}, inplace=True)
df_rb_all.drop(columns='Pos_x', inplace=True)
x = {i : f'{i}_Avg' for i in df_rb_all.columns if len(i) >= 18}
df_rb_all = df_rb_all.rename(columns = x)
df_rb_all.rename(columns={'Tm': 'Tm_Abr'}, inplace=True)

In [20]:
df_rb_all.shape

(691, 44)

In [21]:
df_rb_all.to_csv('./modeling_data/rushing.csv', index = 0)