In [14]:
import requests
from bs4 import BeautifulSoup as bs
import pandas as pd
import numpy as np
from time import sleep
import re
import lxml.html as lx
from lxml import html
pd.options.mode.chained_assignment = None  # default='warn'

In [15]:
with open('pfr_nfl_teams.txt') as file:
    teams = file.read().splitlines()
teams = [x.strip(' ') for x in teams]
teams

['crd',
 'atl',
 'rav',
 'buf',
 'car',
 'chi',
 'cin',
 'cle',
 'dal',
 'den',
 'det',
 'gnb',
 'clt',
 'jax',
 'kan',
 'rai',
 'sdg',
 'ram',
 'mia',
 'min',
 'nwe',
 'nor',
 'nyg',
 'nyj',
 'phi',
 'pit',
 'sfo',
 'sea',
 'tam',
 'oti',
 'was']

In [16]:
def scrape_rosters(yearstart,yearend, teams):
    df=[]
    for year in range(yearstart,yearend):
        for team in teams:
            with open("{}/{}{}.html".format('rosters',team,year)) as f:
                page = f.read()

            soup = bs(page, 'html.parser')
            table = soup.find('table', id = 'starters')
            table_stats = pd.read_html(str(table))[0]
            table_stats['Year'] = year
            table_stats['Tm'] = team
            df.append(table_stats)
    final_df = pd.concat(df)
    return(final_df)

In [17]:
# filter dataset to only include offensive players as we will not look at defensive players 
# individually.

def clean_rosters(df,col): 
    df = df[~(df[col].isnull())]
    df = df[df[col].isin(['QB', 'RB', 'WR', 'TE', 'FB'])]

    df = df[['Player', 'Pos','Year', 'Tm']]
    return(df)

In [18]:
nfl_rosters = scrape_rosters(2000,2022,teams)

In [19]:
nfl_rosters

Unnamed: 0,Pos,Player,Age,Yrs,GS,Summary of Player Stats,Drafted (tm/rnd/yr),Year,Tm
0,,Offensive Starters,,,,,,2000,crd
1,QB,Jake Plummer,26.0,3,14.0,"270 for 475, 2,946 yards, 13 td, 21 int, & 37 ...",,2000,crd
2,RB,Michael Pittman,25.0,2,12.0,"184 rushes for 719 yards, 4 td, & 73 catches f...",,2000,crd
3,FB,Joel Makovicka,25.0,1,10.0,"3 rushes for 8 yards, 0 td, & 6 catches for 18...",,2000,crd
4,WR,David Boston,22.0,1,16.0,"71 catches for 1,156 yards, 7 td, & 3 rushes f...",,2000,crd
...,...,...,...,...,...,...,...,...,...
19,RLB,Jonathan Bostic,30.0,8,4.0,,,2021,was
20,LCB,Kendall Fuller,26.0,5,16.0,"1.0 sacks, 1 interception, 0 fumbles recovered",,2021,was
21,RCB,William Jackson III,29.0,4,12.0,"0.0 sacks, 2 interceptions, 0 fumbles recovered",,2021,was
22,SS,Landon Collins,27.0,6,13.0,"3.0 sacks, 2 interceptions, 2 fumbles recovered",,2021,was


## Scrape roster just for Houston Texans bc they became a team in 2002

In [25]:
htx_df = scrape_rosters(2002,2022,['htx'])

In [26]:
htx_df

Unnamed: 0,Pos,Player,Age,Yrs,GS,Summary of Player Stats,Drafted (tm/rnd/yr),Year,Tm
0,,Offensive Starters,,,,,,2002,htx
1,QB,David Carr,23.0,Rook,16.0,"233 for 444, 2,592 yards, 9 td, 15 int, & 59 r...",,2002,htx
2,RB,Jonathan Wells,23.0,Rook,11.0,"197 rushes for 529 yards, 3 td, & 9 catches fo...",,2002,htx
3,FB,Jarrod Baxter,23.0,Rook,10.0,"7 rushes for 14 yards, 0 td, & 5 catches for 3...",,2002,htx
4,WR,Corey Bradford,27.0,4,16.0,"45 catches for 697 yards, 6 td, & 2 rushes for...",,2002,htx
...,...,...,...,...,...,...,...,...,...
19,RLB,Zach Cunningham,27.0,4,7.0,,,2021,htx
20,LCB,Desmond King,27.0,4,12.0,"0.0 sacks, 3 interceptions, 0 fumbles recovered",,2021,htx
21,RCB,Terrance Mitchell,29.0,7,13.0,"0.0 sacks, 1 interception, 0 fumbles recovered",,2021,htx
22,SS,Justin Reid,24.0,3,13.0,"0.0 sacks, 2 interceptions, 0 fumbles recovered",,2021,htx


In [27]:
htx_df = clean_rosters(htx_df, 'Pos')
htx_df

Unnamed: 0,Player,Pos,Year,Tm
1,David Carr,QB,2002,htx
2,Jonathan Wells,RB,2002,htx
3,Jarrod Baxter,FB,2002,htx
4,Corey Bradford,WR,2002,htx
5,Jabar Gaffney,WR,2002,htx
...,...,...,...,...
2,Mark Ingram,RB,2021,htx
3,Chris Conley,WR,2021,htx
4,Brandin Cooks,WR,2021,htx
5,Antony Auclair,TE,2021,htx


In [28]:
nfl_rosters = clean_rosters(nfl_rosters, 'Pos')
nfl_rosters

Unnamed: 0,Player,Pos,Year,Tm
1,Jake Plummer,QB,2000,crd
2,Michael Pittman,RB,2000,crd
3,Joel Makovicka,FB,2000,crd
4,David Boston,WR,2000,crd
5,Frank Sanders,WR,2000,crd
...,...,...,...,...
2,Mark Ingram,RB,2021,htx
3,Chris Conley,WR,2021,htx
4,Brandin Cooks,WR,2021,htx
5,Antony Auclair,TE,2021,htx


In [29]:
nfl_rosters = pd.concat([nfl_rosters,htx_df])
nfl_rosters

Unnamed: 0,Player,Pos,Year,Tm
1,Jake Plummer,QB,2000,crd
2,Michael Pittman,RB,2000,crd
3,Joel Makovicka,FB,2000,crd
4,David Boston,WR,2000,crd
5,Frank Sanders,WR,2000,crd
...,...,...,...,...
2,Mark Ingram,RB,2021,htx
3,Chris Conley,WR,2021,htx
4,Brandin Cooks,WR,2021,htx
5,Antony Auclair,TE,2021,htx


In [30]:
pat = r'(\,|\'|\.|Jr|Sr|III |IV )'
nfl_rosters['Player'] = nfl_rosters['Player'].replace(pat, '', regex=True)    
nfl_rosters['Player'] = nfl_rosters['Player'].replace('[^a-zA-Z0-9 ]', '', regex=True)

nfl_rosters['Tm'] = [x.upper() for x in nfl_rosters['Tm']]
nfl_rosters['Tm'] = np.where(nfl_rosters.Tm == 'SDG', 'LAC', nfl_rosters.Tm)
nfl_rosters['Tm'] = np.where(nfl_rosters.Tm == 'CRD', 'ARI', nfl_rosters.Tm)
nfl_rosters['Tm'] = np.where(nfl_rosters.Tm == 'HTX', 'HOU', nfl_rosters.Tm)
nfl_rosters['Tm'] = np.where(nfl_rosters.Tm == 'RAI', 'LVR', nfl_rosters.Tm)
nfl_rosters['Tm'] = np.where(nfl_rosters.Tm == 'RAM', 'LAR', nfl_rosters.Tm)
nfl_rosters['Tm'] = np.where(nfl_rosters.Tm == 'RAV', 'BAL', nfl_rosters.Tm)
nfl_rosters['Tm'] = np.where(nfl_rosters.Tm == 'CLT', 'IND', nfl_rosters.Tm)
nfl_rosters['Tm'] = np.where(nfl_rosters.Tm == 'OTI', 'TEN', nfl_rosters.Tm)
nfl_rosters

Unnamed: 0,Player,Pos,Year,Tm
1,Jake Plummer,QB,2000,ARI
2,Michael Pittman,RB,2000,ARI
3,Joel Makovicka,FB,2000,ARI
4,David Boston,WR,2000,ARI
5,Frank Sanders,WR,2000,ARI
...,...,...,...,...
2,Mark Ingram,RB,2021,HOU
3,Chris Conley,WR,2021,HOU
4,Brandin Cooks,WR,2021,HOU
5,Antony Auclair,TE,2021,HOU


In [13]:
nfl_rosters.to_csv('NFLRosters_since2000.csv')