In [14]:
import nfl_data_py as nfl
import pandas as pd
import requests
from bs4 import BeautifulSoup
import time

import warnings
warnings.filterwarnings('ignore')

Collect the Rookie Data for Current Rookies

In [10]:
rookies2023 = nfl.import_combine_data([2023])
print(rookies2023.shape)
rookies2023.head()

(319, 18)


Unnamed: 0,season,draft_year,draft_team,draft_round,draft_ovr,pfr_id,cfb_id,player_name,pos,school,ht,wt,forty,bench,vertical,broad_jump,cone,shuttle
7680,2023,2023.0,New York Jets,5.0,143.0,AbanIs00,israel-abanikanda-1,Israel Abanikanda,RB,Pittsburgh,5-10,216.0,,,,,,
7681,2023,2023.0,Jacksonville Jaguars,5.0,136.0,AbduYa00,yasir-abdullah-1,Yasir Abdullah,LB,Louisville,6-1,237.0,4.47,,36.5,129.0,,
7682,2023,2023.0,Miami Dolphins,3.0,84.0,AchaDe00,devon-achane-1,Devon Achane,RB,Texas A&M,5-9,188.0,4.32,,33.0,,,
7683,2023,2023.0,Minnesota Vikings,1.0,23.0,AddiJo00,jordan-addison-1,Jordan Addison,WR,USC,5-11,173.0,4.49,,34.0,122.0,,
7684,2023,2023.0,Indianapolis Colts,4.0,110.0,AdebAd00,adetomiwa-adebawore-1,Adetomiwa Adebawore,DE,Northwestern,6-2,282.0,4.49,27.0,37.5,125.0,,


Collect the Rookie Data for Rookies from 2000-2022

In [11]:
rookies00to22 = nfl.import_combine_data(range(2000,2023))
print(rookies00to22.shape)
rookies00to22.head()

(7680, 18)


Unnamed: 0,season,draft_year,draft_team,draft_round,draft_ovr,pfr_id,cfb_id,player_name,pos,school,ht,wt,forty,bench,vertical,broad_jump,cone,shuttle
0,2000,2000.0,New York Jets,1.0,13.0,AbraJo00,,John Abraham,OLB,South Carolina,6-4,252.0,4.55,,,,,
1,2000,2000.0,Seattle Seahawks,1.0,19.0,AlexSh00,shaun-alexander-1,Shaun Alexander,RB,Alabama,6-0,218.0,4.58,,,,,
2,2000,2000.0,Kansas City Chiefs,6.0,188.0,AlfoDa20,,Darnell Alford,OT,Boston Col.,6-4,334.0,5.56,23.0,25.0,94.0,8.48,4.98
3,2000,,,,,,,Kyle Allamon,TE,Texas Tech,6-2,253.0,4.97,,29.0,104.0,7.29,4.49
4,2000,2000.0,Carolina Panthers,1.0,23.0,AndeRa21,,Rashard Anderson,CB,Jackson State,6-2,206.0,4.55,,34.0,123.0,7.18,4.15


Function to convert heights to inches

In [12]:
def parse_ht(ht):
    # format: 6-1
    ht_ = ht.split("-")
    ft_ = float(ht_[0])
    in_ = float(ht_[1])
    return (12*ft_) + in_

Apply height function

In [13]:
rookies00to22["ht"] = rookies00to22["ht"].apply(lambda x: parse_ht(x) if x else x)
rookies2023["ht"] = rookies2023["ht"].apply(lambda x: parse_ht(x) if x else x)
rookies00to22.head()

Unnamed: 0,season,draft_year,draft_team,draft_round,draft_ovr,pfr_id,cfb_id,player_name,pos,school,ht,wt,forty,bench,vertical,broad_jump,cone,shuttle
0,2000,2000.0,New York Jets,1.0,13.0,AbraJo00,,John Abraham,OLB,South Carolina,76.0,252.0,4.55,,,,,
1,2000,2000.0,Seattle Seahawks,1.0,19.0,AlexSh00,shaun-alexander-1,Shaun Alexander,RB,Alabama,72.0,218.0,4.58,,,,,
2,2000,2000.0,Kansas City Chiefs,6.0,188.0,AlfoDa20,,Darnell Alford,OT,Boston Col.,76.0,334.0,5.56,23.0,25.0,94.0,8.48,4.98
3,2000,,,,,,,Kyle Allamon,TE,Texas Tech,74.0,253.0,4.97,,29.0,104.0,7.29,4.49
4,2000,2000.0,Carolina Panthers,1.0,23.0,AndeRa21,,Rashard Anderson,CB,Jackson State,74.0,206.0,4.55,,34.0,123.0,7.18,4.15


Scrape pro football reference for all fantasy data from 2000-2022

https://stmorse.github.io/journal/pfr-scrape-python.html

In [None]:
url = 'https://www.pro-football-reference.com'
maxp = 515

df = []


for year in range(2000, 2023):
    # grab fantasy players
    r = requests.get(url + '/years/' + str(year) + '/fantasy.htm')
    soup = BeautifulSoup(r.content, 'html.parser')
    parsed_table = soup.find_all('table')[0]  

    # first 2 rows are col headers
    for i,row in enumerate(parsed_table.find_all('tr')[2:]):
        if i % 10 == 0: 
            print(i, end=' ')
        if i >= maxp: 
            print('\nComplete.')
            break
        
        # Sport reference has a rate limit of 20 requests per minute (I rounded up to 5 seconds per request) so this took a while to run
        time.sleep(5)
        try:
            dat = row.find('td', attrs={'data-stat': 'player'})
            name = dat.a.get_text()
            pfr_id = dat.get('data-append-csv')
            stub = dat.a.get('href')
            stub = stub[:-4] + '/fantasy/' + str(year)
            pos = row.find('td', attrs={'data-stat': 'fantasy_pos'}).get_text()

            # grab this players stats
            tdf = pd.read_html(url + stub)[0]

            # get rid of MultiIndex, just keep last row
            tdf.columns = tdf.columns.get_level_values(-1)

            # drop all intermediate stats
            tdf = tdf.iloc[:,[-2]]
            
            # drop "Total" row
            tdf = tdf[:-1]
            
            # add other info
            tdf['Name'] = name
            tdf['PFR_ID'] = pfr_id
            tdf['Position'] = pos
            tdf['Season'] = year

            df.append(tdf)
        except:
            pass

df = pd.concat(df)
df.head()
df.to_csv('fantasy00to22.csv')

Prep CFB IDs for scraping sports reference

In [17]:
cfb_ids = nfl.import_combine_data(range(2000,2024))[["cfb_id", "pos", "player_name"]]
cfb_ids = cfb_ids.dropna(subset=['cfb_id'])
print(cfb_ids.shape)
cfb_ids.head()

(6524, 3)


Unnamed: 0,cfb_id,pos,player_name
1,shaun-alexander-1,RB,Shaun Alexander
6,lavar-arrington-1,OLB,LaVar Arrington
10,john-baker-3,P,John Baker
18,anthony-becht-1,TE,Anthony Becht
27,tom-brady-1,QB,Tom Brady


Filter out relevant positions

In [18]:
cfb_qbs = cfb_ids[cfb_ids["pos"] == "QB"]
cfb_rbs = cfb_ids[cfb_ids["pos"] == "RB"]
cfb_wrs = cfb_ids[cfb_ids["pos"] == "WR"]
cfb_tes = cfb_ids[cfb_ids["pos"] == "TE"]

Scrape sports reference for college data (have to do it by position because of stat table differences), below is the example for TE

In [None]:
url = 'https://www.sports-reference.com/cfb/players/'

df = []

for index, row in cfb_tes.iterrows():
    if index % 10 == 0: print(index, end=' ')
    
    time.sleep(5)
    cfb_id, pos, name = row['cfb_id'], row['pos'], row['player_name']
    try:
        tdf = pd.read_html(url + cfb_id + '/gamelog/')[0]
        tdf.columns = tdf.columns.get_level_values(-1)
        tdf = tdf.iloc[:,[1, 3, 7, 8, 9, 10]]
        tdf.columns =  ['Year', 'School', 'Rec', 'Rec_Yds', 'Rec_Avg', 'Rec_TD']
        tdf['Year'] = pd.to_numeric(tdf['Year'], errors='coerce')
        tdf = tdf.dropna(subset=['Year'])
        tdf['pfr_id'] = cfb_id
        tdf['Position'] = pos
        tdf['Name'] = name

        df.append(tdf)
    except:
        pass

df = pd.concat(df)
df.head()
df.to_csv('collegeTE_stats.csv')   