# make_dataset.ipynb: 
# Scrape Tables from profootballreference.com

In [7]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import datetime
import re

In [8]:
#====================================================================================================================
# retrieves the tables and player ids from pro football reference via requests and beautiful soup
#
# url: url to retrieve data from
# sel_type: type of selector specifying location of table in html (ex. "id", "class")
# sel_name: name of the selector
#====================================================================================================================

def getdf(url,sel_type,sel_name):
    headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36"}
    response = requests.get(url, headers=headers)
    soup = BeautifulSoup(response.text, "html.parser")

    table = soup.find_all("table",{sel_type:sel_name})

    df = pd.read_html(str(table))[0]

    player_ids = re.findall(r'(?<=\"/players/./)(.*?)(?=\.htm")',str(table), re.IGNORECASE)     # get player IDs from player individual page urls

    return(df,player_ids)

#====================================================================================================================

stats,ids = getdf("https://www.pro-football-reference.com/years/2022/fantasy.htm","id","fantasy")

stats.head()

Unnamed: 0_level_0,Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Games,Games,Passing,Passing,Passing,...,Scoring,Scoring,Scoring,Fantasy,Fantasy,Fantasy,Fantasy,Fantasy,Fantasy,Fantasy
Unnamed: 0_level_1,Rk,Player,Tm,FantPos,Age,G,GS,Cmp,Att,Yds,...,TD,2PM,2PP,FantPt,PPR,DKPt,FDPt,VBD,PosRank,OvRank
0,1,Patrick Mahomes*+,KAN,QB,27,17,17,435,648,5250,...,4,1.0,2.0,416,417.4,435.4,428.9,136,1,1
1,2,Josh Jacobs*+,LVR,RB,24,17,17,0,0,0,...,12,,,275,328.3,335.3,301.8,127,1,2
2,3,Christian McCaffrey*,2TM,RB,26,17,16,1,1,34,...,13,,,271,356.4,362.4,313.9,123,2,3
3,4,Derrick Henry *,TEN,RB,28,16,16,2,2,4,...,13,,,270,302.8,311.8,286.3,122,3,4
4,5,Justin Jefferson*+,MIN,WR,23,17,17,2,2,34,...,9,1.0,,241,368.7,371.7,304.7,119,1,5


In [9]:
# template for the columns (gets rid of first header row)
columns = ['Rank','Player','Team','Pos','Age','G','GS','passCmp','passAtt','passYds','passTD','passInt','rushAtt','rushYds','rushYPA','rushTD','Tgt','Rec','recYds','recYPR','recTD','Fmb','FL','TD','2PM','2PP','FantPt','PPR','DKPt','FDPt','VBD','PosRank','OvRank']

In [10]:
#====================================================================================================================
# create csvs from the tables retrieved from pro football reference since 2002 (first year with 32 teams in the NFL)
#====================================================================================================================
for yr in range(2002,datetime.date.today().year):
    stats,ids = getdf(f"https://www.pro-football-reference.com/years/{yr}/fantasy.htm","id","fantasy")
    stats.columns = columns
    stats = stats[stats['Rank']!='Rk']      # remove intermitent header rows in the middle of the data
    stats['ID'] = ids                       # add IDs column
    stats['Year'] = [yr] * stats['ID'].size
    stats.to_csv(f"../data/base/sts_ALL.csv", mode='a', header=(yr==2002))
    stats.to_csv(f"../data/base/sts_{yr}.csv")
#====================================================================================================================

In [None]:
#====================================================================================================================
# create dynasty league csvs with adjusted point totals for PPR changes (0.5 RB, 1 WR, 1.5 TE)
#====================================================================================================================


#====================================================================================================================