In [1]:
import requests
from bs4 import BeautifulSoup as BS
import pandas as pd

In [14]:
def get_rec_data(year):
    """Creating a function to scrape receiving data"""
    pd.set_option('display.max_columns', None)
    # setting the url
    URL = f'https://www.sports-reference.com/cfb/years/{year}-receiving.html'
    # saving the url as a variable
    res = requests.get(URL)
    # creating a beautifulSoup object
    soup = BS(res.content, 'html.parser')
    # locate table of interest
    table = soup.find(class_="sortable stats_table", id='receiving')
    # read data into a datafrae
    df = pd.read_html(str(table))[0]
    # rename columns
    df.columns = ['Rk', 'Player', 'School', 'Conf', 'G', 'Rec', 'RecYds', 'RecAvg', 'RecTD', 'RushAtt', 'RushYds', 'RushAvg', 'RushTD', 'Plays', 'TotalYds', 'YardsPerPlay', 'TotalTDs']
    # remove blank rows
    df = df.loc[df['Conf']!='Conf']
    # remove the * that's next to several player names
    df['Player'] = df['Player'].apply(lambda x: x.split('*')[0].strip())
    # save data to csv
    df.to_csv(f'receiving/{year}.csv', index=False)
    return "Data successfully saved"

In [15]:
# getting data
get_rec_data(2022)

'Data successfully saved'

In [38]:
# getting data
for x in range(1993,2022):
    get_rec_data(x)

In [12]:
def get_rush_data(year):
    """Creating a function to scrape receiving data"""
    pd.set_option('display.max_columns', None)
    # setting the url
    URL = f'https://www.sports-reference.com/cfb/years/{year}-rushing.html'
    # saving the url as a variable
    res = requests.get(URL)
    # creating a beautifulSoup object
    soup = BS(res.content, 'html.parser')
    # locate table of interest
    table = soup.find(class_="sortable stats_table", id='rushing')
    # read data into a datafrae
    df = pd.read_html(str(table))[0]
    # rename columns
    df.columns = ['Rk', 'Player', 'School', 'Conf', 'G', 'RushAtt', 'RushYds', 'RushAvg', 'RushTD', 'Rec', 'RecYds', 'RecAvg', 'RecTD', 'Plays', 'TotalYds', 'YardsPerPlay', 'TotalTDs']
    # remove blank rows
    df = df.loc[df['Conf']!='Conf']
    # remove the * that's next to several player names
    df['Player'] = df['Player'].apply(lambda x: x.split('*')[0].strip())
    # save data to csv
    df.to_csv(f'rushing/{year}.csv', index=False)
    return "Data successfully saved"

In [13]:
# getting data
get_rush_data(2022)

'Data successfully saved'

In [13]:
# getting data
for x in range(1993,2022):
    get_rush_data(x)

In [10]:
def get_passing_data(year):
    """Creating a function to scrape passing data"""
    pd.set_option('display.max_columns', None)
    # setting the url
    URL = f'https://www.sports-reference.com/cfb/years/{year}-passing.html'
    # saving the url as a variable
    res = requests.get(URL)
    # creating a beautifulSoup object
    soup = BS(res.content, 'html.parser')
    # locate table of interest
    table = soup.find(class_="sortable stats_table", id='passing')
    # read data into a datafrae
    df = pd.read_html(str(table))[0]
    # rename columns
    df.columns = ['Rk', 'Player', 'School', 'Conf', 'G', 'Completions', 'Attempts', 'CompletionPCT', 'PassYds', 'YardsPerAttempt',
                  'AdjustedYardsPerAttempt', 'PassingTDs', 'Int', 'EfficiencyRtg', 'RushAtt', 'RushYds', 'RushAvg', 'RushTD']
    # remove blank rows
    df = df.loc[df['Conf']!='Conf']
    # remove the * that's next to several player names
    df['Player'] = df['Player'].apply(lambda x: x.split('*')[0].strip())
    # save data to csv
    df.to_csv(f'passing/{year}.csv', index=False)
    return "Data successfully saved"

In [11]:
# getting data
get_passing_data(2022)

'Data successfully saved'

In [10]:
# getting data
for x in range(1993,2022):
    get_passing_data(x)

In [31]:
# getting data, scrolling error is why this cell is above the created function
for x in range(1993,2022):
    try:
        get_draft_data(x)
    except ValueError:
        print(x)
    except:
        print(x, 'something else')

1993


In [8]:
def get_draft_data(year):
    pd.set_option('display.max_columns', None)
    # setting the url
    URL = f'https://www.pro-football-reference.com/years/{year}/draft.htm'
    # saving the url as a variable
    res = requests.get(URL)
    # creating a beautifulSoup object
    soup = BS(res.content, 'html.parser')
    # locate table of interest
    table = soup.find(class_="sortable stats_table", id='drafts')
    # read data into a datafrae
    df = pd.read_html(str(table))[0]
    # rename columns
    df.columns = ['Round', 'Pick', 'Team', 'Player', 'Position', 'Age', 'To', 'AP1', 'PB', 'St', 'wAV', 'DrAV','G', 'Cmp', 'PassAtt',
                'PassYds', 'PassTD', 'Int', 'RushAtt', 'RushYds', 'RushTD', 'Rec', 'RecYds', 'RecTD', 'Solo', 'Int', 'Sk', 'School', 'Unnamed: 28_level_1']

        # remove blank rows
    df = df.loc[df['Player']!='Player']
    # filtering to only positions of interest
    # df1 = df1[(df1['Player']=='QB') or (df1['Player']=='WR') or (df1['Player']=='RB') or (df1['Player']=='TE')].copy() 
    # select only relevant columns
    df1 = df[['Round', 'Pick', 'Team', 'Player', 'Position', 'Age', 'School']].copy()
        # remove the * that's next to several player names
    df1['Player'] = df1['Player'].apply(lambda x: x.split('*')[0].strip())
        # save data to csv
    df1.to_csv(f'draft_data/{year}.csv', index=False)
    print('success')

In [9]:
get_draft_data(2022)

success


In [4]:
def get_combine_data(year):
    """Function for getting combine data"""
    pd.set_option('display.max_columns', None)
        # setting the url
    URL = f'https://nflcombineresults.com/nflcombinedata_expanded.php?year={year}&pos=&college='
        # saving the url as a variable
    res = requests.get(URL)
        # creating a beautifulSoup object
    soup = BS(res.content, 'html.parser')
    # locate table of interest
    table = soup.find(class_="sortable")
    # read data into a datafrae
    df = pd.read_html(str(table))[0]
    df.to_csv(f'combine_results/{year}.csv', index=False)
    print('success')

In [5]:
# saving data
for x in range(1993,2023):
    try:
        get_combine_data(x)
    except ValueError:
        print(x)
    except:
        print(x, 'something else')

success
success
success
success
success
success
success
success
success
success
success
success
success
success
success
success
success
success
success
success
success
success
success
success
success
success
success
success
success
success


In [6]:
def get_combine_data(year):
    """Function for getting combine data"""
    pd.set_option('display.max_columns', None)
    # setting the url
    URL = f'https://www.pro-football-reference.com/draft/{year}-combine.htm'
    # saving the url as a variable
    res = requests.get(URL)
    # creating a beautifulSoup object
    soup = BS(res.content, 'html.parser')
    # locate table of interest
    table = soup.find(class_="sortable stats_table", id='combine')
    # read data into a datafrae
    df = pd.read_html(str(table))[0]
    # rename columns
    df.columns = ['Player', 'Position', 'School', 'College', 'Height', 'Weight', '40 Time', 'Vertical Jump', 'Bench', 'Broad Jump', '3 Cone', 'Shuttle', 'Drafted']

        # remove blank rows
    df = df.loc[df['Player']!='Player']
    # filtering to only positions of interest
    # df1 = df1[(df1['Player']=='QB') or (df1['Player']=='WR') or (df1['Player']=='RB') or (df1['Player']=='TE')].copy() 
    # select only relevant columns
    df1 = df[df['Drafted'].notnull()].copy()
        # remove the * that's next to several player names
    df1['Player'] = df1['Player'].apply(lambda x: x.split('*')[0].strip())
        # save data to csv
    df1.to_csv(f'combine_data/{year}.csv', index=False)
    print('success')

In [7]:
# saving data
for x in range(2000,2023):
    get_combine_data(x)

success
success
success
success
success
success
success
success
success
success
success
success
success
success
success
success
success
success
success
success
success
success
success
