In [1]:
import requests                   
from bs4 import BeautifulSoup     
import pandas as pd               
import time                       
import numpy as np                
import re         

In [2]:
# 2000-2020 - e.g 2000 nfl draft is based on 1999 college season
years_nfl = ['2000','2001','2002','2003','2004','2005','2006','2007','2008','2009','2010','2011','2012','2013','2014','2015','2016','2017','2018','2019', '2020']

# 1999-2019 - e.g 1999 season ends in 2000 and players get drafted in 2000
years_ncaa = ['1999','2000','2001','2002','2003','2004','2005','2006','2007','2008','2009','2010','2011','2012','2013','2014','2015','2016','2017','2018','2019']

# list of tables for player stats
player_tables = ['passing','rushing','receiving','kicking','punting']

# list of delays to put between get requests
delays = [5, 6, 7, 8, 9, 10, 14, 17, 19, 23, 27, 29, 31, 33, 35, 42, 45]

# set request headers based on website
headers = {
    'accept': '*/*',
    'accept-encoding': 'gzip, deflate, br',
    'accept-language': 'en-US,en;q=0.9',
    'referer': 'http://www.google.com/',
    'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.121 Safari/537.36'
}

In [3]:
# get combine data
def pullCombineData(years):
    # a list to store dataframes -- 1 df per year
    dfs = []
    # a list to store any errors that may come up while scraping
    error_list = []
    
    # iterate over years
    for year in years:
        # use try/except block to catch and inspect any urls that cause an error
        try:
            # set url
            url = 'https://www.pro-football-reference.com/draft/' + year + '-combine.htm'
            
            # put random delays between get requests
            delay = np.random.choice(delays)
            time.sleep(delay)
            # make get request with headers
            html = requests.get(url, headers=headers)

            # create the BeautifulSoup object
            soup = BeautifulSoup(html.content, "lxml")
            # get combine table
            table = soup.find('table', {'id': 'combine'})
            # get column headers
            column_headers = [th.getText() for th in soup.findAll('tr', limit=2)[0].findAll('th')]
            
            # read table as dataframe
            df = pd.read_html(str(table))[0]
            # remove duplicate header rows
            df.drop_duplicates(keep=False, inplace=True) 

            # get player ids and ncaa links, if not exist put N/A
            player_ids = []
            ncaa_links = []
            for tbody in table.find_all('tbody'):
                for tr in tbody.find_all('tr'):
                    for th in tr.find_all('th'):
                        if th.text not in column_headers:
                            try:
                                player_ids.append(th['data-append-csv'])
                            except KeyError:
                                player_ids.append('N/A')
                    for td in tr.find_all('td'):
                            if td['data-stat'] == "college":
                                if td.find_all('a'):
                                    for a in td.find_all('a'):
                                        ncaa_links.append(a['href'])
                                else:
                                    ncaa_links.append('N/A')
            
            # insert year, player id, ncaa link to table
            df.insert(0, "NCAA_Link", ncaa_links)
            df.insert(0, "Player_ID", player_ids)
            df.insert(0, "Year", year)
            
            # append df to dfs list
            dfs.append(df)

        except Exception as e:
            # store the url and the error it causes in a list
            error = [url, e] 
            # append it to the list of errors
            error_list.append(error)
    
    # print errors
    print(error_list)
    # merge into one DataFrame
    combines_df = pd.concat(dfs, ignore_index=True)
    combines_df.to_csv('nfl_combine_2000_2020.csv', index=False)

In [4]:
pullCombineData(years_nfl)


[]


In [5]:
# get draft data
def pullDraftData(years):
    # a list to store dataframes -- 1 df per year
    dfs = []
    # a list to store any errors that may come up while scraping
    error_list = []
    
    # iterate over years
    for year in years:
        # use try/except block to catch and inspect any urls that cause an error
        try:
            # set url
            url = 'https://www.pro-football-reference.com/years/' + year + '/draft.htm'
            
            # put random delays between get requests
            delay = np.random.choice(delays)
            time.sleep(delay)
            # make get request with header
            html = requests.get(url, headers=headers)

            # create the BeautifulSoup object
            soup = BeautifulSoup(html.content, "lxml")
            # get draft table
            table = soup.find('table', {'id': 'drafts'})
            
            # read table as dataframe
            df = pd.read_html(str(table))[0]
            # remove duplicate header rows
            df.drop_duplicates(keep=False, inplace=True)
            # remove multiheaders (first header row)
            df.columns = df.columns.droplevel(0)

            player_ids = []
            ncaa_links = []
            for tbody in table.find_all('tbody'):
                for tr in tbody.find_all('tr'):
                    for td in tr.find_all('td'):
                        if td['data-stat'] == "college_link":
                            if td.find_all('a'):
                                for a in td.find_all('a'):
                                    ncaa_links.append(a['href'])
                            else:
                                ncaa_links.append('N/A')
                        elif td['data-stat'] == "player":
                            try:
                                player_ids.append(td['data-append-csv'])
                            except KeyError:
                                player_ids.append('N/A')
                            
            # insert year, player id, ncaa link to table
            df.insert(0, "NCAA_Link", ncaa_links)
            df.insert(0, "Player_ID", player_ids)
            df.insert(0, "Year", year)
            
            # append df to dfs list
            dfs.append(df)

        except Exception as e:
            # store the url and the error it causes in a list
            error = [url, e] 
            # append it to the list of errors
            error_list.append(error)
    
    # print errors
    print(error_list)
    # concatenate dfs into a single df
    draft_df = pd.concat(dfs, ignore_index=True)
    draft_df.to_csv('nfl_draft_2000_2020.csv', index=False)

In [6]:
pullDraftData(years_nfl)


[]


In [7]:
# get ncaa team offense data
def pullTeamOffData(years):    
    # a list to store dataframes -- 1 df per year
    dfs = []
    # a list to store any errors that may come up while scraping
    error_list = []
    
    # iterate over years
    for year in years:
        # use try/except block to catch and inspect any urls that cause an error
        try:
            # set url
            url = 'https://www.sports-reference.com/cfb/years/' + year + '-team-offense.html'
            
            # put random delays between get requests
            delay = np.random.choice(delays)
            time.sleep(delay)
            # make get request with header
            html = requests.get(url, headers=headers)

            # create the BeautifulSoup object
            soup = BeautifulSoup(html.content, "lxml")
            
            # get team offense table
            table = soup.find('table', {'id': 'offense'})
            
            # read table as dataframe
            df = pd.read_html(str(table))[0]
            # remove duplicate header rows
            df.drop_duplicates(keep=False, inplace=True)
            # remove multilevel headers (first header row)
            df.columns = df.columns.droplevel(0)
            
            # add year
            df.insert(0, "Year", year)
            
            # append df to dfs list
            dfs.append(df)
        
        except Exception as e:
            # store the url and the error it causes in a list
            error = [url, e] 
            # append it to the list of errors
            error_list.append(error)
    
    # print errors
    print(error_list)
    # concatenate dataframes into a single dataframe
    team_offense_df = pd.concat(dfs, ignore_index=True)
    # save dataframe to csv
    team_offense_df.to_csv('ncaaf_team_offense_1999_2019.csv', index=False)

In [8]:
pullTeamOffData(years_ncaa)


[['https://www.sports-reference.com/cfb/years/1999-team-offense.html', ValueError('No tables found')]]


In [9]:
# get ncaa team defense data
def pullTeamDefData(years):    
    # a list to store dataframes -- 1 df per year
    dfs = []
    # a list to store any errors that may come up while scraping
    error_list = []
    
    # iterate over years
    for year in years:
        # use try/except block to catch and inspect any urls that cause an error
        try:
            # set url
            url = 'https://www.sports-reference.com/cfb/years/' + year + '-team-defense.html'
            # put random delays between get requests
            delay = np.random.choice(delays)
            time.sleep(delay)
            # get html
            html = requests.get(url, headers=headers)

            # create the BeautifulSoup object
            soup = BeautifulSoup(html.content, "lxml")
            
            # get team defense table
            table = soup.find('table', {'id': 'defense'})
            df = pd.read_html(str(table))[0]
            # remove duplicate header rows
            df.drop_duplicates(keep=False, inplace=True)
            # remove multilevel headers
            df.columns = df.columns.droplevel(0)
            
            # add year
            df.insert(0, "Year", year)
            
            # append dataframe to dataframes list
            dfs.append(df)
        
        except Exception as e:
            # store the url and the error it causes in a list
            error = [url, e] 
            # append it to the list of errors
            error_list.append(error)
    
    # print errors
    print(error_list)
    # concatenate dataframes into a single dataframe
    team_defense_df = pd.concat(dfs, ignore_index=True)
    # save dataframe to csv
    team_defense_df.to_csv('ncaaf_team_defense_1999_2019.csv', index=False)

In [10]:
pullTeamDefData(years_ncaa)

[['https://www.sports-reference.com/cfb/years/1999-team-defense.html', ValueError('No tables found')]]


In [11]:
# get NCAAF ratings
def pullTeamRatings(years):    
    # a list to store dataframes -- 1 df per year
    dfs = []
    # a list to store any errors that may come up while scraping
    error_list = []
    
    # iterate over years
    for year in years:
        # use try/except block to catch and inspect any urls that cause an error
        try:
            # set url
            url = 'https://www.sports-reference.com/cfb/years/' + year + '-ratings.html'
            
            # put random delays between get requests
            delay = np.random.choice(delays)
            time.sleep(delay)
            # make get request with header
            html = requests.get(url, headers=headers)

            # create the BeautifulSoup object
            soup = BeautifulSoup(html.content, "lxml")
            
            # get ratings table
            table = soup.find('table', {'id': 'ratings'})
            
            # read table as dataframe
            df = pd.read_html(str(table))[0]
            # remove duplicate header rows
            df.drop_duplicates(keep=False, inplace=True)
            # remove multilevel headers (first header row)
            df.columns = df.columns.droplevel(0)
            
            # add year
            df.insert(0, "Year", year)
            
            # append df to dfs list
            dfs.append(df)
        
        except Exception as e:
            # store the url and the error it causes in a list
            error = [url, e] 
            # append it to the list of errors
            error_list.append(error)
    
    # print errors
    print(error_list)
    # concatenate dataframes into a single dataframe
    team_ratings_df = pd.concat(dfs, ignore_index=True)
    # save dataframe to csv
    team_ratings_df.to_csv('ncaaf_team_ratings_1999_2019.csv', index=False)

In [12]:
pullTeamRatings(years_ncaa)

[]


In [13]:
# get nfl team stats
def pullNFLTeamStats(years):    
    # a list to store dataframes -- 1 df per year
    dfs = []
    # a list to store any errors that may come up while scraping
    error_list = []
    
    # iterate over years
    for year in years:
        # use try/except block to catch and inspect any urls that cause an error
        try:
            # set url
            url = 'https://www.pro-football-reference.com/years/' + year + '/'
            # get html
            delay = np.random.choice(delays)
            time.sleep(delay)
            html = requests.get(url, headers=headers)

            # create the BeautifulSoup object
            soup = BeautifulSoup(html.content, "lxml")
            
            # get NFL team stats
            table = soup.find_all('table', {'id': ['AFC', 'NFC']})
            # read table as dataframe
            tables = pd.read_html(str(table))
            
            # concatenate tables into single dataframe
            df = pd.concat(tables, ignore_index=True)
            
            # remove division rows
            df = df[~df['Tm'].isin(divisions)]
            
            # add year
            df.insert(0, "Year", year)
            
            # append dataframe to dataframes list
            dfs.append(df)
        
        except Exception as e:
            # store the url and the error it causes in a list
            error = [url, e] 
            # append it to the list of errors
            error_list.append(error)
    
    # print errors
    print(error_list)
    # concatenate dataframes into a single dataframe
    nfl_team_stats_df = pd.concat(dfs, ignore_index=True)
    
    # save dataframe to csv
    nfl_team_stats_df.to_csv('nfl_team_stats_1999_2019.csv', index=False)

In [14]:
pullNFLTeamStats(years_ncaa)

[['https://www.pro-football-reference.com/years/1999/', NameError("name 'divisions' is not defined")], ['https://www.pro-football-reference.com/years/2000/', NameError("name 'divisions' is not defined")], ['https://www.pro-football-reference.com/years/2001/', NameError("name 'divisions' is not defined")], ['https://www.pro-football-reference.com/years/2002/', NameError("name 'divisions' is not defined")], ['https://www.pro-football-reference.com/years/2003/', NameError("name 'divisions' is not defined")], ['https://www.pro-football-reference.com/years/2004/', NameError("name 'divisions' is not defined")], ['https://www.pro-football-reference.com/years/2005/', NameError("name 'divisions' is not defined")], ['https://www.pro-football-reference.com/years/2006/', NameError("name 'divisions' is not defined")], ['https://www.pro-football-reference.com/years/2007/', NameError("name 'divisions' is not defined")], ['https://www.pro-football-reference.com/years/2008/', NameError("name 'divisions

ValueError: No objects to concatenate

In [None]:
# get ncaa player data
def pullPlayerData(years, tableID):
    # a list to store dataframes -- 1 df per year
    dfs = []
    # a list to store any errors that may come up while scraping
    error_list = []
    # list of tables with multilevel headers
    multi_list = ['passing','rushing','receiving']
    
    # iterate over years
    for year in years:
        # use try/except block to catch and inspect any urls that cause an error
        try:
            # set url
            url = 'https://www.sports-reference.com/cfb/years/' + year + '-' + tableID + '.html'
            
            # put random delays between get requests
            delay = np.random.choice(delays)
            time.sleep(delay)
            # make get request with header
            html = requests.get(url, headers=headers)

            # create the BeautifulSoup object
            soup = BeautifulSoup(html.content, "lxml")
            # get table
            table = soup.find('table', {'id': tableID})
            
            # read table as dataframe
            df = pd.read_html(str(table))[0]
            # remove duplicate header rows
            df.drop_duplicates(keep=False, inplace=True)
            # remove multiheaders (first header row)
            if tableID in multi_list:
                df.columns = df.columns.droplevel(0)

            player_ncaa_ids = []
            ncaa_links = []
            for tbody in table.find_all('tbody'):
                for tr in tbody.find_all('tr'):
                    for td in tr.find_all('td'):
                        if td['data-stat'] == "player":
                            try:
                                player_ncaa_ids.append(td['data-append-csv'])
                            except KeyError:
                                player_ncaa_ids.append('N/A')
                            
                            if td.find_all('a'):
                                for a in td.find_all('a'):
                                    ncaa_links.append(a['href'])
                            else:
                                ncaa_links.append('N/A')
                            
            # insert year, player id, ncaa link to table
            df.insert(0, "NCAA_Link", ncaa_links)
            df.insert(0, "Player_NCAA_ID", player_ncaa_ids)
            df.insert(0, "Year", year)
            
            # append df to dfs list
            dfs.append(df)

        except Exception as e:
            # store the url and the error it causes in a list
            error = [url, e] 
            # append it to the list of errors
            error_list.append(error)
    
    # print errors
    print(error_list)
    # concatenate dataframes into a single dataframe
    out_df = pd.concat(dfs, ignore_index=True)
    out_df.to_csv('ncaa_player_' + tableID + '_stats_1999_2019.csv', index=False)


In [None]:
for table in player_tables:
    pullPlayerData(years, table)

In [None]:
# pull All-Americans
def pullAllAmericans(years):

    base_url = 'https://www.sports-reference.com'
    lst = []
    
    # a list to store any errors that may come up while scraping
    error_list = []
    
    # iterate over years
    for year in years:
        # use try/except block to catch and inspect any urls that cause an error
        try:
            # set url
            url = 'https://www.sports-reference.com/cfb/years/' + year + '.html'
            
            # put random delays between get requests 
            delay = np.random.choice(delays)
            time.sleep(delay)
            
            # get html
            res = requests.get(url, headers = headers)
            
            # Work around comments
            comm = re.compile("<!--|-->")
            soup = BeautifulSoup(comm.sub("", res.text), 'lxml')
            for row in soup.find_all('div', id = 'div_all_americans'):
                for p in row.find_all('p'):
                    for a in p.find_all('a',limit=1):
                        line = [year, p.text.split(',')[0].replace('*',''), p.text.split(',')[1].replace(' ',''), p.text.split(',')[2].lstrip(), base_url + a['href']]
                    
                    lst.append(line)
        
        except Exception as e:
            # store the url and the error it causes in a list
            error = [url, e] 
            # append it to the list of errors
            error_list.append(error)
    
    # print errors
    print(error_list)
    # add list to dataframe
    df=pd.DataFrame(lst,columns=['Year','Player','Pos','School','NCAA_Link'])
    # save as csv
    df.to_csv('ncaa_all_americans__1999_2019.csv', index=False)

In [None]:
pullAllAmericans(years_ncaa)

In [None]:
def PullNFLCombineResults(years):
    # a list to store dataframes -- 1 df per year
    dfs = []
    # a list to store any errors that may come up while scraping
    error_list = []
    
    # iterate over years
    for year in years:
        # use try/except block to catch and inspect any urls that cause an error
        try:
            # set url
            url = 'https://nflcombineresults.com/nflcombinedata_expanded.php?year=' + year + ' &pos=&college='
                        
            # put random delays between get requests
            delay = np.random.choice(delays)
            time.sleep(delay)
            # make get request with header
            html = requests.get(url, headers=headers)

            # create the BeautifulSoup object
            soup = BeautifulSoup(html.content, "lxml")
            
            # get ratings table
            table = soup.find('table', {'class': 'sortable'})
            
            # read table as dataframe
            df = pd.read_html(str(table))[0]
            
            # append df to dfs list
            dfs.append(df)
            
        except Exception as e:
            # store the url and the error it causes in a list
            error = [url, e] 
            # append it to the list of errors
            error_list.append(error)
    
    # print errors
    print(error_list)
    # concatenate dataframes into a single dataframe
    combineresults_df = pd.concat(dfs, ignore_index=True)
    # save dataframe to csv
    combineresults_df.to_csv('nflcombineresults_2000_2020.csv', index=False)