# Scrap NFL Games Data From PFR
Scrap NFL game data for year(s) from https://www.pro-football-reference.com with sleep time between 3.5 to 5.5 seconds to avoid overwhelming the server

In [None]:
import os
import time
import random
import requests
from bs4 import BeautifulSoup, Comment
import pandas as pd


## weather func

In [2]:
def parse_weather(weather_text):
    """
    Parse the weather information to extract temperature, humidity, and wind speed.

    Parameters:
    weather_text (str): Text containing weather information in a specific format.

    Returns:
    tuple: A tuple containing temperature (float), humidity_pct (float), and wind_speed (float).
    """
    temperature = None
    humidity_pct = None
    wind_speed = None

    parts = weather_text.split(", ")
    for part in parts:
        if 'degrees' in part:
            temperature = float(part.split()[0])
        if 'humidity' in part:
            humidity_pct = float(part.split()[2].replace('%', ''))
        if 'mph' in part:
            wind_speed = float(part.split()[1])
    
    return temperature, humidity_pct, wind_speed

In [3]:
def date_func(url):
    """
    Extract the date from the given URL.

    Parameters:
    url (str): The URL containing date information.

    Returns:
    str: Formatted event date in 'YYYY-MM-DD' format.
    """
    event_date = f'{url[-16:-12]}-{url[-12:-10]}-{url[-10:-8]}'
    return event_date

## games starters

In [4]:
def game_starters(comments,event_date):
    """
    Extract starting players' data for each team from comments.

    Parameters:
    comments (list): List of comments containing HTML tables of starters.
    event_date (str): Date of the event in 'YYYY-MM-DD' format.

    Returns:
    DataFrame: A DataFrame containing starting player details including player ID, name, position, and team.
    """
    table_data = [] 

    # Iterate over each comment and parse the one that contains 'home_starters' or 'vis_starters'
    for comment in comments:
        if 'id="home_starters"' in comment:
            comment_soup = BeautifulSoup(comment, 'html.parser')
            table = comment_soup.find('table')
            
            if table:
                # Extract the team name from the caption dynamically
                team_name = table.find('caption').get_text().split(' Starters Table')[0]
                rows = table.find_all('tr')[1:]  # Skip header row

                # Loop through each row and extract player data
                for row in rows:
                    player_tag = row.find('th', {'data-stat': 'player'})
                    if player_tag:
                        player_id = player_tag['data-append-csv']
                        player_name = player_tag.find('a').get_text()
                        position = row.find('td', {'data-stat': 'pos'}).get_text()
                        home = 1
                        date = event_date
                        table_data.append([date, team_name, player_id, player_name, position, home])
        elif 'id="vis_starters"' in comment:
            comment_soup = BeautifulSoup(comment, 'html.parser')
            table = comment_soup.find('table')
            
            if table:
                # Extract the team name from the caption dynamically
                team_name = table.find('caption').get_text().split(' Starters Table')[0]
                rows = table.find_all('tr')[1:]  # Skip header row

                # Loop through each row and extract player data
                for row in rows:
                    player_tag = row.find('th', {'data-stat': 'player'})
                    if player_tag:
                        player_id = player_tag['data-append-csv']
                        player_name = player_tag.find('a').get_text()
                        position = row.find('td', {'data-stat': 'pos'}).get_text()
                        home = 0
                        date = event_date
                        table_data.append([date, team_name, player_id, player_name, position, home])

    # Convert the list to a DataFrame
    df = pd.DataFrame(table_data, columns=["Date","Team", "Player_id", "Starter", "Position", "Home"])
    return df

# snap counts

In [5]:

def snap_counts(comments,event_date):
    """
    Extract snap count data for each player from comments.

    Parameters:
    comments (list): List of comments containing HTML tables of snap counts.
    event_date (str): Date of the event in 'YYYY-MM-DD' format.

    Returns:
    DataFrame: A DataFrame containing snap counts for players including offensive, defensive, and special teams data.
    """
    # Initialize a list to store table data
    snap_data = [] 

    # Iterate over each comment and parse the one that contains 'home_starters' or 'vis_starters'
    for comment in comments:
        if 'id="home_snap_counts"' in comment:
            comment_soup = BeautifulSoup(comment, 'html.parser')
            table = comment_soup.find('table')
            
            if table:
                # Extract the team name from the caption dynamically
                team_name = table.find('caption').get_text().split(' Snap Counts Table')[0]
                rows = table.find_all('tr')[2:]  # Skip header row

                for row in rows:
                    # Extract each field (skip rows without enough columns)
                    columns = row.find_all('td')

                    # extract player data
                    player_tag = row.find('th', {'data-stat': 'player'})
                    if player_tag:
                        player_id = player_tag['data-append-csv']
                        player_name = player_tag.find('a').get_text()

                    # Extract required data
                    pos = columns[0].text.strip()
                    off_num = columns[1].text.strip()
                    off_pct = columns[2].text.strip()
                    def_num = columns[3].text.strip()
                    def_pct = columns[4].text.strip()
                    st_num = columns[5].text.strip()
                    st_pct = columns[6].text.strip()
                    date = event_date
                    snap_data.append([date, team_name, player_id, player_name, pos, off_num, off_pct, def_num, def_pct, st_num, st_pct])
                    
        if 'id="vis_snap_counts"' in comment:
            comment_soup = BeautifulSoup(comment, 'html.parser')
            table = comment_soup.find('table')
            
            if table:
                # Extract the team name from the caption dynamically
                team_name = table.find('caption').get_text().split(' Snap Counts Table')[0]
                rows = table.find_all('tr')[2:]  # Skip header row

                for row in rows:
                    # Extract each field (skip rows without enough columns)
                    columns = row.find_all('td')

                    # extract player data
                    player_tag = row.find('th', {'data-stat': 'player'})
                    if player_tag:
                        player_id = player_tag['data-append-csv']
                        player_name = player_tag.find('a').get_text()

                    # Extract required data
                    pos = columns[0].text.strip()
                    off_num = columns[1].text.strip()
                    off_pct = columns[2].text.strip()
                    def_num = columns[3].text.strip()
                    def_pct = columns[4].text.strip()
                    st_num = columns[5].text.strip()
                    st_pct = columns[6].text.strip()
                    date = event_date
                    snap_data.append([date, team_name, player_id, player_name, pos, off_num, off_pct, def_num, def_pct, st_num, st_pct])

    # Convert the list to a DataFrame
    df = pd.DataFrame(snap_data, columns=["date","team", "player_id", "player", "position", "off_num","off_pct","def_num","def_pct","st_num","st_pct"])
    return df



## Passing, Rushing, & Receiving

In [6]:
def player_offense(soup,event_date):
    """
    Extract offensive player stats from the HTML soup.

    Parameters:
    soup (BeautifulSoup): Parsed HTML content of the page.
    event_date (str): Date of the event in 'YYYY-MM-DD' format.

    Returns:
    DataFrame: A DataFrame containing offensive player statistics such as passing, rushing, and receiving.
    """
    # Find the table by ID
    table = soup.find('table', id='player_offense')

    # Initialize a list to hold player data
    offense_data = []

    # Loop through the rows in the table body
    for row in table.find('tbody').find_all('tr'):
        # Create a dictionary to store the player's stats
        player_stats = {}
        
        data_tag = row.find_all('td')
        if not data_tag:
            # Skip rows that have fewer columns (short rows)
            continue
        # Extract the player's name
        try:
            player_tag = row.find('th', {'data-stat': 'player'})
            if player_tag:
                player_id = player_tag['data-append-csv']
                player_name = player_tag.find('a').get_text()

            team = data_tag[0].text.strip()
            pass_cmp = data_tag[1].text.strip()
            pass_att = data_tag[2].text.strip()
            pass_yds = data_tag[3].text.strip()
            pass_td = data_tag[4].text.strip()
            pass_int = data_tag[5].text.strip()
            pass_sacked = data_tag[6].text.strip()
            pass_sacked_yds = data_tag[7].text.strip()
            pass_long = data_tag[8].text.strip()
            pass_rating = data_tag[9].text.strip()
            rush_att = data_tag[10].text.strip()
            rush_yds = data_tag[11].text.strip()
            rush_td = data_tag[12].text.strip()
            rush_long = data_tag[13].text.strip()
            rec_tgt = data_tag[14].text.strip()
            rec_rec = data_tag[15].text.strip()
            rec_yds = data_tag[16].text.strip()
            rec_td = data_tag[17].text.strip()
            rec_long = data_tag[18].text.strip()
            fmb = data_tag[19].text.strip()
            fmb_lost = data_tag[20].text.strip()
        except:
            pass
        clmn = [event_date,player_id,player_name,team,pass_cmp,pass_att,pass_yds,pass_td,pass_int,pass_sacked,pass_sacked_yds,pass_long,pass_rating,
                rush_att,rush_yds,rush_td,rush_long,rec_tgt,rec_rec,rec_yds,rec_td,rec_long,fmb,fmb_lost]
        offense_data.append(clmn)
    df = pd.DataFrame(offense_data,columns=['date','player_id', 'player_name', 'team', 'pass_cmp', 'pass_att', 'pass_yds', 'pass_td', 'pass_int', 'pass_sacked', 'pass_sacked_yds',
    'pass_long', 'pass_rating', 'rush_att', 'rush_yds', 'rush_td', 'rush_long', 'rec_tgt', 'rec_rec', 'rec_yds', 'rec_td', 
    'rec_long', 'fmb', 'fmb_lost'])
    return df

    

## Defense

In [7]:
def player_defense(comments,event_date):
    """
    Extract defensive player stats from comments.

    Parameters:
    comments (list): List of comments containing HTML tables of player defense stats.
    event_date (str): Date of the event in 'YYYY-MM-DD' format.

    Returns:
    DataFrame: A DataFrame containing defensive player statistics such as tackles, sacks, interceptions.
    """
    # Initialize a list to store table data
    diff_data = [] 

    # Iterate over each comment and parse the one that contains 'home_starters' or 'vis_starters'
    for comment in comments:
        if 'id="player_defense"' in comment:
                comment_soup = BeautifulSoup(comment, 'html.parser')
                table = comment_soup.find('table')

                if table:
                    # Extract the team name from the caption dynamically
                    rows = table.find_all('tr')[2:]  # Skip header row
                    # Extract each field (skip rows without enough columns)
                    for row in rows:
                        data_tag = row.find_all('td')
                        if not data_tag:
                            # Skip rows that have fewer columns (short rows)
                            continue
    
                        # extract player data
                        player_tag = row.find('th', {'data-stat': 'player'})
                        if player_tag:
                            player_id = player_tag['data-append-csv']
                            player_name = player_tag.find('a').get_text()
                        team = data_tag[0].text.strip()
                        def_int = data_tag[1].text.strip()
                        def_int_yds = data_tag[2].text.strip()
                        def_int_td = data_tag[3].text.strip()
                        def_int_long = data_tag[4].text.strip()
                        pass_defended = data_tag[5].text.strip()
                        sacks = data_tag[6].text.strip()
                        tackles_combined = data_tag[7].text.strip()
                        tackles_solo = data_tag[8].text.strip()
                        tackles_assists = data_tag[9].text.strip()
                        tackles_loss = data_tag[10].text.strip()
                        qb_hits = data_tag[11].text.strip()
                        fumbles_rec = data_tag[12].text.strip()
                        fumbles_rec_yds = data_tag[13].text.strip()
                        fumbles_rec_td = data_tag[14].text.strip()
                        fumbles_forced = data_tag[15].text.strip()
                        diff_data.append([event_date,player_id, player_name, team,def_int,def_int_yds,def_int_td,def_int_long,pass_defended,sacks,tackles_combined,tackles_solo,
                                    tackles_assists,tackles_loss,qb_hits,fumbles_rec,fumbles_rec_yds,fumbles_rec_td,fumbles_forced])
                    
    df = pd.DataFrame(diff_data,columns=['date','player_id','player','team', 'def_int', 'def_int_yds', 'def_int_td', 'def_int_long', 'pass_defended', 'sacks', 'tackles_combined', 'tackles_solo',
                                                    'tackles_assists', 'tackles_loss', 'qb_hits', 'fumbles_rec', 'fumbles_rec_yds', 'fumbles_rec_td', 'fumbles_forced'])
    return df             

## Returns

In [8]:
def player_returns(comments,event_date):
    """
    Extract return player stats from comments.

    Parameters:
    comments (list): List of comments containing HTML tables of player return stats.
    event_date (str): Date of the event in 'YYYY-MM-DD' format.

    Returns:
    DataFrame: A DataFrame containing player return statistics such as kick and punt returns.
    """
    # Initialize a list to store table data
    returns_data = [] 

    # Iterate over each comment and parse the one that contains 'home_starters' or 'vis_starters'
    for comment in comments:
        if 'id="returns"' in comment:
                comment_soup = BeautifulSoup(comment, 'html.parser')
                table = comment_soup.find('table')

                if table:
                    # Extract the team name from the caption dynamically
                    rows = table.find_all('tr')[2:]  # Skip header row
                    # Extract each field (skip rows without enough columns)
                    for row in rows:
                        data_tag = row.find_all('td')
                        if not data_tag:
                            # Skip rows that have fewer columns (short rows)
                            continue

                        # extract player data
                        player_tag = row.find('th', {'data-stat': 'player'})
                        if player_tag:
                            player_id = player_tag['data-append-csv']
                            player_name = player_tag.find('a').get_text()
                        team = data_tag[0].text.strip() 
                        kick_ret = data_tag[1].text.strip()
                        kick_ret_yds = data_tag[2].text.strip() 
                        kick_ret_yds_per_ret = data_tag[3].text.strip() 
                        kick_ret_td = data_tag[4].text.strip()
                        kick_ret_long = data_tag[5].text.strip()
                        punt_ret = data_tag[6].text.strip()
                        punt_ret_yds = data_tag[7].text.strip()
                        punt_ret_yds_per_ret = data_tag[8].text.strip() 
                        punt_ret_td = data_tag[9].text.strip()
                        punt_ret_long= data_tag[10].text.strip()
                        returns_data.append([event_date,player_id,player_name,team,kick_ret,kick_ret_yds,kick_ret_yds_per_ret,kick_ret_td,kick_ret_long,
                                            punt_ret,punt_ret_yds,punt_ret_yds_per_ret,punt_ret_td,punt_ret_long])
    df = pd.DataFrame(returns_data,columns=['date','player_id', 'player_name', 'team', 'kick_ret', 'kick_ret_yds', 'kick_ret_yds_per_ret', 
                                                    'kick_ret_td', 'kick_ret_long','punt_ret', 'punt_ret_yds', 'punt_ret_yds_per_ret', 'punt_ret_td', 'punt_ret_long'])
    return  df

## Kicking and Punting

In [9]:
def player_kicking(comments,event_date):
    """
    Extract kicking player stats from comments.

    Parameters:
    comments (list): List of comments containing HTML tables of player kicking stats.
    event_date (str): Date of the event in 'YYYY-MM-DD' format.

    Returns:
    DataFrame: A DataFrame containing player kicking statistics such as field goals, punts, and extra points.
    """
    # Initialize a list to store table data
    kicking_data = [] 

    # Iterate over each comment and parse the one that contains 'home_starters' or 'vis_starters'
    for comment in comments:
        if 'id="kicking"' in comment:
                comment_soup = BeautifulSoup(comment, 'html.parser')
                table = comment_soup.find('table')

                if table:
                    # Extract the team name from the caption dynamically
                    rows = table.find_all('tr')[2:]  # Skip header row
                    # Extract each field (skip rows without enough columns)
                    for row in rows:
                        data_tag = row.find_all('td')
                        if not data_tag:
                            # Skip rows that have fewer columns (short rows)
                            continue
                        # extract player data
                        player_tag = row.find('th', {'data-stat': 'player'})
                        if player_tag:
                            player_id = player_tag['data-append-csv']
                            player_name = player_tag.find('a').get_text()

                        team = data_tag[0].text.strip() 
                        scoring_xpm = data_tag[1].text.strip()
                        scoring_xpa = data_tag[2].text.strip() 
                        scoring_fgm = data_tag[3].text.strip() 
                        scoring_fga = data_tag[4].text.strip()
                        punt = data_tag[5].text.strip()
                        punt_yds = data_tag[6].text.strip()
                        punt_yds_per_punt = data_tag[7].text.strip()
                        punt_long = data_tag[8].text.strip() 
                        kicking_data.append([event_date,player_id, player_name, team,scoring_xpm,scoring_xpa,scoring_fgm,scoring_fga,punt,punt_yds,punt_yds_per_punt,punt_long])
    df = pd.DataFrame(kicking_data,columns=['date','player_id', 'player_name', 'team', 'scoring_xpm', 'scoring_xpa', 'scoring_fgm', 'scoring_fga',
                                                        'punt', 'punt_yds', 'punt_yds_per_punt', 'punt_long'])
    return df
                    


## Advanced Passing

In [10]:
def player_advanced_passing(comments,event_date):
    """
    Extract advanced passing stats for players from comments.

    Parameters:
    comments (list): List of comments containing HTML tables of advanced passing stats.
    event_date (str): Date of the event in 'YYYY-MM-DD' format.

    Returns:
    DataFrame: A DataFrame containing advanced passing statistics such as air yards, yards after catch.
    """
    # Initialize a list to store table data
    advanced_passing_data = [] 

    # Iterate over each comment and parse the one that contains 'home_starters' or 'vis_starters'
    for comment in comments:
        if 'id="passing_advanced"' in comment:
                comment_soup = BeautifulSoup(comment, 'html.parser')
                table = comment_soup.find('table')

                if table:
                    # Extract the team name from the caption dynamically
                    rows = table.find_all('tr')[1:]  # Skip header row
                    # Extract each field (skip rows without enough columns)
                    for row in rows:
                        data_tag = row.find_all('td')
                        if not data_tag:
                            # Skip rows that have fewer columns (short rows)
                            continue
                        # extract player data
                        player_tag = row.find('th', {'data-stat': 'player'})
                        if player_tag:
                            player_id = player_tag['data-append-csv']
                            player_name = player_tag.find('a').get_text()
                        team = data_tag[0].text.strip()
                        pass_cmp = data_tag[1].text.strip()
                        pass_att = data_tag[2].text.strip()
                        pass_yds = data_tag[3].text.strip()
                        pass_first_down = data_tag[4].text.strip()
                        pass_first_down_pct = data_tag[5].text.strip()
                        pass_target_yds = data_tag[6].text.strip()
                        pass_tgt_yds_per_att = data_tag[7].text.strip()
                        pass_air_yds = data_tag[8].text.strip()
                        pass_air_yds_per_cmp = data_tag[9].text.strip()
                        pass_air_yds_per_att = data_tag[10].text.strip()
                        pass_yac = data_tag[11].text.strip()
                        pass_yac_per_cmp = data_tag[12].text.strip()
                        pass_drops = data_tag[13].text.strip()
                        pass_drop_pct = data_tag[14].text.strip()
                        pass_poor_throws = data_tag[15].text.strip()
                        pass_poor_throw_pct = data_tag[16].text.strip()
                        pass_sacked = data_tag[17].text.strip()
                        pass_blitzed = data_tag[18].text.strip()
                        pass_hurried = data_tag[19].text.strip()
                        pass_hits = data_tag[20].text.strip()
                        pass_pressured = data_tag[21].text.strip()
                        pass_pressured_pct = data_tag[22].text.strip()
                        rush_scrambles = data_tag[23].text.strip()
                        rush_scrambles_yds_per_att = data_tag[24].text.strip()

                        # Append the extracted stats to the player_stats_list
                        advanced_passing_data.append([event_date,player_id, player_name, team, pass_cmp, pass_att, pass_yds, pass_first_down, pass_first_down_pct, pass_target_yds, 
                                                pass_tgt_yds_per_att, pass_air_yds, pass_air_yds_per_cmp, pass_air_yds_per_att, pass_yac, 
                                                pass_yac_per_cmp, pass_drops, pass_drop_pct, pass_poor_throws, pass_poor_throw_pct, pass_sacked,
                                                pass_blitzed, pass_hurried, pass_hits, pass_pressured, pass_pressured_pct, rush_scrambles, 
                                                rush_scrambles_yds_per_att])

    # Create a DataFrame from the extracted data
    df = pd.DataFrame(advanced_passing_data, columns=[
                        'date','player_id', 'player_name', 'team', 'pass_cmp', 'pass_att', 'pass_yds', 'pass_first_down', 'pass_first_down_pct', 'pass_target_yds', 'pass_tgt_yds_per_att', 
                        'pass_air_yds', 'pass_air_yds_per_cmp', 'pass_air_yds_per_att', 'pass_yac', 'pass_yac_per_cmp', 'pass_drops', 'pass_drop_pct',
                        'pass_poor_throws', 'pass_poor_throw_pct', 'pass_sacked', 'pass_blitzed', 'pass_hurried', 'pass_hits', 'pass_pressured', 
                        'pass_pressured_pct', 'rush_scrambles', 'rush_scrambles_yds_per_att'])
    return df
                                        

## Advanced Rushing

In [11]:
def player_advanced_rushing(comments,event_date):
    """
    Extract advanced rushing stats for players from comments.

    Parameters:
    comments (list): List of comments containing HTML tables of advanced rushing stats.
    event_date (str): Date of the event in 'YYYY-MM-DD' format.

    Returns:
    DataFrame: A DataFrame containing advanced rushing statistics like yards before contact and yards after contact.
    """
    # Initialize a list to store table data
    advanced_rushing_data = [] 

    # Iterate over each comment and parse the one that contains 'home_starters' or 'vis_starters'
    for comment in comments:
        if 'id="rushing_advanced"' in comment:
                comment_soup = BeautifulSoup(comment, 'html.parser')
                table = comment_soup.find('table')

                if table:
                    # Extract the team name from the caption dynamically
                    rows = table.find_all('tr')[1:]  # Skip header row
                    # Extract each field (skip rows without enough columns)
                    for row in rows:
                        data_tag = row.find_all('td')
                        if not data_tag:
                            # Skip rows that have fewer columns (short rows)
                            continue
                        # extract player data
                        player_tag = row.find('th', {'data-stat': 'player'})
                        if player_tag:
                            player_id = player_tag['data-append-csv']
                            player_name = player_tag.find('a').get_text()
                        team = data_tag[0].text.strip()
                        rush_att = data_tag[1].text.strip()
                        rush_yds = data_tag[2].text.strip()
                        rush_td = data_tag[3].text.strip()
                        rush_first_down = data_tag[4].text.strip()
                        rush_yds_before_contact = data_tag[5].text.strip()
                        rush_yds_bc_per_rush = data_tag[6].text.strip()
                        rush_yac = data_tag[7].text.strip()
                        rush_yac_per_rush = data_tag[8].text.strip()
                        rush_broken_tackles = data_tag[9].text.strip()
                        rush_broken_tackles_per_rush = data_tag[10].text.strip()

                        # Append the extracted stats to the rushing_stats_list
                        advanced_rushing_data.append([event_date,player_id, player_name, team, rush_att, rush_yds, rush_td, rush_first_down, rush_yds_before_contact, 
                                                rush_yds_bc_per_rush, rush_yac, rush_yac_per_rush, rush_broken_tackles, rush_broken_tackles_per_rush])

    # Create a DataFrame from the extracted data
    df = pd.DataFrame(advanced_rushing_data, columns=[
                        'date','player_id', 'player_name', 'team', 'rush_att', 'rush_yds', 'rush_td', 'rush_first_down', 'rush_yds_before_contact', 
                        'rush_yds_bc_per_rush', 'rush_yac', 'rush_yac_per_rush', 'rush_broken_tackles', 'rush_broken_tackles_per_rush'
                    ])
    return df

## Advanced Receiving

In [12]:
def player_advanced_receiving(comments,event_date):
    """
    Extract advanced receiving stats for players from comments.

    Parameters:
    comments (list): List of comments containing HTML tables of advanced receiving stats.
    event_date (str): Date of the event in 'YYYY-MM-DD' format.

    Returns:
    DataFrame: A DataFrame containing advanced receiving statistics such as air yards, yards after catch, and drop rates.
    """
    # Initialize a list to store table data
    advanced_receiving_data = [] 

    # Iterate over each comment and parse the one that contains 'home_starters' or 'vis_starters'
    for comment in comments:
        if 'id="receiving_advanced"' in comment:
                comment_soup = BeautifulSoup(comment, 'html.parser')
                table = comment_soup.find('table')

                if table:
                    # Extract the team name from the caption dynamically
                    rows = table.find_all('tr')[1:]  # Skip header row
                    # Extract each field (skip rows without enough columns)
                    for row in rows:
                        data_tag = row.find_all('td')
                        if not data_tag:
                            # Skip rows that have fewer columns (short rows)
                            continue
                        # extract player data
                        player_tag = row.find('th', {'data-stat': 'player'})
                        if player_tag:
                            player_id = player_tag['data-append-csv']
                            player_name = player_tag.find('a').get_text()
                        team = data_tag[0].text.strip()
                        targets = data_tag[1].text.strip()
                        rec = data_tag[2].text.strip()
                        rec_yds = data_tag[3].text.strip()
                        rec_td = data_tag[4].text.strip()
                        rec_first_down = data_tag[5].text.strip()
                        rec_air_yds = data_tag[6].text.strip()
                        rec_air_yds_per_rec = data_tag[7].text.strip()
                        rec_yac = data_tag[8].text.strip()
                        rec_yac_per_rec = data_tag[9].text.strip()
                        rec_adot = data_tag[10].text.strip()
                        rec_broken_tackles = data_tag[11].text.strip()
                        rec_broken_tackles_per_rec = data_tag[12].text.strip()
                        rec_drops = data_tag[13].text.strip()
                        rec_drop_pct = data_tag[14].text.strip()
                        rec_target_int = data_tag[15].text.strip()
                        rec_pass_rating = data_tag[16].text.strip()

                        # Append the extracted stats to the receiving_stats_list
                        advanced_receiving_data.append([event_date,player_id, player_name, team, targets, rec, rec_yds, rec_td, rec_first_down, rec_air_yds, 
                                                    rec_air_yds_per_rec, rec_yac, rec_yac_per_rec, rec_adot, rec_broken_tackles,
                                                    rec_broken_tackles_per_rec, rec_drops, rec_drop_pct, rec_target_int, rec_pass_rating])

    # Create a DataFrame from the extracted data
    df = pd.DataFrame(advanced_receiving_data, columns=[
                        'date','player_id', 'player_name', 'team', 'targets', 'rec', 'rec_yds', 'rec_td', 'rec_first_down', 'rec_air_yds', 'rec_air_yds_per_rec', 
                        'rec_yac', 'rec_yac_per_rec', 'rec_adot', 'rec_broken_tackles', 'rec_broken_tackles_per_rec', 'rec_drops',
                        'rec_drop_pct', 'rec_target_int', 'rec_pass_rating'
                    ])
    return df

## Advanced Defense

In [13]:
def player_advanced_defense(comments,event_date):
    """
    Extract advanced defensive stats for players from comments.

    Parameters:
    comments (list): List of comments containing HTML tables of advanced defensive stats.
    event_date (str): Date of the event in 'YYYY-MM-DD' format.

    Returns:
    DataFrame: A DataFrame containing advanced defensive statistics such as pressures, missed tackles, and blitzes.
    """
    # Initialize a list to store table data
    advanced_defense_data = [] 

    # Iterate over each comment and parse the one that contains 'home_starters' or 'vis_starters'
    for comment in comments:
        if 'id="defense_advanced"' in comment:
                comment_soup = BeautifulSoup(comment, 'html.parser')
                table = comment_soup.find('table')

                if table:
                    # Extract the team name from the caption dynamically
                    rows = table.find_all('tr')[1:]  # Skip header row
                    # Extract each field (skip rows without enough columns)
                    for row in rows:
                        data_tag = row.find_all('td')
                        if not data_tag:
                            # Skip rows that have fewer columns (short rows)
                            continue
                        # extract player data
                        player_tag = row.find('th', {'data-stat': 'player'})
                        if player_tag:
                            player_id = player_tag['data-append-csv']
                            player_name = player_tag.find('a').get_text()
                        team = data_tag[0].text.strip()
                        def_int = data_tag[1].text.strip()
                        def_targets = data_tag[2].text.strip()
                        def_cmp = data_tag[3].text.strip()
                        def_cmp_perc = data_tag[4].text.strip()
                        def_cmp_yds = data_tag[5].text.strip()
                        def_yds_per_cmp = data_tag[6].text.strip()
                        def_yds_per_target = data_tag[7].text.strip()
                        def_cmp_td = data_tag[8].text.strip()
                        def_pass_rating = data_tag[9].text.strip()
                        def_tgt_yds_per_att = data_tag[10].text.strip()
                        def_air_yds = data_tag[11].text.strip()
                        def_yac = data_tag[12].text.strip()
                        blitzes = data_tag[13].text.strip()
                        qb_hurry = data_tag[14].text.strip()
                        qb_knockdown = data_tag[15].text.strip()
                        sacks = data_tag[16].text.strip()
                        pressures = data_tag[17].text.strip()
                        tackles_combined = data_tag[18].text.strip()
                        tackles_missed = data_tag[19].text.strip()
                        tackles_missed_pct = data_tag[20].text.strip()

                        # Append the extracted stats to the defensive_stats_list
                        advanced_defense_data.append([event_date,player_id, player_name, team, def_int, def_targets, def_cmp, def_cmp_perc, def_cmp_yds, def_yds_per_cmp, 
                                                    def_yds_per_target, def_cmp_td, def_pass_rating, def_tgt_yds_per_att, def_air_yds, 
                                                    def_yac, blitzes, qb_hurry, qb_knockdown, sacks, pressures, tackles_combined, 
                                                    tackles_missed, tackles_missed_pct])

    # Create a DataFrame from the extracted data
    df = pd.DataFrame(advanced_defense_data, columns=[
                        'date','player_id', 'player_name', 'team', 'def_int', 'def_targets', 'def_cmp', 'def_cmp_perc', 'def_cmp_yds', 'def_yds_per_cmp', 'def_yds_per_target',
                        'def_cmp_td', 'def_pass_rating', 'def_tgt_yds_per_att', 'def_air_yds', 'def_yac', 'blitzes', 'qb_hurry', 'qb_knockdown',
                        'sacks', 'pressures', 'tackles_combined', 'tackles_missed', 'tackles_missed_pct'
                    ])

    return df

## team stats

In [14]:
def team_stats(comments,event_date):
    """
    Extract team statistics from comments.

    Parameters:
    comments (list): List of comments containing HTML tables of team statistics.
    event_date (str): Date of the event in 'YYYY-MM-DD' format.

    Returns:
    DataFrame: A DataFrame containing team statistics such as rushing yards, passing yards, and turnovers.
    """
    # Initialize a list to store table data
    data = []
    for comment in comments:
        if 'id="team_stats"' in comment:
            comment_soup = BeautifulSoup(comment, 'html.parser')
            table = comment_soup.find('table')
            
            # Extract rows from the table
            rows = table.find_all('tr')
            
            # Extract the team names from the header
            headers = table.find('thead').find_all('th')
            visitor_team = headers[1].text.strip()
            home_team = headers[2].text.strip()

            # Initialize dictionaries to hold the visitor and home team data
            visitor_data = {'event_date':event_date,'team': visitor_team}
            home_data = {'event_date':event_date,'team': home_team}
            
            # Loop through the rows to extract each stat
            for row in rows[1:]:  # Skipping the first row as it's the column headers
                stat_name = row.find('th').text.strip()
                vis_stat = row.find_all('td')[0].text.strip()
                home_stat = row.find_all('td')[1].text.strip()
                
                # Handle different stat formats (Rush-Yds-TDs, Cmp-Att-Yd-TD-INT, etc.)
                if stat_name == "Rush-Yds-TDs":
                    vis_rush = vis_stat.replace('--', '-').split('-')
                    home_rush = home_stat.replace('--', '-').split('-')
                    visitor_data['rush_att'] = int(vis_rush[0])
                    visitor_data['rush_yds'] = int(vis_rush[1])
                    visitor_data['rush_tds'] = int(vis_rush[2])
                    home_data['rush_att'] = int(home_rush[0])
                    home_data['rush_yds'] = int(home_rush[1])
                    home_data['rush_tds'] = int(home_rush[2])
                
                elif stat_name == "Cmp-Att-Yd-TD-INT":
                    vis_pass = vis_stat.replace('--', '-').split('-')
                    home_pass = home_stat.replace('--', '-').split('-')
                    visitor_data['pass_cmp'] = int(vis_pass[0])
                    visitor_data['pass_att'] = int(vis_pass[1])
                    try:
                        visitor_data["pass_cmp_pct"] = round((visitor_data['pass_cmp'] / visitor_data['pass_att']), 3)
                    except ZeroDivisionError:
                        visitor_data["pass_cmp_pct"] = 0
                    visitor_data['pass_yds'] = int(vis_pass[2])
                    visitor_data['pass_tds'] = int(vis_pass[3])
                    visitor_data['pass_int'] = int(vis_pass[4])
                    home_data['pass_cmp'] = int(home_pass[0])
                    home_data['pass_att'] = int(home_pass[1])
                    try:
                        home_data["pass_cmp_pct"] = round((home_data['pass_cmp'] / home_data['pass_att']), 3)
                    except ZeroDivisionError:
                        home_data["pass_cmp_pct"] = 0
                    home_data['pass_yds'] = int(home_pass[2])
                    home_data['pass_tds'] = int(home_pass[3])
                    home_data['pass_int'] = int(home_pass[4])

                elif stat_name == "Sacked-Yards":
                    vis_sacked = vis_stat.split('-')
                    home_sacked = home_stat.split('-')
                    visitor_data['times_sacked'] = int(vis_sacked[0])
                    visitor_data['yds_sacked_for'] = int(vis_sacked[1])
                    home_data['times_sacked'] = int(home_sacked[0])
                    home_data['yds_sacked_for'] = int(home_sacked[1])

                elif stat_name == "Fumbles-Lost":
                    vis_fumble = vis_stat.split('-')
                    home_fumble = home_stat.split('-')
                    visitor_data['fumbles'] = int(vis_fumble[0])
                    visitor_data['fumbles_lost'] = int(vis_fumble[1])
                    home_data['fumbles'] = int(home_fumble[0])
                    home_data['fumbles_lost'] = int(home_fumble[1])

                elif stat_name == "Penalties-Yards":
                    vis_penalties = vis_stat.split('-')
                    home_penalties = home_stat.split('-')
                    visitor_data['penalties'] = int(vis_penalties[0])
                    visitor_data['penalty_yds'] = int(vis_penalties[1])
                    home_data['penalties'] = int(home_penalties[0])
                    home_data['penalty_yds'] = int(home_penalties[1])

                elif stat_name == "Third Down Conv.":
                    vis_third_down = vis_stat.split('-')
                    home_third_down = home_stat.split('-')
                    visitor_data['third_down_conv'] = int(vis_third_down[0])
                    visitor_data['third_down_att'] = int(vis_third_down[1])
                    try:
                        visitor_data['third_down_conv_pct'] = round((visitor_data['third_down_conv']/visitor_data['third_down_att']),3)
                    except ZeroDivisionError:
                        visitor_data['third_down_conv_pct'] = 0                
                    home_data['third_down_conv'] = int(home_third_down[0])
                    home_data['third_down_att'] = int(home_third_down[1])
                    try:
                        home_data['third_down_conv_pct'] = round((home_data['third_down_conv']/home_data['third_down_att']),3)
                    except ZeroDivisionError:
                        home_data['third_down_conv_pct'] = 0                

                elif stat_name == "Fourth Down Conv.":
                    vis_fourth_down = vis_stat.split('-')
                    home_fourth_down = home_stat.split('-')
                    visitor_data['fourth_down_conv'] = int(vis_fourth_down[0])
                    visitor_data['fourth_down_att'] = int(vis_fourth_down[1])
                    try:
                        visitor_data['fourth_down_conv_pct'] = round((visitor_data['fourth_down_conv']/visitor_data['fourth_down_att']),3)
                    except ZeroDivisionError:
                        visitor_data['fourth_down_conv_pct'] = 0
                    home_data['fourth_down_conv'] = int(home_fourth_down[0])
                    home_data['fourth_down_att'] = int(home_fourth_down[1])
                    try:
                        home_data['fourth_down_conv_pct'] = round((home_data['fourth_down_conv']/home_data['fourth_down_att']),3)
                    except ZeroDivisionError:
                        home_data['fourth_down_conv_pct'] = 0
                
                elif stat_name == "Time of Possession":
                    vis_pos_time = vis_stat.split(':')
                    home_pos_time = home_stat.split(':')
                    visitor_data['time_of_possession'] = (int(vis_pos_time[0])*60) + int(vis_pos_time[1])
                    home_data['time_of_possession'] = (int(home_pos_time[0])*60) + int(home_pos_time[1])

                else:
                    # Other stats like Net Pass Yards, Total Yards, Turnovers, etc.
                    visitor_data[stat_name.replace(' ', '_').lower()] = vis_stat
                    home_data[stat_name.replace(' ', '_').lower()] = home_stat
            
            # Add the extracted data to the main data list
            data.append(visitor_data)
            data.append(home_data)

    # Convert the extracted data into a Pandas DataFrame
    df = pd.DataFrame(data)
    return df

## games info

In [15]:
def games_info(boxscore_link):
    """
    Fetch game-related information and extract statistics.

    Parameters:
    boxscore_link (str): URL to the game's boxscore page.

    Returns:
    tuple: A tuple containing game information dictionary, team stats DataFrame, starters DataFrame, 
           snap counts DataFrame, and DataFrames for various player statistics (offense, defense, returns, kicking, etc.).
    """
    # return game info dict
    page = requests.get(boxscore_link)
    soup = BeautifulSoup(page.content, 'html.parser')
    comments = soup.find_all(string = lambda text: isinstance(text, Comment))

    coach_tag = soup.find_all('div', class_='datapoint')
    coaches = [coach.find('a').text for coach in coach_tag]
    team_tag = soup.find_all('div', class_='scorebox')
    teams = [t.text for tm in team_tag[0].find_all('strong') if (t := tm.find('a')) is not None]
    coach_team = dict(zip(teams, coaches))

    event_date=date_func(boxscore_link)

    #individual game data
    i_team_stats = team_stats(comments,event_date)

    i_starters = game_starters(comments,event_date)

    i_snap_counts = snap_counts(comments,event_date)

    i_player_offense = player_offense(soup,event_date)

    i_player_defense = player_defense(comments,event_date)

    i_player_returns = player_returns(comments,event_date)

    i_player_kicking = player_kicking(comments,event_date)

    i_player_advance_passing = player_advanced_passing(comments,event_date)

    i_player_advance_rushing = player_advanced_rushing(comments,event_date)

    i_player_advance_receiving = player_advanced_receiving(comments,event_date)

    i_player_advance_defense = player_advanced_defense(comments,event_date)


    game_info = {
        'won_toss': None,
        'won_toss_decision': None,
        'won_toss_overtime': None,
        'won_toss_overtime_decision': None,
        'attendance': None,
        'duration': None,
        'roof_type': None,
        'surface_type': None,
        'temperature': None,
        'humidity_pct': None,
        'wind_speed': None,
        'team_spread': None,
        'over_under': None,
        'coaches':coach_team
    }

    # Extract relevant data from the comments
    for comment in comments:
        if 'id="game_info"' in comment:
            comment_soup = BeautifulSoup(comment, 'html.parser')
            table = comment_soup.find('table')
            if table:
                rows = table.find_all('tr')
                
                for row in rows:
                    header = row.find('th').text.strip() if row.find('th') else None
                    value = row.find('td').text.strip() if row.find('td') else None

                    # Map the data from the table to the appropriate field
                    if header == "Won Toss":
                        won_toss_text = value
                        if 'deferred' in won_toss_text:
                            game_info['won_toss'] = won_toss_text.split()[0]
                            game_info['won_toss_decision'] = 'deferred'
                        else:
                            game_info['won_toss'] = won_toss_text
                            game_info['won_toss_decision'] = 'accepted'
                    elif header == "Won OT Toss":
                        won_ot_toss_text = value
                        if 'deferred' in won_ot_toss_text:
                            game_info['won_toss_overtime'] = won_toss_text.split()[0]
                            game_info['won_toss_overtime_decision'] = 'deferred'
                        else:
                            game_info['won_toss_overtime'] = won_ot_toss_text
                            game_info['won_toss_overtime_decision'] = 'accepted'
                    elif header == "Roof":
                        game_info['roof_type'] = value
                    elif header == "Surface":
                        game_info['surface_type'] = value
                    elif header == "Duration":
                        game_info['duration'] = int(value.split(":")[0]) * 60 + int(value.split(":")[1])
                    elif header == "Attendance":
                        game_info['attendance'] = int(value.replace(",", ""))
                    elif header == "Weather":
                        game_info['temperature'], game_info['humidity_pct'], game_info['wind_speed'] = parse_weather(value)
                    elif header == "Vegas Line":
                        game_info['team_spread'] = value  # Extract the team spread
                    elif header == "Over/Under":
                        game_info['over_under'] = value  # Extract the over/under total
    return game_info, i_team_stats, i_starters, i_snap_counts, i_player_offense, i_player_defense, i_player_returns, i_player_kicking, i_player_advance_passing, i_player_advance_rushing, i_player_advance_receiving, i_player_advance_defense
    

## game data

In [16]:
def scrape_game_data(url,year):
    """
    Scrape game data for a specific year from the given URL.

    Parameters:
    url (str): URL of the page containing game information for the specified year.
    year (int): The year for which game data is being scraped.

    Returns:
    tuple: A tuple containing the games DataFrame and a list of DataFrames for team stats, 
           starters, snap counts, and various player statistics.
    """
    # Fetch the page content
    season = year
    page = requests.get(url)
    soup = BeautifulSoup(page.content, "html.parser")

    # Find all the rows in the table
    rows = soup.find_all('tr')

    # Data to be extracted
    games_data = []

    # Initialize empty lists to store DataFrames
    team_stats_list = []
    starters_list =[]
    snap_counts_list = []
    player_offense_list = []
    player_defense_list = [] 
    player_returns_list = [] 
    player_kicking_list = [] 
    player_advance_passing_list = [] 
    player_advance_rushing_list = [] 
    player_advance_receiving_list = []
    player_advance_defense_list = []


    for row in rows:
        # Extract each field (skip rows without enough columns)
        columns = row.find_all('td')

        week = row.find_all('th')[0].text.strip()
        if len(columns) < 13 or columns[1].text.strip() == 'Playoffs':
            continue
        
        # Extract required data
        week_day = columns[0].text.strip()
        event_date = columns[1].text.strip()
        game_time = columns[2].text.strip()
        team_a = columns[3].text.strip()
        team_b = columns[5].text.strip()
        game_location = columns[4].text.strip()

        try:
            # Logic to determine game location
            if game_location == '@':
                location = " ".join(team_b.split()[:-1])
            elif game_location == 'N':
                location = "Niether"
            else:
                location = " ".join(team_a.split()[:-1])
        except:
            continue
        
        team_a_score = columns[7].text.strip()
        team_b_score = columns[8].text.strip()
        team_a_yards = columns[9].text.strip()
        team_a_turnover = columns[10].text.strip()
        team_b_yards = columns[11].text.strip()
        team_b_turnover = columns[12].text.strip()
        
        boxscore_link = columns[6].find('a')['href'] if columns[6].find('a') else ''
        boxscore_link = f"https://www.pro-football-reference.com{boxscore_link}"  # Complete the URL

        i_gi, i_ts, i_s, i_sc, i_po, i_pd, i_pr, i_pk, i_pap, i_paru, i_parc, i_pad = games_info(boxscore_link)
        team_stats_list.append(i_ts)
        starters_list.append(i_s)
        snap_counts_list.append(i_sc)
        player_offense_list.append(i_po)
        player_defense_list.append(i_pd)
        player_returns_list.append(i_pr)
        player_kicking_list.append(i_pk)
        player_advance_passing_list.append(i_pap)
        player_advance_rushing_list.append(i_paru)
        player_advance_receiving_list.append(i_parc)
        player_advance_defense_list.append(i_pad)
        
        # Append to list of games
        games_data.append({**{
            'season':season,
            'week': week,
            'week_day': week_day,
            'event_date': event_date,
            'game_time': game_time,
            'team_a': team_a,
            'team_b': team_b,
            'location': location,
            'team_a_score': team_a_score,
            'team_b_score': team_b_score,
            'team_a_yards': team_a_yards,
            'team_a_turnover': team_a_turnover,
            'team_b_yards': team_b_yards,
            'team_b_turnover': team_b_turnover,
            'boxscore_link': boxscore_link
        }, **i_gi})

        # Sleep for a random time between 3.5 to 5.5 seconds to avoid overwhelming the server
        time.sleep(random.uniform(3.5, 5.5))

    # Convert the data into a DataFrame
    df_games = pd.DataFrame(games_data)

    return df_games, team_stats_list, starters_list, snap_counts_list, player_offense_list, player_defense_list, player_returns_list, player_kicking_list, player_advance_passing_list, player_advance_rushing_list, player_advance_receiving_list, player_advance_defense_list

## scrap and combine func

In [17]:
def scrape_and_combine_data(years, folder_path):
    """
    Scrape data for the specified years and save combined data for each type as CSV files.

    Parameters:
    years (list of int): List of years to scrape data for.
    folder_path (str): Directory path to save the combined CSV files.
    """ 
    # Initialize dictionaries to store lists of DataFrames for each type
    combined_dataframes = {
        'y_g': [],
        'y_ts': [],
        'y_st': [],
        'y_sn': [],
        'y_po': [],
        'y_pd': [],
        'y_pr': [],
        'y_pk': [],
        'y_pap': [],
        'y_paru': [],
        'y_parc': [],
        'y_pad': []
    }

    # Loop over each year and scrape the data
    for year in years:
        url = f"https://www.pro-football-reference.com/years/{year}/games.htm"
        
        # Scrape the data for the given year
        y_g, y_ts, y_st, y_sn, y_po, y_pd, y_pr, y_pk, y_pap, y_paru, y_parc, y_pad = scrape_game_data(url, year)
        
        # Append the data to the corresponding lists in combined_dataframes
        combined_dataframes['y_g'].append(y_g)
        combined_dataframes['y_ts'].extend(y_ts)  # Extend since these are lists of DataFrames
        combined_dataframes['y_st'].extend(y_st)
        combined_dataframes['y_sn'].extend(y_sn)
        combined_dataframes['y_po'].extend(y_po)
        combined_dataframes['y_pd'].extend(y_pd)
        combined_dataframes['y_pr'].extend(y_pr)
        combined_dataframes['y_pk'].extend(y_pk)
        combined_dataframes['y_pap'].extend(y_pap)
        combined_dataframes['y_paru'].extend(y_paru)
        combined_dataframes['y_parc'].extend(y_parc)
        combined_dataframes['y_pad'].extend(y_pad)

        print(f"Scraped data for year {year}")

        # Sleep for a random time between 3.5 to 5.5 seconds to avoid overwhelming the server
        time.sleep(random.uniform(3.5, 5.5))

    # Combine and save each type of DataFrame to a single file
    os.makedirs(folder_path, exist_ok=True)

    # Dictionary mapping DataFrame names (as strings) to their respective filenames
    dataframe_filenames = {
        'y_g': "games_info_all.csv",
        'y_ts': "team_stats_all.csv",
        'y_st': "starters_all.csv",
        'y_sn': "snap_counts_all.csv",
        'y_po': "player_offense_all.csv",
        'y_pd': "player_defense_all.csv",
        'y_pr': "player_returns_all.csv",
        'y_pk': "player_kicking_all.csv",
        'y_pap': "player_ad_pass_all.csv",
        'y_paru': "player_ad_rush_all.csv",
        'y_parc': "player_ad_recv_all.csv",
        'y_pad': "player_ad_dfns_all.csv"
    }

    # Combine and save each DataFrame
    for df_name, filename in dataframe_filenames.items():
        # Concatenate all the DataFrames in the list for the given type
        combined_df = pd.concat(combined_dataframes[df_name], ignore_index=True)
        
        # Create the full file path
        csv_filename = os.path.join(folder_path, filename)
        
        # Save the combined DataFrame to a single CSV file
        combined_df.to_csv(csv_filename, index=False)
        print(f"Saved combined data to {csv_filename}")


## scrap data

In [18]:
# Specify the folder where you want to save the CSV files
folder_path = "../data"  # Update this to your desired directory

# Ensure the folder exists
if not os.path.exists(folder_path):
    os.makedirs(folder_path)

# # Define the years you want to scrape data for
# years = list(range(2022, 2024))

# # Call the function to scrape and save the data
# scrape_and_combine_data(years, folder_path)