In [3]:
# %pip install cloudscraper

import pandas as pd
from bs4 import BeautifulSoup, Comment
import re
import cloudscraper
import time
import random
import csv

SEASON_LIST=['2016', '2017', '2018', '2019', '2020', '2021', '2022', '2023', '2024', '2025']
SEASON_URL = 'https://www.pro-football-reference.com/years/{}/games.htm'
GAME_URL = 'https://www.pro-football-reference.com/boxscores/{}.htm' # add game_id to the end of it
GAME_URL_LIST = []

In [5]:
# Create a scraper that bypasses Cloudflare
scraper = cloudscraper.create_scraper(
    browser={
        'browser': 'chrome',
        'platform': 'windows',
        'mobile': False
    }
)

game_id_list = []

for year in SEASON_LIST:
    url = SEASON_URL.format(year)
    # print(f"Processing {year}...")
    
    # Add random delay
    time.sleep(random.uniform(2, 4))
    
    try:
        response = scraper.get(url)
        # print(f"Status code: {response.status_code}")
        
        if response.status_code == 200:
            soup = BeautifulSoup(response.text, 'html.parser')
            # print(f"Successfully fetched {url}")
            
            # Find the table with id="games"
            games_table = soup.find('table', {'id': 'games'})
            
            if games_table:
                # print(f"Found games table for {year}")

                # Method 1: Exact text match
                boxscore_links = games_table.find_all('a', string='boxscore')
                
                print(f"Found {len(boxscore_links)} boxscore links for {year}")
                
                # Extract href attributes
                for link in boxscore_links:
                    href = link.get('href')
                    if href:
                        # print(f"Found href: {href}")

                        # 1. Get the last part after last slash
                        game_id = href.split('/')[-1]
                        
                        # 2. Remove file extension if present
                        if '.' in game_id:
                            game_id = game_id.split('.')[0]
                            
                        
                        game_id_list.append(game_id)
                        # print(f"  â†’ Extracted game ID: {game_id}")
        else:
            print(f"Failed with status: {response.status_code}")
            
    except Exception as e:
        print(f"Error: {e}")

# print(len(game_id_list))
game_id_list

Found 267 boxscore links for 2016
Found 267 boxscore links for 2017
Found 267 boxscore links for 2018
Found 267 boxscore links for 2019
Found 269 boxscore links for 2020
Found 285 boxscore links for 2021
Found 284 boxscore links for 2022
Found 285 boxscore links for 2023
Found 285 boxscore links for 2024
Found 256 boxscore links for 2025


['201609080den',
 '201609110rav',
 '201609110jax',
 '201609110atl',
 '201609110htx',
 '201609110nyj',
 '201609110phi',
 '201609110kan',
 '201609110oti',
 '201609110nor',
 '201609110sea',
 '201609110clt',
 '201609110dal',
 '201609110crd',
 '201609120was',
 '201609120sfo',
 '201609150buf',
 '201609180pit',
 '201609180cle',
 '201609180det',
 '201609180htx',
 '201609180nwe',
 '201609180car',
 '201609180was',
 '201609180nyg',
 '201609180crd',
 '201609180ram',
 '201609180rai',
 '201609180den',
 '201609180sdg',
 '201609180min',
 '201609190chi',
 '201609220nwe',
 '201609250buf',
 '201609250car',
 '201609250cin',
 '201609250mia',
 '201609250gnb',
 '201609250jax',
 '201609250nyg',
 '201609250oti',
 '201609250tam',
 '201609250sea',
 '201609250clt',
 '201609250phi',
 '201609250kan',
 '201609250dal',
 '201609260nor',
 '201609290cin',
 '201610020jax',
 '201610020chi',
 '201610020nwe',
 '201610020htx',
 '201610020rav',
 '201610020nyj',
 '201610020atl',
 '201610020was',
 '201610020tam',
 '201610020crd

In [62]:
# get the qb stuff
def get_data(game_id_list):
   for game_id in game_id_list:

      failed_ids = []

      url = GAME_URL.format(game_id)
      print("Processing: " + url)

      # Add random delay
      time.sleep(random.uniform(2, 4))

      try:
         response = scraper.get(url)
         print(f"Status code: {response.status_code}")
         
         if response.status_code == 200:
            soup = BeautifulSoup(response.text, 'html.parser')
            print(f"Successfully fetched {url}")            
                  
            comments = soup.find_all(string=lambda text: isinstance(text, Comment)) # find it in the comments

            ##### get general info

            scorebox_meta_div = soup.find('div', {'class': 'scorebox_meta'})
            if scorebox_meta_div:
               # print(f"Found scorebox_meta div for {game_id}")
               
               # Get all text content with line breaks
               full_text = scorebox_meta_div.get_text(separator='\n')
               
               # Initialize variables
               date = None
               
               # Extract date - first line is usually the date
               lines = [line.strip() for line in full_text.split('\n') if line.strip()]
               
               date = lines[0]
               # print(date)

            ##### get Passing, Rushing, & Receiving table
            player_offense_table = soup.find('table', {'id': 'player_offense'})
            if player_offense_table:
               # print('found')

               player_offense_tbody = player_offense_table.find('tbody')
               if player_offense_tbody:
                  # print(player_offense_tbody)
                  rows = player_offense_tbody.find_all('tr')

                  with open('player_offense_data.csv', 'a', newline='', encoding='utf-8') as player_offense_f:
                     player_offense_writer = csv.writer(player_offense_f)

                     for row in rows:
                        if not row.has_attr("class"):
                           player_name = row.find('th', {'data-stat': 'player'}).text

                           pass_cmp_cell = row.find('td', {'data-stat': 'pass_cmp'})
                           pass_cmp = pass_cmp_cell.get_text(strip=True) if pass_cmp_cell else 0
                           pass_att_cell = row.find('td', {'data-stat': 'pass_att'})
                           pass_att = pass_att_cell.get_text(strip=True) if pass_att_cell else 0
                           pass_yds_cell = row.find('td', {'data-stat': 'pass_yds'})
                           pass_yds = pass_yds_cell.get_text(strip=True) if pass_yds_cell else 0
                           pass_td_cell = row.find('td', {'data-stat': 'pass_td'})
                           pass_td = pass_td_cell.get_text(strip=True) if pass_td_cell else 0
                           pass_int_cell = row.find('td', {'data-stat': 'pass_int'})
                           pass_int = pass_int_cell.get_text(strip=True) if pass_int_cell else 0
                           pass_sacked_cell = row.find('td', {'data-stat': 'pass_sacked'})
                           pass_sacked = pass_sacked_cell.get_text(strip=True) if pass_sacked_cell else 0
                           pass_sacked_yds_cell = row.find('td', {'data-stat': 'pass_sacked_yds'})
                           pass_sacked_yds = pass_sacked_yds_cell.get_text(strip=True) if pass_sacked_yds_cell else 0
                           pass_long_cell = row.find('td', {'data-stat': 'pass_long'})
                           pass_long = pass_long_cell.get_text(strip=True) if pass_long_cell else 0
                           pass_rating_cell = row.find('td', {'data-stat': 'pass_rating'})
                           pass_rating = pass_rating_cell.get_text(strip=True) if pass_rating_cell else 0

                           rush_att_cell = row.find('td', {'data-stat': 'rush_att'})
                           rush_att = rush_att_cell.get_text(strip=True) if rush_att_cell else 0
                           rush_yds_cell = row.find('td', {'data-stat': 'rush_yds'})
                           rush_yds = rush_yds_cell.get_text(strip=True) if rush_yds_cell else 0
                           rush_td_cell = row.find('td', {'data-stat': 'rush_td'})
                           rush_td = rush_td_cell.get_text(strip=True) if rush_td_cell else 0
                           rush_long_cell = row.find('td', {'data-stat': 'rush_long'})
                           rush_long = rush_long_cell.get_text(strip=True) if rush_long_cell else 0

                           targets_cell = row.find('td', {'data-stat': 'targets'})
                           targets = targets_cell.get_text(strip=True) if targets_cell else 0
                           rec_cell = row.find('td', {'data-stat': 'rec'})
                           rec = rec_cell.get_text(strip=True) if rec_cell else 0
                           rec_yds_cell = row.find('td', {'data-stat': 'rec_yds'})
                           rec_yds = rec_yds_cell.get_text(strip=True) if rec_yds_cell else 0
                           rec_td_cell = row.find('td', {'data-stat': 'rec_td'})
                           rec_td = rec_td_cell.get_text(strip=True) if rec_td_cell else 0
                           rec_long_cell = row.find('td', {'data-stat': 'rec_long'})
                           rec_long = rec_long_cell.get_text(strip=True) if rec_long_cell else 0

                           fumbles_cell = row.find('td', {'data-stat': 'fumbles'})
                           fumbles = fumbles_cell.get_text(strip=True) if fumbles_cell else 0
                           fumbles_lost_cell = row.find('td', {'data-stat': 'fumbles_lost'})
                           fumbles_lost = fumbles_lost_cell.get_text(strip=True) if fumbles_lost_cell else 0

                           # print(rec_cell)

                           player_offense_writer.writerow([
                              game_id,
                              player_name,
                              date,
                              pass_cmp, pass_att, pass_yds, pass_td, pass_int, pass_sacked, pass_sacked_yds, pass_long, pass_rating,
                              rush_att, rush_yds, rush_td, rush_long,
                              targets, rec, rec_yds, rec_td, rec_long,
                              fumbles, fumbles_lost
                           ])

            ##### get Defense table
            for comment in comments:
               if 'player_defense' in comment:
                  # Parse the comment content as HTML
                  comment_soup = BeautifulSoup(comment, 'html.parser')
                  player_defense_table = comment_soup.find('table', {'id': 'player_defense'})
                  if player_defense_table:
                     # print("Found table in comments")
                     player_defense_tbody = player_defense_table.find('tbody')

                     rows = player_defense_tbody.find_all('tr')

                     with open('player_defense_data.csv', 'a', newline='', encoding='utf-8') as player_defense_f:
                        player_defense_writer = csv.writer(player_defense_f)

                        for row in rows:
                           # print(row)

                           if not row.has_attr("class"):
                              player_name = row.find('th', {'data-stat': 'player'}).text

                              def_int_cell = row.find('td', {'data-stat': 'def_int'})
                              def_int = def_int_cell.get_text(strip=True) if def_int_cell else 0
                              def_int_yds_cell = row.find('td', {'data-stat': 'def_int_yds'})
                              def_int_yds = def_int_yds_cell.get_text(strip=True) if def_int_yds_cell else 0
                              def_int_td_cell = row.find('td', {'data-stat': 'def_int_td'})
                              def_int_td = def_int_td_cell.get_text(strip=True) if def_int_td_cell else 0
                              def_int_long_cell = row.find('td', {'data-stat': 'def_int_long'})
                              def_int_long = def_int_long_cell.get_text(strip=True) if def_int_long_cell else 0
                              pass_defended_cell = row.find('td', {'data-stat': 'pass_defended'})
                              pass_defended = pass_defended_cell.get_text(strip=True) if pass_defended_cell else 0

                              sacks_cell = row.find('td', {'data-stat': 'sacks'})
                              sacks = sacks_cell.get_text(strip=True) if sacks_cell else 0
                              
                              tackles_combined_cell = row.find('td', {'data-stat': 'tackles_combined'})
                              tackles_combined = tackles_combined_cell.get_text(strip=True) if tackles_combined_cell else 0
                              tackles_solo_cell = row.find('td', {'data-stat': 'tackles_solo'})
                              tackles_solo = tackles_solo_cell.get_text(strip=True) if tackles_solo_cell else 0
                              tackles_assists_cell = row.find('td', {'data-stat': 'tackles_assists'})
                              tackles_assists = tackles_assists_cell.get_text(strip=True) if tackles_assists_cell else 0
                              tackles_loss_cell = row.find('td', {'data-stat': 'tackles_loss'})
                              tackles_loss = tackles_loss_cell.get_text(strip=True) if tackles_loss_cell else 0
                              qb_hits_cell = row.find('td', {'data-stat': 'qb_hits'})
                              qb_hits = qb_hits_cell.get_text(strip=True) if qb_hits_cell else 0

                              fumbles_rec_cell = row.find('td', {'data-stat': 'fumbles_rec'})
                              fumbles_rec = fumbles_rec_cell.get_text(strip=True) if fumbles_rec_cell else 0
                              fumbles_rec_yds_cell = row.find('td', {'data-stat': 'fumbles_rec_yds'})
                              fumbles_rec_yds = fumbles_rec_yds_cell.get_text(strip=True) if fumbles_rec_yds_cell else 0
                              fumbles_rec_td_cell = row.find('td', {'data-stat': 'fumbles_rec_td'})
                              fumbles_rec_td = fumbles_rec_td_cell.get_text(strip=True) if fumbles_rec_td_cell else 0
                              fumbles_forced_cell = row.find('td', {'data-stat': 'fumbles_forced'})
                              fumbles_forced = fumbles_forced_cell.get_text(strip=True) if fumbles_forced_cell else 0

                              player_defense_writer.writerow([
                                 game_id,
                                 player_name,
                                 date,
                                 def_int, def_int_yds, def_int_td, def_int_long, pass_defended,
                                 sacks,
                                 tackles_combined, tackles_solo, tackles_assists, tackles_loss, qb_hits,
                                 fumbles_rec, fumbles_rec_yds, fumbles_rec_td, fumbles_forced
                              ])

            ##### get Kick/Punt Returns table
            for comment in comments:
               if 'returns' in comment:
                  comment_soup = BeautifulSoup(comment, 'html.parser')
                  returns_table = comment_soup.find('table', {'id': 'returns'})
                  if player_defense_table:
                     # print("Found table in comments")
                     returns_tbody = returns_table.find('tbody')

                     rows = returns_tbody.find_all('tr')

                     with open('returns_data.csv', 'a', newline='', encoding='utf-8') as returns_f:
                        returns_writer = csv.writer(returns_f)

                        for row in rows:
                           # print(row)

                           if not row.has_attr("class"):
                              player_name = row.find('th', {'data-stat': 'player'}).text

                              kick_ret_cell = row.find('td', {'data-stat': 'kick_ret'})
                              kick_ret = kick_ret_cell.get_text(strip=True) if kick_ret_cell else 0
                              kick_ret_yds_cell = row.find('td', {'data-stat': 'kick_ret_yds'})
                              kick_ret_yds = kick_ret_yds_cell.get_text(strip=True) if kick_ret_yds_cell else 0
                              kick_ret_yds_per_ret_cell = row.find('td', {'data-stat': 'kick_ret_yds_per_ret'})
                              kick_ret_yds_per_ret = kick_ret_yds_per_ret_cell.get_text(strip=True) if kick_ret_yds_per_ret_cell else 0
                              kick_ret_td_cell = row.find('td', {'data-stat': 'kick_ret_td'})
                              kick_ret_td = kick_ret_td_cell.get_text(strip=True) if kick_ret_td_cell else 0
                              kick_ret_long_cell = row.find('td', {'data-stat': 'kick_ret_long'})
                              kick_ret_long = kick_ret_long_cell.get_text(strip=True) if kick_ret_long_cell else 0

                              punt_ret_cell = row.find('td', {'data-stat': 'punt_ret'})
                              punt_ret = punt_ret_cell.get_text(strip=True) if punt_ret_cell else 0
                              punt_ret_yds_cell = row.find('td', {'data-stat': 'punt_ret_yds'})
                              punt_ret_yds = punt_ret_yds_cell.get_text(strip=True) if punt_ret_yds_cell else 0
                              punt_ret_yds_per_ret_cell = row.find('td', {'data-stat': 'punt_ret_yds_per_ret'})
                              punt_ret_yds_per_ret = punt_ret_yds_per_ret_cell.get_text(strip=True) if punt_ret_yds_per_ret_cell else 0
                              punt_ret_td_cell = row.find('td', {'data-stat': 'punt_ret_td'})
                              punt_ret_td = punt_ret_td_cell.get_text(strip=True) if punt_ret_td_cell else 0
                              punt_ret_long_cell = row.find('td', {'data-stat': 'punt_ret_long'})
                              punt_ret_long = punt_ret_long_cell.get_text(strip=True) if punt_ret_long_cell else 0

                              returns_writer.writerow([
                                 game_id,
                                 player_name,
                                 date,
                                 kick_ret, kick_ret_yds, kick_ret_yds_per_ret, kick_ret_td, kick_ret_long,
                                 punt_ret, punt_ret_yds, punt_ret_yds_per_ret, punt_ret_td, punt_ret_long
                              ])

            ##### get Kicking & Punting table
            for comment in comments:
               if 'kicking' in comment:
                  comment_soup = BeautifulSoup(comment, 'html.parser')
                  kicking_table = comment_soup.find('table', {'id': 'kicking'})
                  if kicking_table:
                     # print("Found table in comments")
                     kicking_tbody = kicking_table.find('tbody')

                     rows = kicking_tbody.find_all('tr')

                     with open('kicking_data.csv', 'a', newline='', encoding='utf-8') as kicking_f:
                        kicking_writer = csv.writer(kicking_f)

                        for row in rows:
                           # print(row)

                           if not row.has_attr("class"):
                              player_name = row.find('th', {'data-stat': 'player'}).text

                              xpm_cell = row.find('td', {'data-stat': 'xpm'})
                              xpm = xpm_cell.get_text(strip=True) if xpm_cell else 0
                              xpa_cell = row.find('td', {'data-stat': 'xpa'})
                              xpa = xpa_cell.get_text(strip=True) if xpa_cell else 0

                              fgm_cell = row.find('td', {'data-stat': 'fgm'})
                              fgm = fgm_cell.get_text(strip=True) if fgm_cell else 0
                              fga_cell = row.find('td', {'data-stat': 'fga'})
                              fga = fga_cell.get_text(strip=True) if fga_cell else 0

                              punt_cell = row.find('td', {'data-stat': 'punt'})
                              punt = punt_cell.get_text(strip=True) if punt_cell else 0
                              punt_yds_cell = row.find('td', {'data-stat': 'punt_yds'})
                              punt_yds = punt_yds_cell.get_text(strip=True) if punt_yds_cell else 0
                              punt_yds_per_punt_cell = row.find('td', {'data-stat': 'punt_yds_per_punt'})
                              punt_yds_per_punt = punt_yds_per_punt_cell.get_text(strip=True) if punt_yds_per_punt_cell else 0
                              punt_long_cell = row.find('td', {'data-stat': 'punt_long'})
                              punt_long = punt_long_cell.get_text(strip=True) if punt_long_cell else 0

                              kicking_writer.writerow([
                                 game_id,
                                 player_name,
                                 date,
                                 xpm, xpa,
                                 fgm, fga,
                                 punt, punt_yds, punt_yds_per_punt, punt_long
                              ])

            ##### get Advanced Passing table
            for comment in comments:
               if 'passing_advanced' in comment:
                  comment_soup = BeautifulSoup(comment, 'html.parser')
                  passing_advanced_table = comment_soup.find('table', {'id': 'passing_advanced'})
                  if passing_advanced_table:
                     # print("Found table in com/mentsasdfasdf")
                     passing_advanced_tbody = passing_advanced_table.find('tbody')

                     rows = passing_advanced_tbody.find_all('tr')

                     with open('passing_advanced_data.csv', 'a', newline='', encoding='utf-8') as passing_advanced_f:
                        passing_advanced_writer = csv.writer(passing_advanced_f)

                        for row in rows:
                           # print(row)

                           if not row.has_attr("class"):
                              player_name = row.find('th', {'data-stat': 'player'}).text

                              pass_cmp_cell = row.find('td', {'data-stat': 'pass_cmp'})
                              pass_cmp = pass_cmp_cell.get_text(strip=True) if pass_cmp_cell else 0
                              pass_att_cell = row.find('td', {'data-stat': 'pass_att'})
                              pass_att = pass_att_cell.get_text(strip=True) if pass_att_cell else 0
                              pass_yds_cell = row.find('td', {'data-stat': 'pass_yds'})
                              pass_yds = pass_yds_cell.get_text(strip=True) if pass_yds_cell else 0

                              pass_first_down_cell = row.find('td', {'data-stat': 'pass_first_down'})
                              pass_first_down = pass_first_down_cell.get_text(strip=True) if pass_first_down_cell else 0
                              pass_first_down_pct_cell = row.find('td', {'data-stat': 'pass_first_down_pct'})
                              pass_first_down_pct = pass_first_down_pct_cell.get_text(strip=True) if pass_first_down_pct_cell else 0

                              pass_target_yds_cell = row.find('td', {'data-stat': 'pass_target_yds'})
                              pass_target_yds = pass_target_yds_cell.get_text(strip=True) if pass_target_yds_cell else 0
                              pass_tgt_yds_per_att_cell = row.find('td', {'data-stat': 'pass_tgt_yds_per_att'})
                              pass_tgt_yds_per_att = pass_tgt_yds_per_att_cell.get_text(strip=True) if pass_tgt_yds_per_att_cell else 0

                              pass_air_yds_cell = row.find('td', {'data-stat': 'pass_air_yds'})
                              pass_air_yds = pass_air_yds_cell.get_text(strip=True) if pass_air_yds_cell else 0
                              pass_air_yds_per_cmp_cell = row.find('td', {'data-stat': 'pass_air_yds_per_cmp'})
                              pass_air_yds_per_cmp = pass_air_yds_per_cmp_cell.get_text(strip=True) if pass_air_yds_per_cmp_cell else 0
                              pass_air_yds_per_att_cell = row.find('td', {'data-stat': 'pass_air_yds_per_att'})
                              pass_air_yds_per_att = pass_air_yds_per_att_cell.get_text(strip=True) if pass_air_yds_per_att_cell else 0

                              pass_yac_cell = row.find('td', {'data-stat': 'pass_yac'})
                              pass_yac = pass_yac_cell.get_text(strip=True) if pass_yac_cell else 0
                              pass_yac_per_cmp_cell = row.find('td', {'data-stat': 'pass_yac_per_cmp'})
                              pass_yac_per_cmp = pass_yac_per_cmp_cell.get_text(strip=True) if pass_yac_per_cmp_cell else 0

                              pass_drops_cell = row.find('td', {'data-stat': 'pass_drops'})
                              pass_drops = pass_drops_cell.get_text(strip=True) if pass_drops_cell else 0
                              pass_drop_pct_cell = row.find('td', {'data-stat': 'pass_drop_pct'})
                              pass_drop_pct = pass_drop_pct_cell.get_text(strip=True) if pass_drop_pct_cell else 0

                              pass_poor_throws_cell = row.find('td', {'data-stat': 'pass_poor_throws'})
                              pass_poor_throws = pass_poor_throws_cell.get_text(strip=True) if pass_poor_throws_cell else 0
                              pass_poor_throw_pct_cell = row.find('td', {'data-stat': 'pass_poor_throw_pct'})
                              pass_poor_throw_pct = pass_poor_throw_pct_cell.get_text(strip=True) if pass_poor_throw_pct_cell else 0

                              pass_sacked_cell = row.find('td', {'data-stat': 'pass_sacked'})
                              pass_sacked = pass_sacked_cell.get_text(strip=True) if pass_sacked_cell else 0
                              pass_blitzed_cell = row.find('td', {'data-stat': 'pass_blitzed'})
                              pass_blitzed = pass_blitzed_cell.get_text(strip=True) if pass_blitzed_cell else 0
                              pass_hurried_cell = row.find('td', {'data-stat': 'pass_hurried'})
                              pass_hurried = pass_hurried_cell.get_text(strip=True) if pass_hurried_cell else 0
                              pass_hits_cell = row.find('td', {'data-stat': 'pass_hits'})
                              pass_hits = pass_hits_cell.get_text(strip=True) if pass_hits_cell else 0
                              pass_pressured_cell = row.find('td', {'data-stat': 'pass_pressured'})
                              pass_pressured = pass_pressured_cell.get_text(strip=True) if pass_pressured_cell else 0
                              pass_pressured_pct_cell = row.find('td', {'data-stat': 'pass_pressured_pct'})
                              pass_pressured_pct = pass_pressured_pct_cell.get_text(strip=True) if pass_pressured_pct_cell else 0

                              rush_scrambles_cell = row.find('td', {'data-stat': 'rush_scrambles'})
                              rush_scrambles = rush_scrambles_cell.get_text(strip=True) if rush_scrambles_cell else 0
                              rush_scrambles_yds_per_att_cell = row.find('td', {'data-stat': 'rush_scrambles_yds_per_att'})
                              rush_scrambles_yds_per_att = rush_scrambles_yds_per_att_cell.get_text(strip=True) if rush_scrambles_yds_per_att_cell else 0

                              passing_advanced_writer.writerow([
                                 game_id,
                                 player_name,
                                 date,
                                 pass_cmp, pass_att, pass_yds,
                                 pass_first_down, pass_first_down_pct,
                                 pass_target_yds, pass_tgt_yds_per_att,
                                 pass_air_yds, pass_air_yds_per_cmp, pass_air_yds_per_att,
                                 pass_yac, pass_yac_per_cmp,
                                 pass_drops, pass_drop_pct,
                                 pass_poor_throws, pass_poor_throw_pct,
                                 pass_sacked, pass_blitzed, pass_hurried, pass_hits, pass_pressured, pass_pressured_pct,
                                 rush_scrambles, rush_scrambles_yds_per_att
                              ])

            ##### get Advanced Rushing table
            for comment in comments:
               if 'rushing_advanced' in comment:
                  comment_soup = BeautifulSoup(comment, 'html.parser')
                  rushing_advanced_table = comment_soup.find('table', {'id': 'rushing_advanced'})
                  if rushing_advanced_table:
                     # print("Found table in com/mentsasdfasdf")
                     rushing_advanced_tbody = rushing_advanced_table.find('tbody')

                     rows = rushing_advanced_tbody.find_all('tr')

                     with open('rushing_advanced_data.csv', 'a', newline='', encoding='utf-8') as rushing_advanced_f:
                        rushing_advanced_writer = csv.writer(rushing_advanced_f)

                        for row in rows:
                           # print(row)

                           if not row.has_attr("class"):
                              player_name = row.find('th', {'data-stat': 'player'}).text

                              rush_att_cell = row.find('td', {'data-stat': 'rush_att'})
                              rush_att = rush_att_cell.get_text(strip=True) if rush_att_cell else 0
                              rush_yds_cell = row.find('td', {'data-stat': 'rush_yds'})
                              rush_yds = rush_yds_cell.get_text(strip=True) if rush_yds_cell else 0
                              rush_td_cell = row.find('td', {'data-stat': 'rush_td'})
                              rush_td = rush_td_cell.get_text(strip=True) if rush_td_cell else 0

                              rush_first_down_cell = row.find('td', {'data-stat': 'rush_first_down'})
                              rush_first_down = rush_first_down_cell.get_text(strip=True) if rush_first_down_cell else 0

                              rush_yds_before_contact_cell = row.find('td', {'data-stat': 'rush_yds_before_contact'})
                              rush_yds_before_contact = rush_yds_before_contact_cell.get_text(strip=True) if rush_yds_before_contact_cell else 0
                              rush_yds_bc_per_rush_cell = row.find('td', {'data-stat': 'rush_yds_bc_per_rush'})
                              rush_yds_bc_per_rush = rush_yds_bc_per_rush_cell.get_text(strip=True) if rush_yds_bc_per_rush_cell else 0

                              rush_yac_cell = row.find('td', {'data-stat': 'rush_yac'})
                              rush_yac = rush_yac_cell.get_text(strip=True) if rush_yac_cell else 0
                              rush_yac_per_rush_cell = row.find('td', {'data-stat': 'rush_yac_per_rush'})
                              rush_yac_per_rush = rush_yac_per_rush_cell.get_text(strip=True) if rush_yac_per_rush_cell else 0

                              rush_broken_tackles_cell = row.find('td', {'data-stat': 'rush_broken_tackles'})
                              rush_broken_tackles = rush_broken_tackles_cell.get_text(strip=True) if rush_broken_tackles_cell else 0
                              rush_broken_tackles_per_rush_cell = row.find('td', {'data-stat': 'rush_broken_tackles_per_rush'})
                              rush_broken_tackles_per_rush = rush_broken_tackles_per_rush_cell.get_text(strip=True) if rush_broken_tackles_per_rush_cell else 0

                              rushing_advanced_writer.writerow([
                                 game_id,
                                 player_name,
                                 date,
                                 rush_att, rush_yds, rush_td,
                                 rush_first_down,
                                 rush_yds_before_contact, rush_yds_bc_per_rush,
                                 rush_yac, rush_yac_per_rush,
                                 rush_broken_tackles, rush_broken_tackles_per_rush
                              ])

            ##### get Advanced Receiving table
            for comment in comments:
               if 'receiving_advanced' in comment: 
                  comment_soup = BeautifulSoup(comment, 'html.parser')
                  receiving_advanced_table = comment_soup.find('table', {'id': 'receiving_advanced'})  
                  if receiving_advanced_table:
                        # print("Found receiving table in comments")
                        receiving_advanced_tbody = receiving_advanced_table.find('tbody')

                        rows = receiving_advanced_tbody.find_all('tr')

                        with open('receiving_advanced_data.csv', 'a', newline='', encoding='utf-8') as receiving_advanced_f:  
                           receiving_advanced_writer = csv.writer(receiving_advanced_f)  

                           for row in rows:
                              # print(row)

                              if not row.has_attr("class"):
                                    player_name = row.find('th', {'data-stat': 'player'}).text

                                    targets_cell = row.find('td', {'data-stat': 'targets'})
                                    targets = targets_cell.get_text(strip=True) if targets_cell else 0                                 
                                    rec_cell = row.find('td', {'data-stat': 'rec'})
                                    rec = rec_cell.get_text(strip=True) if rec_cell else 0                                   
                                    rec_yds_cell = row.find('td', {'data-stat': 'rec_yds'})
                                    rec_yds = rec_yds_cell.get_text(strip=True) if rec_yds_cell else 0                                  
                                    rec_td_cell = row.find('td', {'data-stat': 'rec_td'})
                                    rec_td = rec_td_cell.get_text(strip=True) if rec_td_cell else 0

                                    rec_first_down_cell = row.find('td', {'data-stat': 'rec_first_down'})
                                    rec_first_down = rec_first_down_cell.get_text(strip=True) if rec_first_down_cell else 0

                                    rec_air_yds_cell = row.find('td', {'data-stat': 'rec_air_yds'})
                                    rec_air_yds = rec_air_yds_cell.get_text(strip=True) if rec_air_yds_cell else 0
                                    rec_air_yds_per_rec_cell = row.find('td', {'data-stat': 'rec_air_yds_per_rec'})
                                    rec_air_yds_per_rec = rec_air_yds_per_rec_cell.get_text(strip=True) if rec_air_yds_per_rec_cell else 0

                                    rec_yac_cell = row.find('td', {'data-stat': 'rec_yac'})
                                    rec_yac = rec_yac_cell.get_text(strip=True) if rec_yac_cell else 0
                                    rec_yac_per_rec_cell = row.find('td', {'data-stat': 'rec_yac_per_rec'})
                                    rec_yac_per_rec = rec_yac_per_rec_cell.get_text(strip=True) if rec_yac_per_rec_cell else 0

                                    rec_adot_cell = row.find('td', {'data-stat': 'rec_adot'})
                                    rec_adot = rec_adot_cell.get_text(strip=True) if rec_adot_cell else 0

                                    rec_broken_tackles_cell = row.find('td', {'data-stat': 'rec_broken_tackles'})
                                    rec_broken_tackles = rec_broken_tackles_cell.get_text(strip=True) if rec_broken_tackles_cell else 0
                                    rec_broken_tackles_per_rec_cell = row.find('td', {'data-stat': 'rec_broken_tackles_per_rec'})
                                    rec_broken_tackles_per_rec = rec_broken_tackles_per_rec_cell.get_text(strip=True) if rec_broken_tackles_per_rec_cell else 0

                                    rec_drops_cell = row.find('td', {'data-stat': 'rec_drops'})
                                    rec_drops = rec_drops_cell.get_text(strip=True) if rec_drops_cell else 0
                                    rec_drop_pct_cell = row.find('td', {'data-stat': 'rec_drop_pct'})
                                    rec_drop_pct = rec_drop_pct_cell.get_text(strip=True) if rec_drop_pct_cell else 0

                                    rec_target_int_cell = row.find('td', {'data-stat': 'rec_target_int'})
                                    rec_target_int = rec_target_int_cell.get_text(strip=True) if rec_target_int_cell else 0

                                    rec_pass_rating_cell = row.find('td', {'data-stat': 'rec_pass_rating'})
                                    rec_pass_rating = rec_pass_rating_cell.get_text(strip=True) if rec_pass_rating_cell else 0

                                    receiving_advanced_writer.writerow([
                                       game_id,
                                       player_name,
                                       date,
                                       targets, rec, rec_yds, rec_td,
                                       rec_first_down,
                                       rec_air_yds, rec_air_yds_per_rec,
                                       rec_yac, rec_yac_per_rec,
                                       rec_adot,
                                       rec_broken_tackles, rec_broken_tackles_per_rec,
                                       rec_drops, rec_drop_pct,
                                       rec_target_int,
                                       rec_pass_rating
                                    ])

            ##### get Advanced Defense table
            for comment in comments:
               if 'defense_advanced' in comment:
                  comment_soup = BeautifulSoup(comment, 'html.parser')
                  defense_advanced_table = comment_soup.find('table', {'id': 'defense_advanced'})
                  if defense_advanced_table:
                        # print("Found defensive table in comments")
                        defense_advanced_tbody = defense_advanced_table.find('tbody')

                        rows = defense_advanced_tbody.find_all('tr')

                        with open('defense_advanced_data.csv', 'a', newline='', encoding='utf-8') as defense_advanced_f:
                           defense_advanced_writer = csv.writer(defense_advanced_f)

                           for row in rows:
                              # print(row)

                              if not row.has_attr("class"):
                                    player_name = row.find('th', {'data-stat': 'player'}).text

                                    def_int_cell = row.find('td', {'data-stat': 'def_int'})
                                    def_int = def_int_cell.get_text(strip=True) if def_int_cell else 0

                                    def_targets_cell = row.find('td', {'data-stat': 'def_targets'})
                                    def_targets = def_targets_cell.get_text(strip=True) if def_targets_cell else 0
                                    def_cmp_cell = row.find('td', {'data-stat': 'def_cmp'})
                                    def_cmp = def_cmp_cell.get_text(strip=True) if def_cmp_cell else 0
                                    def_cmp_perc_cell = row.find('td', {'data-stat': 'def_cmp_perc'})
                                    def_cmp_perc = def_cmp_perc_cell.get_text(strip=True) if def_cmp_perc_cell else 0
                                    def_cmp_yds_cell = row.find('td', {'data-stat': 'def_cmp_yds'})
                                    def_cmp_yds = def_cmp_yds_cell.get_text(strip=True) if def_cmp_yds_cell else 0
                                    def_yds_per_cmp_cell = row.find('td', {'data-stat': 'def_yds_per_cmp'})
                                    def_yds_per_cmp = def_yds_per_cmp_cell.get_text(strip=True) if def_yds_per_cmp_cell else 0
                                    def_yds_per_target_cell = row.find('td', {'data-stat': 'def_yds_per_target'})
                                    def_yds_per_target = def_yds_per_target_cell.get_text(strip=True) if def_yds_per_target_cell else 0
                                    def_cmp_td_cell = row.find('td', {'data-stat': 'def_cmp_td'})
                                    def_cmp_td = def_cmp_td_cell.get_text(strip=True) if def_cmp_td_cell else 0

                                    def_pass_rating_cell = row.find('td', {'data-stat': 'def_pass_rating'})
                                    def_pass_rating = def_pass_rating_cell.get_text(strip=True) if def_pass_rating_cell else 0

                                    def_tgt_yds_per_att_cell = row.find('td', {'data-stat': 'def_tgt_yds_per_att'})
                                    def_tgt_yds_per_att = def_tgt_yds_per_att_cell.get_text(strip=True) if def_tgt_yds_per_att_cell else 0
                                    def_air_yds_cell = row.find('td', {'data-stat': 'def_air_yds'})
                                    def_air_yds = def_air_yds_cell.get_text(strip=True) if def_air_yds_cell else 0
                                    def_yac_cell = row.find('td', {'data-stat': 'def_yac'})
                                    def_yac = def_yac_cell.get_text(strip=True) if def_yac_cell else 0

                                    blitzes_cell = row.find('td', {'data-stat': 'blitzes'})
                                    blitzes = blitzes_cell.get_text(strip=True) if blitzes_cell else 0
                                    qb_hurry_cell = row.find('td', {'data-stat': 'qb_hurry'})
                                    qb_hurry = qb_hurry_cell.get_text(strip=True) if qb_hurry_cell else 0
                                    qb_knockdown_cell = row.find('td', {'data-stat': 'qb_knockdown'})
                                    qb_knockdown = qb_knockdown_cell.get_text(strip=True) if qb_knockdown_cell else 0
                                    sacks_cell = row.find('td', {'data-stat': 'sacks'})
                                    sacks = sacks_cell.get_text(strip=True) if sacks_cell else 0
                                    pressures_cell = row.find('td', {'data-stat': 'pressures'})
                                    pressures = pressures_cell.get_text(strip=True) if pressures_cell else 0

                                    tackles_combined_cell = row.find('td', {'data-stat': 'tackles_combined'})
                                    tackles_combined = tackles_combined_cell.get_text(strip=True) if tackles_combined_cell else 0
                                    tackles_missed_cell = row.find('td', {'data-stat': 'tackles_missed'})
                                    tackles_missed = tackles_missed_cell.get_text(strip=True) if tackles_missed_cell else 0
                                    tackles_missed_pct_cell = row.find('td', {'data-stat': 'tackles_missed_pct'})
                                    tackles_missed_pct = tackles_missed_pct_cell.get_text(strip=True) if tackles_missed_pct_cell else 0

                                    defense_advanced_writer.writerow([
                                       game_id,
                                       player_name,
                                       date,
                                       def_int,
                                       def_targets, def_cmp, def_cmp_perc, def_cmp_yds, 
                                       def_yds_per_cmp, def_yds_per_target, def_cmp_td,
                                       def_pass_rating,
                                       def_tgt_yds_per_att, def_air_yds, def_yac,
                                       blitzes, qb_hurry, qb_knockdown, sacks, pressures,
                                       tackles_combined, tackles_missed, tackles_missed_pct
                                    ])

            ##### get Starters
            for comment in comments:
               if 'home_starters' in comment:
                  comment_soup = BeautifulSoup(comment, 'html.parser')
                  home_starters_table = comment_soup.find('table', {'id': 'home_starters'})
                  if home_starters_table:
                     # print("Found table in comments")
                     home_starters_tbody = home_starters_table.find('tbody')

                     rows = home_starters_tbody.find_all('tr')

                     with open('starters_data.csv', 'a', newline='', encoding='utf-8') as starters_f:
                        starters_writer = csv.writer(starters_f)

                        for row in rows:
                           # print(row)

                           if not row.has_attr("class"):
                              player_name = row.find('th', {'data-stat': 'player'}).text

                              starters_writer.writerow([
                                 game_id,
                                 player_name,
                                 date,
                                 True
                              ])

               if 'vis_starters' in comment:
                  comment_soup = BeautifulSoup(comment, 'html.parser')
                  vis_starters_table = comment_soup.find('table', {'id': 'vis_starters'})
                  if home_starters_table:
                     # print("Found table in comments")
                     vis_starters_tbody = vis_starters_table.find('tbody')

                     rows = vis_starters_tbody.find_all('tr')

                     with open('starters_data.csv', 'a', newline='', encoding='utf-8') as starters_f:
                        starters_writer = csv.writer(starters_f)

                        for row in rows:
                           # print(row)

                           if not row.has_attr("class"):
                              player_name = row.find('th', {'data-stat': 'player'}).text

                              starters_writer.writerow([
                                 game_id,
                                 player_name,
                                 date,
                                 True
                              ])
            
         elif response.status_code == 429:
            # time.sleep(random.uniform(2, 4))
            print("429 error at " + game_id)
            game_id_429 = game_id
            break

         else:
            print(f"Failed with status: {response.status_code}")
            failed_ids.append(game_id)
               
      except Exception as e:
         print(f"Error: {e}")
         failed_ids.append(game_id)

      # with open('player_snap_data.csv', 'a', newline='', encoding='utf-8') as f:
      #       writer = csv.writer(f)

   
      # break

get_data(game_id_list)
# get_data(['202509040phi'])
   

Processing: https://www.pro-football-reference.com/boxscores/201609080den.htm
Status code: 200
Successfully fetched https://www.pro-football-reference.com/boxscores/201609080den.htm
Processing: https://www.pro-football-reference.com/boxscores/201609110rav.htm
Status code: 200
Successfully fetched https://www.pro-football-reference.com/boxscores/201609110rav.htm
Processing: https://www.pro-football-reference.com/boxscores/201609110jax.htm
Status code: 200
Successfully fetched https://www.pro-football-reference.com/boxscores/201609110jax.htm
Processing: https://www.pro-football-reference.com/boxscores/201609110atl.htm
Status code: 200
Successfully fetched https://www.pro-football-reference.com/boxscores/201609110atl.htm
Processing: https://www.pro-football-reference.com/boxscores/201609110htx.htm
Status code: 200
Successfully fetched https://www.pro-football-reference.com/boxscores/201609110htx.htm
Processing: https://www.pro-football-reference.com/boxscores/201609110nyj.htm
Status code:

In [75]:
# combine the csvs

# all snap data
player_snap_df = pd.read_csv('player_snap_data.csv')

player_offense_df = pd.read_csv('player_offense_data.csv')
player_defense_df = pd.read_csv('player_defense_data.csv')
kicking_df = pd.read_csv('kicking_data.csv')
returns_df = pd.read_csv('returns_data.csv')
passing_advanced_df = pd.read_csv('passing_advanced_data.csv')
rushing_advanced_df = pd.read_csv('rushing_advanced_data.csv')
receiving_advanced_df = pd.read_csv('receiving_advanced_data.csv')
defense_advanced_df = pd.read_csv('defense_advanced_data.csv')


In [77]:
# defense_advanced_df

from functools import reduce

# Create list of all DataFrames
all_dfs = [
    player_snap_df,
    player_offense_df,
    player_defense_df,
    kicking_df,
    returns_df,
    passing_advanced_df,
    rushing_advanced_df,
    receiving_advanced_df,
    defense_advanced_df
]

# Define merge function
def merge_dfs(left, right):
    return pd.merge(left, right, on=['game_id', 'player_name', 'date'], how='left')

# Reduce all DataFrames
combined_df = reduce(merge_dfs, all_dfs)

combined_df.to_csv('combined_data.csv', index=False)