In [52]:
# %pip install cloudscraper

import requests
import pandas as pd
from bs4 import BeautifulSoup, Comment
import re
import cloudscraper
import time
import random

SEASON_LIST=['2020', '2021', '2022', '2023', '2024', '2025']
SEASON_URL = 'https://www.pro-football-reference.com/years/{}/games.htm'
GAME_URL = 'https://www.pro-football-reference.com/boxscores/{}.htm' # add game_id to the end of it
GAME_URL_LIST = []



In [53]:
# Create a scraper that bypasses Cloudflare
scraper = cloudscraper.create_scraper(
    browser={
        'browser': 'chrome',
        'platform': 'windows',
        'mobile': False
    }
)

game_id_list = []

for year in SEASON_LIST:
    url = SEASON_URL.format(year)
    # print(f"Processing {year}...")
    
    # Add random delay
    time.sleep(random.uniform(2, 4))
    
    try:
        response = scraper.get(url)
        # print(f"Status code: {response.status_code}")
        
        if response.status_code == 200:
            soup = BeautifulSoup(response.text, 'html.parser')
            # print(f"Successfully fetched {url}")
            
            # Find the table with id="games"
            games_table = soup.find('table', {'id': 'games'})
            
            if games_table:
                # print(f"Found games table for {year}")

                # Method 1: Exact text match
                boxscore_links = games_table.find_all('a', string='boxscore')
                
                print(f"Found {len(boxscore_links)} boxscore links for {year}")
                
                # Extract href attributes
                for link in boxscore_links:
                    href = link.get('href')
                    if href:
                        # print(f"Found href: {href}")

                        # 1. Get the last part after last slash
                        game_id = href.split('/')[-1]
                        
                        # 2. Remove file extension if present
                        if '.' in game_id:
                            game_id = game_id.split('.')[0]
                            
                        
                        game_id_list.append(game_id)
                        # print(f"  â†’ Extracted game ID: {game_id}")
        else:
            print(f"Failed with status: {response.status_code}")
            
    except Exception as e:
        print(f"Error: {e}")

# print(len(game_id_list))
game_id_list

Found 269 boxscore links for 2020
Found 285 boxscore links for 2021
Found 284 boxscore links for 2022
Found 285 boxscore links for 2023
Found 285 boxscore links for 2024
Found 243 boxscore links for 2025


['202009100kan',
 '202009130atl',
 '202009130buf',
 '202009130car',
 '202009130det',
 '202009130rav',
 '202009130jax',
 '202009130min',
 '202009130nwe',
 '202009130was',
 '202009130cin',
 '202009130sfo',
 '202009130nor',
 '202009130ram',
 '202009140nyg',
 '202009140den',
 '202009170cle',
 '202009200dal',
 '202009200mia',
 '202009200tam',
 '202009200chi',
 '202009200clt',
 '202009200pit',
 '202009200gnb',
 '202009200oti',
 '202009200nyj',
 '202009200phi',
 '202009200crd',
 '202009200htx',
 '202009200sdg',
 '202009200sea',
 '202009210rai',
 '202009240jax',
 '202009270atl',
 '202009270buf',
 '202009270phi',
 '202009270cle',
 '202009270pit',
 '202009270min',
 '202009270nwe',
 '202009270nyg',
 '202009270sdg',
 '202009270clt',
 '202009270crd',
 '202009270sea',
 '202009270den',
 '202009270nor',
 '202009280rav',
 '202010010nyj',
 '202010040car',
 '202010040chi',
 '202010040cin',
 '202010040dal',
 '202010040det',
 '202010040htx',
 '202010040mia',
 '202010040was',
 '202010040tam',
 '202010040ram

In [80]:
game_content = {}

def get_data(game_id_list):
   for game_id in game_id_list:

      failed_ids = []

      url = GAME_URL.format(game_id)
      print("Processing: " + url)

      # Add random delay
      time.sleep(random.uniform(1, 2))

      try:
         response = scraper.get(url)
         print(f"Status code: {response.status_code}")
         
         if response.status_code == 200:
            soup = BeautifulSoup(response.text, 'html.parser')
            print(f"Successfully fetched {url}")            

            ##### get general data #####
            scorebox_meta_div = soup.find('div', {'class': 'scorebox_meta'})
            if scorebox_meta_div:
               # print(f"Found scorebox_meta div for {game_id}")
               
               # Get all text content with line breaks
               full_text = scorebox_meta_div.get_text(separator='\n')
               
               # Initialize variables
               date = None
               stadium = None
               
               # Extract date - first line is usually the date
               lines = [line.strip() for line in full_text.split('\n') if line.strip()]
               
               date = lines[0]
               stadium = lines[5]      
                  
            away_scorebox_div = soup.find('div', {'id': 'sb_team_0'})
            if away_scorebox_div:
               # print(f"Found away_scorebox_div div for {game_id}")
               
               full_text = away_scorebox_div.get_text(separator='\n')
               
               lines = [line.strip() for line in full_text.split('\n') if line.strip()]

               away_team = lines[2]
               # print(away_team)

            home_scorebox_div = soup.find('div', {'id': 'sb_team_1'})
            if home_scorebox_div:
               # print(f"Found home_scorebox_div div for {game_id}")
               
               full_text = home_scorebox_div.get_text(separator='\n')
               
               lines = [line.strip() for line in full_text.split('\n') if line.strip()]

               home_team = lines[2]
               # print(home_team)

            # set the scorebox content
            scorebox_content = (home_team, away_team, date, stadium)

            ##### get environment data #####
            game_info_div = soup.find('div', {'id': 'all_game_info'})
            if game_info_div:
               # print(type(game_info_div))
                  
               # Look for the Roof, Surface, Weather lines in the comment
               roof_match = re.search(r'<th[^>]*>Roof</th><td[^>]*>(.*?)</td>', str(game_info_div))
               surface_match = re.search(r'<th[^>]*>Surface</th><td[^>]*>(.*?)</td>', str(game_info_div))
               weather_match = re.search(r'<th[^>]*>Weather</th><td[^>]*>(.*?)</td>', str(game_info_div))
               
               roof = roof_match.group(1) if roof_match else None
               surface = surface_match.group(1) if surface_match else None
               weather = weather_match.group(1) if weather_match else None

               weather_content = weather.split(', ')

               temp = weather_content[0][:2]

               humidity = weather_content[1].split(' ')[-1]

               wind = weather_content[2][-5:]

               environment_content = (roof, surface, temp, humidity, wind)
            

            ##### get snap data #####

            # get player snaps for each row in home and away tables
            def extract_player_snaps(row):
               
               # Get player name from the link in the first th
               player_cell = row.find('th', {'data-stat': 'player'})
               player_link = player_cell.find('a')
               player_name = player_link.text if player_link else player_cell.text.strip()
               
               # Extract snap counts using data-stat attributes
               off_snaps = int(row.find('td', {'data-stat': 'offense'}).text.strip())
               def_snaps = int(row.find('td', {'data-stat': 'defense'}).text.strip())
               st_snaps = int(row.find('td', {'data-stat': 'special_teams'}).text.strip())

               total_snaps = off_snaps + def_snaps + st_snaps

               return {player_name: total_snaps} 

            comments = soup.find_all(string=lambda text: isinstance(text, Comment)) # find it in the comments

            # get data for home team
            for comment in comments:
               if 'home_snap_counts' in comment:
                  # print("Found table in comment!")
                  
                  # Parse the comment content as HTML
                  comment_soup = BeautifulSoup(comment, 'html.parser')
                  
                  # Now find the table
                  home_snap_counts_table = comment_soup.find('table', id='home_snap_counts')
                  break

            home_tbody = home_snap_counts_table.find('tbody')
            if home_tbody:
               # print("Found home_tbody")

               home_player_snap_counts = []

               rows = home_tbody.find_all('tr')
               for row in rows:
                  # print(row)

                  player_snap_data = extract_player_snaps(row)
                  home_player_snap_counts.append(player_snap_data)

            # get data for away team
            for comment in comments:
               if 'vis_snap_counts' in comment:
                  # print("Found table in comment!")
                  
                  # Parse the comment content as HTML
                  comment_soup = BeautifulSoup(comment, 'html.parser')
                  
                  # Now find the table
                  away_snap_counts_table = comment_soup.find('table', id='vis_snap_counts')
                  break

            away_tbody = away_snap_counts_table.find('tbody')
            if away_tbody:
               # print("Found away_tbody")

               away_player_snap_counts = []

               rows = away_tbody.find_all('tr')
               for row in rows:
                  # print(row)

                  player_snap_data = extract_player_snaps(row)
                  away_player_snap_counts.append(player_snap_data)

         elif response.status_code == 429:
            # time.sleep(random.uniform(2, 4))
            print("429 error at " + game_id)
            game_id_429 = game_id
            break

         else:
            print(f"Failed with status: {response.status_code}")
            failed_ids.append(game_id)
               
      except Exception as e:
         print(f"Error: {e}")
         failed_ids.append(game_id)

      game_content[(game_id, scorebox_content)] = (environment_content, home_player_snap_counts, away_player_snap_counts)

      # break

   

In [81]:
get_data(game_id_list)

# # if there is a 429 error, manually set game_id_429 to the correct game_id, comment out the above get_data function call
# game_id_429 = "202009100kan"
# game_content_start_ind = game_id_list.index(game_id_429)
# updated_game_id_list = game_id_list[game_content_start_ind:]

# get_data(updated_game_id_list)

print(game_content)


Processing: https://www.pro-football-reference.com/boxscores/202009100kan.htm
Status code: 429
429 error at 202009100kan
{}
