In [None]:
# %pip install cloudscraper

import requests
import pandas as pd
from bs4 import BeautifulSoup
import re
import cloudscraper
import time
import random

SEASON_LIST=['2020', '2021', '2022', '2023', '2024', '2025']
SEASON_URL = 'https://www.pro-football-reference.com/years/{}/games.htm'
GAME_URL = 'https://www.pro-football-reference.com{}.htm' # add game_id to the end of it
GAME_URL_LIST = []



In [37]:
# Create a scraper that bypasses Cloudflare
scraper = cloudscraper.create_scraper(
    browser={
        'browser': 'chrome',
        'platform': 'windows',
        'mobile': False
    }
)

game_id_list = []

for year in SEASON_LIST:
    url = SEASON_URL.format(year)
    # print(f"Processing {year}...")
    
    # Add random delay
    time.sleep(random.uniform(2, 4))
    
    try:
        response = scraper.get(url)
        # print(f"Status code: {response.status_code}")
        
        if response.status_code == 200:
            soup = BeautifulSoup(response.text, 'html.parser')
            # print(f"Successfully fetched {url}")
            
            # Find the table with id="games"
            games_table = soup.find('table', {'id': 'games'})
            
            if games_table:
                # print(f"Found games table for {year}")

                # Method 1: Exact text match
                boxscore_links = games_table.find_all('a', string='boxscore')
                
                print(f"Found {len(boxscore_links)} boxscore links for {year}")
                
                # Extract href attributes
                for link in boxscore_links:
                    href = link.get('href')
                    if href:
                        # print(f"Found href: {href}")

                        # 1. Get the last part after last slash
                        game_id = href.split('/')[-1]
                        
                        # 2. Remove file extension if present
                        if '.' in game_id:
                            game_id = game_id.split('.')[0]
                            
                        
                        game_id_list.append(game_id)
                        # print(f"  â†’ Extracted game ID: {game_id}")
        else:
            print(f"Failed with status: {response.status_code}")
            
    except Exception as e:
        print(f"Error: {e}")

print(len(game_id_list))
game_id_list

Found 269 boxscore links for 2020
Found href: /boxscores/202009100kan.htm
Found href: /boxscores/202009130atl.htm
Found href: /boxscores/202009130buf.htm
Found href: /boxscores/202009130car.htm
Found href: /boxscores/202009130det.htm
Found href: /boxscores/202009130rav.htm
Found href: /boxscores/202009130jax.htm
Found href: /boxscores/202009130min.htm
Found href: /boxscores/202009130nwe.htm
Found href: /boxscores/202009130was.htm
Found href: /boxscores/202009130cin.htm
Found href: /boxscores/202009130sfo.htm
Found href: /boxscores/202009130nor.htm
Found href: /boxscores/202009130ram.htm
Found href: /boxscores/202009140nyg.htm
Found href: /boxscores/202009140den.htm
Found href: /boxscores/202009170cle.htm
Found href: /boxscores/202009200dal.htm
Found href: /boxscores/202009200mia.htm
Found href: /boxscores/202009200tam.htm
Found href: /boxscores/202009200chi.htm
Found href: /boxscores/202009200clt.htm
Found href: /boxscores/202009200pit.htm
Found href: /boxscores/202009200gnb.htm
Found 

KeyboardInterrupt: 

In [34]:
game_content = {}

for game_id in game_id_list:
   url = GAME_URL.format(game_id)
   print("Processing: " + url)

   # Add random delay
   time.sleep(random.uniform(2, 4))

   try:
      response = scraper.get(url)
      print(f"Status code: {response.status_code}")
        
      if response.status_code == 200:
         soup = BeautifulSoup(response.text, 'html.parser')
         print(f"Successfully fetched {url}")            

         ##### get general data #####
         scorebox_meta_div = soup.find('div', {'class': 'scorebox_meta'})

         if scorebox_meta_div:
            print(f"Found scorebox_meta div for {game_id}")
            
            # Get all text content with line breaks
            full_text = scorebox_meta_div.get_text(separator='\n')
            
            # Initialize variables
            date = None
            stadium = None
            
            # Extract date - first line is usually the date
            lines = [line.strip() for line in full_text.split('\n') if line.strip()]
            
            date = lines[0]
            stadium = lines[5]
                  
            # set the scorebox content
            scorebox_content = (date, stadium)            
               

         ##### get environment data #####
         game_info_div = soup.find('div', {'id':'all_game_info'})
         print(type(game_info_div))
            
         # Look for the Roof, Surface, Weather lines in the comment
         roof_match = re.search(r'<th[^>]*>Roof</th><td[^>]*>(.*?)</td>', str(game_info_div))
         surface_match = re.search(r'<th[^>]*>Surface</th><td[^>]*>(.*?)</td>', str(game_info_div))
         weather_match = re.search(r'<th[^>]*>Weather</th><td[^>]*>(.*?)</td>', str(game_info_div))
         
         roof = roof_match.group(1) if roof_match else None
         surface = surface_match.group(1) if surface_match else None
         weather = weather_match.group(1) if weather_match else None

         weather_content = weather.split(', ')

         temp = weather_content[0][:2]

         humidity = weather_content[1].split(' ')[-1]

         wind = weather_content[2][-5:]

         environment_content = (roof, surface, temp, humidity, wind)
         


         


      else:
         print(f"Failed with status: {response.status_code}")
            
   except Exception as e:
      print(f"Error: {e}")

   game_content[game_id] = (scorebox_content, environment_content)

   break

   
print(game_content)
   

Processing: https://www.pro-football-reference.com/boxscores/202009100kan.htm
Status code: 200
Successfully fetched https://www.pro-football-reference.com/boxscores/202009100kan.htm
Found scorebox_meta div for 202009100kan
<class 'bs4.element.Tag'>
{'202009100kan': (('Thursday Sep 10, 2020', 'Arrowhead Stadium'), ('outdoors', 'grass', '56', '95%', '7 mph'))}


In [16]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service

service = Service(executable_path="chromedriver.exe")
driver = webdriver.Chrome(service=service)


ImportError: cannot import name 'Sentinel' from 'typing_extensions' (c:\Users\bchm5\AppData\Local\Programs\Python\Python313\Lib\site-packages\typing_extensions.py)

In [13]:

# Configure Chrome options
chrome_options = Options()
chrome_options.add_argument("--headless")  # Run in background
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--disable-dev-shm-usage")
chrome_options.add_argument("--disable-blink-features=AutomationControlled")
chrome_options.add_experimental_option("excludeSwitches", ["enable-automation"])
chrome_options.add_experimental_option('useAutomationExtension', False)

# Add stealth settings
chrome_options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36")

driver = webdriver.Chrome(options=chrome_options)

# Execute CDP commands to hide automation
driver.execute_cdp_cmd('Network.setUserAgentOverride', {
    "userAgent": 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36'
})

game_id_list = []

for year in SEASON_LIST:
    url = SEASON_URL.format(year)
    print(f"Processing {year}...")
    
    driver.get(url)
    time.sleep(2)  # Wait for page to load
    
    # Get page source and parse with BeautifulSoup
    soup = BeautifulSoup(driver.page_source, 'html.parser')
    
    # Your parsing logic here
    
driver.quit()

Collecting selenium
  Downloading selenium-4.39.0-py3-none-any.whl.metadata (7.5 kB)
Collecting trio<1.0,>=0.31.0 (from selenium)
  Downloading trio-0.32.0-py3-none-any.whl.metadata (8.5 kB)
Collecting trio-websocket<1.0,>=0.12.2 (from selenium)
  Using cached trio_websocket-0.12.2-py3-none-any.whl.metadata (5.1 kB)
Collecting certifi>=2025.10.5 (from selenium)
  Downloading certifi-2025.11.12-py3-none-any.whl.metadata (2.5 kB)
Collecting typing_extensions<5.0,>=4.15.0 (from selenium)
  Using cached typing_extensions-4.15.0-py3-none-any.whl.metadata (3.3 kB)
Collecting websocket-client<2.0,>=1.8.0 (from selenium)
  Downloading websocket_client-1.9.0-py3-none-any.whl.metadata (8.3 kB)
Collecting outcome (from trio<1.0,>=0.31.0->selenium)
  Using cached outcome-1.3.0.post0-py2.py3-none-any.whl.metadata (2.6 kB)
Collecting cffi>=1.14 (from trio<1.0,>=0.31.0->selenium)
  Downloading cffi-2.0.0-cp313-cp313-win_amd64.whl.metadata (2.6 kB)
Collecting wsproto>=0.14 (from trio-websocket<1.0,>=0


[notice] A new release of pip is available: 25.2 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip


ImportError: cannot import name 'Sentinel' from 'typing_extensions' (c:\Users\bchm5\AppData\Local\Programs\Python\Python313\Lib\site-packages\typing_extensions.py)