In [200]:
import os
import re
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
import time

In [249]:
# Time frame for scraping
SEASONS = list(range(2016, 2025))

In [233]:
# Creating directories to store data
DATA_DIR = "data"
SCHEDULES = os.path.join(DATA_DIR, "schedules")
SCORES = os.path.join(DATA_DIR, "scores")

In [234]:
# Template url for schedule data
url_start = "https://www.pro-football-reference.com/years/{}/games.htm"

In [235]:
# Scraping schedule tables in time frame
def scrape_season(season):
    url = url_start.format(season)
    data = requests.get(url)

     # Parse the HTML with BeautifulSoup
    soup = BeautifulSoup(data.text, 'html.parser')
    
    # Find the specific section containing the schedule (table)
    schedule = soup.find(id="div_games")
    
    # Check if the table exists
    if schedule:
        html_snippet = schedule.prettify()

        save_path = os.path.join(SCHEDULES, f"{season}_schedule.html")
        with open(save_path, "w+", encoding="utf-8") as f:
            f.write(html_snippet)
        print(f"Schedule table for season {season} saved to {save_path}")
    else:
        print(f"Schedule table for season {season} not found.")

    time.sleep(5)

In [250]:
scrape_season(2024)

Schedule table for season 2024 saved to data\schedules\2024_schedule.html


In [252]:
links = []

file_path = os.path.join(SCHEDULES, f"{2024}_schedule.html")
with open(file_path, "r", encoding="utf-8") as f:
    page = f.read()

soup = BeautifulSoup(page, 'html.parser')

# Find all <td> elements with class="center" and data-stat="boxscore_word"
td_elements = soup.find_all('td', class_='center', attrs={'data-stat': 'boxscore_word'})
            
# Loop through each <td> element
for td in td_elements:
    # Find the <a> tag inside the <td>
    boxscore_link = td.find('a')
    if boxscore_link:
        # Extract the href attribute
        href = boxscore_link.get('href')
        links.append(href)

In [253]:
len(links)

272

In [254]:
def get_selenium_driver():
    options = Options()
    options.headless = True  # Run in headless mode
    driver = webdriver.Chrome(options=options)
    return driver

In [255]:
def scrape_box_scores(link):
    url = box_score_url.format(link)
    
    # Use Selenium to open the page and wait for it to load completely
    driver = get_selenium_driver()
    driver.get(url)
    
    # Wait for a few seconds to allow content to load
    time.sleep(3)
    
    # Parse the page with BeautifulSoup after it has fully loaded
    soup = BeautifulSoup(driver.page_source, 'html.parser')
    
    # Close the driver
    driver.quit()

    # Finding and removing unwanted elements
    soup.find(id="all_other_scores").decompose()
    soup.find(id="all_home_starters").decompose()
    soup.find(id="all_home_snap_counts").decompose()
    soup.find(id="all_home_drives").decompose()
    soup.find(id="all_vis_starters").decompose()
    soup.find(id="all_vis_snap_counts").decompose()
    soup.find(id="all_vis_drives").decompose()
    soup.find(id="all_pbp").decompose()
    soup.find(id="all_expected_points").decompose()
    soup.find(id="all_officials").decompose()
    soup.find(id="all_game_info").decompose()
    soup.find(id="all_scoring").decompose()
    soup.find(id="all_player_offense").decompose()
    soup.find(id="all_player_defense").decompose()

    # Find the content section
    box_score = soup.find(id="content")
    
    # Check if the box score exists and extract tables
    if box_score:
        html_snippet = box_score.prettify()
        
        match = re.search(r'boxscores/([^.]+)', url)
        
        save_path = os.path.join(SCORES, f"{match.group(1)}_box_score.html")
        with open(save_path, "w+", encoding="utf-8") as f:
            f.write(html_snippet)
    else:
        print("Box score not found")
    
    # Add delay between requests if scraping multiple pages
    time.sleep(3)

In [None]:
# Scrape all box scores of the season
for link in links:
    scrape_box_scores(link)