In [1]:
import os
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import json
from tqdm import tqdm
try:
    from curl_cffi import requests as crequests
except:
    ! pip install curl_cffi
    from curl_cffi import requests as crequests

#### Functions

In [2]:
class ScrapeAndPrep:
    # initialize class
    def __init__(self, int_year=2025):
        self.int_year = int_year
    # scrape
    def scrape_html(self):
        # url
        str_url = f'https://www.pro-football-reference.com/years/{self.int_year}/games.htm'
        # get request
        r = crequests.get(
            str_url,
            headers=BROWSER_HEADERS,
            impersonate="chrome",
            timeout=60,
        )
        r.raise_for_status()
        # decode
        if not r.encoding:
            r.encoding = r.apparent_encoding
        # get soup
        soup = BeautifulSoup(r.text, 'html.parser')
        # save to object
        self.soup = soup
    # prep
    def create_table(self):
        # get table
        table = self.soup.find('table', class_='sortable stats_table')
        # get body
        tbody = table.find('tbody')
        # get rows
        rows = tbody.find_all("tr")
        # make df
        list_dict_row = []
        for row in tqdm(rows):
            try:
                # get headers and cols
                tds = row.find_all(['th','td'])
                # get week
                int_week = int(tds[0].get_text(strip=True))
                # get date
                str_date = tds[2].get_text(strip=True)
                # winning team
                str_winner = tds[4].get_text(strip=True)
                # location
                str_location = tds[5].get_text(strip=True)
                if str_location == '':
                    str_location = 'vs'
                # losing team
                str_loser = tds[6].get_text(strip=True)
                # points scored by winning team
                int_pts_winner = int(tds[8].get_text(strip=True))
                # points scored by losing team
                int_pts_loser = int(tds[9].get_text(strip=True))
                # row
                dict_row = {
                    'week': int_week,
                    'date': str_date,
                    'winner': str_winner,
                    'location': str_location,
                    'loser': str_loser,
                    'winning_pts': int_pts_winner,
                    'losing_pts': int_pts_loser,
                }
                # append
                list_dict_row.append(dict_row)
            except ValueError:
                pass
        # df
        df = pd.DataFrame(list_dict_row)
        # save to object
        self.df = df
    # prep table
    def prep_table(self):
        # copy
        df = self.df.copy()
        # get home team
        df['home_team'] = df.apply(
            lambda x: x['winner'] if x['location'] == 'vs' else x['loser'],
            axis=1,
        )
        # get away team
        df['away_team'] = df.apply(
            lambda x: x['loser'] if x['location'] == 'vs' else x['winner'],
            axis=1,
        )
        # home points
        df['home_pts'] = df.apply(
            lambda x: x['winning_pts'] if (x['winner'] == x['home_team']) else x['losing_pts'],
            axis=1,
        )
        # away points
        df['away_pts'] = df.apply(
            lambda x: x['winning_pts'] if (x['winner'] == x['away_team']) else x['losing_pts'],
            axis=1,
        )
        # home win
        df['home_win'] = df.apply(
            lambda x: 1 if x['home_pts'] > x['away_pts'] else 0,
            axis=1,
        )
        # away win
        df['away_win'] = df.apply(
            lambda x: 1 if x['home_pts'] < x['away_pts'] else 0,
            axis=1,
        )
        # tie
        df['tie'] = df.apply(
            lambda x: 1 if x['home_pts'] == x['away_pts'] else 0,
            axis=1,
        )
        # subset
        list_cols = [
            'week',
            'home_team',
            'home_pts',
            'away_team',
            'away_pts',
            'home_win',
            'away_win',
            'tie',
        ]
        df = df[list_cols].copy()
        # save to object
        self.df = df

#### Constants

In [3]:
str_dirname_output = './output'

# headers
BROWSER_HEADERS = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36",
    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
    "Accept-Language": "en-US,en;q=0.9",
    "Referer": "https://www.pro-football-reference.com/",
    "Upgrade-Insecure-Requests": "1",
}

#### Output

In [4]:
try:
    os.mkdir(str_dirname_output)
except FileExistsError:
    pass

#### Initialize class

In [5]:
cls_scrape_and_prep = ScrapeAndPrep()

#### Scrape html

In [6]:
cls_scrape_and_prep.scrape_html()

#### Create table

In [7]:
cls_scrape_and_prep.create_table()
# show
cls_scrape_and_prep.df

100%|██████████| 303/303 [00:00<00:00, 16910.04it/s]


Unnamed: 0,week,date,winner,location,loser,winning_pts,losing_pts
0,1,2025-09-04,Philadelphia Eagles,vs,Dallas Cowboys,24,20
1,1,2025-09-05,Los Angeles Chargers,vs,Kansas City Chiefs,27,21
2,1,2025-09-07,Tampa Bay Buccaneers,@,Atlanta Falcons,23,20
3,1,2025-09-07,Jacksonville Jaguars,vs,Carolina Panthers,26,10
4,1,2025-09-07,Cincinnati Bengals,@,Cleveland Browns,17,16
...,...,...,...,...,...,...,...
267,18,2026-01-04,Denver Broncos,vs,Los Angeles Chargers,19,3
268,18,2026-01-04,Las Vegas Raiders,vs,Kansas City Chiefs,14,12
269,18,2026-01-04,New England Patriots,vs,Miami Dolphins,38,10
270,18,2026-01-04,Washington Commanders,@,Philadelphia Eagles,24,17


#### Prep table

In [8]:
cls_scrape_and_prep.prep_table()
# show
cls_scrape_and_prep.df

Unnamed: 0,week,home_team,home_pts,away_team,away_pts,home_win,away_win,tie
0,1,Philadelphia Eagles,24,Dallas Cowboys,20,1,0,0
1,1,Los Angeles Chargers,27,Kansas City Chiefs,21,1,0,0
2,1,Atlanta Falcons,20,Tampa Bay Buccaneers,23,0,1,0
3,1,Jacksonville Jaguars,26,Carolina Panthers,10,1,0,0
4,1,Cleveland Browns,16,Cincinnati Bengals,17,0,1,0
...,...,...,...,...,...,...,...,...
267,18,Denver Broncos,19,Los Angeles Chargers,3,1,0,0
268,18,Las Vegas Raiders,14,Kansas City Chiefs,12,1,0,0
269,18,New England Patriots,38,Miami Dolphins,10,1,0,0
270,18,Philadelphia Eagles,17,Washington Commanders,24,0,1,0


#### Save

In [9]:
str_filename = 'df.csv'
str_local_path = f'{str_dirname_output}/{str_filename}'
cls_scrape_and_prep.df.to_csv(
    str_local_path,
    index=False,
)