In [1]:
import os
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import json
from tqdm import tqdm
import time
try:
    from curl_cffi import requests as crequests
except:
    ! pip install curl_cffi
    from curl_cffi import requests as crequests

#### Functions

In [2]:
class ScrapeAndPrep:
    # initialize class
    def __init__(self, int_year=2026):
        self.int_year = int_year
    # scrape
    def scrape_site(self):
        list_dict_row = []
        int_game = 1
        for str_month in tqdm(['october','november','december','january']):
            print(str_month)
            # url
            str_url = f'https://www.basketball-reference.com/leagues/NBA_{self.int_year}_games-{str_month}.html'
            # get request
            r = crequests.get(
                str_url,
                headers=BROWSER_HEADERS,
                impersonate="chrome",
                timeout=60,
            )
            r.raise_for_status()
            # decode
            if not r.encoding:
                r.encoding = r.apparent_encoding
            # get soup
            soup = BeautifulSoup(r.text, 'html.parser')
            
            # table
            table = soup.find('table', class_='stats_table')
            # body
            tbody = table.find('tbody')
            # rows
            rows = tbody.find_all("tr")
            
            # loop
            for row in tqdm(rows):
                # cols
                tds = row.find_all(['th','td'])
                # date
                str_date = tds[0].get_text(strip=True)
                # home team
                str_home = tds[4].get_text(strip=True)
                # home pts
                int_pts_home = int(tds[5].get_text(strip=True))
                # away team
                str_away = tds[2].get_text(strip=True)
                # away pts
                int_pts_away = int(tds[3].get_text(strip=True))
                # home win
                if int_pts_home > int_pts_away:
                    int_home_win = 1
                    int_away_win = 0
                else:
                    int_home_win = 0
                    int_away_win = 1
                # row
                dict_row = {
                    'game': int_game,
                    'date': str_date,
                    'home_team': str_home,
                    'home_pts': int_pts_home,
                    'away_team': str_away,
                    'away_pts': int_pts_away,
                    'home_win': int_home_win,
                    'away_win': int_away_win,
                }
                # append
                list_dict_row.append(dict_row)
                int_game += 1
            # sleep
            time.sleep(2)
        # create df
        df = pd.DataFrame(list_dict_row)

        # save to object
        self.df = df.copy()

#### Constants

In [3]:
str_dirname_output = './output'

# headers
BROWSER_HEADERS = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36",
    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
    "Accept-Language": "en-US,en;q=0.9",
    "Referer": "https://www.basketball-reference.com/",
    "Upgrade-Insecure-Requests": "1",
}

#### Output

In [4]:
try:
    os.mkdir(str_dirname_output)
except FileExistsError:
    pass

#### Initialize class

In [5]:
cls_scrape_and_prep = ScrapeAndPrep()

#### Scrape html

In [6]:
cls_scrape_and_prep.scrape_site()
# show
cls_scrape_and_prep.df

  0%|          | 0/4 [00:00<?, ?it/s]

october



100%|██████████| 80/80 [00:00<00:00, 18366.87it/s]
 25%|██▌       | 1/4 [00:02<00:06,  2.14s/it]

november



100%|██████████| 219/219 [00:00<00:00, 19971.57it/s]
 50%|█████     | 2/4 [00:04<00:04,  2.18s/it]

december



100%|██████████| 198/198 [00:00<00:00, 19749.64it/s]
 75%|███████▌  | 3/4 [00:06<00:02,  2.22s/it]

january



100%|██████████| 233/233 [00:00<00:00, 19864.08it/s]
100%|██████████| 4/4 [00:08<00:00,  2.21s/it]


Unnamed: 0,game,date,home_team,home_pts,away_team,away_pts,home_win,away_win
0,1,"Tue, Oct 21, 2025",Oklahoma City Thunder,125,Houston Rockets,124,1,0
1,2,"Tue, Oct 21, 2025",Los Angeles Lakers,109,Golden State Warriors,119,0,1
2,3,"Wed, Oct 22, 2025",Charlotte Hornets,136,Brooklyn Nets,117,1,0
3,4,"Wed, Oct 22, 2025",New York Knicks,119,Cleveland Cavaliers,111,1,0
4,5,"Wed, Oct 22, 2025",Orlando Magic,125,Miami Heat,121,1,0
...,...,...,...,...,...,...,...,...
725,726,"Sat, Jan 31, 2026",Indiana Pacers,129,Atlanta Hawks,124,1,0
726,727,"Sat, Jan 31, 2026",Philadelphia 76ers,124,New Orleans Pelicans,114,1,0
727,728,"Sat, Jan 31, 2026",Memphis Grizzlies,114,Minnesota Timberwolves,131,0,1
728,729,"Sat, Jan 31, 2026",Miami Heat,118,Chicago Bulls,125,0,1


#### Save

In [7]:
str_filename = 'df.csv'
str_local_path = f'{str_dirname_output}/{str_filename}'
cls_scrape_and_prep.df.to_csv(
    str_local_path,
    index=False,
)