In [5]:
import requests

In [6]:
standings_url = "https://www.basketball-reference.com/leagues/NBA_2024_standings.html"

In [474]:
data = requests.get(standings_url)

from datetime import timedelta

# See how much time I have left to make another request
if data.status_code == 429:
    print("Too many requests, try again after", str(timedelta(seconds = int(data.headers["Retry-After"]))), "hours")

In [6]:
from bs4 import BeautifulSoup

In [444]:
soup = BeautifulSoup(data.text)

In [386]:
standings_tables = soup.find_all('table')[:2]
# [0] -> Eastern Conference
# [1] -> Western Conference

In [388]:
links = [standings_tables[0].find_all('a'), standings_tables[1].find_all('a')]

In [389]:
links = [l for conference in links for l in conference]

In [390]:
links = [l.get("href") for l in links]

In [391]:
team_urls = [f"https://basketball-reference.com{l[:-5]}_games.html" for l in links]

In [394]:
team_url = team_urls[0]

In [395]:
data = requests.get(team_url)

In [411]:
import pandas as pd

games = pd.read_html(data.text, match="Regular Season Table")

In [13]:
from datetime import datetime

def convert_to_short_date(date_string):
    try:
        date_object = datetime.strptime(date_string, "%a, %b %d, %Y")
        short_date_string = date_object.strftime("%Y-%m-%d")
        return short_date_string
    except ValueError:
        return date_string

In [15]:
columns_to_drop = ['Unnamed: 3', 'Unnamed: 4', 'Unnamed: 5', 'Unnamed: 7', 'Unnamed: 8', 'Notes']

In [434]:
for game in games:
    if 'Date' in game.columns:
        game['Date'] = game['Date'].apply(convert_to_short_date)
    # Find indexes where the "Date" column's value is "Date"
    indexes_to_drop = game[game["Date"] == "Date"].index
    # Drop these indexes
    game.drop(indexes_to_drop, inplace=True)
    game.reset_index(drop=True, inplace=True)

In [435]:
games[0]

Unnamed: 0,G,Date,Start (ET),Unnamed: 3,Unnamed: 4,Unnamed: 5,Opponent,Unnamed: 7,Unnamed: 8,Tm,Opp,W,L,Streak,Notes
0,1,2023-10-25,7:00p,,Box Score,@,New York Knicks,W,,108,104,1,0,W 1,
1,2,2023-10-27,7:30p,,Box Score,,Miami Heat,W,,119,111,2,0,W 2,
2,3,2023-10-30,7:00p,,Box Score,@,Washington Wizards,W,,126,107,3,0,W 3,
3,4,2023-11-01,7:30p,,Box Score,,Indiana Pacers,W,,155,104,4,0,W 4,
4,5,2023-11-04,8:00p,,Box Score,@,Brooklyn Nets,W,,124,114,5,0,W 5,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
77,78,2024-04-07,6:00p,,,,Portland Trail Blazers,,,,,,,,
78,79,2024-04-09,7:30p,,,@,Milwaukee Bucks,,,,,,,,
79,80,2024-04-11,7:30p,,,,New York Knicks,,,,,,,,
80,81,2024-04-12,7:30p,,,,Charlotte Hornets,,,,,,,,


In [418]:
soup = BeautifulSoup(data.text)

In [419]:
links = soup.find_all('a')

In [420]:
links  = [l.get("href") for l in links]

In [421]:
links = [l for l in links if l and '/gamelog/' in l]

In [422]:
links

['/teams/BOS/2024/gamelog/',
 '/teams/BOS/2024/gamelog/',
 '/teams/BOS/2024/gamelog/']

In [423]:
data = requests.get(f"https://basketball-reference.com{links[-1]}")

In [424]:
stats = pd.read_html(data.text, match="Regular Season Table")[0]

In [425]:
mask = stats.columns.get_level_values(0) != 'Opponent'
stats = stats.loc[:, mask]
# Remove all opponent data

In [426]:
stats.columns = stats.columns.droplevel()
# Remove hierarchy of columns
stats = stats.drop("Unnamed: 24_level_1", axis=1)

In [464]:
stats.head()

Unnamed: 0,Rk,G,Date,Unnamed: 3_level_1,Opp,W/L,Tm,Opp.1,FG,FGA,FG%,3P,3PA,3P%,FT,FTA,FT%,ORB,TRB,AST,STL,BLK,TOV,PF
0,1,1,2023-10-25,,DET,W,103,102,37,92,0.402,8,22,0.364,21,26,0.808,16,48,22,11,3,7,18
1,2,2,2023-10-27,@,BOS,L,111,119,38,89,0.427,16,33,0.485,19,23,0.826,11,45,24,8,6,12,16
2,3,3,2023-10-28,@,MIN,L,90,106,35,87,0.402,12,38,0.316,8,14,0.571,9,42,24,6,1,14,12
3,4,4,2023-10-30,@,MIL,L,114,122,41,86,0.477,13,34,0.382,19,20,0.95,7,40,27,7,2,14,20
4,5,5,2023-11-01,,BRK,L,105,109,39,91,0.429,13,38,0.342,14,18,0.778,11,46,22,10,5,16,20


In [429]:
team_data = games[0].merge(stats[["Date", "W/L", "FG", "FGA", "3P", "3PA", "3P%", "FT", "FTA", "FT%", "ORB", "TRB", "AST", "STL", "BLK", "TOV", "PF"]], on="Date")

In [436]:
team_data = team_data.drop(columns_to_drop, axis=1)

In [437]:
team_data.head()

Unnamed: 0,G,Date,Start (ET),Opponent,Tm,Opp,W,L,Streak,W/L,FG,FGA,3P,3PA,3P%,FT,FTA,FT%,ORB,TRB,AST,STL,BLK,TOV,PF
0,1,2023-10-25,7:00p,New York Knicks,108,104,1,0,W 1,W,37,77,12,39,0.308,22,26,0.846,7,46,18,6,11,13,22
1,2,2023-10-27,7:30p,Miami Heat,119,111,2,0,W 2,W,45,95,16,39,0.41,13,19,0.684,16,55,20,7,6,15,19
2,3,2023-10-30,7:00p,Washington Wizards,126,107,3,0,W 3,W,51,102,19,53,0.358,5,7,0.714,15,51,31,11,6,17,21
3,4,2023-11-01,7:30p,Indiana Pacers,155,104,4,0,W 4,W,54,95,20,35,0.571,27,28,0.964,11,57,27,5,2,11,19
4,5,2023-11-04,8:00p,Brooklyn Nets,124,114,5,0,W 5,W,43,90,15,45,0.333,23,27,0.852,10,50,22,4,6,11,17


In [467]:
team_data.shape

(54, 33)

In [12]:
abbrev_to_name = {
    'ATL': 'Atlanta Hawks',
    'BOS': 'Boston Celtics',
    'BRK': 'Brooklyn Nets',
    'CHO': 'Charlotte Hornets',
    'CHI': 'Chicago Bulls',
    'CLE': 'Cleveland Cavaliers',
    'DAL': 'Dallas Mavericks',
    'DEN': 'Denver Nuggets',
    'DET': 'Detroit Pistons',
    'GSW': 'Golden State Warriors',
    'HOU': 'Houston Rockets',
    'IND': 'Indiana Pacers',
    'LAC': 'Los Angeles Clippers',
    'LAL': 'Los Angeles Lakers',
    'MEM': 'Memphis Grizzlies',
    'MIA': 'Miami Heat',
    'MIL': 'Milwaukee Bucks',
    'MIN': 'Minnesota Timberwolves',
    'NOP': 'New Orleans Pelicans',
    'NYK': 'New York Knicks',
    'OKC': 'Oklahoma City Thunder',
    'ORL': 'Orlando Magic',
    'PHI': 'Philadelphia 76ers',
    'PHO': 'Phoenix Suns',
    'POR': 'Portland Trail Blazers',
    'SAC': 'Sacramento Kings',
    'SAS': 'San Antonio Spurs',
    'TOR': 'Toronto Raptors',
    'UTA': 'Utah Jazz',
    'WAS': 'Washington Wizards'
}

In [3]:
# Now we start the real stuff
years = list(range(2024, 2022, -1))

In [8]:
years

[2024, 2023]

In [21]:
# MAKE SURE TO RESET THE FOLLOWING TWO VARIABLES BEFORE RUNNING SCRAPING SCRIPT
all_games = []

In [22]:
standings_url = "https://www.basketball-reference.com/leagues/NBA_2024_standings.html"

In [23]:
import requests
from bs4 import BeautifulSoup
import time
import pandas as pd
from datetime import timedelta

for year in years:
    data = requests.get(standings_url)
    if data.status_code == 429:
        print("Oops 429 error try again after", str(timedelta(seconds = int(data.headers["Retry-After"]))), "hours")
        # quit()
        time.sleep(int(data.headers["Retry-After"]) + 10)
            
    soup = BeautifulSoup(data.text)
    standings_tables = soup.select('table.stats_table')[:2]
    # [0] -> Eastern Conference
    # [1] -> Western Conference
    
    links = [standings_tables[0].find_all('a'), standings_tables[1].find_all('a')]
    links = [l for conference in links for l in conference]
    links = [l.get("href") for l in links]
    team_urls = [f"https://basketball-reference.com{l[:-5]}_games.html" for l in links]
    
    previous_season = soup.select("a.prev")[0].get("href")
    standings_url = f"https://basketball-reference.com{previous_season}"
    
    for team_url in team_urls:
        team_name = abbrev_to_name[team_url.split("/")[-2]]
        
        data = requests.get(team_url)
        if data.status_code == 429:
            print("Oops 429 error try again after", str(timedelta(seconds = int(data.headers["Retry-After"]))), "hours")
            # quit()
            time.sleep(int(data.headers["Retry-After"]) + 10)
        
        games = pd.read_html(data.text, match="Regular Season Table")
        for team_games in games:
            if 'Date' in team_games.columns:
                team_games['Date'] = team_games['Date'].apply(convert_to_short_date)
            
            team_games = team_games.drop(columns_to_drop, axis=1)
            # Find indexes where the "Date" column's value is "Date"
            indexes_to_drop = team_games[team_games["Date"] == "Date"].index
            # Drop these indexes
            team_games.drop(indexes_to_drop, inplace=True)
            team_games.reset_index(drop=True, inplace=True)
                
        soup = BeautifulSoup(data.text)
        links = soup.find_all('a')
        links  = [l.get("href") for l in links]
        links = [l for l in links if l and '/gamelog/' in l]
        
        time.sleep(4)
        data = requests.get(f"https://basketball-reference.com{links[-1]}")
        if data.status_code == 429:
            print("Oops 429 error try again after", str(timedelta(seconds = int(data.headers["Retry-After"]))), "hours")
            # quit()
            time.sleep(int(data.headers["Retry-After"]) + 10)
        
        stats = pd.read_html(data.text, match="Regular Season Table")[0]
        # Remove all opponent data
        mask = stats.columns.get_level_values(0) != 'Opponent'
        stats = stats.loc[:, mask]
        # Remove hierarchy of columns
        stats.columns = stats.columns.droplevel()
        # Remove unnecessary unnamed columns
        stats = stats.drop("Unnamed: 24_level_1", axis=1)
        
        try: 
            team_data = games[0].merge(stats[["Date", "W/L", "FG", "FGA", "3P", "3PA", "3P%", "FT", "FTA", "FT%", "ORB", "TRB", "AST", "STL", "BLK", "TOV", "PF"]], on="Date")
        except ValueError:
            continue
            
        team_data["Season"] = year
        team_data["Team"] = team_name
        all_games.append(team_data)
        time.sleep(4)

In [24]:
game_df = pd.concat(all_games)
pd.set_option('display.max_columns', None)

In [25]:
game_df

Unnamed: 0,G,Date,Start (ET),Unnamed: 3,Unnamed: 4,Unnamed: 5,Opponent,Unnamed: 7,Unnamed: 8,Tm,Opp,W,L,Streak,Notes,W/L,FG,FGA,3P,3PA,3P%,FT,FTA,FT%,ORB,TRB,AST,STL,BLK,TOV,PF,Season,Team
0,1,2023-10-25,7:00p,,Box Score,@,New York Knicks,W,,108,104,1,0,W 1,,W,37,77,12,39,.308,22,26,.846,7,46,18,6,11,13,22,2024,Boston Celtics
1,2,2023-10-27,7:30p,,Box Score,,Miami Heat,W,,119,111,2,0,W 2,,W,45,95,16,39,.410,13,19,.684,16,55,20,7,6,15,19,2024,Boston Celtics
2,3,2023-10-30,7:00p,,Box Score,@,Washington Wizards,W,,126,107,3,0,W 3,,W,51,102,19,53,.358,5,7,.714,15,51,31,11,6,17,21,2024,Boston Celtics
3,4,2023-11-01,7:30p,,Box Score,,Indiana Pacers,W,,155,104,4,0,W 4,,W,54,95,20,35,.571,27,28,.964,11,57,27,5,2,11,19,2024,Boston Celtics
4,5,2023-11-04,8:00p,,Box Score,@,Brooklyn Nets,W,,124,114,5,0,W 5,,W,43,90,15,45,.333,23,27,.852,10,50,22,4,6,11,17,2024,Boston Celtics
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
93,78,2023-04-02,6:00p,,Box Score,@,Sacramento Kings,W,OT,142,134,20,58,W 1,,W,52,98,13,34,.382,25,29,.862,14,52,31,8,3,17,25,2023,San Antonio Spurs
94,79,2023-04-04,10:00p,,Box Score,@,Phoenix Suns,L,,94,115,20,59,L 1,,L,35,90,13,40,.325,11,14,.786,9,38,25,7,5,12,16,2023,San Antonio Spurs
95,80,2023-04-06,8:00p,,Box Score,,Portland Trail Blazers,W,,129,127,21,59,W 1,,W,47,106,10,35,.286,25,28,.893,18,47,29,9,4,14,12,2023,San Antonio Spurs
96,81,2023-04-08,4:00p,,Box Score,,Minnesota Timberwolves,L,,131,151,21,60,L 1,,L,50,95,13,29,.448,18,24,.750,10,45,34,5,4,13,16,2023,San Antonio Spurs


In [26]:
# Format df
columns_to_drop = ['Unnamed: 3', 'Unnamed: 4', 'Unnamed: 7', 'Unnamed: 8', 'Notes']

# Remove rubbish columns
game_df = game_df.drop(columns_to_drop, axis=1)
# Remove rubbish rows (placeholders from website)
mask = game_df["Date"] == "Date"
game_df = game_df[~mask].reset_index(drop=True)

# Fix venue column
game_df = game_df.rename(columns={'Unnamed: 5': 'Venue'})

import numpy as np

is_away = game_df["Venue"] == "@"

game_df["Venue"] = np.where(is_away, "Away", "Home")

In [27]:
game_df

Unnamed: 0,G,Date,Start (ET),Venue,Opponent,Tm,Opp,W,L,Streak,W/L,FG,FGA,3P,3PA,3P%,FT,FTA,FT%,ORB,TRB,AST,STL,BLK,TOV,PF,Season,Team
0,1,2023-10-25,7:00p,Away,New York Knicks,108,104,1,0,W 1,W,37,77,12,39,.308,22,26,.846,7,46,18,6,11,13,22,2024,Boston Celtics
1,2,2023-10-27,7:30p,Home,Miami Heat,119,111,2,0,W 2,W,45,95,16,39,.410,13,19,.684,16,55,20,7,6,15,19,2024,Boston Celtics
2,3,2023-10-30,7:00p,Away,Washington Wizards,126,107,3,0,W 3,W,51,102,19,53,.358,5,7,.714,15,51,31,11,6,17,21,2024,Boston Celtics
3,4,2023-11-01,7:30p,Home,Indiana Pacers,155,104,4,0,W 4,W,54,95,20,35,.571,27,28,.964,11,57,27,5,2,11,19,2024,Boston Celtics
4,5,2023-11-04,8:00p,Away,Brooklyn Nets,124,114,5,0,W 5,W,43,90,15,45,.333,23,27,.852,10,50,22,4,6,11,17,2024,Boston Celtics
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3823,78,2023-04-02,6:00p,Away,Sacramento Kings,142,134,20,58,W 1,W,52,98,13,34,.382,25,29,.862,14,52,31,8,3,17,25,2023,San Antonio Spurs
3824,79,2023-04-04,10:00p,Away,Phoenix Suns,94,115,20,59,L 1,L,35,90,13,40,.325,11,14,.786,9,38,25,7,5,12,16,2023,San Antonio Spurs
3825,80,2023-04-06,8:00p,Home,Portland Trail Blazers,129,127,21,59,W 1,W,47,106,10,35,.286,25,28,.893,18,47,29,9,4,14,12,2023,San Antonio Spurs
3826,81,2023-04-08,4:00p,Home,Minnesota Timberwolves,131,151,21,60,L 1,L,50,95,13,29,.448,18,24,.750,10,45,34,5,4,13,16,2023,San Antonio Spurs


In [599]:
game_df.to_csv("2023-2024 data V2.csv")