# Web scraping NBA transaction data from [basketball-reference.com](https://www.basketball-reference.com/)

In [139]:
# import bs4
from bs4 import BeautifulSoup
import requests
import csv
import re
import os
from time import sleep
import numpy as np
import pandas as pd
# import ast

### HTML files

In [447]:
years = list(range(1991, 2025))

url_blank = "https://www.basketball-reference.com/leagues/NBA_{}_transactions.html"

In [196]:
# save HTML files for each year

for year in years:
    url = url_blank.format(year)
    response = requests.get(url)

    with open("./Data/transaction_htmls/transactions_{}.html".format(year), "w+") as f:
        f.write(response.text)

### Waived players data

In [448]:
# extract waived data from HTMLs and save as a CSV file

with open("./Data/waived_data.csv", "w+", newline="") as f:
    writer = csv.writer(f)
    writer.writerow(["season_start", "date", "player_name", "bbref_id", "team_abv"])

    for year in years:
        with open("./Data/transaction_htmls/transactions_{}.html".format(year)) as h:
            page = h.read()

        soup = BeautifulSoup(page, "html.parser")

        page_index = soup.find("ul", attrs={"class":"page_index"})

        for entry in page_index.find_all("p"):
            if "waived" in entry.get_text():
                writer.writerow([year-1,
                                 entry.parent.find("span").get_text(),
                                 entry.find("a", attrs={"href":re.compile("/players/")}).get_text(),
                                 os.path.basename(entry.find("a", attrs={"href":re.compile("/players/")})["href"]).removesuffix(".html"),
                                 entry.find("a", attrs={"data-attr-from":True})["data-attr-from"]])

### Traded players data

In [449]:
# helper function for extracting each traded player (excluding draft picks) for a given entry and writing to the CSV file

def write_trades(entry, season_start, csv_writer):
    for tag in entry.find_all("a"):
        if "data-attr-from" in tag.attrs:
            current_team = tag["data-attr-from"]
        elif "data-attr-to" in tag.attrs:
            current_team = tag["data-attr-to"]
        elif ("href" in tag.attrs) and ("/players/" in tag["href"]) and ("was later selected" not in tag.next_sibling):
            csv_writer.writerow([season_start,
                                 entry.parent.find("span").get_text(),
                                 tag.get_text(),
                                 os.path.basename(tag["href"]).removesuffix(".html"),
                                 current_team])

In [450]:
# extract traded data from HTMLs and save as a CSV file

with open("./Data/traded_data.csv", "w+", newline="") as f:
    writer = csv.writer(f)
    writer.writerow(["season_start", "date", "player_name", "bbref_id", "team_abv"])

    for year in years:
        with open("./Data/transaction_htmls/transactions_{}.html".format(year)) as h:
            page = h.read()

        soup = BeautifulSoup(page, "html.parser")

        page_index = soup.find("ul", attrs={"class":"page_index"})

        for entry in page_index.find_all("p"):
            if "traded" in entry.get_text():
                write_trades(entry, year-1, writer)

### Check for coaches and executives

In [451]:
# check for players labelled as coaches or executives

for year in years:
    with open("./Data/transaction_htmls/transactions_{}.html".format(year)) as f:
        page = f.read()

    soup = BeautifulSoup(page, "html.parser")

    page_index = soup.find("ul", attrs={"class":"page_index"})
    entries = page_index.find_all("p")
    waived_entries = [entry for entry in entries if "waived" in entry.get_text()]
    traded_entries = [entry for entry in entries if "traded" in entry.get_text()]

    for entry in waived_entries:
        if len(entry.find_all("a", attrs={"href":re.compile("/coaches/")})) > 0:
            print(year, entry.find_all("a", attrs={"href":re.compile("/coaches/")}), "(waived)")
        if len(entry.find_all("a", attrs={"href":re.compile("/executives/")})) > 0:
            print(year, entry.find_all("a", attrs={"href":re.compile("/executives/")}), "(waived)")

    for entry in traded_entries:
        if len(entry.find_all("a", attrs={"href":re.compile("/coaches/")})) > 0:
            print(year, entry.find_all("a", attrs={"href":re.compile("/coaches/")}), "(traded)")
        if len(entry.find_all("a", attrs={"href":re.compile("/executives/")})) > 0:
            print(year, entry.find_all("a", attrs={"href":re.compile("/executives/")}), "(traded)")


1992 [<a href="/coaches/dunlemi01c.html">Mike Dunleavy</a>] (traded)
1996 [<a href="/coaches/rileypa01c.html">Pat Riley</a>] (traded)
1997 [<a href="/executives/wallach99x.html">Chris Wallace</a>] (traded)
2014 [<a href="/coaches/kiddja01c.html">Jason Kidd</a>] (traded)
2018 [<a href="/executives/perrysc99x.html">Scott Perry</a>] (traded)


In [452]:
# manually enter relevant data

with open("./Data/traded_data.csv", "a", newline="") as f:
    writer = csv.writer(f)

    writer.writerow([1991, "May 11, 1992", "Mike Dunleavy", "dunlemi01", "LAL"])
    writer.writerow([1995, "September 1, 1995", "Pat Riley", "rileypa01", "NYK"])
    # writer.writerow([1996, "May 27, 1997", "Chris Wallace", "wallach99", "BOS"])
    writer.writerow([2013, "June 30, 2014", "Jason Kidd", "kiddja01", "BRK"])
    # writer.writerow([2017, "July 14, 2017", "Scott Perry", "perrysc99", "SAC"])

### Concatenating waived and traded data

In [453]:
# concatenate data using DataFrames

waived_df = pd.read_csv("./Data/waived_data.csv")
waived_df["waived"] = 1
waived_df["traded"] = 0

traded_df = pd.read_csv("./Data/traded_data.csv")
traded_df["waived"] = 0
traded_df["traded"] = 1

transaction_df = pd.concat([waived_df, traded_df], ignore_index=True)

In [454]:
# decoding player names with special characters

transaction_df["player_name"] = transaction_df["player_name"].str.encode('latin-1').str.decode('utf-8')

In [455]:
transaction_df.sample(10)

Unnamed: 0,season_start,date,player_name,bbref_id,team_abv,waived,traded
2355,2014,"January 7, 2015",Jeff Adrien,adrieje01,MIN,1,0
4743,2023,"October 21, 2023",Kennedy Chandler,chandke01,BRK,1,0
3332,2018,"November 4, 2018",Tyson Chandler,chandty01,PHO,1,0
7947,2019,"July 7, 2019",Marcos Louzada Silva,louzama01,ATL,0,1
2656,2016,"October 5, 2016",Jabari Brown,brownja01,MIL,1,0
8176,2020,"March 25, 2021",Chandler Hutchison,hutchch01,CHI,0,1
1743,2010,"February 24, 2011",Derrick Brown,brownde04,CHA,1,0
7284,2014,"August 6, 2014",Quincy Acy,acyqu01,SAC,0,1
5639,2000,"January 2, 2001",Anthony Johnson,johnsan02,ATL,0,1
8391,2021,"June 24, 2022",Wendell Moore Jr.,moorewe01,HOU,0,1


In [456]:
# save transaction data as a CSV file

transaction_df.to_csv("./Data/transaction_data.csv", index=False)

### Adding player IDs

In [446]:
%pip install nba_api
from nba_api.stats.static import players

Note: you may need to restart the kernel to use updated packages.


In [458]:
# load transaction data, add player_id column

transaction_df = pd.read_csv("Data/transaction_data.csv")
transaction_df["player_id"] = None
transaction_df

Unnamed: 0,season_start,date,player_name,bbref_id,team_abv,waived,traded,player_id
0,1990,"July 5, 1990",Anthony Bowie,bowiean01,HOU,1,0,
1,1990,"July 9, 1990",Randy Allen,allenra01,SAC,1,0,
2,1990,"July 9, 1990",Michael Jackson,jacksmi02,SAC,1,0,
3,1990,"July 24, 1990",Melvin Turpin,turpime01,WSB,1,0,
4,1990,"August 10, 1990",Scott Haffner,haffnsc01,MIA,1,0,
...,...,...,...,...,...,...,...,...
8672,2023,"June 26, 2024",Ryan Dunn,dunnry01,DEN,0,1,
8673,2023,"June 26, 2024",DaRon Holmes,holmeda01,PHO,0,1,
8674,1991,"May 11, 1992",Mike Dunleavy,dunlemi01,LAL,0,1,
8675,1995,"September 1, 1995",Pat Riley,rileypa01,NYK,0,1,


In [459]:
# load ID data from NBA_Player_IDs.csv
# (obtained from https://github.com/djblechn-su/nba-player-team-ids)

nba_player_ids_df = pd.read_csv("Data/NBA_Player_IDs.csv", encoding="latin-1")
nba_player_ids_df

Unnamed: 0,BBRefName,BBRefLink,BBRefID,BBRefBirthDate,NBAName,NBALink,NBAID,NBABirthDate,ESPNName,ESPNLink,ESPNID,ESPNBirthDate,SpotracName,SpotracLink,SpotracID
0,A.J. Hammons,https://www.basketball-reference.com/players/h...,hammoaj01,8/27/1992,AJ Hammons,https://stats.nba.com/player/1627773/,1627773.0,8/27/1992,AJ Hammons,http://www.espn.com/nba/player/_/id/2991178/aj...,2991178.0,8/27/1992,A.J. Hammons,https://www.spotrac.com/redirect/player/20252/,20252.0
1,A.J. Price,https://www.basketball-reference.com/players/p...,priceaj01,10/7/1986,AJ Price,https://stats.nba.com/player/201985/,201985.0,10/7/1986,A.J. Price,http://www.espn.com/nba/player/_/id/4010/aj-price,4010.0,10/7/1986,A.J. Price,https://www.spotrac.com/redirect/player/6292/,6292.0
2,Aaron Brooks,https://www.basketball-reference.com/players/b...,brookaa01,1/14/1985,Aaron Brooks,https://stats.nba.com/player/201166/,201166.0,1/14/1985,Aaron Brooks,http://www.espn.com/nba/player/_/id/3192/aaron...,3192.0,1/14/1985,Aaron Brooks,https://www.spotrac.com/redirect/player/2390/,2390.0
3,Aaron Gordon,https://www.basketball-reference.com/players/g...,gordoaa01,9/16/1995,Aaron Gordon,https://stats.nba.com/player/203932/,203932.0,9/16/1995,Aaron Gordon,http://www.espn.com/nba/player/_/id/3064290/aa...,3064290.0,9/16/1995,Aaron Gordon,https://www.spotrac.com/redirect/player/15356/,15356.0
4,Aaron Gray,https://www.basketball-reference.com/players/g...,grayaa01,12/7/1984,Aaron Gray,https://stats.nba.com/player/201189/,201189.0,12/7/1984,Aaron Gray,http://www.espn.com/nba/player/_/id/3207/aaron...,3207.0,12/7/1984,Aaron Gray,https://www.spotrac.com/redirect/player/2244/,2244.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4608,,,,,,,,,,,,,Cam Reddish,https://www.spotrac.com/redirect/player/31567/,31567.0
4609,,,,,,,,,,,,,Charlie Brown Jr.,https://www.spotrac.com/redirect/player/31896/,31896.0
4610,,,,,,,,,,,,,Louis King,https://www.spotrac.com/redirect/player/32043/,32043.0
4611,,,,,,,,,,,,,Naz Reid,https://www.spotrac.com/redirect/player/32045/,32045.0


In [460]:
# first check NBA_Player_IDs.csv for player_id

for bbref_id in transaction_df["bbref_id"].unique():
    matches = nba_player_ids_df[nba_player_ids_df["BBRefID"]==bbref_id]
    if len(matches) == 1:
        transaction_df.loc[transaction_df["bbref_id"]==bbref_id, "player_id"] = int(matches["NBAID"].item())

In [461]:
transaction_df

Unnamed: 0,season_start,date,player_name,bbref_id,team_abv,waived,traded,player_id
0,1990,"July 5, 1990",Anthony Bowie,bowiean01,HOU,1,0,194
1,1990,"July 9, 1990",Randy Allen,allenra01,SAC,1,0,76025
2,1990,"July 9, 1990",Michael Jackson,jacksmi02,SAC,1,0,77104
3,1990,"July 24, 1990",Melvin Turpin,turpime01,WSB,1,0,78386
4,1990,"August 10, 1990",Scott Haffner,haffnsc01,MIA,1,0,76911
...,...,...,...,...,...,...,...,...
8672,2023,"June 26, 2024",Ryan Dunn,dunnry01,DEN,0,1,
8673,2023,"June 26, 2024",DaRon Holmes,holmeda01,PHO,0,1,
8674,1991,"May 11, 1992",Mike Dunleavy,dunlemi01,LAL,0,1,76616
8675,1995,"September 1, 1995",Pat Riley,rileypa01,NYK,0,1,77962


In [462]:
# check how many players are still missing player_id

transaction_df.loc[transaction_df["player_id"].isnull(), "bbref_id"].nunique()

987

In [463]:
# next check NBA API for players with missing player_id

for i in transaction_df[transaction_df["player_id"].isnull()].index:
    matches = players.find_players_by_full_name(transaction_df.loc[i, "player_name"])
    if len(matches) == 1:
        transaction_df.loc[i, "player_id"] = matches[0]["id"]

In [464]:
# check how many players are still missing player_id

transaction_df.loc[transaction_df["player_id"].isnull(), "bbref_id"].nunique()

606

In [465]:
# scrape individual player webpages to find out experience/career length,
# drop rookies who have never played in the NBA

for bbref_id in transaction_df[transaction_df["player_id"].isnull()]["bbref_id"].unique():
    url = f"https://www.basketball-reference.com/players/{bbref_id[0]}/{bbref_id}.html"
    try:
        response = requests.get(url)
        sleep(np.random.uniform(3.0, 4.0))
        html_content = response.content
        soup = BeautifulSoup(html_content, "html.parser")
        try:
            if "Rookie" in soup.find("strong", string=re.compile("Experience:")).next_sibling:
                # transaction_df.loc[transaction_df["bbref_id"]==bbref_id, "Experience"] = "Rookie"
                transaction_df.drop(transaction_df[transaction_df["bbref_id"]==bbref_id].index, inplace=True)
        except:
            try:
                if "Rookie" in soup.find("strong", string=re.compile("Career Length:")).next_sibling:
                    # transaction_df.loc[transaction_df["bbref_id"]==bbref_id, "Experience"] = "Rookie"
                    transaction_df.drop(transaction_df[transaction_df["bbref_id"]==bbref_id].index, inplace=True)
            except:
                pass
    except:
        print("Error:", bbref_id)

In [466]:
# check how many players are still missing player_id

transaction_df.loc[transaction_df["player_id"].isnull(), "bbref_id"].nunique()

29

In [468]:
# check which players still have missing player_id

transaction_df.loc[transaction_df["player_id"].isnull(), "bbref_id"].unique()

array(['pasecan01', 'scotttr01', 'maneka01', 'hurtma01', 'smailal01',
       'vildolu01', 'samanlu01', 'wainris01', 'walkemj01', 'smartja01',
       'willibr03', 'willije02', 'johnsda08', 'louzama01', 'primojo01',
       'krejcvi01', 'mooreta02', 'harpero02', 'loftoke01', 'mensana01',
       'vanguje99c', 'vangust99c', 'tillmxa01', 'martike04', 'maledth01',
       'mellini01', 'sengual01', 'washity02', 'barrerj01'], dtype=object)

In [469]:
# manually enter missing player_id for each player above

bbref_to_nba_id_dict = {'pasecan01': 1628394, 'scotttr01': 1630286, 'maneka01': 1630211, 'hurtma01': 1630562, 'smailal01': 1629346,
    'vildolu01': 1630492, 'samanlu01': 1629677, 'wainris01': 1630688, 'walkemj01': 1630640, 'smartja01': 1630606,
    'willibr03': 1630314, 'willije02': 1631466, 'johnsda08': 1630525, 'louzama01': 1629712, 'primojo01': 1630563,
    'krejcvi01': 1630249, 'mooreta02': 1631386, 'harpero02': 1631199, 'loftoke01': 1631254, 'mensana01': 1641877,
    'vanguje99c': None, 'vangust99c': None, 'tillmxa01': 1630214, 'martike04': 1630231, 'maledth01': 1630177,
    'mellini01': 1629740, 'sengual01': 1630578, 'washity02': 1631102, 'barrerj01': 1629628}

for bbref_id in bbref_to_nba_id_dict.keys():
    transaction_df.loc[transaction_df["bbref_id"]==bbref_id, "player_id"] = bbref_to_nba_id_dict[bbref_id]

In [471]:
# check which many players are still missing player_id

transaction_df.loc[transaction_df["player_id"].isnull(), "bbref_id"].unique()

array(['vanguje99c', 'vangust99c'], dtype=object)

In [474]:
# drop rows corresponding to Jeff Van Gundy and Stan Van Gundy (coaches)

transaction_df.drop(transaction_df[transaction_df["bbref_id"].isin(["vanguje99c", "vangust99c"])].index, inplace=True)

In [477]:
transaction_df

Unnamed: 0,season_start,date,player_name,bbref_id,team_abv,waived,traded,player_id
0,1990,"July 5, 1990",Anthony Bowie,bowiean01,HOU,1,0,194
1,1990,"July 9, 1990",Randy Allen,allenra01,SAC,1,0,76025
2,1990,"July 9, 1990",Michael Jackson,jacksmi02,SAC,1,0,77104
3,1990,"July 24, 1990",Melvin Turpin,turpime01,WSB,1,0,78386
4,1990,"August 10, 1990",Scott Haffner,haffnsc01,MIA,1,0,76911
...,...,...,...,...,...,...,...,...
8672,2023,"June 26, 2024",Ryan Dunn,dunnry01,DEN,0,1,1642346
8673,2023,"June 26, 2024",DaRon Holmes,holmeda01,PHO,0,1,1641747
8674,1991,"May 11, 1992",Mike Dunleavy,dunlemi01,LAL,0,1,76616
8675,1995,"September 1, 1995",Pat Riley,rileypa01,NYK,0,1,77962


In [479]:
transaction_df.rename(columns={"season_start": "SEASON_START", "date": "DATE", "player_name": "PLAYER_NAME",
                               "bbref_id": "BBREF_ID", "team_abv": "TEAM_ABBREVIATION", "waived": "WAIVED",
                               "traded": "TRADED", "player_id": "PLAYER_ID"}, inplace=True)

In [480]:
transaction_df

Unnamed: 0,SEASON_START,DATE,PLAYER_NAME,BBREF_ID,TEAM_ABBREVIATION,WAIVED,TRADED,PLAYER_ID
0,1990,"July 5, 1990",Anthony Bowie,bowiean01,HOU,1,0,194
1,1990,"July 9, 1990",Randy Allen,allenra01,SAC,1,0,76025
2,1990,"July 9, 1990",Michael Jackson,jacksmi02,SAC,1,0,77104
3,1990,"July 24, 1990",Melvin Turpin,turpime01,WSB,1,0,78386
4,1990,"August 10, 1990",Scott Haffner,haffnsc01,MIA,1,0,76911
...,...,...,...,...,...,...,...,...
8672,2023,"June 26, 2024",Ryan Dunn,dunnry01,DEN,0,1,1642346
8673,2023,"June 26, 2024",DaRon Holmes,holmeda01,PHO,0,1,1641747
8674,1991,"May 11, 1992",Mike Dunleavy,dunlemi01,LAL,0,1,76616
8675,1995,"September 1, 1995",Pat Riley,rileypa01,NYK,0,1,77962


In [481]:
# rewrite transaction_data.csv with PLAYER_ID added

transaction_df.to_csv("./Data/transaction_data.csv", index=False)