# Web scraping NBA transaction data from [basketball-reference.com](https://www.basketball-reference.com/)

In [171]:
import bs4
from bs4 import BeautifulSoup
import requests
import csv
import re
import pandas as pd
import ast

### HTML files

In [114]:
years = list(range(1991, 2025))

url_blank = "https://www.basketball-reference.com/leagues/NBA_{}_transactions.html"

In [196]:
# save HTML files for each year
# for year in [2015]:
for year in years:
    url = url_blank.format(year)
    response = requests.get(url)

    with open("./Data/transaction_htmls/transactions_{}.html".format(year), "w+") as f:
        f.write(response.text)

### Waived players data

In [199]:
# extract waived data from HTMLs and save as a CSV file
with open("./Data/waived_data.csv", "w+", newline="") as f:
    writer = csv.writer(f)
    writer.writerow(["season_start", "date", "player_name", "team_abv"])

    for year in years:
        with open("./Data/transaction_htmls/transactions_{}.html".format(year)) as h:
            page = h.read()

        soup = BeautifulSoup(page, "html.parser")

        page_index = soup.find("ul", attrs={"class":"page_index"})

        for entry in page_index.find_all("p"):
            if "waived" in entry.get_text():
                writer.writerow([year-1,
                                 entry.parent.find("span").get_text(),
                                 entry.find("a", attrs={"href":re.compile("/players/")}).get_text(),
                                 entry.find("a", attrs={"data-attr-from":True})["data-attr-from"]])

### Traded players data

In [173]:
# helper function for extracting each traded player (excluding draft picks) for a given entry and writing to the CSV file
def write_trades(entry, season_start, csv_writer):
    for tag in entry.find_all("a"):
        if "data-attr-from" in tag.attrs:
            current_team = tag["data-attr-from"]
        elif "data-attr-to" in tag.attrs:
            current_team = tag["data-attr-to"]
        elif ("href" in tag.attrs) and ("/players/" in tag["href"]) and ("was later selected" not in tag.next_sibling):
            csv_writer.writerow([season_start, entry.parent.find("span").get_text(), tag.get_text(), current_team])

In [201]:
# extract traded data from HTMLs and save as a CSV file
with open("./Data/traded_data.csv", "w+", newline="") as f:
    writer = csv.writer(f)
    writer.writerow(["season_start", "date", "player_name", "team_abv"])

    for year in years:
        with open("./Data/transaction_htmls/transactions_{}.html".format(year)) as h:
            page = h.read()

        soup = BeautifulSoup(page, "html.parser")

        page_index = soup.find("ul", attrs={"class":"page_index"})

        for entry in page_index.find_all("p"):
            if "traded" in entry.get_text():
                write_trades(entry, year-1, writer)

### Check for players labelled as coaches or executives

(Manually fill these in.)

In [176]:
for year in years:
    with open("./Data/transaction_htmls/transactions_{}.html".format(year)) as f:
        page = f.read()

    soup = BeautifulSoup(page, "html.parser")

    page_index = soup.find("ul", attrs={"class":"page_index"})
    entries = page_index.find_all("p")
    waived_entries = [entry for entry in entries if "waived" in entry.get_text()]
    traded_entries = [entry for entry in entries if "traded" in entry.get_text()]

    for entry in waived_entries:
        if len(entry.find_all("a", attrs={"href":re.compile("/coaches/")})) > 0:
            print(year, entry.find_all("a", attrs={"href":re.compile("/coaches/")}), "(waived)")
        if len(entry.find_all("a", attrs={"href":re.compile("/executives/")})) > 0:
            print(year, entry.find_all("a", attrs={"href":re.compile("/executives/")}), "(waived)")

    for entry in traded_entries:
        if len(entry.find_all("a", attrs={"href":re.compile("/coaches/")})) > 0:
            print(year, entry.find_all("a", attrs={"href":re.compile("/coaches/")}), "(traded)")
        if len(entry.find_all("a", attrs={"href":re.compile("/executives/")})) > 0:
            print(year, entry.find_all("a", attrs={"href":re.compile("/executives/")}), "(traded)")


1992 [<a href="/coaches/dunlemi01c.html">Mike Dunleavy</a>] (traded)
1996 [<a href="/coaches/rileypa01c.html">Pat Riley</a>] (traded)
1997 [<a href="/executives/wallach99x.html">Chris Wallace</a>] (traded)
2014 [<a href="/coaches/kiddja01c.html">Jason Kidd</a>] (traded)
2018 [<a href="/executives/perrysc99x.html">Scott Perry</a>] (traded)


In [181]:
with open("./Data/traded_data.csv", "a", newline="") as f:
    writer = csv.writer(f)

    writer.writerow([1991, "May 11, 1992", "Mike Dunleavy", "LAL"])
    writer.writerow([1995, "September 1, 1995", "Pat Riley", "NYK"])
    writer.writerow([1996, "May 27, 1997", "Chris Wallace", "BOS"])
    writer.writerow([2013, "June 30, 2014", "Jason Kidd", "BRK"])
    writer.writerow([2017, "July 14, 2017", "Scott Perry", "SAC"])

### Concatenating waived and traded data

In [217]:
# concatenate data using DataFrames
waived_df = pd.read_csv("./Data/waived_data.csv")
waived_df["waived"] = 1
waived_df["traded"] = 0

traded_df = pd.read_csv("./Data/traded_data.csv")
traded_df["waived"] = 0
traded_df["traded"] = 1

transaction_df = pd.concat([waived_df, traded_df], ignore_index=True)

In [None]:
# decoding player names with special characters
transaction_df["player_name"] = transaction_df["player_name"].str.encode('latin-1').str.decode('utf-8')

In [218]:
transaction_df.sample(10)

Unnamed: 0,season_start,date,player_name,team_abv,waived,traded
7940,2019,"July 6, 2019",Anthony Davis,NOP,0,1
822,1999,"October 20, 1999",Maceo Baston,CHI,1,0
4429,2022,"October 17, 2022",Kemba Walker,DET,1,0
3339,2018,"December 2, 2018",Billy Preston,CLE,1,0
3746,2020,"December 18, 2020",Trevon Bluiett,UTA,1,0
8152,2020,"March 18, 2021",Torrey Craig,MIL,0,1
8237,2021,"August 6, 2021",Chandler Hutchison,WAS,0,1
1170,2003,"October 22, 2003",Jeryl Sasser,ORL,1,0
5063,1994,"February 14, 1995",Marcelo Nicola,HOU,0,1
5622,2000,"September 25, 2000",Popeye Jones,DEN,0,1


In [227]:
# save transaction data as a CSV file
transaction_df.to_csv("./Data/transaction_data.csv", index=False)