# Web scraping NBA transaction data from [basketball-reference.com](https://www.basketball-reference.com/)

In [313]:
import bs4
from bs4 import BeautifulSoup
import requests
import re
import pandas as pd

### HTML files

In [314]:
years = list(range(1991, 2025))

url_blank = "https://www.basketball-reference.com/leagues/NBA_{}_transactions.html"

In [None]:
# save HTML files for each year
for year in years:
    url = url_blank.format(year)
    response = requests.get(url)

    with open("./Data/transaction_htmls/transactions_{}.html".format(year), "w+") as f:
        f.write(response.text)

### Waived players data

In [187]:
# extract waived data from HTML file for each year and save as CSV
for year in years:
    with open("./Data/transaction_htmls/transactions_{}.html".format(year)) as f:
        page = f.read()

    soup = BeautifulSoup(page, "html.parser")

    page_index = soup.find("ul", attrs={"class":"page_index"})
    entries = page_index.find_all("p")
    waived_entries = [entry for entry in entries if "waived" in entry.get_text()]

    dates = [entry.parent.find("span").get_text() for entry in waived_entries]
    teams = [entry.find("a", attrs={"data-attr-from":True})["data-attr-from"] for entry in waived_entries]
    players = [entry.find("a", attrs={"href":re.compile("/players/")}).get_text() for entry in waived_entries]

    df = pd.DataFrame({"date":dates, "team_from":teams, "player":players})

    df.to_csv("./Data/waived/waived_{}.csv".format(year))

In [213]:
# concatenate waived data for all years and save as CSV
waived_dfs = [pd.read_csv("./Data/waived/waived_{}.csv".format(year), index_col=0) for year in years]
waived_df = pd.concat(waived_dfs, ignore_index=True)
waived_df.to_csv("./Data/waived_data.csv")

### Traded players data

In [428]:
# helper function for extracting the set of teams involved in a given trade
def involved_teams(entry):
    result = []
    teams = entry.find_all("a", attrs={"href":re.compile("/teams/")})
    for team in teams:
        try:
            result.append(team["data-attr-from"])
        except:
            result.append(team["data-attr-to"])
    return set(result)

# helper functiono for extracting players involved in a given trade which are future draft picks
# (and therefore shouldn't be included in our CSV file)
def draft_picks(entry):
    result = []
    target_strings = entry.find_all(string=lambda text: "was later selected" in text)
    for s in target_strings:
        result.append(s.previous_sibling.get_text())
    return result

# helper function for extracting the set of players involved in a given trade
# (excluding future draft picks)
def involved_players(entry):
    return set([player.get_text() for player in entry.find_all("a", attrs={"href":re.compile("/players/")})]).difference(draft_picks(entry))

In [431]:
# extract traded data from HTML file for each year and save as CSV
for year in years:
    with open("./Data/transaction_htmls/transactions_{}.html".format(year)) as f:
        page = f.read()

    soup = BeautifulSoup(page, "html.parser")

    page_index = soup.find("ul", attrs={"class":"page_index"})
    entries = page_index.find_all("p")
    traded_entries = [entry for entry in entries if "traded" in entry.get_text()]

    dates = [entry.parent.find("span").get_text() for entry in traded_entries]
    teams = [involved_teams(entry) for entry in traded_entries]
    players = [involved_players(entry) for entry in traded_entries]

    df = pd.DataFrame({"date":dates, "teams":teams, "players":players})

    df.to_csv("./Data/traded/traded_{}.csv".format(year))

### Check for players labelled as coaches or executives

(Manually edit these CSV files.)

In [312]:
for year in years:
    with open("./Data/transaction_htmls/transactions_{}.html".format(year)) as f:
        page = f.read()

    soup = BeautifulSoup(page, "html.parser")

    page_index = soup.find("ul", attrs={"class":"page_index"})
    entries = page_index.find_all("p")
    waived_entries = [entry for entry in entries if "waived" in entry.get_text()]
    traded_entries = [entry for entry in entries if "traded" in entry.get_text()]

    for entry in waived_entries:
        if len(entry.find_all("a", attrs={"href":re.compile("/coaches/")})) > 0:
            print(year, entry.find_all("a", attrs={"href":re.compile("/coaches/")}), "(waived)")
        if len(entry.find_all("a", attrs={"href":re.compile("/executives/")})) > 0:
            print(year, entry.find_all("a", attrs={"href":re.compile("/executives/")}), "(waived)")

    for entry in traded_entries:
        if len(entry.find_all("a", attrs={"href":re.compile("/coaches/")})) > 0:
            print(year, entry.find_all("a", attrs={"href":re.compile("/coaches/")}), "(traded)")
        if len(entry.find_all("a", attrs={"href":re.compile("/executives/")})) > 0:
            print(year, entry.find_all("a", attrs={"href":re.compile("/executives/")}), "(traded)")


1992 [<a href="/coaches/dunlemi01c.html">Mike Dunleavy</a>] (traded)
1996 [<a href="/coaches/rileypa01c.html">Pat Riley</a>] (traded)
1997 [<a href="/executives/wallach99x.html">Chris Wallace</a>] (traded)
2014 [<a href="/coaches/kiddja01c.html">Jason Kidd</a>] (traded)
2018 [<a href="/executives/perrysc99x.html">Scott Perry</a>] (traded)
