# Web scraping NBA transaction data from [basketball-reference.com](https://www.basketball-reference.com/)

In [167]:
import bs4
from bs4 import BeautifulSoup
import requests
import re
import pandas as pd

### HTML files

In [175]:
years = list(range(1991, 2025))

url_blank = "https://www.basketball-reference.com/leagues/NBA_{}_transactions.html"

In [None]:
# save HTML files for each year
for year in years:
    url = url_blank.format(year)
    response = requests.get(url)

    with open("./Data/transaction_htmls/transactions_{}.html".format(year), "w+") as f:
        f.write(response.text)

### Waived players data

In [187]:
# extract waived data from HTML file for each year and save as CSV
for year in years:
    with open("./Data/transaction_htmls/transactions_{}.html".format(year)) as f:
        page = f.read()

    soup = BeautifulSoup(page, "html.parser")

    page_index = soup.find("ul", attrs={"class":"page_index"})
    entries = page_index.find_all("p")
    waived_entries = [entry for entry in entries if "waived" in entry.get_text()]

    dates = [entry.parent.find("span").get_text() for entry in waived_entries]
    teams = [entry.find("a", attrs={"data-attr-from":True})["data-attr-from"] for entry in waived_entries]
    players = [entry.find("a", attrs={"href":re.compile("/players/")}).get_text() for entry in waived_entries]

    df = pd.DataFrame({"date":dates, "team_from":teams, "player":players})

    df.to_csv("./Data/waived/waived_{}.csv".format(year))

In [213]:
# concatenate waived data for all years and save as CSV
waived_dfs = [pd.read_csv("./Data/waived/waived_{}.csv".format(year), index_col=0) for year in years]
waived_df = pd.concat(waived_dfs, ignore_index=True)
waived_df.to_csv("./Data/waived_data.csv")

### Traded players data

In [286]:
# helper function for extracting the set of teams involved in a given trade
def involved_teams(entry):
    result = []
    teams = entry.find_all("a", attrs={"href":re.compile("/teams/")})
    for team in teams:
        try:
            result.append(team["data-attr-from"])
        except:
            result.append(team["data-attr-to"])
    return set(result)

# helper function for extracting the set of players involved in a given trade
def involved_players(entry):
    return set([player.get_text() for player in entry.find_all("a", attrs={"href":re.compile("/players/")})])

In [287]:
# extract traded data from HTML file for each year and save as CSV
# includes a column indicating whether the trade includes draft picks (which we well manually remove)
for year in years:
    with open("./Data/transaction_htmls/transactions_{}.html".format(year)) as f:
        page = f.read()

    soup = BeautifulSoup(page, "html.parser")

    page_index = soup.find("ul", attrs={"class":"page_index"})
    entries = page_index.find_all("p")
    traded_entries = [entry for entry in entries if "traded" in entry.get_text()]

    dates = [entry.parent.find("span").get_text() for entry in traded_entries]
    teams = [involved_teams(entry) for entry in traded_entries]
    players = [involved_players(entry) for entry in traded_entries]
    includes_draft_picks = [("draft pick" in entry.get_text()) for entry in traded_entries]

    df = pd.DataFrame({"date":dates, "teams":teams, "players":players, "includes_draft_picks":includes_draft_picks})

    df.to_csv("./Data/traded/traded_{}.csv".format(year))