In [100]:
import os
from io import StringIO
import pandas as pd
from bs4 import BeautifulSoup

In [101]:
SCORE_DIR = "data/scores"
box_scores = os.listdir(SCORE_DIR)
box_scores = [os.path.join(SCORE_DIR, f) for f in box_scores if f.endswith(".html")]

In [102]:
def parse_html(box_score):
    with open(box_score, encoding="utf8") as f:
        html = f.read()

    soup = BeautifulSoup(html)
    [s.decompose() for s in soup.select("tr.over_header")]
    [s.decompose() for s in soup.select("tr.thead")]
    return soup

In [187]:
def read_stats(soup, team, stat):
    df = pd.read_html(StringIO(str(soup)), attrs={"id": f"box-{team}-game-{stat}"}, index_col=0)[0]
    mp_column = df['MP'].replace("Did Not Play", pd.NA)
    df = df.drop(columns=['MP']).apply(pd.to_numeric, errors='coerce')
    df['MP'] = mp_column
    cleaned_df = df.dropna(subset=['MP'])
    return cleaned_df

In [181]:
def read_teams(soup):
    line_score = pd.read_html(StringIO(str(soup)), attrs = {'id': 'line_score'})[0]
    teams = line_score["Unnamed: 0"]
    return teams

In [182]:
def combine_team_stats(soup, team):
    basic = read_stats(soup, team, "basic")
    advanced = read_stats(soup, team, "advanced")
    combined = pd.merge(basic, advanced, left_index=True, right_index=True, suffixes=('_basic', '_advanced'))
    return combined

In [190]:
def read_season_info(soup):
    nav = soup.select("#bottom_nav_container")[0]
    hrefs = [a["href"] for a in nav.find_all("a")]
    season = os.path.basename(hrefs[1]).split("_")[0]
    return season

In [326]:
def add_highest_scoring_teamate(df):
    highest_scoring_teammate = pd.Series(index=df.index)
    total, highest, second_highest = df['PTS'].nlargest(3)
    for index in df.index:
        if df['PTS'][index] == highest:
            highest_scoring_teammate[index] = second_highest
        else:
            highest_scoring_teammate[index] = highest
    return highest_scoring_teammate

In [334]:
games = []

for box_score in box_scores:
    soup = parse_html(box_score)
    teams = list(read_teams(soup))
    team1 = pd.DataFrame()
    for team in teams:
        basic = read_stats(soup, team, "basic")
        advanced = read_stats(soup, team, "advanced")
        advanced = advanced.drop(columns=["MP"])
        combined = pd.merge(basic, advanced, left_index=True, right_index=True)
        combined["home"] = 1
        if team1.empty:
            team1 = combined
            team1['team'] = team
            team1["home"] = 0
            team1_DRtg = team1["DRtg"]["Team Totals"]
            team1['highest_scoring_teammate'] = add_highest_scoring_teamate(team1)
        combined["opp_team_DRtg"] = team1_DRtg
        combined['team'] = team
        combined['highest_scoring_teammate'] = add_highest_scoring_teamate(combined)
        team1["opp_team_DRtg"] = combined["DRtg"]["Team Totals"]
        game = pd.concat([team1, combined])
        game = game.rename_axis('Player Name')
        game = game.drop(index='Team Totals', errors='ignore')
        game["season"] = read_season_info(soup)
        game["date"] = os.path.basename(box_score)[:8]
        game["date"] = pd.to_datetime(game["date"], format="%Y%m%d")
        game['player'] = 
        games.append(game)
        if len(games) % 100 == 0:
            print(f"{len(games)} / {len(box_scores)}")

100 / 1319
200 / 1319
300 / 1319
400 / 1319
500 / 1319
600 / 1319
700 / 1319
800 / 1319
900 / 1319
1000 / 1319
1100 / 1319
1200 / 1319
1300 / 1319
1400 / 1319
1500 / 1319
1600 / 1319
1700 / 1319
1800 / 1319
1900 / 1319
2000 / 1319
2100 / 1319
2200 / 1319
2300 / 1319
2400 / 1319
2500 / 1319
2600 / 1319


In [357]:
games_df = pd.concat(games, ignore_index=False)

In [362]:
indexs = games_df.index
games_df['player'] = indexs

In [368]:
games_df = games_df.reset_index()
games_df = games_df.drop('Unnamed: 16', axis=1)
games_df

Unnamed: 0,Player Name,FG,FGA,FG%,3P,3PA,3P%,FT,FTA,FT%,...,ORtg,DRtg,BPM,home,team,highest_scoring_teammate,opp_team_DRtg,season,date,player
0,D'Angelo Russell,4.0,12.0,0.333,2.0,5.0,0.400,1.0,2.0,0.500,...,91.0,125.0,-6.2,0,LAL,21.0,124.8,2024,2023-10-24,D'Angelo Russell
1,Anthony Davis,6.0,17.0,0.353,1.0,2.0,0.500,4.0,4.0,1.000,...,100.0,122.0,-2.1,0,LAL,21.0,124.8,2024,2023-10-24,Anthony Davis
2,Austin Reaves,4.0,11.0,0.364,1.0,2.0,0.500,5.0,7.0,0.714,...,113.0,121.0,1.5,0,LAL,21.0,124.8,2024,2023-10-24,Austin Reaves
3,Taurean Prince,6.0,8.0,0.750,4.0,6.0,0.667,2.0,2.0,1.000,...,175.0,128.0,11.3,0,LAL,21.0,124.8,2024,2023-10-24,Taurean Prince
4,LeBron James,10.0,16.0,0.625,1.0,4.0,0.250,0.0,1.0,0.000,...,141.0,122.0,11.7,0,LAL,18.0,124.8,2024,2023-10-24,LeBron James
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
57834,Kristaps Porziņģis,2.0,4.0,0.500,0.0,2.0,0.000,1.0,2.0,0.500,...,111.0,108.0,-7.9,1,BOS,31.0,122.1,2024,2024-06-17,Kristaps Porziņģis
57835,Luke Kornet,0.0,0.0,,0.0,0.0,,0.0,0.0,,...,231.0,110.0,-1.0,1,BOS,31.0,122.1,2024,2024-06-17,Luke Kornet
57836,Payton Pritchard,1.0,1.0,1.000,1.0,1.0,1.000,0.0,0.0,,...,300.0,110.0,78.6,1,BOS,31.0,122.1,2024,2024-06-17,Payton Pritchard
57837,Oshae Brissett,0.0,0.0,,0.0,0.0,,0.0,0.0,,...,0.0,110.0,-6.0,1,BOS,31.0,122.1,2024,2024-06-17,Oshae Brissett


In [374]:
[g.shape[1] for g in games if g.shape[1] != 41]

[]

In [375]:
games_df.to_csv("nba_games.csv")