In [100]:
import os
from io import StringIO
import pandas as pd
from bs4 import BeautifulSoup

In [101]:
SCORE_DIR = "data/scores"
box_scores = os.listdir(SCORE_DIR)
box_scores = [os.path.join(SCORE_DIR, f) for f in box_scores if f.endswith(".html")]

In [102]:
def parse_html(box_score):
    with open(box_score, encoding="utf8") as f:
        html = f.read()

    soup = BeautifulSoup(html)
    [s.decompose() for s in soup.select("tr.over_header")]
    [s.decompose() for s in soup.select("tr.thead")]
    return soup

In [124]:
def read_stats(soup, team, stat):
    df = pd.read_html(StringIO(str(soup)), attrs={"id": f"box-{team}-game-{stat}"}, index_col=0)[0]
    mp_column = df['MP'].replace("Did Not Play", pd.NA)
    df = df.drop(columns=['MP']).apply(pd.to_numeric, errors='coerce')
    df['MP'] = mp_column
    cleaned_df = df.dropna(subset=['MP'])
    return cleaned_df

In [125]:
def read_teams(soup):
    line_score = pd.read_html(StringIO(str(soup)), attrs = {'id': 'line_score'})[0]
    teams = line_score["Unnamed: 0"]
    return teams

In [126]:
def combine_team_stats(soup, team):
    basic = read_stats(soup, team, "basic")
    advanced = read_stats(soup, team, "advanced")
    combined = pd.merge(basic, advanced, left_index=True, right_index=True, suffixes=('_basic', '_advanced'))
    return combined

In [127]:
box_score = box_scores[0]
soup = parse_html(box_score)
teams = list(read_teams(soup))
summaries = []
base_cols = None
player_stats = pd.DataFrame()
    
for team in teams:
    basic = read_stats(soup, team, "basic")
    advanced = read_stats(soup, team, "advanced")
    advanced = advanced.drop(columns=["MP"])
    combined = pd.merge(basic, advanced, left_index=True, right_index=True)
    combined = combined.drop(index='Team Totals', errors='ignore')
    

In [128]:
combined["MP"]

Starters
Nikola Jokić                36:16
Kentavious Caldwell-Pope    36:15
Aaron Gordon                34:59
Jamal Murray                34:15
Michael Porter Jr.          30:08
Reggie Jackson              24:04
Christian Braun             19:20
Zeke Nnaji                  11:44
Peyton Watson               10:50
Jalen Pickett                0:43
Collin Gillespie             0:43
Braxton Key                  0:43
Name: MP, dtype: object

In [None]:
games = []
base_cols = None
for box_score in box_scores:
    soup = parse_html(box_score)
    teams = list(read_teams(soup))
    summaries = []
    
    for team in teams:
        basic = read_stats(soup, team, "basic")
        advanced = read_stats(soup, team, "advanced")
        summary = pd.concat([basic, advanced])
        
        if base_cols is None:
            base_cols = list(summary.index.drop_duplicates(keep="first"))
            base_cols = [b for b in base_cols if "bpm" not in b]
        summary

In [None]:
if base_cols is None:
        base_cols = list(summary.index.drop_duplicates(keep="first"))
        base_cols = [b for b in base_cols if "bpm" not in b]

summary = summary[base_cols]
        
        summaries.append(summary)
    summary = pd.concat(summaries, axis=1).T

    summary["home"] = [0,1]

    game_opp = game.iloc[::-1].reset_index()
    game_opp.columns += "_opp"

    full_game = pd.concat([game, game_opp], axis=1)
    full_game["season"] = read_season_info(soup)
    
    full_game["date"] = os.path.basename(box_score)[:8]
    full_game["date"] = pd.to_datetime(full_game["date"], format="%Y%m%d")
    
    full_game["won"] = full_game["total"] > full_game["total_opp"]
    games.append(full_game)
    
    if len(games) % 100 == 0:
        print(f"{len(games)} / {len(box_scores)}")