In [3]:
import os
from io import StringIO
import pandas as pd
from bs4 import BeautifulSoup

In [4]:
SCORE_DIR = "data/scores"
box_scores = os.listdir(SCORE_DIR)
box_scores = [os.path.join(SCORE_DIR, f) for f in box_scores if f.endswith(".html")]

In [5]:
def parse_html(box_score):
    with open(box_score, encoding="utf8") as f:
        html = f.read()

    soup = BeautifulSoup(html)
    [s.decompose() for s in soup.select("tr.over_header")]
    [s.decompose() for s in soup.select("tr.thead")]
    return soup

In [6]:
def read_stats(soup, team, stat):
    df = pd.read_html(StringIO(str(soup)), attrs={"id": f"box-{team}-game-{stat}"}, index_col=0)[0]
    mp_column = df['MP'].replace("Did Not Play", pd.NA)
    df = df.drop(columns=['MP']).apply(pd.to_numeric, errors='coerce')
    df['MP'] = mp_column
    cleaned_df = df.dropna(subset=['MP'])
    return cleaned_df

In [7]:
def read_teams(soup):
    line_score = pd.read_html(StringIO(str(soup)), attrs = {'id': 'line_score'})[0]
    teams = line_score["Unnamed: 0"]
    return teams

In [8]:
def combine_team_stats(soup, team):
    basic = read_stats(soup, team, "basic")
    advanced = read_stats(soup, team, "advanced")
    combined = pd.merge(basic, advanced, left_index=True, right_index=True, suffixes=('_basic', '_advanced'))
    return combined

In [9]:
def read_season_info(soup):
    nav = soup.select("#bottom_nav_container")[0]
    hrefs = [a["href"] for a in nav.find_all("a")]
    season = os.path.basename(hrefs[1]).split("_")[0]
    return season

In [10]:
def add_highest_scoring_teamate(df):
    highest_scoring_teammate = pd.Series(index=df.index)
    total, highest, second_highest = df['PTS'].nlargest(3)
    for index in df.index:
        if df['PTS'][index] == highest:
            highest_scoring_teammate[index] = second_highest
        else:
            highest_scoring_teammate[index] = highest
    return highest_scoring_teammate

In [None]:
games = []

for box_score in box_scores:
    soup = parse_html(box_score)
    teams = list(read_teams(soup))
    team1 = pd.DataFrame()
    for team in teams:
        basic = read_stats(soup, team, "basic")
        advanced = read_stats(soup, team, "advanced")
        advanced = advanced.drop(columns=["MP"])
        combined = pd.merge(basic, advanced, left_index=True, right_index=True)
        combined["home"] = 1
        if team1.empty:
            team1 = combined
            team1['team'] = team
            team1["home"] = 0
            team1_DRtg = team1["DRtg"]["Team Totals"]
            team1['highest_scoring_teammate'] = add_highest_scoring_teamate(team1)
        combined["opp_team_DRtg"] = team1_DRtg
        combined['team'] = team
        combined['highest_scoring_teammate'] = add_highest_scoring_teamate(combined)
        team1["opp_team_DRtg"] = combined["DRtg"]["Team Totals"]
        game = pd.concat([team1, combined])
        game = game.rename_axis('Player Name')
        game = game.drop(index='Team Totals', errors='ignore')
        game["season"] = read_season_info(soup)
        game["date"] = os.path.basename(box_score)[:8]
        game["date"] = pd.to_datetime(game["date"], format="%Y%m%d")
        games.append(game)
        if len(games) % 100 == 0:
            print(f"{len(games)} / {len(box_scores)}")

100 / 1319
200 / 1319


In [357]:
games_df = pd.concat(games, ignore_index=False)

In [362]:
indexs = games_df.index
games_df['player'] = indexs

In [2]:
games_df = games_df.reset_index()
games_df = games_df.drop('Unnamed: 16', axis=1)
games_df

NameError: name 'games_df' is not defined

In [374]:
[g.shape[1] for g in games if g.shape[1] != 41]

[]

In [375]:
games_df.to_csv("nba_games.csv")