In [172]:
import os
import io
import pandas as pd
from bs4 import BeautifulSoup

In [173]:
%pip install html5lib

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip available: 22.3.1 -> 23.2.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [174]:
SCORE_DIRECTORY = "data/scores/"

In [175]:
box_scores = os.listdir(SCORE_DIRECTORY)
box_scores = [os.path.join(SCORE_DIRECTORY, score_file) for score_file in box_scores if score_file.endswith(".html")]
len(box_scores)

5071

In [176]:
example_box_score = box_scores[0]

In [177]:
def parse_html(box_score):
    with open(box_score, encoding="utf-8") as f:
        html = f.read()
    soup = BeautifulSoup(html)
    # table = soup.find('table', {'id': 'line_score'})

    # Mendapatkan data lewat id elemen HTML

    # Hilangkan header tabel yang tidak diperlukan
    [element.decompose() for element in soup.select("tr.over_header")]
    [element.decompose() for element in soup.select("tr.thead")]
    return soup

In [178]:
soup = parse_html(example_box_score)
# 'soup' berisi HTML string dari file 'box_scores[0]'

In [179]:
# Untuk mengambil 'line score' (skor pertandingan)
def read_line_score(soup):
    line_score_df = pd.read_html(io.StringIO(str(soup)), attrs={"id": "line_score"})[0]
    cols = list(line_score_df.columns)
    cols[0] = "team"
    cols[-1] = "total"
    line_score_df.columns = cols

    line_score_df = line_score_df[["team", "total"]]

    return line_score_df

In [180]:
line_score_example = read_line_score(soup)
line_score_example

Unnamed: 0,team,total
0,NYK,88
1,CLE,117


In [181]:
# Untuk mengambil statistik pertandingan (basic/advanced stats)
def read_stats(soup, team, stat_type):
    df = pd.read_html(io.StringIO(str(soup)), attrs = {"id": f"box-{team}-game-{stat_type}"}, index_col=0)[0]
    df = df.apply(pd.to_numeric, errors="coerce") # Convert kolom ke numerik (karena angka2 di tabel masih string)
    return df

In [182]:
teams = list(line_score_example["team"])
for team in teams:
    basic = read_stats(soup, team, "basic")
    advanced = read_stats(soup, team, "advanced")

In [183]:
basic

Unnamed: 0_level_0,MP,FG,FGA,FG%,3P,3PA,3P%,FT,FTA,FT%,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS,+/-
Starters,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
LeBron James,,9,14,0.643,0,3,0.0,1,2,0.5,3,8,11,14,0,1,4,3,19,26.0
Kyrie Irving,,12,22,0.545,4,7,0.571,1,1,1.0,1,1,2,4,2,0,0,1,29,13.0
J.R. Smith,,3,13,0.231,2,8,0.25,0,0,,0,3,3,2,1,0,0,1,8,13.0
Kevin Love,,6,15,0.4,2,6,0.333,9,12,0.75,2,10,12,2,3,0,2,3,23,24.0
Tristan Thompson,,0,1,0.0,0,0,,0,0,,2,4,6,0,0,0,2,4,0,16.0
Richard Jefferson,,5,7,0.714,2,3,0.667,1,2,0.5,0,4,4,1,2,0,2,1,13,18.0
Mike Dunleavy,,2,3,0.667,0,1,0.0,0,0,,0,4,4,2,3,0,0,0,4,19.0
Channing Frye,,2,6,0.333,2,5,0.4,0,0,,2,2,4,0,1,1,1,4,6,7.0
Iman Shumpert,,1,3,0.333,0,1,0.0,0,0,,1,1,2,3,0,1,2,2,2,-5.0
DeAndre Liggins,,2,3,0.667,0,0,,0,0,,0,3,3,3,0,1,1,1,4,11.0


In [184]:
# Mencari tahun season dari data (HTML) tertentu
def read_season_info(soup):
    nav = soup.select("#bottom_nav_container")[0]
    hrefs = [a["href"] for a in nav.find_all('a')]
    season = os.path.basename(hrefs[1]).split("_")[0]
    return season

In [185]:
read_season_info(soup)

'2017'

In [186]:
# Membuat dataset (contoh untuk 1 pertandingan)

games = []
base_cols = None        # --> daftar-daftar fitur, akan didapatkan otomatis nanti
# for box_score in box_scores:
soup = parse_html(example_box_score)

line_score_example = read_line_score(soup)
teams = list(line_score_example["team"])

summaries = []          # --> akan berisi stats dari kedua tim
for team in teams:
    basic = read_stats(soup, team, "basic")
    advanced = read_stats(soup, team, "advanced")

    # Row paling bawah (iloc[-1,:]) dari 'basic' dan 'advanced' adalah total stats tertentu (misal, total rebound/assist sebuah tim)
    # Ambil total itu saja sebagai data yang akan digunakan
    totals = pd.concat([basic.iloc[-1,:], advanced.iloc[-1,:]])
    totals.index = totals.index.str.lower()             # --> nama kolom (fg, 3p, ft, ast, dll.)

    # Dari 'basic' & 'advanced' masing-masing tim, akan diambil nilai terbesar untuk tiap pemain (.iloc[:-1] --> kecuali last row).
    # Karena kemungkinan pemenangnya adalah tim yang mencetak stats (misal, rebound, assist) lebih banyak.
    maxes = pd.concat([basic.iloc[:-1].max(), advanced.iloc[:-1].max()])
    maxes.index = maxes.index.str.lower() + "_max"      # --> nama kolom (fg_max, 3p_max, ft_max, ast_max, dll.)

    # Menggabungkan 'totals' dan 'maxes' ke dalam DataFrame 'summary' (untuk 1 tim)
    summary = pd.concat([totals, maxes])
    
    # Mendapatkan daftar fitur dari DataFrame 'summary', lalu dijadikan standar (base column) untuk tim selanjutnya
    if base_cols is None:
        base_cols = list(summary.index.drop_duplicates(keep="first"))
        base_cols = [b for b in base_cols if "bpm" not in b]    # --> fitur tidak relevan, tidak berpengaruh pada win-loss
        
    summary = summary[base_cols]
        
    # Tambahkan 'summary' dari tim yang lain
    summaries.append(summary)
summary = pd.concat(summaries, axis=1).T

game = pd.concat([summary, line_score_example], axis=1)

# 0 adalah away team, 1 adalah home team
game["home"] = [0,1]

# 'summary' adalah stats 2 tim tapi dalam 2 row
# Tiap row harus ditambahkan row (stats) dari tim lawan
game_opp = game.iloc[::-1].reset_index()
game_opp.columns += "_opp"

# Menambahkan metadata "season"
full_game = pd.concat([game, game_opp], axis=1)
full_game["season"] = read_season_info(soup)

# Menambahkan metadata "date"
full_game["date"] = os.path.basename(box_scores[0])[:8]
full_game["date"] = pd.to_datetime(full_game["date"], format="%Y%m%d")

# Menambahkan kelas siapa yang menang
full_game["won"] = full_game["total"] > full_game["total_opp"]
games.append(full_game)

In [187]:
games_df = pd.DataFrame(games[0])
games_df

Unnamed: 0,mp,mp.1,fg,fga,fg%,3p,3pa,3p%,ft,fta,...,tov%_max_opp,usg%_max_opp,ortg_max_opp,drtg_max_opp,team_opp,total_opp,home_opp,season,date,won
0,240.0,240.0,32.0,87.0,0.368,9.0,27.0,0.333,15.0,20.0,...,66.7,36.8,200.0,101.0,CLE,117,1,2017,2016-10-25,False
1,240.0,240.0,45.0,94.0,0.479,13.0,35.0,0.371,14.0,19.0,...,50.0,36.8,153.0,125.0,NYK,88,0,2017,2016-10-25,True
