In [32]:
import os
import pandas as pd
from bs4 import BeautifulSoup
from io import StringIO
import re
import datetime
pd.set_option('display.max_columns', None)
pd.set_option('display.min_rows', None)

In [33]:
SCORE_DIR = '/Users/benjamincheng/Documents/GitHub/Sports-Betting/data/HTML/scores'

In [34]:
box_scores = os.listdir(SCORES_DIR)

In [35]:
box_scores = [os.path.join(SCORE_DIR, f) for f in box_scores if f.endswith(".html") and (f.startswith("202403")) or (f.startswith("202404"))] #or f.startswith("201710")or f.startswith("201711")or f.startswith("201712"))]

In [36]:
box_scores

['/Users/benjamincheng/Documents/GitHub/Sports-Betting/data/HTML/scores/202403250HOU.html',
 '/Users/benjamincheng/Documents/GitHub/Sports-Betting/data/HTML/scores/202403140MIL.html',
 '/Users/benjamincheng/Documents/GitHub/Sports-Betting/data/HTML/scores/202403220POR.html',
 '/Users/benjamincheng/Documents/GitHub/Sports-Betting/data/HTML/scores/202403170WAS.html',
 '/Users/benjamincheng/Documents/GitHub/Sports-Betting/data/HTML/scores/202403230POR.html',
 '/Users/benjamincheng/Documents/GitHub/Sports-Betting/data/HTML/scores/202403240MIN.html',
 '/Users/benjamincheng/Documents/GitHub/Sports-Betting/data/HTML/scores/202403180SAC.html',
 '/Users/benjamincheng/Documents/GitHub/Sports-Betting/data/HTML/scores/202403150UTA.html',
 '/Users/benjamincheng/Documents/GitHub/Sports-Betting/data/HTML/scores/202403050NYK.html',
 '/Users/benjamincheng/Documents/GitHub/Sports-Betting/data/HTML/scores/202403290WAS.html',
 '/Users/benjamincheng/Documents/GitHub/Sports-Betting/data/HTML/scores/20240302

In [37]:
def parse_html(box_scores):
    with open(box_scores) as f:
        html = f.read()
        
    soup = BeautifulSoup(html)
    [s.decompose() for s in soup.select("tr.over_header")]
    [s.decompose() for s in soup.select("tr.thead")]
    
    return soup

In [38]:
def read_line_score(soup):
    html = StringIO(str(soup))
    line_score = pd.read_html(html, attrs = {"id": "line_score"})[0]
    columns = list(line_score.columns)
    columns[0] = "Teams"
    columns[-1] = "Total"
    line_score.columns = columns 
    
    line_score = line_score[["Teams", "Total"]]
    return line_score

In [39]:
def four_factors(soup):
    html = StringIO(str(soup))
    factors = pd.read_html(html, attrs = {"id": "four_factors"})[0]
    columns = list(factors.columns)
    factors.columns = columns
    
    factors = factors[['Pace', 'eFG%', 'TOV%', 'ORB%', 'FT/FGA', 'ORtg']]
    return factors

In [40]:
def read_stats(soup, team, stat):
    html = StringIO(str(soup))
    stats = pd.read_html(html, attrs = {"id": f"box-{team}-game-{stat}"}, index_col = 0)[0]
    stats = stats.apply(pd.to_numeric, errors = "coerce")
    return stats

In [41]:
def read_season_info(soup):
    nav = soup.select("#bottom_nav_container")[0]
    hrefs = [a["href"] for a in nav.find_all("a")]
    season = os.path.basename(hrefs[1]).split("_")[0]
    return season

In [42]:
def read_record(soup):
    he = []
    html = StringIO(str(soup))
    scoreboxes = soup.find_all(class_='scorebox')
    score_pattern = re.compile(r'\d+-\d+')
    
    for scorebox in scoreboxes:
        # Find all div elements within each scorebox that match the score pattern
        score_divs = scorebox.find_all('div', string=score_pattern)
        # If any matching divs are found
        if score_divs:
            # Loop through each matching div
            for score_div in score_divs:
                # Do something with the div
                record = score_div.text.split("-")
                df_record = pd.DataFrame([record])
                he.append(df_record)
    df = pd.concat(he)
    columns = list(df.columns)
    columns[0] = "Wins"
    columns[1] = "Losses"
    df.columns = columns
    df = df.reset_index()
    del df["index"]
    return df

In [43]:
base_cols = None
games = []

for box_score in box_scores:
    soup = parse_html(box_score)
    line_score = read_line_score(soup)
    factors = four_factors(soup)
    try:
        records = read_record(soup)
    except:
        continue
    teams = list(line_score['Teams'])
    summaries = []
    for team in teams:
        basic = read_stats(soup, team, "basic")
        advanced = read_stats(soup, team, "advanced")
        advanced.drop("MP", axis=1, inplace=True)
        total = pd.concat([basic.iloc[-1,:], advanced.iloc[-1,:]])
        total.index = total.index.str.lower()

        maxes = pd.concat([basic.iloc[:-1].max(), advanced.iloc[:-1].max()])
        maxes.index = maxes.index.str.lower() + "maxes"

        summary = pd.concat([total, maxes])

        if base_cols is None:
            base_cols = list(summary.index.drop_duplicates(keep='first'))
            base_cols = [b for b in base_cols if "bpm" not in b]

        summary = summary[base_cols]
        summaries.append(summary)

    summary = pd.concat(summaries, axis =1).T

    game = pd.concat([summary, line_score,factors, records], axis = 1)
    game['home'] = [0,1]

    game_opp = game.iloc[::-1].reset_index()
    game_opp.columns += "_opp"

    full_game = pd.concat([game, game_opp], axis = 1)

    full_game["season"] = read_season_info(soup)
    full_game["date"] = os.path.basename(box_score)[:8]
    full_game["date"] = pd.to_datetime(full_game["date"], format = "%Y%m%d")
    full_game["won"] = full_game["Total"] > full_game["Total_opp"]

    games.append(full_game)

    if (len(games) % 100 == 0):
        print(f"{len(games)} / {len(box_scores)}")

100 / 236
200 / 236


In [44]:
green_eggs = pd.concat(games, ignore_index = True)

In [45]:
def setup(df, whole):
    date = check['date'].iloc[-1][:10]
    before = pd.to_datetime(date)
    #after = pd.Timestamp(datetime.date(2024,12,12))

     # Subtract a year from 'date' column values exceeding the threshold
    for index,row in df.iterrows():
        if row['date'] <= before:
            df.drop(labels=index, axis=0, inplace=True)
    
#     for index,row in df.iterrows():
#         if row['date'] > after:
#             df.drop(labels=index, axis=0, inplace=True)
            
    df = df.sort_values("date")
    df = df.reset_index(drop=True)
    
    del df['index_opp']
    del df["mpmaxes"]
    del df["mpmaxes_opp"]
    del df['+/-']
    del df["+/-_opp"]
    
    return df

In [46]:
folder_path = "/Users/benjamincheng/Documents/GitHub/Sports-Betting/data/raw_data/"
csv_path = "NBA_2018_2024.csv"
complete_path = folder_path + csv_path
whole = pd.read_csv(complete_path, index_col=0)
df = setup(green_eggs, whole)
complete = pd.concat([whole,df], axis=0)

In [48]:
complete['date']

0               2017-10-17
1               2017-10-17
2               2017-10-17
3               2017-10-17
4               2017-10-18
5               2017-10-18
6               2017-10-18
7               2017-10-18
8               2017-10-18
9               2017-10-18
10              2017-10-18
11              2017-10-18
12              2017-10-18
13              2017-10-18
14              2017-10-18
15              2017-10-18
16              2017-10-18
17              2017-10-18
18              2017-10-18
19              2017-10-18
20              2017-10-18
21              2017-10-18
22              2017-10-18
23              2017-10-18
24              2017-10-18
25              2017-10-18
26              2017-10-19
27              2017-10-19
28              2017-10-19
29              2017-10-19
              ...         
148    2024-03-31 00:00:00
149    2024-03-31 00:00:00
150    2024-03-31 00:00:00
151    2024-03-31 00:00:00
152    2024-03-31 00:00:00
153    2024-03-31 00:00:00
1

In [49]:
folder_path = "/Users/benjamincheng/Documents/GitHub/Sports-Betting/data/raw_data/"
csv_path = "NBA_2018_2024.csv"
complete_path = folder_path + csv_path
complete.to_csv(complete_path)