In [154]:
import numpy as np
import pandas as pd
import requests
from bs4 import BeautifulSoup

import time

In [155]:
BASE_URL = "https://www.basketball-reference.com/"

In [156]:
# Scrape GSW box score links from a team game page
team_1_url = "https://www.basketball-reference.com/teams/GSW/2022_games.html"


response = requests.get(team_1_url)
soup = BeautifulSoup(response.text, 'html.parser')
table = soup.find('table')

team_1_boxscore_links = []
for tr in table.findAll("tr"):
    trs = tr.findAll("td")
    for each in trs:
        try:
            link = each.find('a')['href']
            if "teams" in link or "boxscores/index" in link:
                continue    
            team_1_boxscore_links.append(link)
        except:
            pass


# Scrape BOS box score links from a team game page
team_2_url = "https://www.basketball-reference.com/teams/BOS/2022_games.html"


# df = pd.read_html(url)[0]
# df = df.loc[df["Date"] != "Date"].copy()
# df.reset_index(inplace=True, drop=True)

response = requests.get(team_2_url)
soup = BeautifulSoup(response.text, 'html.parser')
table = soup.find('table')

team_2_boxscore_links = []
for tr in table.findAll("tr"):
    trs = tr.findAll("td")
    for each in trs:
        try:
            link = each.find('a')['href']
            if "teams" in link or "boxscores/index" in link:
                continue    
            team_2_boxscore_links.append(link)
        except:
            pass


In [169]:
team1_pbp_links = [link.replace("/boxscores", f"{BASE_URL}boxscores/pbp") for link in team_1_boxscore_links]
team2_pbp_links = [link.replace("/boxscores", f"{BASE_URL}boxscores/pbp") for link in team_2_boxscore_links]


In [177]:
def boxscore_pbp_to_distilgpt2(boxscore_urls, debug=False):
    """Given a list of urls to team boxscores, return training text and validation text"""

    training_text = ""
    validation_text = ""

    for link in boxscore_urls:
        if debug:
            print(link)
        pbp = pd.read_html(link)[0]
        # convert nulls to str represenaion of NaN
        pbp = pbp.astype(str) 
        pbp.columns = pbp.columns.droplevel()
        pbp.columns = ["Time", "Team1", "Team1points", "Score", "Team2points", "Team2"]
        # mid-game jump balls throw off the parser, it's easier to just remove them
        pbp = pbp.loc[(~pbp["Score"].str.startswith("Jump ball")) & (pbp["Time"] != "12:00")]
        pbp.reset_index(inplace=True, drop=True)

        # one game play by play table
        game_text = ""
        quarter = 1
        for i, record in pbp.iterrows():
            # placeholder for integer representation of the quarter
            
            if record["Team1"] != "nan":
                play = record["Team1"]
            elif record["Team2"] != "nan":
                play = record["Team2"] 
            else:
                # this should be end of quarters
                play = record["Score"]
            
            if play.startswith("Start of "):
                quarter+=1
                # don't write this line to the text
                continue

            time_left = record["Time"][:-2]
            score = record["Score"]
            if score.startswith("End of"):
                # get the score from the previous line
                scores = pbp.iloc[i-1]["Score"].split("-")
                team1_score = int(scores[0])
                team2_score = int(scores[1])
                score = f"{team1_score}-{team2_score}"
            
            # check if the game is over
            if quarter >= 4 and time_left == "0:00":
                if team1_score != team2_score:
                    # game is over, add a distilgpt2 endoftext token
                    game_text+=f"{quarter} {time_left} {score} End of Game\n"
                    game_text+="<|endoftext|>"
                    
                    # decide where to put the game, train or val
                    split = np.random.choice(["train", "val"], p=[0.8, 0.2])
                    if split == "train":
                        training_text += game_text + "\n"
                    else:
                        validation_text += game_text + "\n"
                    
                    # reset game_text
                    game_text = ""
                    quarter = 1
            else:
                if score == play or score == "Score":
                    # these are header rows
                    # 4th Q == 4th Q for example
                    continue
                else:
                    game_text += f"{quarter} {time_left} {score} {play}\n"
        # be nice to the API
        time.sleep(0.1)
    return training_text, validation_text


In [178]:
team1_training_text, team1_validation_text = boxscore_pbp_to_distilgpt2(team1_pbp_links, debug=False)
team2_training_text, team2_validation_text = boxscore_pbp_to_distilgpt2(team2_pbp_links, debug=False)

https://www.basketball-reference.com/boxscores/pbp/202110190LAL.html
https://www.basketball-reference.com/boxscores/pbp/202110210GSW.html
https://www.basketball-reference.com/boxscores/pbp/202110240SAC.html
https://www.basketball-reference.com/boxscores/pbp/202110260OKC.html
https://www.basketball-reference.com/boxscores/pbp/202110280GSW.html
https://www.basketball-reference.com/boxscores/pbp/202110300GSW.html
https://www.basketball-reference.com/boxscores/pbp/202111030GSW.html
https://www.basketball-reference.com/boxscores/pbp/202111050GSW.html
https://www.basketball-reference.com/boxscores/pbp/202111070GSW.html
https://www.basketball-reference.com/boxscores/pbp/202111080GSW.html
https://www.basketball-reference.com/boxscores/pbp/202111100GSW.html
https://www.basketball-reference.com/boxscores/pbp/202111120GSW.html
https://www.basketball-reference.com/boxscores/pbp/202111140CHO.html
https://www.basketball-reference.com/boxscores/pbp/202111160BRK.html
https://www.basketball-reference.c

In [183]:
TRAINING_FILENAME = "data/BOS_GSW_pbp_2021-22_for_distilgpt2.train.txt"
VALIDATION_FILENAME = "data/BOS_GSW_pbp_2021-22_for_distilgpt2.validation.txt"

with open (TRAINING_FILENAME, "w") as f:
    f.write(team1_training_text)
    f.write(team2_training_text)

with open(VALIDATION_FILENAME, "w") as f:
    f.write(team1_validation_text)
    f.write(team2_validation_text)