In [None]:
import os
import time
from bs4 import BeautifulSoup
from playwright.async_api import async_playwright, TimeoutError as PlaywrightTimeout

In [None]:
# !pip install BeautifulSoup
# !pip install playwright
# !playwright install
# !playwright install-deps

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
SEASONS = list(range(2015, 2024))
SEASONS

[2015, 2016, 2017, 2018, 2019, 2020, 2021, 2022, 2023]

In [None]:
DATA_DIR = "/content/drive/MyDrive/Colab Notebooks/Basketball Analytics Project/data"

In [None]:
def is_valid_integer(string):
    try:
        integer = int(string)
        return True
    except ValueError:
        return False


In [None]:
async def get_html(url, selector, sleep=5, retries=3):
  html = None
  for i in range(1, retries+1):
    time.sleep(sleep * i)

    try:
      async with async_playwright() as p:
        browser = await p.firefox.launch()
        page = await browser.new_page()
        await page.goto(url)
        print(await page.title())
        html = await page.inner_html(selector)
    except PlaywrightTimeout:
      print(f"Tiemout error on {url}")
      continue
    break

  return html

In [None]:
async def scrape_season(season):
  url = f'https://www.basketball-reference.com/players/c/curryst01/gamelog/2016'
  html = await get_html(url, "#inner_nav .hoversmooth")

  soup = BeautifulSoup(html)
  links = soup.find_all("a")
  visited = set()  # Set to store visited URLs
  games_pages = []

  for link in links:
      href = link.get("href")
      if href and "gamelog" in href and is_valid_integer(href.split('/')[-1]):
          season = int(href.split('/')[-1])
          if season in SEASONS and href not in visited:
              games_pages.append(f"https://www.basketball-reference.com{href}")
              visited.add(href)


  for url in games_pages:
    save_path = os.path.join(DATA_DIR, url.split("/")[-1])
    if os.path.exists(save_path):
      continue

    html = await get_html(url, "#content")
    if not html:
      continue
    with open(save_path, "w+") as f:
      f.write(html)


In [None]:
for season in SEASONS:
  await scrape_season(season)

Stephen Curry 2015-16 Game Log | Basketball-Reference.com
Stephen Curry 2014-15 Game Log | Basketball-Reference.com
Stephen Curry 2015-16 Game Log | Basketball-Reference.com
Stephen Curry 2016-17 Game Log | Basketball-Reference.com
Stephen Curry 2017-18 Game Log | Basketball-Reference.com
Stephen Curry 2018-19 Game Log | Basketball-Reference.com
Stephen Curry 2019-20 Game Log | Basketball-Reference.com
Stephen Curry 2020-21 Game Log | Basketball-Reference.com
Stephen Curry 2021-22 Game Log | Basketball-Reference.com
Stephen Curry 2022-23 Game Log | Basketball-Reference.com
Stephen Curry 2015-16 Game Log | Basketball-Reference.com
Stephen Curry 2015-16 Game Log | Basketball-Reference.com
Stephen Curry 2015-16 Game Log | Basketball-Reference.com
Stephen Curry 2015-16 Game Log | Basketball-Reference.com
Stephen Curry 2015-16 Game Log | Basketball-Reference.com
Stephen Curry 2015-16 Game Log | Basketball-Reference.com
Stephen Curry 2015-16 Game Log | Basketball-Reference.com
Stephen Curry 

In [None]:
standings_files = os.listdir(STANDINGS_DIR)

In [None]:
async def scrape_game(standings_file):
  with open(standings_file, 'r') as f:
    html = f.read()

  soup = BeautifulSoup(html)
  links = soup.find_all("a")
  hrefs = [l.get("href") for l in links]
  box_scores = [l for l in hrefs if l and "boxscore" in l and ".html" in l]
  box_scores = [f"https://www.basketball-reference.com{l}" for l in box_scores]

  for url in box_scores:
    save_path = os.path.join(SCORES_DIR, url.split("/")[-1])
    if os.path.exists(save_path):
      continue

    html = await get_html(url, "#content")
    if not html:
      continue
    with open(save_path, "w+") as f:
      f.write(html)

In [None]:
for f in standings_files:
  filepath = os.path.join(STANDINGS_DIR, f)

  await scrape_game(filepath)

Cavaliers vs Bulls, October 27, 2015 | Basketball-Reference.com
Pistons vs Hawks, October 27, 2015 | Basketball-Reference.com
Pelicans vs Warriors, October 27, 2015 | Basketball-Reference.com
Tiemout error on https://www.basketball-reference.com/boxscores/201510280ORL.html
Wizards vs Magic, October 28, 2015 | Basketball-Reference.com
76ers vs Celtics, October 28, 2015 | Basketball-Reference.com
Bulls vs Nets, October 28, 2015 | Basketball-Reference.com
Jazz vs Pistons, October 28, 2015 | Basketball-Reference.com
Pacers vs Raptors, October 28, 2015 | Basketball-Reference.com
Hornets vs Heat, October 28, 2015 | Basketball-Reference.com
Knicks vs Bucks, October 28, 2015 | Basketball-Reference.com
Spurs vs Thunder, October 28, 2015 | Basketball-Reference.com
Cavaliers vs Grizzlies, October 28, 2015 | Basketball-Reference.com
Nuggets vs Rockets, October 28, 2015 | Basketball-Reference.com
Mavericks vs Suns, October 28, 2015 | Basketball-Reference.com
Pelicans vs Trail Blazers, October 28, 2

CancelledError: ignored

In [None]:
standings_files