In [2]:
import os
import time
from bs4 import BeautifulSoup
from playwright.async_api import async_playwright, TimeoutError as PlaywrightTimeout

In [None]:
!pip install BeautifulSoup
!pip install playwright
!playwright install
!playwright install-deps

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [5]:
SEASONS = list(range(2015, 2024))
SEASONS

[2015, 2016, 2017, 2018, 2019, 2020, 2021, 2022, 2023]

In [6]:
DATA_DIR = "/content/drive/MyDrive/Colab Notebooks/Basketball Analytics Project/data"

In [7]:
def is_valid_integer(string):
    try:
        integer = int(string)
        return True
    except ValueError:
        return False


In [8]:
async def get_html(url, selector, sleep=5, retries=3):
  html = None
  for i in range(1, retries+1):
    time.sleep(sleep * i)

    try:
      async with async_playwright() as p:
        browser = await p.firefox.launch()
        page = await browser.new_page()
        await page.goto(url)
        print(await page.title())
        html = await page.inner_html(selector)
    except PlaywrightTimeout:
      print(f"Tiemout error on {url}")
      continue
    break

  return html

In [9]:
async def scrape_season(season):
  url = f'https://www.basketball-reference.com/players/c/curryst01/gamelog/2016'
  html = await get_html(url, "#inner_nav .hoversmooth")

  soup = BeautifulSoup(html)
  links = soup.find_all("a")
  visited = set()  # Set to store visited URLs
  games_pages = []

  for link in links:
      href = link.get("href")
      if href and "gamelog" in href and is_valid_integer(href.split('/')[-1]):
          season = int(href.split('/')[-1])
          if season in SEASONS and href not in visited:
              games_pages.append(f"https://www.basketball-reference.com{href}")
              visited.add(href)


  for url in games_pages:
    save_path = os.path.join(DATA_DIR, url.split("/")[-1])
    if os.path.exists(save_path):
      continue

    html = await get_html(url, "#content")
    if not html:
      continue
    with open(save_path, "w+") as f:
      f.write(html)


In [None]:
for season in SEASONS:
    await scrape_season(season)

In [12]:
# !find '/content/drive/MyDrive/Colab Notebooks/Basketball Analytics Project/data' -type f -delete