In [10]:
import os
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
from urllib.parse import urlparse
import pandas as pd
from tqdm import tqdm
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By
import time

In [11]:
chrome_options = Options()
chrome_options.add_argument("--headless")
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--disable-dev-shm-usage")
chrome_options.binary_location = "/usr/bin/google-chrome"

driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=chrome_options)

In [12]:
puzzle_img_dir = '../data/puzzles/puzzle_images'
solution_img_dir = '../data/puzzles/solution_images'
csv_output_path = '../data/puzzles/puzzles.csv'
os.makedirs(puzzle_img_dir, exist_ok=True)
os.makedirs(solution_img_dir, exist_ok=True)

In [13]:
def get_all_puzzle_links():
  base_url = "https://www.janestreet.com/puzzles/archive/"
  page = 1
  entries = []

  while True:
    page_url = base_url if page == 1 else f"{base_url}page{page}/index.html"
    res = requests.get(page_url)
    if res.status_code != 200:
      print(f"Stopped at page {page}")
      break

    soup = BeautifulSoup(res.text, "html.parser")
    puzzle_rows = soup.find_all("div", class_="row puzzle-row archive-list")

    if not puzzle_rows:
      print(f"No puzzles, stopped at page {page}")
      break

    for row in puzzle_rows:
      date_tag = row.find("span", class_="date")
      name_tag = row.find("span", class_="name")
      puzzle_link_tag = row.find("a", class_="puzzle-link")
      solution_link_tag = row.find("a", class_="solution-link")

      date = date_tag.text.strip().rstrip(":") if date_tag else ""
      name = name_tag.text.strip() if name_tag else ""
      puzzle_link = urljoin(base_url, puzzle_link_tag["href"]) if puzzle_link_tag else None
      solution_link = urljoin(base_url, solution_link_tag["href"]) if solution_link_tag else None

      if puzzle_link:
          entries.append({
              "date": date,
              "name": name,
              "puzzleLink": puzzle_link,
              "solutionLink": solution_link
          })
    page += 1

  return entries


In [14]:
def download_images(soup, folder, base_url):
    os.makedirs(folder, exist_ok=True)
    image_paths = []

    container = soup.find("div", class_="page-column row")
    if not container:
        return image_paths

    img_tags = container.find_all("img")

    for i, img in enumerate(img_tags):
        src = img.get("src")
        if not src:
            continue
        img_url = urljoin(base_url, src)
        ext = os.path.splitext(urlparse(img_url).path)[1]
        img_name = f"{i}{ext}"
        img_path = os.path.join(folder, img_name)
        if os.path.exists(img_path):
            image_paths.append(img_path)
            continue
        try:
            r = requests.get(img_url)
            with open(img_path, "wb") as f:
                f.write(r.content)
            image_paths.append(img_path)
        except:
            continue
    return image_paths

In [15]:
def extract_puzzle_body(link):
  base_url = "https://www.janestreet.com/"
  res = requests.get(link)
  soup = BeautifulSoup(res.text, "html.parser")

  puzzle_body = soup.find("div", class_="page-column row")
  p_tags = puzzle_body.find_all("p") if puzzle_body else []
  all_text = ""
  for p in p_tags:
    text = p.get_text(strip=True)
    if text:
      all_text += text + "\n"
  return all_text.strip(), soup, base_url

In [16]:
def get_solvers(url):
  driver.get(url)
  time.sleep(5)

  try:
    solver_element = driver.find_element(By.CSS_SELECTOR, 'p.correct-submissions.margin-top-20')
    br_tags = solver_element.find_elements(By.TAG_NAME, 'br')
    return len(br_tags)
  except Exception as e:
    print(f"Error retrieving numSolvers for {url}: {e}")
    return 0

In [17]:
def scrape_puzzles_to_csv():
  entries = get_all_puzzle_links()
  rows = []
  for i, entry in tqdm(enumerate(entries), total=len(entries)):
    row = {
        "id": i,
        "name": entry["name"],
        "date": entry["date"],
        "puzzleLink": entry["puzzleLink"],
        "puzzleText": "",
        "hasImage": False,
        "imagePaths": "",
        "hasSolution": bool(entry["solutionLink"]),
        "solutionLink": entry.get("solutionLink", ""),
        "solutionText": "",
        "solutionHasImages": False,
        "solutionImagePaths": "",
        "numSolvers": 0
    }

    try:
      puzzle_subdir = os.path.join(puzzle_img_dir, entry["name"])
      text, soup, base_url = extract_puzzle_body(entry["puzzleLink"])
      row["puzzleText"] = text

      puzzle_image_paths = download_images(soup, puzzle_subdir, base_url)
      row["hasImage"] = len(puzzle_image_paths) > 0
      row["imagePaths"] = ";".join(puzzle_image_paths)
    except Exception as e:
      print(f"[Puzzle {i}] Error: {e}")

    if entry.get("solutionLink"):
      row["numSolvers"] = get_solvers(entry["solutionLink"])
      try:
        solution_subdir = os.path.join(solution_img_dir, entry["name"])
        sol_text, sol_soup, sol_base_url = extract_puzzle_body(entry["solutionLink"])
        row["solutionText"] = sol_text

        solution_image_paths = download_images(sol_soup, solution_subdir, sol_base_url)
        row["solutionHasImages"] = len(solution_image_paths) > 0
        row["solutionImagePaths"] = ";".join(solution_image_paths)
      except Exception as e:
        print(f"[Solution {i}] Error: {e}")
    else:
      row["numSolvers"] = get_solvers(entry["puzzleLink"])

    rows.append(row)

  df = pd.DataFrame(rows)
  df.to_csv(csv_output_path, index=False)

  return df

In [18]:
df = scrape_puzzles_to_csv()

Stopped at page 15


 37%|███▋      | 50/136 [04:59<08:34,  5.99s/it]


KeyboardInterrupt: 

In [None]:
df.head()

Unnamed: 0,id,name,date,puzzleLink,puzzleText,hasImage,imagePaths,hasSolution,solutionLink,solutionText,solutionHasImages,solutionImagePaths,numSolvers
0,0,Number Cross 5,May 2025,https://www.janestreet.com/puzzles/current-puz...,(Update 5/5:We added a shaded square to the ce...,True,/content/drive/MyDrive/cse493g1/project/data/p...,False,,,False,,46
1,1,"Sum One, Somewhere",April 2025,https://www.janestreet.com/puzzles/sum-one-som...,"For a fixedp, independently label the nodes of...",True,/content/drive/MyDrive/cse493g1/project/data/p...,True,https://www.janestreet.com/puzzles/sum-one-som...,"For a fixedp, let f(p) be the probability that...",False,,1160
2,2,Hall of Mirrors 3,March 2025,https://www.janestreet.com/puzzles/hall-of-mir...,The perimeter of a 10-by-10 square field is su...,True,/content/drive/MyDrive/cse493g1/project/data/p...,True,https://www.janestreet.com/puzzles/hall-of-mir...,The unique placement of mirrors that satisfies...,True,/content/drive/MyDrive/cse493g1/project/data/p...,1394
3,3,Top Score (Give or Take),February 2025,https://www.janestreet.com/puzzles/top-score-g...,[The answer to this puzzle is a proper noun.],True,/content/drive/MyDrive/cse493g1/project/data/p...,True,https://www.janestreet.com/puzzles/top-score-g...,For our February puzzle we gave almost no dire...,False,,301
4,4,Somewhat Square Sudoku,January 2025,https://www.janestreet.com/puzzles/somewhat-sq...,Fill the empty cells in the grid above with di...,True,/content/drive/MyDrive/cse493g1/project/data/p...,True,https://www.janestreet.com/puzzles/somewhat-sq...,There are a few ways one could go about this m...,True,/content/drive/MyDrive/cse493g1/project/data/p...,505
