In [7]:
# cell 1 — setup Selenium (Chrome) and go to the ESPN page
# If needed in your env: !pip install selenium webdriver-manager bs4 pandas lxml

import time
from typing import List, Dict, Tuple
from dataclasses import dataclass
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import pandas as pd

ESPN_URL = "https://fantasy.espn.com/basketball/players/projections"

def make_driver(headless: bool = True) -> webdriver.Chrome:
    options = Options()
    if headless:
        options.add_argument("--headless=new")
    options.add_argument("--no-sandbox")
    options.add_argument("--disable-dev-shm-usage")
    options.add_argument("--disable-gpu")
    options.add_argument("--window-size=1400,1000")
    options.add_argument(
        "user-agent=Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) "
        "AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
    )
    service = Service(ChromeDriverManager().install())
    driver = webdriver.Chrome(service=service, options=options)
    return driver

driver = make_driver(headless=True)
driver.get(ESPN_URL)

# Wait for at least one player block to render
WebDriverWait(driver, 20).until(
    EC.presence_of_element_located((By.CSS_SELECTOR, ".full-projection-table"))
)
print("✅ Reached projections page and found player blocks.")


✅ Reached projections page and found player blocks.


In [8]:
# cell 2 — helpers to parse a single player block

from bs4 import BeautifulSoup

def text_or_none(node):
    return (node.get_text(strip=True) if node else None)

def parse_stat_table(table_el) -> Tuple[List[str], List[List[str]]]:
    """
    Given the <table> element in the stat section, return (headers, rows)
    headers: e.g. ['year','GP','MIN','FG%','FT%','3PM','REB','AST','A/TO','STL','BLK','TO','PTS']
    rows: list of row values as strings
    """
    soup = BeautifulSoup(table_el.get_attribute("outerHTML"), "lxml")
    thead = soup.select_one("thead")
    headers = []
    if thead:
        for th in thead.select("th"):
            span = th.select_one("span")
            h = span.get_text(strip=True) if span else th.get_text(strip=True)
            headers.append(h)
    tbody = soup.select_one("tbody")
    rows = []
    if tbody:
        for tr in tbody.select("tr"):
            cells = [text_or_none(td) for td in tr.select("td")]
            rows.append(cells)
    return headers, rows

def parse_player_block(block_el) -> Dict:
    """
    Extract player identity, 2025 stats, 2026 projections, and outlook text.
    Returns a dict with flattened keys for convenience.
    """
    # Identity
    name_el = block_el.find_element(By.CSS_SELECTOR, ".player-name a, .player-name") if block_el.find_elements(By.CSS_SELECTOR, ".player-name a, .player-name") else None
    team_el = block_el.find_element(By.CSS_SELECTOR, ".player-teamname") if block_el.find_elements(By.CSS_SELECTOR, ".player-teamname") else None
    pos_el  = block_el.find_element(By.CSS_SELECTOR, ".position-eligibility") if block_el.find_elements(By.CSS_SELECTOR, ".position-eligibility") else None
    rank_el = block_el.find_element(By.CSS_SELECTOR, ".playerInfo__rank .table--cell, .playerInfo__rank") if block_el.find_elements(By.CSS_SELECTOR, ".playerInfo__rank .table--cell, .playerInfo__rank") else None

    player_name = name_el.text.strip() if name_el else None
    team = team_el.text.strip() if team_el else None
    pos = pos_el.text.strip() if pos_el else None
    rank = rank_el.text.strip() if rank_el else None

    # Stat table (second table in the block, under .player__fullprojections__stats)
    stat_tables = block_el.find_elements(By.CSS_SELECTOR, ".player__fullprojections__stats table")
    year_rows = []
    headers = []
    if stat_tables:
        headers, rows = parse_stat_table(stat_tables[0])
        year_rows = rows  # typically 2 rows: [2025 STATISTICS], [2026 PROJECTIONS]

    # Outlook text
    outlook_el = block_el.find_elements(By.CSS_SELECTOR, ".full-projection-player-outlook__content")
    outlook = outlook_el[0].text.strip() if outlook_el else None

    # Convert rows to dicts keyed by headers
    rows_as_dicts = []
    for r in year_rows:
        if len(r) == len(headers):
            d = dict(zip(headers, r))
            rows_as_dicts.append(d)

    # Flatten into a single record:
    out = {
        "rank": rank,
        "player": player_name,
        "team": team,
        "pos": pos,
        "outlook_2026": outlook,
    }

    # Attach parsed rows (normalize keys with prefixes)
    for d in rows_as_dicts:
        year_label = d.get("year", "")
        # Pick a simple tag based on label content
        tag = "y2025_stats" if "STATISTICS" in year_label.upper() else "y2026_proj" if "PROJECTIONS" in year_label.upper() else year_label.replace(" ", "_").lower()
        for k, v in d.items():
            if k == "year": 
                continue
            out[f"{tag}_{k.lower()}"] = v
    return out


In [None]:
# cell 3 — parse the current page into records

def parse_current_page(driver) -> List[Dict]:
    blocks = driver.find_elements(By.CSS_SELECTOR, ".full-projection-table")
    data = []
    for b in blocks:
        try:
            rec = parse_player_block(b)
            data.append(rec)
        except Exception as e:
            # Don't fail the whole page on one odd block
            print("Parse error on a block:", e)
    print(f"🧾 Parsed {len(data)} players from this page.")
    return data

page_data = parse_current_page(driver)
pd.DataFrame(page_data).head(2)


In [None]:
# cell 4 — paginate through all pages and save to CSV

def click_next(driver) -> bool:
    """
    Clicks the 'next' pagination button if enabled.
    Returns True if we moved to the next page; False if disabled or missing.
    """
    try:
        next_btn = driver.find_element(By.CSS_SELECTOR, ".Pagination__Button--next")
        disabled = next_btn.get_attribute("aria-disabled") == "true" or "Button--disabled" in next_btn.get_attribute("class")
        if disabled:
            return False
        driver.execute_script("arguments[0].click();", next_btn)
        # Wait for the page index to change by observing the active page number
        WebDriverWait(driver, 20).until(
            EC.presence_of_element_located((By.CSS_SELECTOR, ".Pagination__list__item[aria-current='page']"))
        )
        # Also wait for at least one block (redundancy)
        WebDriverWait(driver, 20).until(
            EC.presence_of_element_located((By.CSS_SELECTOR, ".full-projection-table"))
        )
        return True
    except Exception:
        return False

all_rows = []
seen_pages = 0

# Optional: ensure we're on page 1 by clicking the first page if present
try:
    first_active = driver.find_element(By.CSS_SELECTOR, ".Pagination__list__item[aria-current='page']")
    # do nothing; we're already on a page
except:
    pass

while True:
    seen_pages += 1
    all_rows.extend(parse_current_page(driver))
    moved = click_next(driver)
    if not moved:
        break

df = pd.DataFrame(all_rows)

# gently coerce numeric columns
for col in df.columns:
    if any(col.endswith(suffix) for suffix in ["_gp","_min","_fg%","_ft%","_3pm","_reb","_ast","_a/to","_stl","_blk","_to","_pts"]):
        with pd.option_context('mode.chained_assignment', None):
            df[col] = pd.to_numeric(df[col].str.replace("%","", regex=False), errors="coerce")

df.to_csv("espn_fantasy_projections_2026.csv", index=False)
print(f"✅ Scraped {len(df)} players across {seen_pages} pages → espn_fantasy_projections_2026.csv")
df.head(10)
