Load Helper Function

In [58]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By
from bs4 import BeautifulSoup
import json
import time
import pandas as pd
import re
import matplotlib.pyplot as plt
import numpy as np

In [None]:
# Setup headless browser
options = webdriver.ChromeOptions()
options.add_argument('--headless=new')
options.add_argument('--disable-gpu')
options.add_experimental_option('excludeSwitches', ['enable-logging'])

driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)
print('Done Opening Driver')

all_regular_season_ids = []
all_playoff_ids = []

for year in range(2016, 2024):  # 2016 to 2024 inclusive
    regular_ids = set()
    playoff_ids = set()

    # 🔁 Regular season: Weeks 1–18
    for week in range(1, 19):
        url = f"https://www.espn.com/nfl/scoreboard/_/week/{week}/year/{year}/seasontype/2"
        driver.get(url)
        time.sleep(3)

        soup = BeautifulSoup(driver.page_source, "html.parser")
        links = soup.find_all("a", href=True)

        for link in links:
            href = link['href']
            match = re.search(r"/game/_/gameId/(\d+)", href)
            if match:
                game_id = match.group(1)
                regular_ids.add(game_id)

    # 🔁 Playoffs: Weeks 1–5 (excluding 4)
    for week in [1, 2, 3, 5]:
        url = f"https://www.espn.com/nfl/scoreboard/_/week/{week}/year/{year}/seasontype/3"
        driver.get(url)
        time.sleep(3)

        soup = BeautifulSoup(driver.page_source, "html.parser")
        links = soup.find_all("a", href=True)

        for link in links:
            href = link['href']
            match = re.search(r"/game/_/gameId/(\d+)", href)
            if match:
                game_id = match.group(1)
                playoff_ids.add(game_id)

    # Store the results by year
    all_regular_season_ids.append(list(regular_ids))
    all_playoff_ids.append(list(playoff_ids))

    print(f"{year} — Regular Season: {len(regular_ids)} games, Playoffs: {len(playoff_ids)} games")

driver.quit()



Done Opening Driver
2016 — Regular Season: 273 games, Playoffs: 28 games
2017 — Regular Season: 274 games, Playoffs: 28 games
2
2


In [64]:
print(all_regular_season_ids)
print(all_playoff_ids)

[['400874653', '400874646', '400874686', '401695796', '401776937', '400874484', '401764547', '400874541', '400874429', '401695798', '400874567', '400874612', '400874584', '400874499', '400874680', '400874729', '400874717', '400874556', '400874660', '400874591', '400874576', '400874566', '400874732', '400874657', '400874579', '400874542', '400874560', '400874545', '400874671', '400874734', '400874547', '400874570', '400874564', '400874651', '400874500', '400874658', '400874507', '400874676', '400874551', '400874517', '400874677', '400874650', '400874730', '400874597', '400874502', '400874577', '400874655', '401695800', '400874675', '400874568', '400874661', '400874549', '400874736', '400874735', '400874601', '400874665', '400874659', '400874602', '400874544', '400874724', '400874727', '400874546', '400874637', '400874627', '400874487', '400874682', '401776917', '400874625', '400874552', '400874716', '400874707', '400874703', '400874428', '400874590', '400874731', '400874621', '400874710

Functions to Web Scrape

In [49]:
def getText(url):

    driver.get(url)
    time.sleep(5)  # wait for JS to load


    soup = BeautifulSoup(driver.page_source, "html.parser")
    scripts = soup.find_all("script")

    # Find raw JSON from the correct script block
    script_text = ""
    for script in scripts:
        if 'wnPrb' in script.text and 'plys' in script.text:
            script_text = script.text
            break

    if not script_text:
        raise Exception("Could not find script containing wnPrb and plys")
    
    return script_text

# Function to convert prd and clck (e.g., "15:00") to seconds elapsed
def compute_time_seconds(prd, clck):
    if clck is None or prd is None:
        return None
    try:
        minutes, seconds = map(int, clck.strip().split(":"))
        quarter_offset = (prd - 1) * 900
        clock_remaining = minutes * 60 + seconds
        time_elapsed = quarter_offset + (900 - clock_remaining)
        return time_elapsed
    except:
        return None

In [50]:
def getData(script_text):

    start = script_text.find('"wnPrb":{"pts":')

    i = start
    while i < len(script_text):
        if script_text[i] == '}':
            end = i + 1
            break
        i += 1

    # Step 3: Extract the JSON string
    wnprb_json_text = script_text[start:end]

    # Clean header off
    prefix = '"wnPrb":{"pts":{'
    wnprb_clean = wnprb_json_text[len(prefix):]

    matrix = []
    i = 0
    length = len(wnprb_clean)

    while i < length:
        # Stop if we reach the end
        if wnprb_clean[i] == '}':
            break

        # === Parse ID ===
        while wnprb_clean[i] in ['"', ' ']:
            i += 1
        id_start = i
        while wnprb_clean[i] != ':':
            i += 1
        id_str = wnprb_clean[id_start:i]
        id_num = int(id_str.strip('"'))  # 🛠️ Fix: strip trailing quote

        i += 1  # skip the colon

        # === Parse PTS ===
        while wnprb_clean[i] == ' ':
            i += 1
        pts_start = i
        while i < length and wnprb_clean[i] not in [',', '}']:
            i += 1
        pts_str = wnprb_clean[pts_start:i]
        pts_val = float(pts_str)

        # Store
        matrix.append([id_num, pts_val])

        # Skip comma if there is one
        if i < length and wnprb_clean[i] == ',':
            i += 1

    # Convert to DataFrame
    df_pts = pd.DataFrame(matrix, columns=["id", "pts"])

        # Step 1: Find start of "plys":[
    plys_start = script_text.find('"plys":[')

    if plys_start == -1:
        raise Exception("Could not find 'plys':[")

    # Step 2: Find matching closing bracket ']'
    i = plys_start
    bracket_count = 0
    found_start = False
    length = len(script_text)

    while i < length:
        char = script_text[i]

        if char == '[':
            bracket_count += 1
            found_start = True
        elif char == ']':
            bracket_count -= 1
            if found_start and bracket_count == 0:
                plys_end = i + 1  # include the closing bracket
                break
        i += 1

    # Step 3: Slice the plys string
    plys_text = script_text[plys_start:plys_end]
    

        # Clean the prefix: remove `"plys":[`
    plys_clean = plys_text[len('"plys":['):]

    # Make a fast lookup for pts using the parsed DataFrame
    id_to_pts = dict(zip(df_pts["id"], df_pts["pts"]))

    # Initialize final matrix
    final_matrix = []

    i = 0
    length = len(plys_clean)

    while i < length:
        # Stop at end of list
        if plys_clean[i] == ']':
            break

        # === Find the next play object ===
        if plys_clean[i] != '{':
            i += 1
            continue

        obj_start = i
        brace_count = 0
        while i < length:
            if plys_clean[i] == '{':
                brace_count += 1
            elif plys_clean[i] == '}':
                brace_count -= 1
                if brace_count == 0:
                    obj_end = i + 1
                    break
            i += 1

        obj_str = plys_clean[obj_start:obj_end]

        # === Extract fields manually ===
        id_match = re.search(r'"id":"?(\d+)"?', obj_str)
        clck_match = re.search(r'"clck":"([^"]+)"', obj_str)
        prd_match = re.search(r'"prd":(\d+)', obj_str)

        if id_match:
            id_val = int(id_match.group(1))
            if id_val in id_to_pts:
                clck_val = clck_match.group(1) if clck_match else None
                prd_val = int(prd_match.group(1)) if prd_match else None
                final_matrix.append([id_val, prd_val, clck_val, id_to_pts[id_val]])

    # Convert to DataFrame
    df_final = pd.DataFrame(final_matrix, columns=["id", "prd", "clck", "pts"])
    # Apply to DataFrame
    df_final["time_sec"] = df_final.apply(lambda row: compute_time_seconds(row["prd"], row["clck"]), axis=1)

    # Preview
    df_final = df_final.sort_values("time_sec").reset_index(drop=True)

    return df_final

In [None]:
def createTensor(game_ids):
    # options = webdriver.ChromeOptions()
    # options.add_experimental_option('excludeSwitches', ['enable-logging'])
    # options.add_argument('--headless=new')   # headless = "blind mode"
    # options.add_argument('--disable-gpu')    # safer rendering

    # driver = webdriver.Chrome(
    #     service=Service(ChromeDriverManager().install()),
    #     options=options
    # )

    all_data = []

    for game_id in game_ids:
        try: 
            url = f"https://www.espn.com/nfl/game/_/gameId/{game_id}"
            script_text = getText(url)
            df = getData(script_text)
            all_data.append(df[["time_sec", "pts"]],int(game_id))

        except Exception as e:
            print(f"Error with game {game_id}: {e}")

    return np.array(all_data)

    

In [None]:
createTensor(playoff_ids)
driver.quit()

Error with game 401695796: HTTPConnectionPool(host='localhost', port=62275): Max retries exceeded with url: /session/29e45d47f4040819a81b47440ce37457/url (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x14332e8b0>: Failed to establish a new connection: [Errno 61] Connection refused'))
Error with game 401776937: HTTPConnectionPool(host='localhost', port=62275): Max retries exceeded with url: /session/29e45d47f4040819a81b47440ce37457/url (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x1433ca0a0>: Failed to establish a new connection: [Errno 61] Connection refused'))
Error with game 401774563: HTTPConnectionPool(host='localhost', port=62275): Max retries exceeded with url: /session/29e45d47f4040819a81b47440ce37457/url (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x1433caf70>: Failed to establish a new connection: [Errno 61] Connection refused'))
Error with game 401764547: HTTPConnectionPool(host='loc

array([], dtype=float64)

In [None]:
def plot_game_from_tensor(tensor, game_index):
    """
    Plot win probability over time for the nth unique game in the tensor.

    Parameters:
    - tensor: np.array with shape (N, 3) where columns are [time_sec, win_prob, game_id]
    - game_index: the index of the unique game to plot (e.g. 0, 1, ..., 155)
    """
    # Get list of unique game IDs
    unique_game_ids = np.unique(tensor[:, 2])
    
    if game_index >= len(unique_game_ids):
        print(f"Error: game_index {game_index} is out of range. Only {len(unique_game_ids)} games available.")
        return
    
    # Get the game ID corresponding to the requested index
    game_id = unique_game_ids[game_index]

    # Filter tensor to only that game's data
    game_data = tensor[tensor[:, 2] == game_id]

    # Sort by time
    game_data = game_data[game_data[:, 0].argsort()]

    # Plot
    plt.figure(figsize=(10, 5))
    plt.plot(game_data[:, 0], game_data[:, 1], marker='o', linestyle='-')
    plt.title(f"Win Probability Over Time (Game ID: {int(game_id)})")
    plt.xlabel("Time (seconds)")
    plt.ylabel("Win Probability (%)")
    plt.xlim(0, 3600)
    plt.ylim(0, 100)
    plt.grid(True)
    plt.tight_layout()
    plt.show()


In [37]:
unique_game_ids = np.unique(playoff_tensor[:, 2])
print(playoff_tensor[:, 2])

[4.01671885e+08 4.01671885e+08 4.01671885e+08 ... 4.01671882e+08
 4.01671882e+08 4.01671882e+08]


In [None]:

regular_tensor = get_game_tensor(regular_ids)
print(regular_tensor.shape)  # Should be (N, 3)


In [None]:
playoff_tensor = get_game_tensor(playoff_ids)
print(playoff_tensor.shape)  # Should be (N, 3)

Error with game 401695796: string index out of range
Error with game 401776937: string index out of range
Error with game 401774563: string index out of range
Error with game 401764547: string index out of range
Error with game 401695798: string index out of range
Error with game 401774561: string index out of range
Error with game 401695801: string index out of range
Error with game 401695797: string index out of range
Error with game 401774564: string index out of range
Error with game 401776967: string index out of range
Error with game 401695800: string index out of range
Error with game 401776962: string index out of range
Error with game 401695795: string index out of range
Error with game 401776957: string index out of range
Error with game 401776912: string index out of range
Error with game 401776917: string index out of range
Error with game 401776927: string index out of range
(2353, 3)


In [None]:
plot_game_from_tensor(playoff_tensor, 14)  # Plot the 156th unique game


Error: game_index 14 is out of range. Only 13 games available.
