# Merge SportVU and PBP Data for LSTM Shot Prediction
This notebook downloads SportVU data, merges it with NBA Play-by-Play (PBP) data, and prepares a dataset for LSTM shot prediction.

## Steps:
1. Install dependencies.
2. Import libraries and define constants.
3. Download the zipped SportVU data.
4. Unzip the data.
5. Define utility functions for processing.
6. Merge and process the data.
7. Run the pipeline.

In [1]:
# Cell 1: Install Dependencies
!pip install py7zr
!apt-get install -y p7zip-full  # Install 7z command-line tool

Collecting py7zr
  Downloading py7zr-0.22.0-py3-none-any.whl.metadata (16 kB)
Collecting texttable (from py7zr)
  Downloading texttable-1.7.0-py2.py3-none-any.whl.metadata (9.8 kB)
Collecting pycryptodomex>=3.16.0 (from py7zr)
  Downloading pycryptodomex-3.22.0-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.4 kB)
Collecting pyzstd>=0.15.9 (from py7zr)
  Downloading pyzstd-0.16.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (2.4 kB)
Collecting pyppmd<1.2.0,>=1.1.0 (from py7zr)
  Downloading pyppmd-1.1.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (5.5 kB)
Collecting pybcj<1.1.0,>=1.0.0 (from py7zr)
  Downloading pybcj-1.0.6-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.7 kB)
Collecting multivolumefile>=0.2.3 (from py7zr)
  Downloading multivolumefile-0.2.3-py3-none-any.whl.metadata (6.3 kB)
Collecting inflate64<1.1.0,>=1.0.0 (from py7zr)
  Downloading inflate64-1.0.1-cp311-cp311-manylinux_2_17_

In [2]:
# Cell 2: Import Libraries and Define Constants
import json
import pandas as pd
import numpy as np
import requests
from sklearn.model_selection import train_test_split
import gc
import random
import zipfile
import os
import subprocess  # Added for running 7z command
import shutil      # Added for directory cleanup

# API headers for NBA stats API
HEADERS = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
    "Referer": "https://stats.nba.com/",
    "Accept": "application/json",
    "Accept-Language": "en-US,en;q=0.9",
}

# Configuration
INPUT_DIR = "/content/sportvu_data/"
INPUT_FILE_DIR = "/content/sportvu_data/2016.NBA.Raw.SportVU.Game.Logs"
OUTPUT_DIR = "/content/merged_data/"
TEMP_DIR = "/content/temp_extract/"  # Added for temporary extraction directory
NUM_MATCHES = 200  # Number of matches to sample
MAX_SHOTS = 7000  # Maximum number of shots to extract
WINDOW = 1.5  # Time window in seconds (0.5s = ~13 frames at 25 Hz)
# Note: For Task 3 (optimal trajectories), increase WINDOW to 2.0 seconds (~50 frames)

In [None]:
# Cell 3: Download Data
!gdown --id 1HFm6KKRVD5SGZZ3YzlkdVBPnU0C2ogE5 --output /content/sportvu_all.zip
print("Data downloaded successfully.")

Downloading...
From (original): https://drive.google.com/uc?id=1HFm6KKRVD5SGZZ3YzlkdVBPnU0C2ogE5
From (redirected): https://drive.google.com/uc?id=1HFm6KKRVD5SGZZ3YzlkdVBPnU0C2ogE5&confirm=t&uuid=8089e2f0-54ce-4501-b2f3-464d16f897be
To: /content/sportvu_all.zip
 15% 570M/3.79G [00:08<00:47, 67.3MB/s]

In [None]:
# Cell 4: Unzip Data
# Create input directory
os.makedirs(INPUT_DIR, exist_ok=True)

# Extract the zip file
with zipfile.ZipFile("/content/sportvu_all.zip", 'r') as zip_ref:
    zip_ref.extractall(INPUT_DIR)

# Create output directory
os.makedirs(OUTPUT_DIR, exist_ok=True)

print("Data unzipped successfully.")

In [None]:
# Cell 5: Define Utility Functions

def extract_gameid(sportvu_data):
    """Extract gameid from SportVU JSON data."""
    try:
        return sportvu_data.get("gameid", None)
    except:
        return None

def parse_clock(clock_str):
    """Convert PBP clock (e.g., PT12M00.00S) to seconds."""
    try:
        if "PT" in clock_str:
            minutes = int(clock_str[2:4])
            seconds = float(clock_str[5:10])
        else:
            minutes, seconds = map(float, clock_str.split(":"))
        return minutes * 60 + seconds
    except:
        return None


print("Utility functions defined successfully.")

In [None]:
def extract_sequence(moments, target_gameclock, period, window=WINDOW, game_stats=None):
    """Extract a sequence of moments within a time window before the target gameclock, with a buffer."""
    if game_stats is None:
        game_stats = {}

    # Initialize counters if not present
    game_stats["total_shots"] = game_stats.get("total_shots", 0) + 1

    if not moments or not isinstance(moments, list):
        game_stats["no_moments"] = game_stats.get("no_moments", 0) + 1
        return None, None, game_stats

    # Convert moments list to a DataFrame
    moments_df = pd.DataFrame(moments, columns=["period", "timestamp", "gameclock", "shotclock", "unknown", "positions"])
    if moments_df.empty:
        game_stats["empty_df"] = game_stats.get("empty_df", 0) + 1
        return None, None, game_stats

    # Store initial count of rows
    initial_count = len(moments_df)
    # Convert gameclock to numeric and drop NaNs
    moments_df["gameclock"] = pd.to_numeric(moments_df["gameclock"], errors='coerce')
    moments_df = moments_df.dropna(subset=["gameclock"])

    # Calculate and log dropped rows
    dropped_count = initial_count - len(moments_df)
    if dropped_count > 0:
        game_stats["nan_gameclock"] = game_stats.get("nan_gameclock", 0) + 1
        game_stats["nan_gameclock_rows"] = game_stats.get("nan_gameclock_rows", 0) + dropped_count
        return None, None, game_stats

    if moments_df.empty:
        game_stats["all_nan_gameclock"] = game_stats.get("all_nan_gameclock", 0) + 1
        return None, None, game_stats

    # Use gameclock for the window (in seconds)
    buffer = 0.08  # 0.08 seconds (2 frames at 25 Hz)
    end_gameclock = target_gameclock - buffer  # Subtract buffer because gameclock decreases
    start_gameclock = end_gameclock + window + buffer  # Add window and buffer to get the start

    # Filter moments within the gameclock window
    sequence_df = moments_df[
        (moments_df["gameclock"] <= start_gameclock) &
        (moments_df["gameclock"] >= end_gameclock)
    ].sort_values("gameclock", ascending=False)  # Sort descending so most recent (lowest gameclock) is last

    if sequence_df.empty:
        game_stats["no_moments_in_window"] = game_stats.get("no_moments_in_window", 0) + 1
        return None, None, game_stats

    orig_len_sequence = len(sequence_df)
    # Save original number of frames
    game_stats["orig_frame_counts"] = game_stats.get("orig_frame_counts", []) + [orig_len_sequence]

    # Drop near-duplicate frames based on same gameclock (keep only first per gameclock tick)
    sequence_df = sequence_df.drop_duplicates(subset=["gameclock"])
    dropped_seq = orig_len_sequence - len(sequence_df)
    if dropped_seq > 0:
        game_stats["duplicate_frames"] = game_stats.get("duplicate_frames", 0) + 1
        game_stats["duplicate_frames_dropped"] = game_stats.get("duplicate_frames_dropped", 0) + dropped_seq
        return None, None, game_stats

    # Require at least 10 moments (out of expected frames based on window size)
    expected_frames = int(window * 25) + 1  # e.g., 38 frames for 1.5 seconds at 25 fps
    if len(sequence_df) < expected_frames:
        game_stats["insufficient_length"] = game_stats.get("insufficient_length", 0) + 1
        game_stats["insufficient_length_counts"] = game_stats.get("insufficient_length_counts", []) + [len(sequence_df)]
        return None, None, game_stats

    # Take the last expected_frames moments (if more than expected, trim to expected)
    sequence_df = sequence_df.tail(expected_frames)
    game_stats["used_shots"] = game_stats.get("used_shots", 0) + 1
    game_stats["final_frame_counts"] = game_stats.get("final_frame_counts", []) + [len(sequence_df)]

    ball_seq = sequence_df["positions"].apply(lambda x: x[0][2:]).tolist()  # [x, y, z]
    players_seq = sequence_df["positions"].apply(lambda x: x[1:]).tolist()  # [[teamid, playerid, x, y, z], ...]
    return ball_seq, players_seq, game_stats

In [None]:
def load_pbp_shots(gameid):
    """Fetch PBP data for a given gameid using the playbyplayv3 API and extract shot events."""
    url = "https://stats.nba.com/stats/playbyplayv3"
    params = {
        "GameID": gameid,
        "StartPeriod": 0,
        "EndPeriod": 14
    }

    try:
        response = requests.get(url, params=params, headers=HEADERS)
        response.raise_for_status()
        pbp_data = response.json()

        # Check if the expected structure exists
        if not pbp_data or "game" not in pbp_data or "actions" not in pbp_data["game"]:
            print(f"No PBP data returned for gameid: {gameid}")
            return None

        # Extract actions directly from pbp_data["game"]["actions"]
        actions = pbp_data["game"]["actions"]
        if not actions:
            print(f"No actions found in PBP data for gameid: {gameid}")
            return None

        # Convert actions to DataFrame
        df_pbp = pd.DataFrame(actions)

        # Define required columns for processing
        required_cols = ["actionNumber", "period", "clock", "isFieldGoal", "shotResult", "personId", "shotDistance"]
        optional_cols = ["xLegacy", "yLegacy", "shotValue", "teamId"]  # Optional but useful columns
        missing_required_cols = [col for col in required_cols if col not in df_pbp.columns]
        if missing_required_cols:
            print(f"Missing required columns in PBP data for gameid {gameid}: {missing_required_cols}")
            return None

        # Filter for shots (isFieldGoal = 1)
        df_shots = df_pbp[df_pbp["isFieldGoal"] == 1].copy()
        if df_shots.empty:
            print(f"No shot events found in PBP data for gameid: {gameid}")
            return None

        # Validate that only shot events are included
        invalid_shots = df_shots[~df_shots["shotResult"].isin(["Made", "Missed"])]
        if not invalid_shots.empty:
            print(f"Warning: Invalid shot results in PBP data for gameid {gameid}: {invalid_shots['shotResult'].unique()}")

        # Map columns to expected format
        df_shots["gameid"] = pbp_data["game"]["gameId"]  # Use gameId from the response
        df_shots["actionId"] = df_shots["actionNumber"]
        df_shots["period"] = df_shots["period"]
        df_shots["clock_seconds"] = df_shots["clock"].apply(parse_clock)
        df_shots = df_shots.dropna(subset=["clock_seconds"])
        # df_shots["gameclock"] = (720 * df_shots["period"]) - df_shots["clock_seconds"]
        df_shots["gameclock"]=df_shots["clock_seconds"]
        df_shots["personId"] = df_shots["personId"]
        df_shots["shotResult"] = df_shots["shotResult"].apply(lambda x: "Made Shot" if x == "Made" else "Missed Shot")
        df_shots["shotDistance"] = df_shots["shotDistance"]


        # Define columns to return (required + available optional columns)
        return_cols = ["gameid", "actionId", "period", "gameclock", "personId", "shotResult", "shotDistance"]
        available_optional_cols = [col for col in optional_cols if col in df_pbp.columns]
        return_cols.extend(available_optional_cols)

        return df_shots[return_cols]
    except Exception as e:
        print(f"Error fetching PBP data for gameid {gameid}: {str(e)}")
        return None

In [None]:
def load_sportvu_event(archive_path):
    """Load SportVU data from a .7z archive with one row per event."""
    # Validate file existence and type
    if not os.path.exists(archive_path):
        print(f"Error: {archive_path} does not exist.")
        return None, None
    if not os.path.isfile(archive_path):
        print(f"Error: {archive_path} is not a file.")
        return None, None
    if not archive_path.endswith('.7z'):
        print(f"Error: {archive_path} does not end with .7z.")
        return None, None

    # Check file size
    file_size = os.path.getsize(archive_path)
    if file_size < 1024:  # Arbitrary threshold for a valid .7z file
        print(f"Error: {archive_path} is too small ({file_size} bytes), likely corrupted.")
        return None, None

    # Create a temporary directory for extraction
    temp_extract_dir = os.path.join(TEMP_DIR, os.path.basename(archive_path).replace('.7z', ''))
    os.makedirs(temp_extract_dir, exist_ok=True)

    try:
        # Use 7z command to extract the .7z file
        result = subprocess.run(
            ['7z', 'x', archive_path, f'-o{temp_extract_dir}', '-y'],
            stdout=subprocess.PIPE,
            stderr=subprocess.PIPE,
            text=True
        )
        if result.returncode != 0:
            print(f"Error extracting {archive_path}: {result.stderr}")
            return None, None

        # Find the JSON file in the extracted contents
        json_file = None
        for root, _, files in os.walk(temp_extract_dir):
            for file in files:
                if file.endswith('.json'):
                    json_file = os.path.join(root, file)
                    break
            if json_file:
                break

        if not json_file:
            print(f"No JSON file found in {archive_path}")
            return None, None

        # Read the JSON file
        with open(json_file, 'r') as f:
            data = json.load(f)

        gameid = extract_gameid(data)
        if not gameid:
            print(f"No gameid found in {archive_path}")
            return None, None

        events = data["events"]
        event_data = []
        for event in events:
            event_id = event["eventId"]
            moments = event["moments"]
            # Skip events with no moments or invalid moments
            if not moments or not isinstance(moments, list) or not all(isinstance(m, list) for m in moments):
                continue
            event_dict = {
                "eventId": pd.to_numeric(event_id, errors='coerce').astype('int64'),
                "gameid": gameid,
                "moments": moments  # Store the entire moments list
            }
            event_data.append(event_dict)

        if not event_data:
            print(f"No valid events found in {archive_path}")
            return None, None

        sportvu_df = pd.DataFrame(event_data)
        print(f"Successfully parsed SportVU data from {archive_path}")
        return sportvu_df, gameid

    except Exception as e:
        print(f"Error processing {archive_path}: {str(e)}")
        return None, None

    finally:
        # Clean up the temporary directory
        if os.path.exists(temp_extract_dir):
            shutil.rmtree(temp_extract_dir)

In [None]:
def merge_sportvu_pbp(window=WINDOW):
    """Merge SportVU data with PBP data and extract sequences for shot events."""
    if not os.path.exists(OUTPUT_DIR):
        os.makedirs(OUTPUT_DIR)

    # Create a fresh temporary directory
    if os.path.exists(TEMP_DIR):
        shutil.rmtree(TEMP_DIR)
    os.makedirs(TEMP_DIR, exist_ok=True)

    archive_files = [f for f in os.listdir(INPUT_FILE_DIR) if f.endswith(".7z")]
    if not archive_files:
        print("No .7z files found in the input directory.")
        return None

    sampled_files = random.sample(archive_files, min(NUM_MATCHES, len(archive_files)))
    print(f"Processing {len(sampled_files)} matches out of {len(archive_files)}")

    all_merged_data = []
    shots_processed = 0

    # Dictionary to store statistics for each game
    games_stats = {}

    # Report file path
    report_path = os.path.join(OUTPUT_DIR, "data_quality_report.txt")

    for archive_file in sampled_files:
        if shots_processed >= MAX_SHOTS:
            break

        archive_path = os.path.join(INPUT_FILE_DIR, archive_file)
        sportvu_df, gameid = load_sportvu_event(archive_path)
        if sportvu_df is None or gameid is None:
            continue

        # Initialize game stats dictionary
        games_stats[gameid] = {
            "archive_file": archive_file,
            "events_count": len(sportvu_df),
            "total_shots": 0,
            "used_shots": 0,
            "nan_gameclock": 0,
            "nan_gameclock_rows": 0,
            "all_nan_gameclock": 0,
            "no_moments": 0,
            "empty_df": 0,
            "no_moments_in_window": 0,
            "duplicate_frames": 0,
            "duplicate_frames_dropped": 0,
            "insufficient_length": 0,
            "orig_frame_counts": [],
            "final_frame_counts": [],
            "insufficient_length_counts": []
        }

        pbp_shots = load_pbp_shots(gameid)
        if pbp_shots is None:
            games_stats[gameid]["pbp_shots_count"] = 0
            continue

        games_stats[gameid]["pbp_shots_count"] = len(pbp_shots)

        # Merge on actionId and eventId
        merged = pd.merge(
            pbp_shots,
            sportvu_df,
            left_on=["gameid", "actionId"],
            right_on=["gameid", "eventId"],
            how="inner"
        )

        if merged.empty:
            print(f"No matching shots found for gameid: {gameid}")
            games_stats[gameid]["matched_shots"] = 0
            continue

        games_stats[gameid]["matched_shots"] = len(merged)

        # Extract sequences for each shot
        merged_data = []
        for idx, row in merged.iterrows():
            target_gameclock = row["gameclock"]
            moments = row["moments"]
            ball_seq, players_seq, game_stats = extract_sequence(moments, target_gameclock, row["period"], window, games_stats[gameid])
            games_stats[gameid] = game_stats  # Update with latest stats

            if ball_seq is None or players_seq is None:
                continue

            shot_data = row.drop(["eventId", "moments"]).to_dict()
            shot_data["ball_seq"] = ball_seq
            shot_data["players_seq"] = players_seq
            merged_data.append(shot_data)

        if not merged_data:
            print(f"No sequences extracted for gameid: {gameid}")
            continue

        game_df = pd.DataFrame(merged_data)
        all_merged_data.append(game_df)
        shots_processed += len(game_df)
        print(f"Processed {len(game_df)} shots for gameid: {gameid}. Total shots: {shots_processed}")

        # Update game stats with final count of shots actually used
        games_stats[gameid]["final_shots_used"] = len(game_df)

    # Write stats to report file
    with open(report_path, 'w') as f:
        f.write("NBA SportVU Data Quality Report\n")
        f.write("==============================\n\n")
        f.write(f"Total games processed: {len(games_stats)}\n")
        f.write(f"Total shots processed: {shots_processed}\n\n")
        f.write("Per-Game Statistics:\n")
        f.write("-------------------\n\n")

        for gameid, stats in games_stats.items():
            f.write(f"GameID: {gameid} (File: {stats['archive_file']})\n")
            f.write(f"  SportVU events: {stats['events_count']}\n")
            f.write(f"  PBP shots: {stats.get('pbp_shots_count', 0)}\n")
            f.write(f"  Matched shots: {stats.get('matched_shots', 0)}\n")
            f.write(f"  Total shots examined: {stats.get('total_shots', 0)}\n")
            f.write(f"  Used shots: {stats.get('used_shots', 0)} (final: {stats.get('final_shots_used', 0)})\n")
            f.write(f"  Dropped shots breakdown:\n")
            f.write(f"    - No moments data: {stats.get('no_moments', 0)}\n")
            f.write(f"    - Empty DataFrame: {stats.get('empty_df', 0)}\n")
            f.write(f"    - NaN gameclock: {stats.get('nan_gameclock', 0)} (rows: {stats.get('nan_gameclock_rows', 0)})\n")
            f.write(f"    - All NaN gameclock: {stats.get('all_nan_gameclock', 0)}\n")
            f.write(f"    - No moments in window: {stats.get('no_moments_in_window', 0)}\n")
            f.write(f"    - Duplicate frames: {stats.get('duplicate_frames', 0)} (frames: {stats.get('duplicate_frames_dropped', 0)})\n")
            f.write(f"    - Insufficient sequence length: {stats.get('insufficient_length', 0)}\n")

            # Calculate averages if data is available
            if stats.get('orig_frame_counts'):
                avg_orig_frames = sum(stats['orig_frame_counts']) / len(stats['orig_frame_counts'])
                f.write(f"  Average original frames per shot: {avg_orig_frames:.2f}\n")

            if stats.get('final_frame_counts'):
                avg_final_frames = sum(stats['final_frame_counts']) / len(stats['final_frame_counts'])
                f.write(f"  Average final frames per shot: {avg_final_frames:.2f}\n")

            if stats.get('insufficient_length_counts'):
                avg_insuff_frames = sum(stats['insufficient_length_counts']) / len(stats['insufficient_length_counts'])
                f.write(f"  Average frames in insufficient sequences: {avg_insuff_frames:.2f}\n")

            f.write("\n")

        # Add summary statistics
        f.write("\nSummary Statistics:\n")
        f.write("------------------\n")
        total_events = sum(stats['events_count'] for stats in games_stats.values())
        total_pbp_shots = sum(stats.get('pbp_shots_count', 0) for stats in games_stats.values())
        total_matched = sum(stats.get('matched_shots', 0) for stats in games_stats.values())
        total_examined = sum(stats.get('total_shots', 0) for stats in games_stats.values())
        total_used = sum(stats.get('final_shots_used', 0) for stats in games_stats.values())

        f.write(f"Total SportVU events: {total_events}\n")
        f.write(f"Total PBP shots: {total_pbp_shots}\n")
        f.write(f"Total matched shots: {total_matched}\n")
        f.write(f"Total shots examined: {total_examined}\n")
        f.write(f"Total shots used in final dataset: {total_used}\n")
        f.write(f"Overall data utilization rate: {(total_used / total_examined * 100):.2f}%\n")

    print(f"Data quality report written to {report_path}")

    if not all_merged_data:
        print("No data merged after processing all matches.")
        return None

    final_df = pd.concat(all_merged_data, ignore_index=True)
    final_df["target"] = (final_df["shotResult"] == "Made Shot").astype(int)

    # Log the distribution of Made vs. Missed shots before splitting
    made_count = (final_df["target"] == 1).sum()
    missed_count = (final_df["target"] == 0).sum()
    print(f"Before splitting - Made shots: {made_count}, Missed shots: {missed_count}, Proportion Made: {made_count / (made_count + missed_count):.2f}")

    # Stratified split to ensure balanced Made/Missed shots
    train_df, temp_df = train_test_split(final_df, test_size=0.3, stratify=final_df["target"], random_state=42)
    val_df, test_df = train_test_split(temp_df, test_size=0.5, stratify=temp_df["target"], random_state=42)

    # Log the distribution in each split
    for split_name, split_df in [("Training", train_df), ("Validation", val_df), ("Test", test_df)]:
        made_count = (split_df["target"] == 1).sum()
        missed_count = (split_df["target"] == 0).sum()
        print(f"{split_name} set - Made shots: {made_count}, Missed shots: {missed_count}, Proportion Made: {made_count / (made_count + missed_count):.2f}")

    train_df.to_csv(os.path.join(OUTPUT_DIR, "train.csv"), index=False)
    val_df.to_csv(os.path.join(OUTPUT_DIR, "val.csv"), index=False)
    test_df.to_csv(os.path.join(OUTPUT_DIR, "test.csv"), index=False)

    # Add dataset statistics to report
    with open(report_path, 'a') as f:
        f.write("\nFinal Dataset Statistics:\n")
        f.write("------------------------\n")
        f.write(f"Total dataset size: {len(final_df)}\n")
        f.write(f"Made shots: {made_count} ({made_count / len(final_df) * 100:.2f}%)\n")
        f.write(f"Missed shots: {missed_count} ({missed_count / len(final_df) * 100:.2f}%)\n\n")

        f.write("Dataset Splits:\n")
        f.write(f"  Training set: {len(train_df)} ({len(train_df) / len(final_df) * 100:.2f}%)\n")
        f.write(f"  Validation set: {len(val_df)} ({len(val_df) / len(final_df) * 100:.2f}%)\n")
        f.write(f"  Test set: {len(test_df)} ({len(test_df) / len(final_df) * 100:.2f}%)\n")

    return final_df

In [None]:
# Cell 7: Run the Pipeline
merged_df = merge_sportvu_pbp(window=WINDOW)
print("Pipeline completed.")