# NFL Player Position Prediction - Feature Engineering

This notebook processes the NFL tracking data into three normalized feature sets:
1. Player-level features (per player, per frame)
2. Team-frame features (per team, per frame)
3. Play-level features (per play summary)

In [None]:
from __future__ import annotations
import os
import sys
import logging
from pathlib import Path
import polars as pl
import pandas as pd
import numpy as np
import psutil
import gc

# Setup logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

# Setup project path
proj = Path.cwd()
if (proj / "src").exists():
    root = proj
elif (proj.parent / "src").exists():
    root = proj.parent
else:
    root = next(p for p in [proj, *proj.parents] if (p / "src").exists())

# Set up python path
os.chdir(root)
if str(root) not in sys.path:
    sys.path.insert(0, str(root))
logger.info(f"Project root configured: {root}")

# Verify critical paths exist
for path in ["src", "data", "data/raw", "data/processed", "notebooks"]:
    if not (root / path).exists():
        raise RuntimeError(f"Missing required path: {root / path}")

2025-11-01 18:00:25,834 - INFO - Project root configured: e:\OneDrive\Documents\Courses\Artificial Intelligence\Project\UF_CAP4261_F25_TEAM9


## Import Feature Engineering Functions

Import the pre-implemented functions from our module. We'll try both potential import paths.

In [46]:
from src.data.features import (
    height_to_inches, age_years, bmi,
    angle_sin_cos_deg, angle_deg_to_rad, velocity_components, acceleration_components,
    normalize_rightward, FIELD_LENGTH, FIELD_WIDTH,
    add_formation_features, add_coverage_features, add_temporal_features,
)

# Define paths
IN_PARQUET    = Path("data/parquet/test_input.parquet")
OUT_DIR       = Path("data/processed")
OUT_PLAYERS   = OUT_DIR / "players_test.parquet"
OUT_TEAMFRAME = OUT_DIR / "teamframe_test.parquet"
OUT_PLAYS     = OUT_DIR / "plays_test.parquet"

OUT_DIR.mkdir(parents=True, exist_ok=True)

In [47]:
def check_memory_usage() -> float:
    """Monitor memory usage and log if above threshold."""
    process = psutil.Process()
    memory_gb = process.memory_info().rss / 1024 / 1024 / 1024
    if memory_gb > 8:  # Warning threshold at 8GB
        logger.warning(f"High memory usage: {memory_gb:.1f}GB")
        logger.info("Triggering garbage collection...")
        gc.collect()
    return memory_gb

def safe_load_parquet(path: Path) -> pl.DataFrame:
    """Safely load a parquet file with error handling."""
    try:
        logger.info(f"Loading parquet file: {path}")
        df = pl.read_parquet(path)
        logger.info(f"Successfully loaded {df.shape[0]} rows and {df.shape[1]} columns")
        return df
    except Exception as e:
        logger.error(f"Error loading {path}: {e}")
        raise

def validate_numeric_features(df: pl.DataFrame, numeric_cols: list[str]) -> None:
    """Validate numeric features for common issues."""
    for col in numeric_cols:
        if col not in df.columns:
            continue

        stats = df.select([
            pl.col(col).mean().alias("mean"),
            pl.col(col).std().alias("std"),
            pl.col(col).min().alias("min"),
            pl.col(col).max().alias("max"),
            pl.col(col).null_count().alias("nulls")
        ])
        mean, std, minv, maxv, nnull = stats.row(0)

        if nnull and nnull > 0:
            logger.warning(f"Column {col} has {nnull} null values")

        has_inf = df.select(pl.col(col).is_infinite().any().alias("_any")).row(0)[0]
        if has_inf:
            logger.warning(f"Column {col} contains infinite values")

        if std and std > 0:
            if (minv is not None and minv < mean - 5*std) or (maxv is not None and maxv > mean + 5*std):
                logger.warning(f"Column {col} has values >5 std from mean")

## Helper Functions

Define utility functions for data normalization and feature engineering.

In [52]:
def rightward_cols(df: pl.DataFrame) -> pl.DataFrame:
    """
    Mirror coordinates to rightward direction and wrap angles (using simple expressions).
    Adds normalized columns: x_norm, y_norm, dir_norm, o_norm
    """
    # Normalize x and y based on play_direction (offense always right)
    x_norm = (
        pl.when(pl.col("play_direction").str.to_lowercase() == "left")
          .then(pl.lit(FIELD_LENGTH) - pl.col("x"))
          .otherwise(pl.col("x"))
          .cast(pl.Float64)
          .alias("x_norm")
    )

    y_norm = (
        pl.when(pl.col("play_direction").str.to_lowercase() == "left")
          .then(pl.lit(FIELD_WIDTH) - pl.col("y"))
          .otherwise(pl.col("y"))
          .cast(pl.Float64)
          .alias("y_norm")
    )

    # For angles, add 180 when direction is left and wrap via modulo 360
    dir_norm = (
        pl.when(pl.col("play_direction").str.to_lowercase() == "left")
          .then((pl.col("dir") + 180.0) % 360.0)
          .otherwise(pl.col("dir"))
          .alias("dir_norm")
    )

    o_norm = (
        pl.when(pl.col("play_direction").str.to_lowercase() == "left")
          .then((pl.col("o") + 180.0) % 360.0)
          .otherwise(pl.col("o"))
          .alias("o_norm")
    )

    return df.with_columns([x_norm, y_norm, dir_norm, o_norm])


def add_basic_player_scalars(df: pl.DataFrame) -> pl.DataFrame:
    """Add basic player features and derived components."""
    df = df.with_columns([
        pl.col("player_height").map_elements(height_to_inches, return_dtype=pl.Float64).alias("height_in"),
        pl.col("player_birth_date").map_elements(age_years, return_dtype=pl.Float64).alias("player_age"),
    ]).with_columns([
        pl.struct(["player_weight", "height_in"]).map_elements(
            lambda s: bmi(s["player_weight"], s["height_in"])
        ).alias("player_bmi"),
    ])

    df = df.with_columns([
        pl.col("dir_norm").map_elements(angle_deg_to_rad).alias("dir_rad"),
        pl.col("o_norm").map_elements(angle_deg_to_rad).alias("o_rad"),
    ]).with_columns([
        pl.col("dir_rad").sin().alias("dir_sin"),
        pl.col("dir_rad").cos().alias("dir_cos"),
        pl.col("o_rad").sin().alias("o_sin"),
        pl.col("o_rad").cos().alias("o_cos"),
    ]).drop(["dir_rad","o_rad"])

    df = df.with_columns([
        (pl.col("s") * pl.col("dir_cos")).alias("vx"),
        (pl.col("s") * pl.col("dir_sin")).alias("vy"),
        (pl.col("a") * pl.col("dir_cos")).alias("ax"),
        (pl.col("a") * pl.col("dir_sin")).alias("ay"),
    ])
    return df

## Feature Table Generation

Create functions to build our three main feature tables:
1. Player features
2. Team-frame features
3. Play summary features

In [49]:
def build_players_table(df: pl.DataFrame) -> pl.DataFrame:
    """
    Build per-player, per-frame feature table.
    Keys: (game_id, play_id, frame_id, nfl_id)
    """
    logger.info("Building player features table...")
    df = df.sort(["game_id", "play_id", "nfl_id", "frame_id"])  # temporal order per player
    df = rightward_cols(df)
    logger.info("✓ Normalized coordinates")
    df = add_basic_player_scalars(df)
    logger.info("✓ Added basic player features")

    df = add_temporal_features(
        df,
        window_sizes=(3,5),
        game="game_id", play="play_id", pid="nfl_id", frame="frame_id",
        s="s", a="a", dirc="dir_norm", x="x_norm", y="y_norm",
    )
    logger.info("✓ Added temporal features")

    numeric_cols = [
        "x_norm", "y_norm", "dir_norm", "o_norm", "s", "a", 
        "vx", "vy", "ax", "ay", "height_in", "player_weight", 
        "player_age", "player_bmi",
        "speed_rolling_mean_3","speed_rolling_std_3","accel_rolling_mean_3","accel_rolling_std_3",
        "speed_rolling_mean_5","speed_rolling_std_5","accel_rolling_mean_5","accel_rolling_std_5",
        "angular_velocity","delta_x","delta_y","cumulative_distance",
    ]
    validate_numeric_features(df, numeric_cols)

    cols = [
        # keys
        "game_id","play_id","frame_id","nfl_id",
        # context
        "player_side","player_role","player_position","player_to_predict",
        # position/motion
        "x_norm","y_norm","dir_norm","o_norm","s","a","vx","vy","ax","ay",
        # trig
        "dir_sin","dir_cos","o_sin","o_cos",
        # physical
        "height_in","player_weight","player_age","player_bmi",
        # other
        "absolute_yardline_number","num_frames_output","ball_land_x","ball_land_y",
        # temporal
        "speed_rolling_mean_3","speed_rolling_std_3","accel_rolling_mean_3","accel_rolling_std_3",
        "speed_rolling_mean_5","speed_rolling_std_5","accel_rolling_mean_5","accel_rolling_std_5",
        "angular_velocity","delta_x","delta_y","cumulative_distance",
    ]
    cols = [c for c in cols if c in df.columns]
    result = df.select(cols)
    logger.info(f"✓ Player features table complete: {result.shape}")
    return result

def build_teamframe_table(df: pl.DataFrame) -> pl.DataFrame:
    """
    Build per-team, per-frame feature table.
    Keys: (game_id, play_id, frame_id, player_side)
    """
    logger.info("Building team-frame features table...")
    df = df.sort(["game_id", "play_id", "frame_id", "player_side"])  # stable order
    df = rightward_cols(df)
    logger.info("✓ Normalized coordinates")

    df_fm = add_formation_features(
        df,
        game="game_id", play="play_id", frame="frame_id", side="player_side",
        x="x_norm", y="y_norm",
    )
    logger.info("✓ Added formation features")

    df_cov = add_coverage_features(
        df_fm,
        radii=(3.0,5.0,7.0),
        game="game_id", play="play_id", frame="frame_id",
        side="player_side", pid="nfl_id",
        x="x_norm", y="y_norm",
    )
    logger.info("✓ Added coverage features")

    keys = ["game_id","play_id","frame_id","player_side"]
    result = (
        df_cov.group_by(keys)
             .agg([
                 pl.col("formation_width").first(),
                 pl.col("formation_depth").first(),
                 pl.col("formation_x_mean").first(),
                 pl.col("formation_y_mean").first(),
                 pl.col("distance_to_formation_center").mean().alias("team_spread_mean"),
                 pl.col("relative_formation_depth").std().alias("depth_std"),
                 pl.col("relative_formation_width").std().alias("width_std"),
                 pl.col("distance_to_nearest_teammate").mean().alias("nn_teammate_mean"),
                 pl.col("distance_to_nearest_opponent").mean().alias("nn_opponent_mean"),
                 pl.col("coverage_density").mean().alias("coverage_density_mean"),
                 pl.col("opponents_within_3yds").sum().alias("opp_within_3yds_sum"),
                 pl.col("opponents_within_5yds").sum().alias("opp_within_5yds_sum"),
                 pl.col("opponents_within_7yds").sum().alias("opp_within_7yds_sum"),
                 pl.col("teammates_within_3yds").sum().alias("tm_within_3yds_sum"),
                 pl.col("teammates_within_5yds").sum().alias("tm_within_5yds_sum"),
                 pl.col("teammates_within_7yds").sum().alias("tm_within_7yds_sum"),
             ])
    )

    numeric_cols = [
        "formation_width","formation_depth","formation_x_mean","formation_y_mean",
        "team_spread_mean","depth_std","width_std","nn_teammate_mean",
        "nn_opponent_mean","coverage_density_mean",
    ]
    validate_numeric_features(result, numeric_cols)
    logger.info(f"✓ Team-frame features table complete: {result.shape}")
    return result

def build_plays_table(players: pl.DataFrame, teamframe: pl.DataFrame) -> pl.DataFrame:
    """
    Build per-play summary feature table using last frame of each side.
    Keys: (game_id, play_id)
    """
    logger.info("Building play features table...")
    play_keys = ["game_id", "play_id"]

    # last frames for teamframe
    tf_last_key = (
        teamframe.group_by(play_keys + ["player_side"]).agg(pl.col("frame_id").max().alias("frame_id"))
    )
    last_tf = teamframe.join(tf_last_key, on=play_keys + ["player_side", "frame_id"], how="inner")

    # last frames for players
    pl_last_key = (
        players.group_by(play_keys + ["player_side"]).agg(pl.col("frame_id").max().alias("frame_id"))
    )
    last_players = players.join(pl_last_key, on=play_keys + ["player_side", "frame_id"], how="inner")

    # side-specific player summaries
    play_agg = (
        last_players.group_by(play_keys + ["player_side"]) 
                    .agg([
                        pl.col("s").mean().alias("mean_speed_last"),
                        pl.col("a").mean().alias("mean_accel_last"),
                        pl.col("x_norm").mean().alias("mean_x_last"),
                        pl.col("y_norm").mean().alias("mean_y_last"),
                    ])
    )

    play_pivot = play_agg.pivot(
        values=["mean_speed_last","mean_accel_last","mean_x_last","mean_y_last"],
        index=play_keys,
        on="player_side",
        aggregate_function="first",
    )

    fm_pivot = (
        last_tf.select(play_keys + ["player_side","formation_width","formation_depth"]) 
               .pivot(
                   values=["formation_width","formation_depth"],
                   index=play_keys,
                   on="player_side",
                   aggregate_function="first",
               )
    )

    result = play_pivot.join(fm_pivot, on=play_keys, how="outer")
    numeric_cols = [c for c in result.columns if c not in play_keys]
    validate_numeric_features(result, numeric_cols)
    logger.info(f"✓ Play features table complete: {result.shape}")
    return result

## Process and Save Feature Tables

Load the input data and generate all feature tables.

In [53]:
try:
    logger.info("\nProcessing test data...")
    logger.info(f"Loading test data from {IN_PARQUET}")
    df = safe_load_parquet(IN_PARQUET)
    initial_memory = check_memory_usage()
    logger.info(f"Input test data: {df.shape}")

    logger.info("\nBuilding feature tables...")

    print("DEBUG: df type:", type(df))
    if isinstance(df, pl.DataFrame):
        print("DEBUG: Polars DF shape:", df.shape)
        print("DEBUG: columns:", df.columns)
    elif isinstance(df, pd.DataFrame):
        print("DEBUG: Pandas DF shape:", df.shape)
        print("DEBUG: columns:", list(df.columns))
    else:
        print("DEBUG: df is neither polars nor pandas; value:", repr(df))


    players = build_players_table(df)
    logger.info(f"✓ Player features: {players.shape}")

    teamframe = build_teamframe_table(df)
    logger.info(f"✓ Team-frame features: {teamframe.shape}")

    plays = build_plays_table(players, teamframe)
    logger.info(f"✓ Play features: {plays.shape}")

    logger.info("\nSaving feature tables...")
    players.write_parquet(OUT_PLAYERS)
    teamframe.write_parquet(OUT_TEAMFRAME)
    plays.write_parquet(OUT_PLAYS)

    logger.info("\n✓ Feature tables saved successfully:")
    logger.info(f" - {OUT_PLAYERS}")
    logger.info(f" - {OUT_TEAMFRAME}")
    logger.info(f" - {OUT_PLAYS}")

except Exception as e:
    logger.error(f"Error during feature processing: {str(e)}")
    raise
finally:
    logger.info("\nCleaning up memory...")
    for name in ("df","players","teamframe","plays"):
        if name in locals():
            del locals()[name]
    gc.collect()
    final_memory = check_memory_usage()
    logger.info(f"Final memory usage: {final_memory:.1f}GB")

2025-11-01 18:09:25,447 - INFO - 
Processing test data...
2025-11-01 18:09:25,449 - INFO - Loading test data from data\parquet\test_input.parquet
2025-11-01 18:09:25,450 - INFO - Loading parquet file: data\parquet\test_input.parquet
2025-11-01 18:09:25,449 - INFO - Loading test data from data\parquet\test_input.parquet
2025-11-01 18:09:25,450 - INFO - Loading parquet file: data\parquet\test_input.parquet
2025-11-01 18:09:25,472 - INFO - Successfully loaded 49753 rows and 24 columns
2025-11-01 18:09:25,473 - INFO - Input test data: (49753, 24)
2025-11-01 18:09:25,474 - INFO - 
Building feature tables...
2025-11-01 18:09:25,475 - INFO - Building player features table...
2025-11-01 18:09:25,472 - INFO - Successfully loaded 49753 rows and 24 columns
2025-11-01 18:09:25,473 - INFO - Input test data: (49753, 24)
2025-11-01 18:09:25,474 - INFO - 
Building feature tables...
2025-11-01 18:09:25,475 - INFO - Building player features table...
2025-11-01 18:09:25,509 - INFO - ✓ Normalized coordina

DEBUG: df type: <class 'polars.dataframe.frame.DataFrame'>
DEBUG: Polars DF shape: (49753, 24)
DEBUG: columns: ['game_id', 'play_id', 'player_to_predict', 'nfl_id', 'frame_id', 'play_direction', 'absolute_yardline_number', 'player_name', 'player_height', 'player_weight', 'player_birth_date', 'player_position', 'player_side', 'player_role', 'x', 'y', 's', 'a', 'dir', 'o', 'num_frames_output', 'ball_land_x', 'ball_land_y', 'week']


2025-11-01 18:09:25,753 - INFO - ✓ Added basic player features
2025-11-01 18:09:25,814 - INFO - ✓ Added temporal features
2025-11-01 18:09:25,814 - INFO - ✓ Added temporal features
2025-11-01 18:09:25,901 - INFO - ✓ Player features table complete: (49753, 42)
2025-11-01 18:09:25,903 - INFO - ✓ Player features: (49753, 42)
2025-11-01 18:09:25,904 - INFO - Building team-frame features table...
2025-11-01 18:09:25,941 - INFO - ✓ Normalized coordinates
2025-11-01 18:09:25,901 - INFO - ✓ Player features table complete: (49753, 42)
2025-11-01 18:09:25,903 - INFO - ✓ Player features: (49753, 42)
2025-11-01 18:09:25,904 - INFO - Building team-frame features table...
2025-11-01 18:09:25,941 - INFO - ✓ Normalized coordinates
2025-11-01 18:09:25,994 - INFO - ✓ Added formation features
2025-11-01 18:09:25,994 - INFO - ✓ Added formation features
2025-11-01 18:09:26,380 - INFO - ✓ Added coverage features
2025-11-01 18:09:26,380 - INFO - ✓ Added coverage features
2025-11-01 18:09:26,406 - INFO - ✓ Te

## Feature Table Summary

Let's examine the structure and content of our generated feature tables.

In [54]:
def summarize_feature_table(df: pl.DataFrame, name: str):
    print(f"\n=== {name} ===")
    print(f"Shape: {df.shape}")
    print("\nColumns:")
    for col in df.columns:
        print(f"- {col}")
    print("\nSample data:")
    print(df.head(3))

# Uncomment to view summaries after running pipeline above
# players = pl.read_parquet(OUT_PLAYERS)
# teamframe = pl.read_parquet(OUT_TEAMFRAME)
# plays = pl.read_parquet(OUT_PLAYS)
# summarize_feature_table(players,  "Player Features")
# summarize_feature_table(teamframe, "Team-Frame Features")
# summarize_feature_table(plays,     "Play Features")

### Sanity Check

In [56]:
players_pq   = Path("data/processed/players_test.parquet")
teamframe_pq = Path("data/processed/teamframe_test.parquet")
plays_pq     = Path("data/processed/plays_test.parquet")

# 1) Files exist
for p in [players_pq, teamframe_pq, plays_pq]:
    assert p.exists(), f"Missing: {p}"

players   = pl.read_parquet(players_pq)
teamframe = pl.read_parquet(teamframe_pq)
plays     = pl.read_parquet(plays_pq)

def assert_pk_unique(df: pl.DataFrame, keys: list[str], name: str):
    # Build a composite key string safely
    key_exprs = [pl.col(k).cast(pl.Utf8) for k in keys]
    k = pl.concat_str(key_exprs, separator="|").alias("__pk__")
    dup_cnt = df.select(k).to_series().is_duplicated().sum()
    if dup_cnt != 0:
        # show a few duplicates for debugging
        dups = (
            df.with_columns(k)
              .group_by("__pk__")
              .len()
              .filter(pl.col("len") > 1)
              .head(10)
        )
        raise AssertionError(
            f"{name} primary key not unique on {keys}. Found {dup_cnt} duplicate rows.\n"
            f"Examples:\n{dups}"
        )

# 2) Primary key uniqueness
assert_pk_unique(players,   ["game_id","play_id","frame_id","nfl_id"],     "players")
assert_pk_unique(teamframe, ["game_id","play_id","frame_id","player_side"], "teamframe")
assert_pk_unique(plays,     ["game_id","play_id"],                          "plays")

print("✅ tables present and keys are unique")

✅ tables present and keys are unique
