<a href="https://colab.research.google.com/github/andreviniciusmb/nfl-big-data-bowl-2026-prediction/blob/main/Prediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# NFL Big Data Bowl 2026 - Prediction

This notebook aims to predict a player's next moves in a play based on previously provided information.

Future improvements:
- Normalize based on the football field
- Average of the features of the play
- A constant distance from the line of scrimmage indicates zone coverage.
- Human Trajectory (Geometric Baseline)

References
- [Kaggle](https://www.kaggle.com/competitions/nfl-big-data-bowl-2026-prediction/overview)

In [None]:
import os
import itertools
import numpy as np
import pandas as pd
from tqdm import tqdm
import tensorflow as tf
import matplotlib.pyplot as plt
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, TimeDistributed, Dropout, Masking, Reshape

def plot(point, num_pred = 10, position = None):

    _,axs = plt.subplots(num_pred, 1, figsize=(8,num_pred*3))
    for ax in axs:
        game_id = meta_array[point]['game_id']
        play_id = meta_array[point]['play_id']
        nfl_id  = meta_array[point]['nfl_id']

        df_input_crop = df_input.loc[
            (df_input['game_id'] == game_id) &
            (df_input['play_id'] == play_id) &
            (df_input['nfl_id'] == nfl_id)].sort_values('frame_id')

        last_frame = df_input_crop.iloc[-1]

        out_tmp = df_output.loc[(df_output['game_id'] == game_id) &
            (df_output['play_id'] == play_id) &
            (df_output['nfl_id'] == nfl_id)].sort_values('frame_id')

        pred_tmp = pred.loc[(pred['game_id'] == game_id) &
            (pred['play_id'] == play_id) &
            (pred['nfl_id'] == nfl_id)]

        ax.set_title(f"{last_frame['player_name']} ({last_frame['player_position']})\n{meta_array[POINT]}")
        ax.plot(df_input_crop['ball_land_x'], df_input_crop['ball_land_y'], 'y*', label='Bola')
        ax.plot(df_input_crop['x'], df_input_crop['y'], 'bo-', label='Histórico')
        ax.plot(out_tmp['x'], out_tmp['y'], 'yo-', label='Real')
        ax.plot(pred_tmp['x'], pred_tmp['y'], 'ro--', label='Previsto')
        #plt.xticks(range(0,120,10))
        #plt.yticks(range(0,55, 5))
        ax.legend()
        ax.grid()

        point+=1
    plt.tight_layout()
    return df_input_crop


def get_outputs(df_input, df_output, meta_array):
    y_train = pd.DataFrame()

    for meta in meta_array:
        game_id = meta['game_id']
        play_id = meta['play_id']
        nfl_id  = meta['nfl_id']

        player_position = df_input.loc[
            (df_input['game_id'] == game_id) &
            (df_input['play_id'] == play_id) &
            (df_input['nfl_id'] == nfl_id), 'player_position'].iloc[0]

        group = df_output.loc[
            (df_output['game_id'] == game_id) &
            (df_output['play_id'] == play_id) &
            (df_output['nfl_id'] == nfl_id)].copy()

        group['player_position'] = player_position

        y_train = pd.concat([y_train, group])

    y_train['row'] = range(len(y_train))
    return y_train

def rebuilding_predictions(df_input, y_pred, meta_array, is_test=False):
    cont = 0
    predictions = pd.DataFrame()

    for meta in meta_array:
        game_id = meta['game_id']
        play_id = meta['play_id']
        nfl_id  = meta['nfl_id']
        try:
            group = df_input.loc[
                (df_input['game_id'] == game_id) &
                (df_input['play_id'] == play_id) &
                (df_input['nfl_id'] == nfl_id)]

            n_pred_max = group['num_frames_output'].iloc[0]
            if len(group) == 0:
                continue

            last_frame = group.sort_values('frame_id').iloc[-1]

            data_pred = pd.DataFrame()
            data_pred['x'] = np.cumsum(y_pred[cont][:n_pred_max, 0], axis=0) + last_frame['x']
            data_pred['y'] = np.cumsum(y_pred[cont][:n_pred_max, 1], axis=0) + last_frame['y']

            data_pred['x'] = data_pred['x'].clip(0, 120)
            data_pred['y'] = data_pred['y'].clip(0, 53.3)

            if not is_test:
                data_pred['game_id'] = game_id
                data_pred['play_id'] = play_id
                data_pred['nfl_id'] = nfl_id

            predictions = pd.concat([predictions, data_pred], ignore_index=True)
            cont+=1
        except:
            print('except')
    return predictions

2025-12-01 23:43:31.555110: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1764632611.789890      38 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1764632611.870715      38 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [None]:
pd.set_option('display.max_columns', None)

df_input = pd.DataFrame()
for week in [str(i).zfill(2) for i in range(1, 17)]:
    df = pd.read_csv(f'/kaggle/input/nfl-big-data-bowl-2026-prediction/train/input_2023_w{week}.csv')
    df_input = pd.concat([df_input, df])
print(df_input.shape)
df_input.head(2)

(4348080, 23)


(0, 23)

In [None]:
df_output = pd.DataFrame()
for week in [str(i).zfill(2) for i in range(1, 17)]:
    df = pd.read_csv(f'/kaggle/input/nfl-big-data-bowl-2026-prediction/train/output_2023_w{week}.csv')
    df_output = pd.concat([df_output, df])
print(df_output.shape)
df_output.head(2)

In [None]:
test = pd.DataFrame()
test = pd.read_csv(f'/kaggle/input/nfl-big-data-bowl-2026-prediction/test.csv')
print(test.shape)
test.head(2)

In [None]:
test_input = pd.DataFrame()
test_input = pd.read_csv(f'/kaggle/input/nfl-big-data-bowl-2026-prediction/test_input.csv')
print(test_input.shape)
test_input.head(2)

### Modeling

In [None]:
def preprocessing(df):
    print(f'Processing...')
    columns = ['game_id','play_id','nfl_id', 'frame_id']

    qb = df[df['player_role']=='Passer'][['game_id','play_id', 'frame_id', 'x','y']].rename(columns={'x':'qb_x','y':'qb_y'})
    df = df.merge(qb, on=['game_id','play_id', 'frame_id'], how='left')

    df['dx_qb'] = df['x'] - df['qb_x']
    df['dy_qb'] = df['y'] - df['qb_y']
    df['angle_to_qb'] = np.degrees(np.arctan2(df['dy_qb'], df['dx_qb']))
    df['angle_diff_qb'] = (df['dir'] - df['angle_to_qb'] + 180) % 360 - 180

    df_tmp = df.loc[df['player_to_predict'] == True].copy()

    df_tmp["angle_to_ball"] = np.degrees(np.arctan2(
        df_tmp["ball_land_y"] - df_tmp["y"],
        df_tmp["ball_land_x"] - df_tmp["x"]
    ))

    df_tmp["angle_diff"] = df_tmp["dir"] - df_tmp["angle_to_ball"]

    # Angle normalization to [-180, 180]
    for col in ["angle_diff", "dir", "o"]:
        df_tmp[col] = (df_tmp[col] + 180) % 360 - 180

    df_tmp['delta_ball_x'] = df_tmp['ball_land_x'] - df_tmp['x']
    df_tmp['delta_ball_y'] = df_tmp['ball_land_y'] - df_tmp['y']

    height_split = [n.split('-') for n in df_tmp['player_height']]
    df_tmp['height_cm'] = [int(n[0]) * 30.48 + int(n[1]) * 2.54 for n in height_split]

    df_tmp['old_x'] = df_tmp['x']
    df_tmp['old_y'] = df_tmp['y']

    # Components of the acceleration module
    df_tmp['s_x'] = df_tmp['s'] * np.cos(df_tmp['dir'])
    df_tmp['s_y'] = df_tmp['s'] * np.sin(df_tmp['dir'])
    df_tmp['a_x'] = df_tmp['a'] * np.cos(df_tmp['dir'])
    df_tmp['a_y'] = df_tmp['a'] * np.sin(df_tmp['dir'])

    cat_feat = ['player_role', 'player_position', 'play_direction']
    df_tmp['is_defensive_coverage'] = 0
    df_tmp.loc[df_tmp['player_role'] == 'Defensive Coverage', 'is_defensive_coverage'] = 1

    df_tmp['is_targeted_receiver'] = 0
    df_tmp.loc[df_tmp['player_role'] == 'Targeted Receiver', 'is_targeted_receiver'] = 1

    df_tmp['is_left'] = 0
    df_tmp.loc[df_tmp['play_direction'] == 'right', 'is_left'] = 1

    le = LabelEncoder()
    df_tmp['player_position_cat'] = le.fit_transform(df_tmp['player_position'])

    return df_tmp

In [None]:
features = ['is_defensive_coverage', 'player_position_cat', 'is_targeted_receiver', 'is_left',
            's', 'a', 'dir', 'o', 'num_frames_output',
            'delta_ball_x', 'delta_ball_y', 'a_x', 'a_y', 'angle_diff_qb', 'dx_qb', 'dy_qb']

df_input_processed = preprocessing(df_input)

print(f'Processed: {df_input_processed.shape}')
df_input_processed[features].head(5)

In [None]:
def create_sequences(df, df_out, features, sequence_length=15, pred_length=10, is_test=False):
    print('Creating sequences...')

    delta_t = 0.1
    grouped = df.groupby(['game_id', 'play_id', 'nfl_id'])
    features = features + ['delta_dir', 'angular_vel', 'delta_x', 'delta_y']

    samples_meta = []
    y = []
    X_tmp = pd.DataFrame(columns = features)
    X_dfs = []

    for (game_id, play_id, nfl_id), group in tqdm(grouped):
        group = group.sort_values('frame_id')

        group['delta_x'] = group['x'].diff().fillna(0)
        group['delta_y'] = group['y'].diff().fillna(0)

        group[['ddx','ddy']] = group[['delta_x','delta_y']].diff().fillna(0)

        group['delta_dir'] = group['dir'].diff().fillna(0)
        group['delta_dir'] = (group['delta_dir'] + 180) % 360 - 180
        group['angular_vel'] = group['delta_dir'] / delta_t

        # Padding X
        pad_length = sequence_length - len(group)
        if pad_length > 0:
            pad_df = pd.DataFrame(np.nan, index=range(pad_length), columns=group.columns)
            group = pd.concat([pad_df, group], ignore_index=True)

        X_tmp = group[features].iloc[-sequence_length:]
        X_tmp['game_id'] = game_id
        X_tmp['play_id'] = play_id
        X_tmp['nfl_id'] = nfl_id

        X_dfs.append(X_tmp)
        samples_meta.append((game_id, play_id, nfl_id))

        if not is_test:
            out_group = df_out[
                (df_out['game_id'] == game_id) &
                (df_out['play_id'] == play_id) &
                (df_out['nfl_id'] == nfl_id)
            ].sort_values('frame_id')

            out_group['delta_x'] = out_group['x'].diff()
            out_group['delta_y'] = out_group['y'].diff()

            diff_x_initial = out_group.iloc[0]['x'] - group.iloc[-1]['x']
            diff_y_initial = out_group.iloc[0]['y'] - group.iloc[-1]['y']
            out_group['delta_x'] = out_group['delta_x'].fillna(diff_x_initial)
            out_group['delta_y'] = out_group['delta_y'].fillna(diff_y_initial)

            pad_out_length = pred_length - len(out_group)
            if pad_out_length > 0:
                pad_df = pd.DataFrame(np.nan, index=range(pad_out_length), columns=out_group.columns)
                out_group = pd.concat([out_group, pad_df], ignore_index=True)

            y.append(out_group[['delta_x', 'delta_y']].values[:pred_length])

    X_df = pd.concat(X_dfs, ignore_index=True)

    std = StandardScaler()
    num_degre = ['dir', 'o', 'a_x', 'a_y', 'angle_diff_qb', 'angular_vel']
    X_df[num_degre] = std.fit_transform(X_df[num_degre])

    num_dist = ['delta_ball_x', 'delta_ball_y', 'dx_qb', 'dy_qb', 'delta_dir', 'delta_x', 'delta_y']
    X_df[num_dist] = std.fit_transform(X_df[num_dist])

    X_df = X_df.sort_values(["game_id", "play_id", "nfl_id"]).reset_index(drop=True)

    n_groups = len(samples_meta)
    n_features = len(features)

    # Convert to tensor (n, seq, features)
    X_array = X_df[features].values.reshape(n_groups, sequence_length, n_features)

    if is_test:
        y_array = np.array({})
    else:
        y_array = np.nan_to_num(np.array(y), nan=-999)

    meta = [
        {'game_id': g, 'play_id': p, 'nfl_id': n}
        for (g, p, n) in samples_meta
    ]
    X_df = X_df.reset_index(drop=True)

    # Descobrir número de grupos:
    n_groups = len(samples_meta)
    n_features = len(features)

    # Convert to tensor (n, seq, features)
    X_array = X_df[features].values.reshape(n_groups, sequence_length, n_features)
    X_array = np.nan_to_num(np.array(X_array), nan=-999)
    if is_test:
        y_array = np.array({})
    else:
        y_array = np.nan_to_num(np.array(y), nan=-999)

    meta = [
        {"game_id": g, "play_id": p, "nfl_id": n}
        for (g, p, n) in samples_meta
    ]

    return X_array, y_array, meta


In [None]:
sequence_length = 15
pred_length= 30
X_array, y_array, meta_array = create_sequences(df_input_processed,
                                                df_output, features,
                                                sequence_length=sequence_length,
                                                pred_length=pred_length)
X_array.shape, y_array.shape

In [None]:
from keras.optimizers import AdamW

model = Sequential([
    Masking(mask_value=-999, input_shape=(sequence_length, X_array.shape[2])),
    LSTM(128, return_sequences=True), # return_sequences=True na primeira LSTM ajuda a capturar padrões longos antes de resumir.
    Dropout(0.2),
    LSTM(64, return_sequences=True),
    Dropout(0.1),
    LSTM(32),
    Dense(pred_length * 2),  # 2 to (x, y) by frame
    Reshape((pred_length, 2))
])

model.compile(
    optimizer=AdamW(1e-3, weight_decay=1e-4),
    loss=tf.keras.losses.Huber(delta=1.0)
)
model.summary()

In [None]:
sample_weights = np.where(np.any(y_array != -999, axis=-1), 1.0, 0.0)
early = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True, min_delta=0.001)

model.fit(X_array, y_array,
          epochs=20, batch_size=32, validation_split=0.2,
          verbose=1, sample_weight=sample_weights, callbacks=[early])

In [None]:
mask_ratio = np.mean(y_array == -999)
print(f"{mask_ratio*100:.2f}% dos valores estão mascarados")

In [None]:
START_PRED = 0
NUM_PRED = 50
y_pred = model.predict(X_array[START_PRED: START_PRED + NUM_PRED])

pred = rebuilding_predictions(df_input, y_pred, meta_array[START_PRED: START_PRED + NUM_PRED])
y_train = get_outputs(df_input, df_output, meta_array[START_PRED: START_PRED + NUM_PRED])
pred.shape

In [None]:
mse_x = np.nanmean((pred['x'] - y_train['x'])**2)
mse_y = np.nanmean((pred['y'] - y_train['y'])**2)
rmse = np.sqrt((mse_x + mse_y) / 2)

print(f'RMSE: {rmse:.2f} (x: {mse_x:.2f}, y: {mse_y:.2f})')

In [None]:
roles = np.unique(y_train['player_position'])
for r in roles:
    y_train_cutted = y_train[y_train['player_position'] == r]
    idxs  = y_train_cutted.index
    rows = y_train_cutted['row'].tolist()

    mse_x = np.nanmean((pred.loc[rows, 'x'] - y_train.loc[idxs, 'x'])**2)
    mse_y = np.nanmean((pred.loc[rows, 'y'] - y_train.loc[idxs, 'y'])**2)

    if mse_x + mse_y != 0:
        rmse = np.sqrt((mse_x + mse_y) / 2)
    else:
        rmse = 0
    print(f'{r}: {rmse:.2f}')

In [None]:
POINT = START_PRED
df_ = plot(POINT, NUM_PRED)