In [1]:
def create_sequences(df, id_cols, feature_cols, target_cols):
    """
    Group by composite ID (multiple columns) and create sequences
    
    Parameters:
    -----------
    df : DataFrame
    id_cols : list of str
        Column names to group by (e.g., ['nfl_id', 'play_id', 'game_id'])
    feature_cols : list of str
        Feature column names
    target_cols : list of str or str
        Target column name(s)
    """
    sequences_X = []
    sequences_y = []
    ids = []
    
    for id_val, group in df.groupby(id_cols):
        group = group.sort_values('frame_id')
        X_seq = group[feature_cols].values
        y_seq = group[target_cols].values
        sequences_X.append(X_seq)
        sequences_y.append(y_seq)
        ids.append(id_val)
    
    return sequences_X, sequences_y, ids

In [2]:
feature_columns = [
    'frame_id',
    'play_direction_encoded',
    'absolute_yardline_number',
    'player_height_encoded',
    'player_weight',
    'player_role_encoded',
    'x',
    'y',
    's',
    'a',
    'dir',
    'o',
    'num_frames_output'
]

In [3]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load
from sklearn.model_selection import train_test_split
#from sklearn.preprocessing import StandardScaler
import numpy as np # linear algebra
import pandas as pd
import tensorflow as tf
from sklearn.preprocessing import LabelEncoder, StandardScaler
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout, Masking


# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

2025-10-20 23:17:50.699770: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1761002270.960274      13 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1761002271.029747      13 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


/kaggle/input/nfl-big-data-bowl-2026-prediction/sample_submission.csv
/kaggle/input/nfl-big-data-bowl-2026-prediction/test_input.csv
/kaggle/input/nfl-big-data-bowl-2026-prediction/test.csv
/kaggle/input/nfl-big-data-bowl-2026-prediction/train/input_2023_w17.csv
/kaggle/input/nfl-big-data-bowl-2026-prediction/train/output_2023_w05.csv
/kaggle/input/nfl-big-data-bowl-2026-prediction/train/output_2023_w10.csv
/kaggle/input/nfl-big-data-bowl-2026-prediction/train/input_2023_w03.csv
/kaggle/input/nfl-big-data-bowl-2026-prediction/train/output_2023_w18.csv
/kaggle/input/nfl-big-data-bowl-2026-prediction/train/input_2023_w05.csv
/kaggle/input/nfl-big-data-bowl-2026-prediction/train/output_2023_w11.csv
/kaggle/input/nfl-big-data-bowl-2026-prediction/train/output_2023_w12.csv
/kaggle/input/nfl-big-data-bowl-2026-prediction/train/output_2023_w16.csv
/kaggle/input/nfl-big-data-bowl-2026-prediction/train/output_2023_w06.csv
/kaggle/input/nfl-big-data-bowl-2026-prediction/train/input_2023_w18.csv


In [4]:
import matplotlib as plt
week_01 = pd.read_csv('/kaggle/input/nfl-big-data-bowl-2026-prediction/train/input_2023_w01.csv')
test_input = pd.read_csv('/kaggle/input/nfl-big-data-bowl-2026-prediction/test_input.csv')

In [5]:
test_input.tail()

Unnamed: 0,game_id,play_id,player_to_predict,nfl_id,frame_id,play_direction,absolute_yardline_number,player_name,player_height,player_weight,...,player_role,x,y,s,a,dir,o,num_frames_output,ball_land_x,ball_land_y
49748,2025010515,3902,True,55112,35,right,61,Tay Martin,6-3,186,...,Targeted Receiver,77.83,16.06,7.44,3.09,92.5,90.16,30,104.5,29.01
49749,2025010515,3902,True,55112,36,right,61,Tay Martin,6-3,186,...,Targeted Receiver,78.58,16.06,7.43,3.78,88.79,79.59,30,104.5,29.01
49750,2025010515,3902,True,55112,37,right,61,Tay Martin,6-3,186,...,Targeted Receiver,79.32,16.11,7.45,4.54,83.87,69.95,30,104.5,29.01
49751,2025010515,3902,True,55112,38,right,61,Tay Martin,6-3,186,...,Targeted Receiver,80.07,16.23,7.52,4.88,79.43,57.29,30,104.5,29.01
49752,2025010515,3902,True,55112,39,right,61,Tay Martin,6-3,186,...,Targeted Receiver,80.8,16.4,7.6,5.06,74.66,49.85,30,104.5,29.01


In [6]:
week_01 = week_01.sort_values(['nfl_id', 'play_id','game_id','frame_id'])
#groups week 1 data properly

In [7]:
#print(week_01)

In [8]:
le_direction = LabelEncoder()
le_role = LabelEncoder()
le_height =  LabelEncoder()

week_01['play_direction_encoded'] = le_direction.fit_transform(week_01['play_direction'])
week_01['player_role_encoded'] = le_role.fit_transform(week_01['player_role'])
week_01['player_height_encoded'] = le_height.fit_transform(week_01['player_height'])

In [9]:
X = week_01[feature_columns]

In [10]:
scaler = StandardScaler()
week_01[feature_columns] = scaler.fit_transform(week_01[feature_columns])

In [11]:
target_columns = ['x', 'y']
#targets both the x and y of an arial football

In [12]:
sequences_X, sequences_y, ids = create_sequences(
    week_01,
    id_cols=['nfl_id', 'play_id', 'game_id'],
    feature_cols=feature_columns,
    target_cols=target_columns
)

In [13]:
max_len = max(len(seq) for seq in sequences_X)
X_padded = pad_sequences(sequences_X, maxlen=max_len, dtype='float32', padding='post')
y_padded = pad_sequences(sequences_y, maxlen=max_len, dtype='float32', padding='post')

In [14]:
train_idx, val_idx = train_test_split(range(len(sequences_X)), test_size=0.2, random_state=42)
X_train = X_padded[train_idx]
X_val = X_padded[val_idx]
y_train = y_padded[train_idx]
y_val = y_padded[val_idx]

In [15]:
print(f"X_train shape: {X_train.shape}")
print(f"y_train shape: {y_train.shape}")

X_train shape: (8071, 74, 13)
y_train shape: (8071, 74, 2)


In [16]:
model = Sequential([
    Masking(mask_value=0., input_shape=(max_len, len(feature_columns))),
    LSTM(64, return_sequences=True),
    Dropout(0.3),
    LSTM(64, return_sequences=True),
    Dropout(0.3),
    Dense(2)
])

  super().__init__(**kwargs)
2025-10-20 23:18:17.971678: E external/local_xla/xla/stream_executor/cuda/cuda_driver.cc:152] failed call to cuInit: INTERNAL: CUDA error: Failed call to cuInit: UNKNOWN ERROR (303)


In [17]:
model.compile(optimizer='adam', loss='mse', metrics=['mae'])
model.summary()

In [18]:
history = model.fit(
    X_train, y_train,
    validation_data=(X_val, y_val),
    epochs=5,
    batch_size=32,
    verbose=1
)

Epoch 1/5
[1m253/253[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m26s[0m 78ms/step - loss: 0.2505 - mae: 0.6090 - val_loss: 0.0335 - val_mae: 0.5736
Epoch 2/5
[1m253/253[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 75ms/step - loss: 0.0506 - mae: 0.5981 - val_loss: 0.0131 - val_mae: 0.5657
Epoch 3/5
[1m253/253[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 76ms/step - loss: 0.0325 - mae: 0.5902 - val_loss: 0.0049 - val_mae: 0.5541
Epoch 4/5
[1m253/253[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 75ms/step - loss: 0.0242 - mae: 0.5793 - val_loss: 0.0029 - val_mae: 0.5619
Epoch 5/5
[1m253/253[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 76ms/step - loss: 0.0213 - mae: 0.5783 - val_loss: 0.0020 - val_mae: 0.5537
