# Creating Final Aggregated Dataset
## Process used for weeks 1 through 9

In [1]:
!pip install google-cloud-bigquery google-cloud-storage



In [2]:
from google.colab import auth
auth.authenticate_user()



In [3]:
from google.cloud import bigquery
import pandas as pd

# Initialize BigQuery client
project_id = "bigdatabowl2025"
client = bigquery.Client(project=project_id, location='us-central1')

In [4]:
# List of table names and corresponding DataFrame names
tables = {
    "players": "players_df",
    "games" : "games_df",
    "player_play" : "player_play_df",
    "plays" : "plays_df",
    "tracking_week_1": "tracking"  #  Will use combined_tracking later
}

# Dataset details
dataset_id = "bdb_tables"

# Query each table and assign to DataFrame variables
for table_name, df_name in tables.items():
    query = f"SELECT * FROM `{client.project}.{dataset_id}.{table_name}`"
    print(f"Querying table: {table_name}")
    globals()[df_name] = client.query(query).to_dataframe()

    print(f"Loaded {len(globals()[df_name])} rows into DataFrame: {df_name}")

Querying table: players
Loaded 1697 rows into DataFrame: players_df
Querying table: games
Loaded 136 rows into DataFrame: games_df
Querying table: player_play
Loaded 354727 rows into DataFrame: player_play_df
Querying table: plays
Loaded 16124 rows into DataFrame: plays_df
Querying table: tracking_week_1
Loaded 7104700 rows into DataFrame: tracking


### Importing Data

In [5]:
!pip install bayesian-optimization

Collecting bayesian-optimization
  Downloading bayesian_optimization-2.0.0-py3-none-any.whl.metadata (8.9 kB)
Collecting colorama<0.5.0,>=0.4.6 (from bayesian-optimization)
  Downloading colorama-0.4.6-py2.py3-none-any.whl.metadata (17 kB)
Downloading bayesian_optimization-2.0.0-py3-none-any.whl (30 kB)
Downloading colorama-0.4.6-py2.py3-none-any.whl (25 kB)
Installing collected packages: colorama, bayesian-optimization
Successfully installed bayesian-optimization-2.0.0 colorama-0.4.6


In [6]:
import numpy as np
import pandas as pd
import os
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
import plotly.graph_objects as go
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.metrics import accuracy_score, roc_auc_score, confusion_matrix, roc_curve
from sklearn.preprocessing import LabelEncoder, StandardScaler
from bayes_opt import BayesianOptimization
import xgboost as xgb
import seaborn as sns
pd.set_option('display.max_columns', None)


In [7]:
# Select only necessary columns from each DataFrame for this analysis
plays_df = plays_df[['gameId', 'playId', 'rushLocationType', 'quarter', 'down', 'yardsToGo', 'gameClock',
       'possessionTeam', 'preSnapHomeScore', 'preSnapVisitorScore', 'absoluteYardlineNumber',
       'preSnapHomeTeamWinProbability', 'preSnapVisitorTeamWinProbability',
       'expectedPoints', 'offenseFormation', 'receiverAlignment', 'pff_passCoverage', 'pff_manZone']]
players_df = players_df[['nflId', 'position']]
games_df = games_df[['gameId', 'gameDate', 'gameTimeEastern', 'homeTeamAbbr']]
player_play_df = player_play_df[['gameId', 'playId', 'nflId', 'inMotionAtBallSnap', 'shiftSinceLineset', 'motionSinceLineset']]
tracking_df = tracking[['gameId', 'playId', 'nflId', 'displayName', 'x', 'y', 'event']]
# merged_df = players_df.merge(player_play_df, on='nflId', how='left')
# base_df = plays_df.merge(merged_df, on=['gameId', 'playId'], how='left')
# print(base_df.head())

In [8]:
# Check for and handle invalid values in nflId
tracking_df['nflId'] = pd.to_numeric(tracking_df['nflId'], errors='coerce')  # Convert non-numeric to NaN
tracking_df['nflId'] = tracking_df['nflId'].fillna(-1).astype('int32')       # Fill NaN with -1 and convert to int32

# Adjust data types for memory efficiency using .loc[] to avoid SettingWithCopyWarning
games_df.loc[:, 'gameId'] = games_df['gameId'].astype('int32')
plays_df.loc[:, 'gameId'] = plays_df['gameId'].astype('int32')
plays_df.loc[:, 'playId'] = plays_df['playId'].astype('int32')
player_play_df.loc[:, 'gameId'] = player_play_df['gameId'].astype('int32')
player_play_df.loc[:, 'playId'] = player_play_df['playId'].astype('int32')
tracking_df.loc[:, 'gameId'] = tracking_df['gameId'].astype('int32')
tracking_df.loc[:, 'playId'] = tracking_df['playId'].astype('int32')
players_df.loc[:, 'nflId'] = players_df['nflId'].astype('int32')
player_play_df.loc[:, 'nflId'] = player_play_df['nflId'].astype('int32')


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tracking_df['nflId'] = pd.to_numeric(tracking_df['nflId'], errors='coerce')  # Convert non-numeric to NaN
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tracking_df['nflId'] = tracking_df['nflId'].fillna(-1).astype('int32')       # Fill NaN with -1 and convert to int32


In [9]:
# Join the DataFrames

# Join plays and tracking on 'gameId' and 'playId'
join_all = pd.merge(plays_df, tracking_df, how="inner", on=["gameId", "playId"])

# Join the resulting DataFrame with players on 'nflId'
join_all = join_all.merge(players_df, how="left", on="nflId")

join_all = join_all.merge(player_play_df, how="left", on=["gameId", "playId", "nflId"])

join_all = join_all.merge(games_df, how="left", on='gameId')

# Display information on the joined DataFrame
print("Final joined DataFrame shape:", join_all.shape)
print(join_all.head())

Final joined DataFrame shape: (7104700, 30)
       gameId  playId rushLocationType  quarter  down  yardsToGo gameClock  \
0  2022090800    1406     INSIDE_RIGHT        2     1          7     03:42   
1  2022090800    1406     INSIDE_RIGHT        2     1          7     03:42   
2  2022090800    1406     INSIDE_RIGHT        2     1          7     03:42   
3  2022090800    1406     INSIDE_RIGHT        2     1          7     03:42   
4  2022090800    1406     INSIDE_RIGHT        2     1          7     03:42   

  possessionTeam  preSnapHomeScore  preSnapVisitorScore  \
0             LA                 0                   10   
1             LA                 0                   10   
2             LA                 0                   10   
3             LA                 0                   10   
4             LA                 0                   10   

   absoluteYardlineNumber  preSnapHomeTeamWinProbability  \
0                      17                       0.263124   
1           

In [10]:
pred_df = join_all
print(pred_df['playId'].nunique())
print(pred_df['gameId'].nunique())
print(len(pred_df))

1574
16
7104700


In [11]:
# Fill 'position' with 'FBall' when 'displayName_x' is 'football'
pred_df.loc[pred_df['displayName'] == 'football', 'position'] = 'FBall'

# Filter pred_df for specific events and positions
filtered_pred_df = pred_df[(pred_df['event'] == 'ball_snap')]  # testing something here
filtered_pred_df = filtered_pred_df[filtered_pred_df['position'].isin(['TE', 'WR', 'FB', 'RB', 'QB', 'FBall'])]

# Create new feature identifying if the home team has possession
filtered_pred_df['homeTeamPossession'] = (filtered_pred_df['possessionTeam'] == filtered_pred_df['homeTeamAbbr']).astype(int)

# Drop uneeded columns
filtered_pred_df = filtered_pred_df.drop(columns=['displayName', 'possessionTeam', 'homeTeamAbbr'])

# Display the first few rows of the filtered DataFrame
print(filtered_pred_df.head())

         gameId  playId rushLocationType  quarter  down  yardsToGo gameClock  \
128  2022090800    1406     INSIDE_RIGHT        2     1          7     03:42   
340  2022090800    1406     INSIDE_RIGHT        2     1          7     03:42   
360  2022090800    1406     INSIDE_RIGHT        2     1          7     03:42   
364  2022090800    1406     INSIDE_RIGHT        2     1          7     03:42   
556  2022090800    1406     INSIDE_RIGHT        2     1          7     03:42   

     preSnapHomeScore  preSnapVisitorScore  absoluteYardlineNumber  \
128                 0                   10                      17   
340                 0                   10                      17   
360                 0                   10                      17   
364                 0                   10                      17   
556                 0                   10                      17   

     preSnapHomeTeamWinProbability  preSnapVisitorTeamWinProbability  \
128                       

In [12]:
cols = ['event', 'rushLocationType', 'position', 'receiverAlignment', 'offenseFormation', 'inMotionAtBallSnap', 'shiftSinceLineset', 'motionSinceLineset']

for i in cols:
    print(filtered_pred_df[i].value_counts())

event
ball_snap    13601
Name: count, dtype: int64
rushLocationType
NA               8117
INSIDE_LEFT      1495
INSIDE_RIGHT     1481
OUTSIDE_RIGHT    1274
OUTSIDE_LEFT     1094
UNKNOWN           140
Name: count, dtype: int64
position
WR       4998
TE       2500
RB       1989
QB       1963
FBall    1948
FB        203
Name: count, dtype: int64
receiverAlignment
2x2    5817
3x1    4942
2x1    1453
3x2     896
NA      196
4x1     147
1x1      96
2x0      36
1x0      18
Name: count, dtype: int64
offenseFormation
SHOTGUN       7636
SINGLEBACK    3197
EMPTY         1036
I_FORM         926
PISTOL         553
NA             196
JUMBO           57
Name: count, dtype: int64
inMotionAtBallSnap
FALSE    10688
TRUE       661
NA         304
Name: count, dtype: int64
shiftSinceLineset
FALSE    11496
TRUE       157
Name: count, dtype: int64
motionSinceLineset
FALSE    9915
TRUE     1424
NA        314
Name: count, dtype: int64


In [13]:
import numpy as np

# Create the target column
filtered_pred_df['target_rush'] = np.where(filtered_pred_df['rushLocationType'].notna(), 1, 0)

# Convert string booleans and other invalid values to integers robustly
columns_to_convert = ['inMotionAtBallSnap', 'shiftSinceLineset', 'motionSinceLineset']
for col in columns_to_convert:
    filtered_pred_df[col] = (
        pd.to_numeric(filtered_pred_df[col], errors='coerce')  # Convert invalid entries to NaN
        .fillna(0)                                            # Replace NaN with 0
        .astype(int)                                          # Convert to integer
    )

if filtered_pred_df is None:
    raise ValueError("filtered_pred_df is None. Check previous operations.")

# Step 1: Create new columns for 'inMotionAtBallSnap'
positions = ['RB', 'FB', 'WR', 'TE']
for pos in positions:
    col_name = f"{pos.lower()}_snap_motion"  # Column name for inMotionAtBallSnap
    filtered_pred_df[col_name] = (filtered_pred_df['position'] == pos).astype(int) * filtered_pred_df['inMotionAtBallSnap']

# Step 2: Create new columns for 'shiftSinceLineset' (including 'QB')
positions_with_qb = ['RB', 'FB', 'WR', 'TE', 'QB']
for pos in positions_with_qb:
    col_name = f"{pos.lower()}_shift"  # Column name for shiftSinceLineset
    filtered_pred_df[col_name] = (filtered_pred_df['position'] == pos).astype(int) * filtered_pred_df['shiftSinceLineset']

# Step 3: Create new columns for 'motionSinceLineset'
for pos in positions:
    col_name = f"{pos.lower()}_motion"  # Column name for motionSinceLineset
    filtered_pred_df[col_name] = (filtered_pred_df['position'] == pos).astype(int) * filtered_pred_df['motionSinceLineset']

# Verify the result
print(filtered_pred_df[[
    'position',
    'rb_snap_motion', 'fb_snap_motion', 'wr_snap_motion', 'te_snap_motion',
    'rb_shift', 'fb_shift', 'wr_shift', 'te_shift', 'qb_shift',
    'rb_motion', 'fb_motion', 'wr_motion', 'te_motion'
]].head())

    position  rb_snap_motion  fb_snap_motion  wr_snap_motion  te_snap_motion  \
128       RB               0               0               0               0   
340       WR               0               0               0               0   
360       QB               0               0               0               0   
364       WR               0               0               0               0   
556       TE               0               0               0               0   

     rb_shift  fb_shift  wr_shift  te_shift  qb_shift  rb_motion  fb_motion  \
128         0         0         0         0         0          0          0   
340         0         0         0         0         0          0          0   
360         0         0         0         0         0          0          0   
364         0         0         0         0         0          0          0   
556         0         0         0         0         0          0          0   

     wr_motion  te_motion  
128          0  

In [14]:
import numpy as np

# Step 2: Filter for rows where the event is 'ball_snap' (only keep plays where the ball was snapped)
ball_snap_df = filtered_pred_df[filtered_pred_df['event'] == 'ball_snap']

# Step 3: Drop the 'event' column now that we have filtered for 'ball_snap'
ball_snap_df = ball_snap_df.drop(columns=['event'])

# Step 4: Identify FBall x-coordinates for each playId (filtered to ball_snap plays)
fb_positions = ball_snap_df[ball_snap_df['position'] == 'FBall'][['gameId', 'playId', 'x']].rename(columns={'x': 'x_fball'})
ball_snap_df = ball_snap_df.merge(fb_positions, on=['gameId', 'playId'], how='left')

# Step 5: Calculate absolute x distances for QB, RB, and FB relative to FBall's x-coordinate
ball_snap_df['qb_depth'] = np.where(ball_snap_df['position'] == 'QB', abs(ball_snap_df['x'] - ball_snap_df['x_fball']), np.nan)
ball_snap_df['rb_depth'] = np.where(ball_snap_df['position'] == 'RB', abs(ball_snap_df['x'] - ball_snap_df['x_fball']), np.nan)
ball_snap_df['fb_depth'] = np.where(ball_snap_df['position'] == 'FB', abs(ball_snap_df['x'] - ball_snap_df['x_fball']), np.nan)

# Step 6: Group by 'gameId' and 'playId' to aggregate, keeping max and first values as specified
columns_to_aggregate_max = [
    'rb_snap_motion', 'fb_snap_motion', 'wr_snap_motion', 'te_snap_motion',
    'rb_shift', 'fb_shift', 'wr_shift', 'te_shift', 'qb_shift',
    'rb_motion', 'fb_motion', 'wr_motion', 'te_motion',
    'qb_depth', 'rb_depth', 'fb_depth', 'target_rush'
]

columns_to_aggregate_first = [
    'quarter', 'down', 'yardsToGo', 'gameClock', 'preSnapHomeScore', 'preSnapVisitorScore',
    'absoluteYardlineNumber', 'preSnapHomeTeamWinProbability',
    'preSnapVisitorTeamWinProbability', 'expectedPoints', 'offenseFormation',
    'receiverAlignment', 'pff_passCoverage', 'pff_manZone'
]

# Aggregate the data
aggregated_df_final = ball_snap_df.groupby(['gameId', 'playId'], as_index=False).agg(
    {**{col: 'max' for col in columns_to_aggregate_max},
     **{col: 'first' for col in columns_to_aggregate_first}}
)

# Verify the results
print(aggregated_df_final.head())
print("Number of unique playId combinations with ball snap:", aggregated_df_final[['gameId', 'playId']].drop_duplicates().shape[0])
print(len(aggregated_df_final))
print(aggregated_df_final['target_rush'].value_counts())

       gameId  playId  rb_snap_motion  fb_snap_motion  wr_snap_motion  \
0  2022090800      56               0               0               0   
1  2022090800      80               0               0               0   
2  2022090800     101               0               0               0   
3  2022090800     122               0               0               0   
4  2022090800     167               0               0               0   

   te_snap_motion  rb_shift  fb_shift  wr_shift  te_shift  qb_shift  \
0               0         0         0         0         0         0   
1               0         0         0         0         0         0   
2               0         0         0         0         0         0   
3               0         0         0         0         0         0   
4               0         0         0         0         0         0   

   rb_motion  fb_motion  wr_motion  te_motion  qb_depth  rb_depth  fb_depth  \
0          0          0          0          0  4.929996

In [15]:
# Fill missing depth values with zero without using inplace
aggregated_df_final['fb_depth'] = aggregated_df_final['fb_depth'].fillna(0)
aggregated_df_final['rb_depth'] = aggregated_df_final['rb_depth'].fillna(0)
aggregated_df_final['qb_depth'] = aggregated_df_final['qb_depth'].fillna(0)

# Add binary indicators for FB and RB presence
aggregated_df_final['fb_present'] = (aggregated_df_final['fb_depth'] > 0).astype(int)
aggregated_df_final['rb_present'] = (aggregated_df_final['rb_depth'] > 0).astype(int)

print(aggregated_df_final.head())

       gameId  playId  rb_snap_motion  fb_snap_motion  wr_snap_motion  \
0  2022090800      56               0               0               0   
1  2022090800      80               0               0               0   
2  2022090800     101               0               0               0   
3  2022090800     122               0               0               0   
4  2022090800     167               0               0               0   

   te_snap_motion  rb_shift  fb_shift  wr_shift  te_shift  qb_shift  \
0               0         0         0         0         0         0   
1               0         0         0         0         0         0   
2               0         0         0         0         0         0   
3               0         0         0         0         0         0   
4               0         0         0         0         0         0   

   rb_motion  fb_motion  wr_motion  te_motion  qb_depth  rb_depth  fb_depth  \
0          0          0          0          0  4.929996

In [16]:
# Check for duplicate gameId and playId pairs
duplicate_pairs = aggregated_df_final[aggregated_df_final.duplicated(subset=['gameId', 'playId'], keep=False)]

# Count duplicates
print(f"Number of duplicate records: {len(duplicate_pairs)}")

# Display the duplicate pairs
print(duplicate_pairs[['gameId', 'playId']].value_counts())

Number of duplicate records: 0
Series([], Name: count, dtype: int64)


## Modeling Preprocessing

In [17]:
!pip install xgboost scikit-learn bayesian-optimization



In [18]:
from sklearn.preprocessing import LabelEncoder

# Encode categorical features
le_offenseFormation = LabelEncoder()
le_receiverAlignment = LabelEncoder()
le_pff_passCoverage = LabelEncoder()
le_pff_manZone = LabelEncoder()

# Apply label encoding to the relevant columns
aggregated_df_final['offenseFormation'] = le_offenseFormation.fit_transform(aggregated_df_final['offenseFormation'])
aggregated_df_final['receiverAlignment'] = le_receiverAlignment.fit_transform(aggregated_df_final['receiverAlignment'])
aggregated_df_final['pff_passCoverage'] = le_pff_passCoverage.fit_transform(aggregated_df_final['pff_passCoverage'])
aggregated_df_final['pff_manZone'] = le_pff_manZone.fit_transform(aggregated_df_final['pff_manZone'])

X_encoded = pd.get_dummies(aggregated_df_final, columns=['offenseFormation', 'receiverAlignment', 'pff_passCoverage', 'pff_manZone'], drop_first=True)

# Feature and target separation
X = aggregated_df_final.drop(columns=['target_rush', 'gameId', 'playId'])
y = aggregated_df_final['target_rush'].astype(int)

# Verify the encoded DataFrame
print(aggregated_df_final[['offenseFormation', 'receiverAlignment', 'pff_passCoverage', 'pff_manZone']].head())

   offenseFormation  receiverAlignment  pff_passCoverage  pff_manZone
0                 5                  4                 2            3
1                 0                  6                 2            3
2                 1                  3                11            3
3                 5                  3                 6            3
4                 0                  6                 6            3


In [19]:
import pandas as pd
from sklearn.preprocessing import StandardScaler

# Identify continuous columns to scale
columns_to_scale = [
    'qb_depth', 'rb_depth', 'fb_depth',
    'preSnapHomeTeamWinProbability', 'preSnapVisitorTeamWinProbability',
    'expectedPoints', 'absoluteYardlineNumber',
    'preSnapHomeScore', 'preSnapVisitorScore'
]

# Initialize scaler
scaler = StandardScaler()

# Scale the selected continuous columns
aggregated_df_final[columns_to_scale] = scaler.fit_transform(aggregated_df_final[columns_to_scale])


In [20]:
from google.cloud import bigquery
from google.oauth2 import service_account
import pandas_gbq

# Define project ID, dataset, and table
project_id = 'bigdatabowl2025'  # Replace with your project ID
dataset_id = 'bdb_tables'  # Replace with your dataset ID
table_id = f"{dataset_id}.aggregated_final_data"  # Full table path

# Save DataFrame to BigQuery
pandas_gbq.to_gbq(
    aggregated_df_final,
    table_id,
    project_id=project_id,
    if_exists='replace'  # Options: 'fail', 'replace', 'append'
)




100%|██████████| 1/1 [00:00<00:00, 7169.75it/s]


In [22]:
# Save the DataFrame as a CSV file in Colab's file system
aggregated_df_final.to_csv("aggregated_df_1.csv", index=False)  # This was performed for weeks 1 through 9
print("File saved as 'aggregated_df_1.csv'")

from google.colab import files

# Download the file to your local computer
files.download("aggregated_df_1.csv")

File saved as 'aggregated_df_1.csv'


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>