In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/competitions/spl-utspan-data-challenge-2026/tutorial.ipynb
/kaggle/input/competitions/spl-utspan-data-challenge-2026/scaler_depth.pkl
/kaggle/input/competitions/spl-utspan-data-challenge-2026/scaler_angle.pkl
/kaggle/input/competitions/spl-utspan-data-challenge-2026/submission.csv
/kaggle/input/competitions/spl-utspan-data-challenge-2026/train.csv
/kaggle/input/competitions/spl-utspan-data-challenge-2026/test.csv
/kaggle/input/competitions/spl-utspan-data-challenge-2026/scaler_left_right.pkl


## Methodology and Overview
This notebook utilizes an XGBoost regression model. The objective is to predict the scaled angle, depth, and left right deviation of the shot. The full code is available on my public GitHub repository [**here**](https://github.com/adel-dot-jpg/SPLxUTSPAN-2026-Data-Challenge)

In [2]:
import pandas as pd
import numpy as np
import ast
import xgboost as xgb
from sklearn.model_selection import GroupKFold
from sklearn.metrics import mean_squared_error
from sklearn.multioutput import MultiOutputRegressor
import warnings

warnings.filterwarnings('ignore') # Clean up output

BASE_PATH = '/kaggle/input/competitions/spl-utspan-data-challenge-2026/'
TRAIN_PATH = f'{BASE_PATH}train.csv'
TEST_PATH = f'{BASE_PATH}test.csv'
SUBMISSION_PATH = f'{BASE_PATH}submission.csv'

SCALER_BOUNDS = {
    'angle': {'min': 30, 'max': 60},
    'depth': {'min': -12, 'max': 30},
    'left_right': {'min': -16, 'max': 16}
}

## Feature Extraction

The raw dataset provides spatial coordinates stored as string representations of lists. To process this, my pipeline uses the ast library (abstract syntax tree) to safely evaluate these strings into NumPy arrays. Because a basketball shot varies in duration and frame count, extracting summary statistics is simpler than feeding raw time series data. For every joint coordinate across the entire duration of the shot, we calculate the mean and standard deviation. This flattens the variable length sequences into a fixed length tabular format suitable for tree based models, capturing both the average position and the physical variability of the shooter's movement.

In [3]:
def parse_and_extract_features(path, is_train=True):
    print(f"Loading {path}")
    df = pd.read_csv(path)
    
    # Identify feature columns (everything that isn't ID or Target)
    exclude_cols = ['id', 'shot_id', 'participant_id', 'angle', 'depth', 'left_right', 'Unnamed: 0']
    feature_cols = [c for c in df.columns if c not in exclude_cols]
    
    # Arrays to hold clean data
    X_features = []
    y_targets = []
    groups = []
    ids = [] # Keep track of IDs for submission
    
    print(f"Parsing {len(df)} shots... this might take a moment.")
    
    for idx, row in df.iterrows():
        row_stats = []
        
        for col in feature_cols:
            raw_val = row[col]
            
            try:
                if isinstance(raw_val, str):
                    series = np.array(ast.literal_eval(raw_val))
                else:
                    series = np.array([float(raw_val)]) # Handle single numbers if any
            except:
                series = np.zeros(1) # Fallback
            
            if len(series) > 0:
                row_stats.append(np.mean(series))
                row_stats.append(np.std(series))
            else:
                row_stats.append(0.0)
                row_stats.append(0.0)
                
        X_features.append(row_stats)
        ids.append(row.get('id', idx))
        
        if is_train:
            y_targets.append([
                row['angle'],
                row['depth'],
                row['left_right']
            ])
            # Append participant ID for groupkfold
            groups.append(row['participant_id'])
            
    # Convert to dataframes
    X_df = pd.DataFrame(X_features, columns=[f"{c}_{stat}" for c in feature_cols for stat in ['mean', 'std']])
    
    if is_train:
        return X_df, pd.DataFrame(y_targets, columns=['angle', 'depth', 'left_right']), np.array(groups)
    else:
        return X_df, ids

# Execute loading
print("--- BEEP, PROCESSING TRAIN DATA ---")
X, y, participants = parse_and_extract_features(TRAIN_PATH, is_train=True)
print(f"Training Data Ready: {X.shape}")

print("--- BOOP, PROCESSING TEST DATA ---")
X_test, test_ids = parse_and_extract_features(TEST_PATH, is_train=False)
print(f"Test Data Ready: {X_test.shape}")

--- BEEP, PROCESSING TRAIN DATA ---
Loading /kaggle/input/competitions/spl-utspan-data-challenge-2026/train.csv
Parsing 345 shots... this might take a moment.
Training Data Ready: (345, 414)
--- BOOP, PROCESSING TEST DATA ---
Loading /kaggle/input/competitions/spl-utspan-data-challenge-2026/test.csv
Parsing 113 shots... this might take a moment.
Test Data Ready: (113, 414)


## Model Strategy and Validation

For the predictive model, I utilize XGBoost regressor wrapped in a MultiOutputRegressor to handle the three continuous target variables simultaneously. Given the small sample size of 458 shots across only 5 participants, if the model sees the same participant in both the training and validation sets, there is the risk of the model memorizing player body mechanics instead of shot physics. To prevent this, Group K Fold cross validation, grouping by participant ID is employed to validate output consistency. This checks if the model is learning generalized shooting mechanics and evaluates its performance on an entirely unseen participant during each fold.

In [4]:
# XGBoost setup
model = MultiOutputRegressor(xgb.XGBRegressor(
    n_estimators=100,
    learning_rate=0.05,
    max_depth=3, # Keep shallow to prevent overfitting
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=67
))

# Validation logic
gkf = GroupKFold(n_splits=5)
scores = []

print("--- HOLDUP, STARTING CROSS-VALIDATION ---")
for fold, (train_idx, val_idx) in enumerate(gkf.split(X, y, groups=participants)):
    X_train_fold, X_val_fold = X.iloc[train_idx], X.iloc[val_idx]
    y_train_fold, y_val_fold = y.iloc[train_idx], y.iloc[val_idx]
    
    model.fit(X_train_fold, y_train_fold)
    preds = model.predict(X_val_fold)
    
    # Scale the predictions to calculate the true competition metric
    mse = mean_squared_error(y_val_fold, preds)
    scores.append(mse)
    
    # Helper to see which player we tested on
    val_p = np.unique(participants[val_idx])
    print("Fold: ", fold+1, "Player: ", val_p, "Raw MSE = ", mse)

print("Average Raw MSE: ", np.mean(scores))

# Final training on all data
print("Retraining...")
model.fit(X, y)
final_preds = model.predict(X_test)

--- HOLDUP, STARTING CROSS-VALIDATION ---
Fold:  1 Player:  [5] Raw MSE =  48.71630859375
Fold:  2 Player:  [1] Raw MSE =  28.639997482299805
Fold:  3 Player:  [3] Raw MSE =  8.465067863464355
Fold:  4 Player:  [4] Raw MSE =  52.89833450317383
Fold:  5 Player:  [2] Raw MSE =  32.639408111572266
Average Raw MSE:  34.27182331085205
Retraining...


## Final Prediction and Scaling

After validating the model architecture, it is retrained on the entire training dataset to maximize the amount of data it learns from before inferring on the test set. Finally, the raw predictions must be scaled to adhere to the competition requirements. A MinMax scaling formula is applied using the provided bounds for angle, depth, and left right deviation, clipping the final results between 0 and 1 to generate the final submission file.

In [5]:
def scale_output(values, col_name):
    """Applies the competition MinMax scaling formula."""
    mini = SCALER_BOUNDS[col_name]['min']
    maxi = SCALER_BOUNDS[col_name]['max']
    
    # Formula: (x - min) / (max - min)
    scaled = (values - mini) / (maxi - mini)
    
    # Clip to ensure we don't go outside [0, 1]
    return np.clip(scaled, 0, 1)

# Create submission DataFrame
submission = pd.DataFrame()
submission['id'] = test_ids
submission['scaled_angle'] = scale_output(final_preds[:, 0], 'angle')
submission['scaled_depth'] = scale_output(final_preds[:, 1], 'depth')
submission['scaled_left_right'] = scale_output(final_preds[:, 2], 'left_right')

# Save
submission.to_csv('submission.csv', index=False)
print("submission.csv created successfully!")
print(submission.head())

submission.csv created successfully!
                                     id  scaled_angle  scaled_depth  \
0  d5cc9ade-6bfd-42d2-8404-99d7506e535c      0.483936      0.591083   
1  6fb475ff-1732-42bc-8385-9f80956199fe      0.484560      0.523309   
2  39f95c12-deab-4d77-8a9c-feecda4d5a66      0.521334      0.538571   
3  5ec65bf7-4892-4076-a572-e01b4b8ff038      0.476318      0.522425   
4  52ffbd2a-969c-4e52-af66-c4b4be3c3cbb      0.489406      0.595713   

   scaled_left_right  
0           0.417449  
1           0.511225  
2           0.492368  
3           0.530073  
4           0.376594  
