In [1]:
"""
Final Submission Notebook for CMI Kaggle Competition.
This notebook integrates the trained model and the finalized feature engineering
function within the required Kaggle API loop structure.
"""

import pandas as pd
import numpy as np
import data.kaggle_evaluation
import os
import pickle
import joblib
import catboost as cat

In [2]:
from src.config import PROJECT_PATH, DATA_PATH
from src.feature_engineering import create_wave1_features
TEST_DATA_FILE = 'test.csv'
TEST_DEMOGRAPHICS_FILE = 'test_demographics.csv'

In [3]:
# --- Update these paths to where you save your final model and feature function ---
MODEL_PATH = 'models_rev/wave1-catboost-best.cbm'
FEATURE_FUNCTION_NAME = 'create_wave1_features' # Name of the final function

In [4]:
#GLOBAL MAPS
# Load training data
df_train = pd.read_csv(os.path.join(DATA_PATH, 'train.csv'))
df_train_demos = pd.read_csv(os.path.join(DATA_PATH, 'train_demographics.csv'))

# --- Create Helper Mappings for Evaluation Metric ---
# Important for the custom F1 score function
metadata = df_train[['gesture', 'sequence_type']].drop_duplicates()

# Map gesture string to sequence type (Target vs. Non-Target)
gesture_to_seq_type_map = metadata.set_index('gesture')['sequence_type'].to_dict()

# Map gesture string to integer code and back
gesture_map = {label: i for i, label in enumerate(metadata['gesture'].unique())}
inv_gesture_map = {i: label for label, i in gesture_map.items()}

# Validate
print(f"Gesture Map: {gesture_map}")
print(f"\nInverted Gesture Map: {inv_gesture_map}")
print(f"\nGesture To Sequence Type: {gesture_to_seq_type_map}")

Gesture Map: {'Cheek - pinch skin': 0, 'Forehead - pull hairline': 1, 'Write name on leg': 2, 'Feel around in tray and pull out an object': 3, 'Neck - scratch': 4, 'Neck - pinch skin': 5, 'Eyelash - pull hair': 6, 'Eyebrow - pull hair': 7, 'Forehead - scratch': 8, 'Above ear - pull hair': 9, 'Wave hello': 10, 'Write name in air': 11, 'Text on phone': 12, 'Pull air toward your face': 13, 'Pinch knee/leg skin': 14, 'Scratch knee/leg skin': 15, 'Drink from bottle/cup': 16, 'Glasses on/off': 17}

Inverted Gesture Map: {0: 'Cheek - pinch skin', 1: 'Forehead - pull hairline', 2: 'Write name on leg', 3: 'Feel around in tray and pull out an object', 4: 'Neck - scratch', 5: 'Neck - pinch skin', 6: 'Eyelash - pull hair', 7: 'Eyebrow - pull hair', 8: 'Forehead - scratch', 9: 'Above ear - pull hair', 10: 'Wave hello', 11: 'Write name in air', 12: 'Text on phone', 13: 'Pull air toward your face', 14: 'Pinch knee/leg skin', 15: 'Scratch knee/leg skin', 16: 'Drink from bottle/cup', 17: 'Glasses on/of

## Load model and feature function

In [5]:
model = cat.CatBoostClassifier()
model.load_model(MODEL_PATH);
print(f"Model parameters: {model.get_all_params()}")
print(f"\nModel classes: {model.classes_}")

Model parameters: {'nan_mode': 'Min', 'gpu_ram_part': 0.95, 'eval_metric': 'MultiClass', 'iterations': 1000, 'leaf_estimation_method': 'Newton', 'observations_to_bootstrap': 'TestOnly', 'od_pval': 0, 'random_score_type': 'NormalWithModelSizeDecrease', 'grow_policy': 'SymmetricTree', 'penalties_coefficient': 1, 'boosting_type': 'Plain', 'feature_border_type': 'GreedyLogSum', 'bayesian_matrix_reg': 0.1000000015, 'devices': '-1', 'eval_fraction': 0, 'pinned_memory_bytes': '104857600', 'force_unit_auto_pair_weights': False, 'l2_leaf_reg': 3, 'random_strength': 1, 'od_type': 'Iter', 'rsm': 1, 'boost_from_average': False, 'gpu_cat_features_storage': 'GpuRam', 'fold_size_loss_normalization': False, 'model_size_reg': 0.5, 'pool_metainfo_options': {'tags': {}}, 'use_best_model': True, 'meta_l2_frequency': 0, 'od_wait': 50, 'class_names': [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17], 'random_seed': 42, 'depth': 6, 'border_count': 128, 'min_fold_size': 100, 'data_partition': 'Do

In [6]:
def create_final_submission_features(single_sequence_df:pd.DataFrame) -> pd.DataFrame:
    """
    Final feature engineering function for submission using Wave 1 logic.
    This function:
    1. Takes a single pandas DataFrame (e.g., `test_sequence_df` from the API).
    2. Calls the modular `create_wave1_features` function.
    
    Note: create_wave1_features processes one sequence_id but expects a df with 'sequence_id'.
    The API provides data for one sequence_id, so we need to ensure it's handled correctly.
    The function groups by 'sequence_id', so it should work on the slice provided by the API.
    
    3. Post-processes the result to return a single-row DataFrame of features.
    4. Ensures 'sequence_id' column is present for later dropping.
    5. Handles -1.0 in ToF columns correctly (should be handled inside `create_wave1_features`).
    """
    print(f"  [Feature Eng.] Processing sequence data of shape {single_sequence_df.shape}...")
    
    # --- Proactive State Management: Validate Input ---
    required_base_cols = ['sequence_id', 'phase', 'subject', 'acc_x', 'acc_y', 'acc_z',
                          'rot_w', 'rot_x', 'rot_y', 'rot_z', 'thm_1', 'thm_2',
                          'thm_3', 'thm_4', 'thm_5', 'sequence_counter']
    # Check for ToF columns
    tof_cols_exist = any(col.startswith('tof_') for col in single_sequence_df.columns)
    if not tof_cols_exist:
        raise ValueError("No ToF columns (starting with 'tof_') found in API-provided data.")
    missing_base_cols = [col for col in required_base_cols if col not in single_sequence_df.columns]
    if missing_base_cols:
        raise ValueError(f"Missing required base columns for Wave 1 feature engineering: {missing_base_cols}")
    
    # Call feature engineering function from src.feature_engineering
    # The function will group by 'sequence_id' to produce one output row.abs
    try:
        # This function is expected to handle -1.0 in ToF internally.
        features_df = create_wave1_features(single_sequence_df)
        print(f"  [Feature Eng.] FE function returned shape: {features_df.shape}")
    except Exception as e:
        print(f"  [Feature Eng.] Error in modular function: {e}")
        raise e
    
    
    # Post processing - ensure the output is a single row DF
    if not isinstance(features_df, pd.DataFrame):
        raise  TypeError("Modular feature function must return a pandas Dataframe!")
    if len(features_df) != 1:
        raise ValueError(f"Modular feature function should return 1 row, got {len(features_df)}.")
    
    
    # Ensure sequence_id is present (will be dropped later)
    if 'sequence_id' not in features_df.columns:
        # Get sequence_id from the input data (assuming it is consistent)
        seq_id_from_input = single_sequence_df['sequence_id'].iloc[0]
        features_df['sequence_id'] = seq_id_from_input
        print(f"  [Feature Eng.] Added missing 'sequence_id' column with value {seq_id_from_input}.")
        
    print(f"  [Feature Eng.] Final features ready. Shape: {features_df.shape}")
    return features_df

## Submission loop