In [None]:
"""
Final Submission Notebook for CMI Kaggle Competition.
This notebook integrates the trained model and the finalized feature engineering
function within the required Kaggle API loop structure.
"""

import pandas as pd
import numpy as np
from  kaggle_evaluation.cmi_gateway import CMIGateway
import os, sys
import pickle
import joblib
import catboost as cat

In [None]:
from src.config import PROJECT_PATH#, DATA_PATH
from src.feature_engineering import create_wave1_features

In [None]:
DATA_PATH = './data/'  # Path to the directory containing test files
TEST_DATA_FILE = 'test.csv'
TEST_DEMOGRAPHICS_FILE = 'test_demographics.csv'

required_files = [TEST_DATA_FILE, TEST_DEMOGRAPHICS_FILE]
for file in required_files:
    if not os.path.exists(os.path.join(DATA_PATH, file)):
        raise FileNotFoundError(f"Required test file not found: {os.path.join(DATA_PATH, file)}")

In [None]:
# --- Update these paths to where you save your final model and feature function ---
MODEL_PATH = 'models_rev/wave1-catboost-best.cbm'
FEATURE_FUNCTION_NAME = 'create_wave1_features' # Name of the final function

In [None]:
#GLOBAL MAPS
# Load training data
df_train = pd.read_csv(os.path.join(DATA_PATH, 'train.csv'))
df_train_demos = pd.read_csv(os.path.join(DATA_PATH, 'train_demographics.csv'))

# --- Create Helper Mappings for Evaluation Metric ---
# Important for the custom F1 score function
metadata = df_train[['gesture', 'sequence_type']].drop_duplicates()

# Map gesture string to sequence type (Target vs. Non-Target)
gesture_to_seq_type_map = metadata.set_index('gesture')['sequence_type'].to_dict()

# Map gesture string to integer code and back
gesture_map = {label: i for i, label in enumerate(metadata['gesture'].unique())}
inv_gesture_map = {i: label for label, i in gesture_map.items()}

# Validate
print(f"Gesture Map: {gesture_map}")
print(f"\nInverted Gesture Map: {inv_gesture_map}")
print(f"\nGesture To Sequence Type: {gesture_to_seq_type_map}")

## Load model and feature function

In [None]:
model = cat.CatBoostClassifier()
model.load_model(MODEL_PATH);
print(f"Model parameters: {model.get_all_params()}")
print(f"\nModel classes: {model.classes_}")

In [None]:
def create_final_submission_features(single_sequence_df:pd.DataFrame) -> pd.DataFrame:
    """
    Final feature engineering function for submission using Wave 1 logic.
    This function:
    1. Takes a single pandas DataFrame (e.g., `test_sequence_df` from the API).
    2. Calls the modular `create_wave1_features` function.
    
    Note: create_wave1_features processes one sequence_id but expects a df with 'sequence_id'.
    The API provides data for one sequence_id, so we need to ensure it's handled correctly.
    The function groups by 'sequence_id', so it should work on the slice provided by the API.
    
    3. Post-processes the result to return a single-row DataFrame of features.
    4. Ensures 'sequence_id' column is present for later dropping.
    5. Handles -1.0 in ToF columns correctly (should be handled inside `create_wave1_features`).
    """
    print(f"  [Feature Eng.] Processing sequence data of shape {single_sequence_df.shape}...")
    
    # --- Proactive State Management: Validate Input ---
    required_base_cols = ['sequence_id', 'phase', 'subject', 'acc_x', 'acc_y', 'acc_z',
                          'rot_w', 'rot_x', 'rot_y', 'rot_z', 'thm_1', 'thm_2',
                          'thm_3', 'thm_4', 'thm_5', 'sequence_counter']
    # Check for ToF columns
    tof_cols_exist = any(col.startswith('tof_') for col in single_sequence_df.columns)
    if not tof_cols_exist:
        raise ValueError("No ToF columns (starting with 'tof_') found in API-provided data.")
    missing_base_cols = [col for col in required_base_cols if col not in single_sequence_df.columns]
    if missing_base_cols:
        raise ValueError(f"Missing required base columns for Wave 1 feature engineering: {missing_base_cols}")
    
    # Call feature engineering function from src.feature_engineering
    # The function will group by 'sequence_id' to produce one output row.abs
    try:
        # This function is expected to handle -1.0 in ToF internally.
        features_df = create_wave1_features(single_sequence_df)
        print(f"  [Feature Eng.] FE function returned shape: {features_df.shape}")
    except Exception as e:
        print(f"  [Feature Eng.] Error in modular function: {e}")
        raise e
    
    
    # Post processing - ensure the output is a single row DF
    if not isinstance(features_df, pd.DataFrame):
        raise  TypeError("Modular feature function must return a pandas Dataframe!")
    if len(features_df) != 1:
        raise ValueError(f"Modular feature function should return 1 row, got {len(features_df)}.")
    
    
    # Ensure sequence_id is present (will be dropped later)
    if 'sequence_id' not in features_df.columns:
        # Get sequence_id from the input data (assuming it is consistent)
        seq_id_from_input = single_sequence_df['sequence_id'].iloc[0]
        features_df['sequence_id'] = seq_id_from_input
        print(f"  [Feature Eng.] Added missing 'sequence_id' column with value {seq_id_from_input}.")
        
    print(f"  [Feature Eng.] Final features ready. Shape: {features_df.shape}")
    return features_df

## Submission loop

In [None]:
# --- Cell 3: Main Submission Loop (Revised for CMIGateway - Ensuring Path Unpacking with Tuple) ---
"""
The main Kaggle submission loop logic.
This cell demonstrates how to interact with CMIGateway for testing.
Includes enhanced debugging for data loading issues.
"""

# --- Define the prediction function (CORRECTED IMPLEMENTATION) ---
def predict_single_sequence_for_submission(model, single_sequence_df: pd.DataFrame, feature_func, inv_map: dict) -> str:
    """
    Performs feature engineering and prediction for a single sequence DataFrame for the final submission.
    """
    try:
        print(f"      Feature engineering for sequence with {len(single_sequence_df)} rows...")
        features_df = feature_func(single_sequence_df)
        print(f"      Feature engineering complete. Output shape: {features_df.shape}")

        # --- Proactive Debugging: Check feature names ---
        # Get the feature names the model was trained on
        # Handle potential differences in how feature names are stored
        if hasattr(model, 'feature_names_') and model.feature_names_ is not None:
            expected_feature_names = list(model.feature_names_)
        elif hasattr(model, 'get_feature_names') and callable(getattr(model, 'get_feature_names')):
             # Some models might have a get_feature_names method
             expected_feature_names = list(model.get_feature_names())
        else:
             # Fallback: This is risky, assumes features_df columns are correct
             # This should ideally not be reached.
             print("      WARNING: Could not determine model's expected feature names from model object attributes.")
             expected_feature_names = list(features_df.columns)
             # Remove non-feature columns if they exist in features_df
             non_feature_cols = ['sequence_id', 'subject', 'gesture', 'gesture_encoded'] # Common ones
             expected_feature_names = [f for f in expected_feature_names if f not in non_feature_cols]

        print(f"      Model expects {len(expected_feature_names)} features.")

        # Ensure all expected features are present
        available_features = set(features_df.columns)
        missing_features = set(expected_feature_names) - available_features

        if missing_features:
            print(f"      ERROR: Missing features for model: {missing_features}")
            print(f"      First few missing: {list(missing_features)[:5]}")
            raise ValueError(f"Feature mismatch: Missing features {missing_features}")

        # Select only the features the model expects
        model_input_features = features_df[expected_feature_names]
        
        # Ensure it's a single row if the model expects it (common for sequence-level models)
        # The feature function should ideally produce one row per sequence.
        if model_input_features.shape[0] != 1:
             print(f"      WARNING: Feature function returned {model_input_features.shape[0]} rows. Expected 1. Using first row.")
             # This slicing keeps it as a DataFrame
             model_input_features = model_input_features.iloc[[0]] # Use double brackets for iloc

        print(f"      Features selected for model input. Shape: {model_input_features.shape}")

        # --- Model Prediction ---
        print("      Calling model.predict_proba...")
        y_pred_proba = model.predict_proba(model_input_features)
        print(f"      Model prediction complete. Proba shape: {y_pred_proba.shape}")
        predicted_class_index = np.argmax(y_pred_proba, axis=1)[0] # Get index for first (only) row
        predicted_gesture_string = inv_map[predicted_class_index]
        print(f"      Prediction made: '{predicted_gesture_string}'")
        return predicted_gesture_string

    except Exception as e:
        print(f"      ERROR during feature engineering or prediction for sequence: {e}")
        import traceback
        traceback.print_exc()
        # --- Proactive Debugging: Fallback in Final Submission ---
        # In the final submission, it's often better to predict a default gesture
        # than to crash the entire process or return None.
        default_gesture_index = 0 # Or choose based on training frequency
        # Safely get a default gesture, fallback to first in map if index 0 doesn't exist
        default_gesture = inv_map.get(default_gesture_index, list(inv_map.values())[0]) 
        print(f"      Fallback prediction due to error: '{default_gesture}'")
        return default_gesture # Return fallback string, not None

In [None]:
# --- Cell 3: Main Submission Loop (Revised for CMIGateway - Ensuring Path Unpacking with Tuple) ---
# ... (keep the predict_single_sequence_for_submission function as is) ...

# --- Initialize the API (Corrected: Using Explicit Local Paths AND calling unpack_data_paths) ---
print("\nInitializing Kaggle API (CMIGateway) with explicit LOCAL data paths...")
try:
    # --- Proactive State Management: Define and Verify Local Paths ---
    project_root = os.path.abspath(os.path.join(os.getcwd(), ".."))
    local_data_base_dir = os.path.join(project_root, "notebooks", "kaggle", "input", "cmi-detect-behavior-with-sensor-data")

    local_test_path = os.path.join(local_data_base_dir, "test.csv")
    local_demo_path = os.path.join(local_data_base_dir, "test_demographics.csv")

    local_test_path = os.path.abspath(os.path.expanduser(local_test_path))
    local_demo_path = os.path.abspath(os.path.expanduser(local_demo_path))

    print(f"  Configured local test path: {local_test_path}")
    print(f"  Configured local demo path: {local_demo_path}")

    # Check if files exist at the specified local paths BEFORE passing to gateway
    if not os.path.exists(local_test_path):
        raise FileNotFoundError(f"Local test file not found at: {local_test_path}")
    if not os.path.exists(local_demo_path):
        raise FileNotFoundError(f"Local demographics file not found at: {local_demo_path}")
    print("  Local test files confirmed to exist.")

    # --- Pass the paths as a tuple ---
    env = CMIGateway(data_paths=(local_test_path, local_demo_path))
    print("API (CMIGateway) instantiated with explicit local paths tuple.")

    # --- CRITICAL: Explicitly call unpack_data_paths to set internal attributes ---
    print("Calling unpack_data_paths() to set internal paths...")
    env.unpack_data_paths() # This should now correctly use self.data_paths[0] and [1]
    print("unpack_data_paths() completed.")

    # --- Proactive Debugging: Verify Internal Paths are Set ---
    if not hasattr(env, 'test_path') or not hasattr(env, 'demographics_path'):
         raise AttributeError("CMIGateway failed to set test_path or demographics_path after unpack_data_paths().")
    print(f"  Internal paths set by gateway: test_path='{env.test_path}', demographics_path='{env.demographics_path}'")
    # Optional: Check if they match expected
    if env.test_path != local_test_path or env.demographics_path != local_demo_path:
        print("  WARNING: Internal paths do not exactly match provided paths, but they should point to the same files.")

    # --- Enhanced Debugging: Check File Reading (using gateway's internal paths) ---
    print("\n--- Enhanced Debugging: Checking File Reading with Gateway's Paths ---")
    import polars as pl

    print("Attempting to read test.csv with Polars (using gateway's internal path)...")
    try:
        test_pl = pl.read_csv(env.test_path)
        print(f"  Successfully read test.csv. Shape: {test_pl.shape}")
        if test_pl.is_empty():
            print("  WARNING: test.csv is empty!")
        else:
             unique_seq_ids = test_pl['sequence_id'].unique()
             print(f"  Number of unique sequence_ids found: {len(unique_seq_ids)}")
    except Exception as read_error:
        print(f"  ERROR reading test.csv with Polars: {read_error}")
        raise

    print("Attempting to read test_demographics.csv with Polars (using gateway's internal path)...")
    try:
        demos_pl = pl.read_csv(env.demographics_path)
        print(f"  Successfully read test_demographics.csv. Shape: {demos_pl.shape}")
        if demos_pl.is_empty():
            print("  WARNING: test_demographics.csv is empty!")
    except Exception as read_error:
        print(f"  ERROR reading test_demographics.csv with Polars: {read_error}")
        raise
    print("--- End Enhanced Debugging ---\n")

    # --- Get the generator ---
    data_generator = env.generate_data_batches()
    print("Data generator created.")

except Exception as e:
    print(f"Error during API initialization or data check: {e}")
    import traceback
    traceback.print_exc()
    raise


# --- The Submission Loop (Interacting with the generator) ---
print("Entering main prediction loop (testing first few sequences)...")
sequence_count = 0
MAX_SEQUENCES_TO_TEST = 5
sequences_processed = 0 # New counter to track actual loop entries

try:
    for batch_data, row_ids_df in data_generator:
        sequences_processed += 1 # Increment if loop body is entered
        sequence_count += 1
        if sequence_count > MAX_SEQUENCES_TO_TEST:
            print(f"Reached test limit of {MAX_SEQUENCES_TO_TEST} sequences. Stopping.")
            break
            
        # --- Extract sequence_id and data ---
        sequence_pl_df, sequence_demos_pl_df = batch_data
        sequence_id_series = row_ids_df['sequence_id'] # This should be a Polars Series
        sequence_id = sequence_id_series[0] # Get the single sequence ID for this batch
        
        print(f"\nProcessing sequence {sequence_count}: {sequence_id}")
        print(f"  Sequence data shape (Polars): {sequence_pl_df.shape}")
        # print(f"  Sequence columns: {sequence_pl_df.columns}") # Uncomment for detailed inspection
        print(f"  Demographics data shape (Polars): {sequence_demos_pl_df.shape}")

        # --- Convert Polars DataFrames to Pandas ---
        # The API serves data using Polars. Convert for our functions.
        if hasattr(sequence_pl_df, 'to_pandas'):
            test_sequence_df = sequence_pl_df.to_pandas()
            # The gateway code merges demographics based on 'subject' internally.
            # The resulting test_sequence_df should already contain demographic columns
            # if the merge in generate_data_batches worked correctly.
            # If you need to access demographics separately, you can convert sequence_demos_pl_df too.
            # demos_df = sequence_demos_pl_df.to_pandas() 
        else:
            print(f"      ERROR: Expected Polars DataFrame from API.")
            continue # Skip this sequence

        # --- Proactive Debugging: Check for train-only columns ---
        # This check is important. If 'phase' is present, it's an error in the feature function usage.
        if 'phase' in test_sequence_df.columns:
             print(f"      CRITICAL WARNING: 'phase' column found in test data served by API. This column is train-only. Feature engineering MUST NOT use it.")

        # --- Feature Engineering and Model Prediction ---
        try:
            predicted_gesture_string = predict_single_sequence_for_submission(
                model=model,
                single_sequence_df=test_sequence_df, # This is the full data for one sequence
                feature_func=create_wave1_features, # <-- Ensure this function is designed for single-sequence input
                inv_map=inv_gesture_map
            )
            print(f"  Predicted gesture string: '{predicted_gesture_string}' (Type: {type(predicted_gesture_string)})")
            # --- Proactive Debugging: Ensure prediction is a string ---
            if not isinstance(predicted_gesture_string, str):
                print(f"  CRITICAL ERROR: predict_single_sequence_for_submission returned a non-string: {predicted_gesture_string}")
                raise TypeError(f"Prediction must be a string, got {type(predicted_gesture_string)}")
                
        except Exception as e:
            print(f"  ERROR in prediction pipeline for {sequence_id}: {e}")
            import traceback
            traceback.print_exc()
            # Fallback prediction (should be a string now)
            default_gesture_index = 0
            default_gesture = inv_gesture_map.get(default_gesture_index, list(inv_gesture_map.values())[0])
            predicted_gesture_string = default_gesture
            print(f"  Using fallback prediction for {sequence_id}: '{predicted_gesture_string}'")

        # --- Submit Prediction ---
        try:
            print(f"  Submitting prediction '{predicted_gesture_string}' for {sequence_id}...")
            # The gateway's predict method likely needs the sequence_id and the prediction string.
            # The row_ids_df might also be required by the base class validation.
            # Based on BaseGateway.validate_prediction_batch, it takes (prediction, row_ids)
            # Let's try the standard predict call first. If it fails, we might need to pass row_ids_df.
            
            # --- Proactive Debugging: Check prediction validity before submission ---
            # The gateway validates predictions. Let's do a quick check ourselves.
            # Based on cmi_gateway.py, valid gestures are in env.all_gestures
            if hasattr(env, 'all_gestures') and predicted_gesture_string not in env.all_gestures:
                print(f"  WARNING: Predicted gesture '{predicted_gesture_string}' not in gateway's known list. Submission might fail.")
            
            # Try the primary signature based on BaseGateway validation logic
            env.predict(predicted_gesture_string, row_ids_df) # Try passing row_ids_df as per base validation
            # If the above fails, try: env.predict(sequence_id, predicted_gesture_string)
            print(f"  Prediction submitted for {sequence_id}.")
        except TypeError as te:
            print(f"  TypeError on env.predict (likely wrong signature): {te}")
            print("  Trying alternative submission method: env.predict(sequence_id, prediction)...")
            try:
                # Fallback signature based on initial schema
                env.predict(sequence_id, predicted_gesture_string)
                print(f"  Prediction submitted for {sequence_id} (using fallback signature).")
            except Exception as te2:
                print(f"  Fallback submission also failed: {te2}")
                raise te2 # Re-raise the second error
        except Exception as e:
            print(f"  ERROR submitting prediction for {sequence_id}: {e}")
            import traceback
            traceback.print_exc()
            raise e # Consider if you want to stop on submission error or continue

    print(f"\nPrediction loop completed successfully. Iterated through loop body {sequences_processed} times.")
    print(f"Tested {min(sequence_count, MAX_SEQUENCES_TO_TEST)} sequences.")

except Exception as e:
    print(f"\n!!! CRITICAL ERROR in main loop !!!")
    print(f"Error: {e}")
    import traceback
    traceback.print_exc()

print("\n=== Final Submission Notebook Execution (Test Loop) Complete ===")