In [None]:
# Detect Behavior with Sensor Data – CNN + Bi-LSTM + Demographics
# ------------------------------------------------------------------
# This is a minimally-intrusive revision of your original notebook.
# The only functional addition is that the seven demographic/anthro-
# pometric columns from train_demographics.csv are merged onto every
# row of the sensor frame and treated as extra numeric channels.
# Nothing else in the pipeline changes, so you can reuse previous
# hyper-parameters and checkpoints if desired.

import os
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder

#from tensorflow.keras.models import Sequential, load_model
#from tensorflow.keras.layers import (
#    Conv1D, MaxPooling1D, Flatten, Dense, Dropout, BatchNormalization,
#    LSTM, Bidirectional, GlobalAveragePooling1D
#)
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.preprocessing.sequence import pad_sequences
#from tensorflow.keras.optimizers import Adam
#from tensorflow.keras.callbacks import ReduceLROnPlateau, EarlyStopping
#import tensorflow as tf
import polars as pl
#import kaggle_evaluation.cmi_inference_server  # noqa: F401   | Kaggle runner hook
from collections import Counter

import os
import pickle
from copy import deepcopy
from collections import Counter

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.pipeline import make_pipeline

from sklearn.metrics import (
    accuracy_score,
    balanced_accuracy_score,
    precision_recall_fscore_support,
    confusion_matrix,
    roc_auc_score,
    average_precision_score,
    brier_score_loss,
    classification_report
)

from sklearn.linear_model import RidgeClassifierCV, LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import (
    RandomForestClassifier,
    GradientBoostingClassifier,
    HistGradientBoostingClassifier
)
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier

from sktime.transformations.panel.rocket import Rocket, MiniRocket
from sklearn.metrics import accuracy_score, balanced_accuracy_score, precision_score, recall_score, f1_score


# custom model
from detach_rocket.detach_classes import DetachEnsemble
import time


In [None]:
# -----------------------------
# Helper: compute metrics safely
# -----------------------------
def compute_binary_metrics(y_true, y_pred, y_proba=None):
    # Confusion matrix with fixed label order to always get TN,FP,FN,TP
    cm = confusion_matrix(y_true, y_pred, labels=[0, 1])
    print(cm)
    tn, fp, fn, tp = cm.ravel()

    acc  = accuracy_score(y_true, y_pred)
    bacc = balanced_accuracy_score(y_true, y_pred)

    # Positive-class metrics (pos_label=1); zero_division=0 to avoid NaNs
    prec, rec, f1, _ = precision_recall_fscore_support(
        y_true, y_pred, average='binary', zero_division=0
    )

    # Specificity = TN / (TN + FP); guard divide-by-zero
    specificity = tn / (tn + fp) if (tn + fp) > 0 else 0.0

    # Probability-based metrics (skip if proba missing or only one class present)
    auroc = None
    auprc = None
    brier = None
    if y_proba is not None and len(np.unique(y_true)) > 1:
        try:
            auroc = roc_auc_score(y_true, y_proba)
        except Exception:
            auroc = None
        try:
            auprc = average_precision_score(y_true, y_proba)
        except Exception:
            auprc = None
        try:
            brier = brier_score_loss(y_true, y_proba)
        except Exception:
            brier = None

    return {
        "n_trials": int(len(y_true)),
        "tn": int(tn), "fp": int(fp), "fn": int(fn), "tp": int(tp),
        "accuracy": float(acc),
        "balanced_accuracy": float(bacc),
        "precision": float(prec),
        "recall": float(rec),          # sensitivity
        "specificity": float(specificity),
        "f1": float(f1),
        "auroc": None if auroc is None else float(auroc),
        "auprc": None if auprc is None else float(auprc),
        "brier": None if brier is None else float(brier),
    }


In [None]:


def get_numpy_array(df):
    # ------------------------------------------------------------------
    # 2. BINARY LABEL-ENCODE GESTURE TARGET
    # ------------------------------------------------------------------
    
    # Define target gestures (BFRB-like = 1) and map others to 0
    bfrb_gestures = [
        "Above ear - pull hair",
        "Forehead - pull hairline",
        "Forehead - scratch",
        "Eyebrow - pull hair",
        "Eyelash - pull hair",
        "Neck - pinch skin",
        "Neck - scratch",
        "Cheek - pinch skin",
    ]
    
    # Assign binary labels
    df["gesture"] = df["gesture"].apply(lambda g: 1 if g in bfrb_gestures else 0)
    
    # Save the binary class names
    binary_classes = np.array(["non_target", "target"])
    #np.save("gesture_classes_binary.npy", binary_classes)
    
    # Optional: print class distribution
    print("Binary label distribution:")
    print(df["gesture"].value_counts().rename(index={0: "non-target", 1: "target"}))
    
    # ------------------------------------------------------------------
    # 3.  FEATURE LIST CONSTRUCTION
    # ------------------------------------------------------------------
    # Optionally skip thermal/TOF values → set to False to use them.
    
    drop_thermal_and_tof = False
    
    excluded_cols = {
        "gesture", "sequence_type", "behavior", "orientation",  # train-only targets
        "row_id", "subject", "phase",                            # meta
        "sequence_id", "sequence_counter"                         # ids
    }
    
    thermal_tof_cols = [c for c in df.columns if c.startswith(("thm_", "tof_"))]
    
    if drop_thermal_and_tof:
        excluded_cols.update(thermal_tof_cols)
        #print(f"Ignoring {len(thermal_tof_cols)} thermopile/TOF channels → set drop_thermal_and_tof=False to use them.")
    
    # --- NEW: demographic numeric columns --------------------------------
    demographic_cols = [
        "adult_child", "age", "sex", "handedness",
        "height_cm", "shoulder_to_wrist_cm", "elbow_to_wrist_cm",
    ]
    
    # Combine sensor + demographic feature list
    feature_cols = [c for c in df.columns if c not in excluded_cols]
    print(f"Using {len(feature_cols)} feature columns for training, including demographics:")
    print(sorted(feature_cols)[:15], "…")
    
    # Check missing values
    nan_total = df[feature_cols].isna().sum().sum()
    print(f"Total NaNs inside feature matrix: {nan_total:,}")
    
    
    # ------------------------------------------------------------------
    # 4.  SEQUENCE BUILDING HELPERS
    # ------------------------------------------------------------------
    
    def preprocess_sequence(df_seq: pd.DataFrame, feature_columns: list[str]) -> np.ndarray:
        """Fill→scale a *single* sequence dataframe and return float32 numpy."""
        data = df_seq[feature_columns].copy()
        data = data.ffill().bfill().fillna(0.0)
        scaled = StandardScaler().fit_transform(data)   # per-sequence scaler (unchanged)
        return scaled.astype("float32")
    
    print("Constructing padded tensor dataset …")
    seq_groups = df.groupby("sequence_id")
    
    X, seq_lengths = [], []
    for i, (_, seq) in enumerate(seq_groups):
        #if i and i % 500 == 0:
        #    print(f"  processed {i} sequences …")
        arr = preprocess_sequence(seq, feature_cols)
        X.append(arr)
        seq_lengths.append(arr.shape[0])
    
    pad_len = int(np.percentile(seq_lengths, 90))
    print(f"90th-percentile length = {pad_len} → fixed pad length chosen")


    
    X = pad_sequences(X, maxlen=pad_len, dtype="float32", padding="post", truncating="post")
    
    y = seq_groups["gesture"].first().values
    num_classes = len(np.unique(y))
    y = to_categorical(y, num_classes=num_classes)
    
    return X,y

In [None]:
def save_results(rows):
        
    results_dir = "results"
    os.makedirs(results_dir, exist_ok=True)
    
    df_subjects = pd.DataFrame(rows)
    # Order columns nicely
    cols = [
        "test_subject", "train_n_trials", "test_n_trials",
        "tn","fp","fn","tp",
        "accuracy","balanced_accuracy","precision","recall","specificity","f1",
        "auroc","auprc","brier"
    ]
    df_subjects = df_subjects[cols]
    df_subjects.sort_values("test_subject", inplace=True)
    per_subject_csv = os.path.join(results_dir, f"DetachR_num_kernels{num_kernels}_RidgeClassifierCV_loso_subject_metrics.csv")
    
    
    # save here as a file
    df_subjects.to_csv(per_subject_csv, index=False)
    print(f"Per-subject metrics written to: {per_subject_csv}")  
    
    # -----------------------------
    # Aggregate across subjects
    # -----------------------------
    agg_metrics = (
        df_subjects.drop(columns=["test_subject"])
                   .mean(numeric_only=True)
                   .to_frame("mean")
    )
    
    agg_metrics["std"] = (
        df_subjects.drop(columns=["test_subject"])
                   .std(numeric_only=True)
    )
    
    # Reset index so metrics become a column
    df_agg = agg_metrics.reset_index().rename(columns={"index": "metric"})
    
    # Save
    agg_csv = os.path.join(results_dir, f"DetachR_num_kernels{num_kernels}_RidgeClassifierCV_loso_agg_metrics.csv")
    df_agg.to_csv(agg_csv, index=False)
    print(f"Aggregated metrics written to: {agg_csv}")    

In [None]:

print("Imports loaded")

# ------------------------------------------------------------------
# 1.  LOAD TRAIN SENSOR DATA + DEMOGRAPHICS
# ------------------------------------------------------------------
print("Loading sensor dataset …")
root = '/Users/ashhadulislam/projects/general_data/CMI/ Detect Behavior with Sensor Data/cmi-detect-behavior-with-sensor-data/'

df = pd.read_csv(f"{root}/train.csv")
print(f"Loaded {len(df):,} rows of sensor frames")

# --- NEW: merge participant demographics on the key `subject` --------
print("Merging demographic attributes …")
demographics = pd.read_csv(f"{root}/train_demographics.csv")
df = df.merge(demographics, on="subject", how="left")



In [None]:
df.head()

In [None]:
# Get unique subjects
subjects = df['subject'].unique()



In [None]:
rows = []                 # per-test-subject metric rows
counter=0
for test_subject in subjects:
    start=time.time()
    print(f'Test subject: {test_subject}')
    train_df=df[df['subject']!=test_subject]
    test_df=df[df['subject']==test_subject]
    print(train_df.shape,test_df.shape)
    #break
    X_test,y_test=get_numpy_array(test_df)
    X_train,y_train=get_numpy_array(train_df)
    print('Obtained tensors for train and test')
    if len(Counter(test_df['gesture']).keys())==1:
        print(test_subject, df.shape[0],train_df.shape[0],test_df.shape[0])    
        print(Counter(train_df['gesture']), len(Counter(test_df['gesture']).keys()))
    print(X_test.shape,y_test.shape)
    print(X_test.shape,y_test.shape,X_train.shape,y_train.shape)
    

    # 1) Transpose to (n_samples, n_channels, n_timepoints)
    X_train = np.transpose(X_train, (0, 2, 1))
    X_test  = np.transpose(X_test,  (0, 2, 1))
    
    # 2a) Option A: pad/crop so all time lengths match (choose a common T)
    T = min(X_train.shape[-1], X_test.shape[-1])  # or max(...) and pad
    X_train = X_train[..., :T]   # or pad to T
    X_test  = X_test[..., :T]
    
    # 3) Make y 1-D labels
    # If y is one-hot, convert with argmax; if already class ids, ravel.
    y_train_1d = y_train.argmax(axis=1) if y_train.ndim == 2 else y_train.ravel()
    y_test_1d  = y_test.argmax(axis=1)  if y_test.ndim == 2 else y_test.ravel()

    

    num_models = 5
    num_kernels = 100
    print('going to train on detach')
    clf = DetachEnsemble(num_models=num_models, num_kernels=num_kernels)
    clf.fit(X_train, y_train_1d)        
    y_test_pred = clf.predict(X_test)

    
    m=compute_binary_metrics(y_test_1d, y_test_pred, None)
    m.update({            
            "test_subject": test_subject,
            "train_n_trials": int(len(y_train)),
            "test_n_trials": int(len(y_test)),
        })
    print(m)
    rows.append(m)
    
    
    end=time.time()
    save_results(rows)
    print(end-start)
    counter+=1    
    if counter>3:
        break
    
    

