In [1]:
# Detect Behavior with Sensor Data – CNN + Bi-LSTM + Demographics
# ------------------------------------------------------------------
# This is a minimally-intrusive revision of your original notebook.
# The only functional addition is that the seven demographic/anthro-
# pometric columns from train_demographics.csv are merged onto every
# row of the sensor frame and treated as extra numeric channels.
# Nothing else in the pipeline changes, so you can reuse previous
# hyper-parameters and checkpoints if desired.

import os
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import (
    Conv1D, MaxPooling1D, Flatten, Dense, Dropout, BatchNormalization,
    LSTM, Bidirectional, GlobalAveragePooling1D
)
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import ReduceLROnPlateau, EarlyStopping
import tensorflow as tf
import polars as pl
#import kaggle_evaluation.cmi_inference_server  # noqa: F401   | Kaggle runner hook

print("Imports loaded")

# ------------------------------------------------------------------
# 1.  LOAD TRAIN SENSOR DATA + DEMOGRAPHICS
# ------------------------------------------------------------------
print("Loading sensor dataset …")
root = '/Users/ashhadulislam/projects/general_data/CMI/ Detect Behavior with Sensor Data/cmi-detect-behavior-with-sensor-data/'

df = pd.read_csv(f"{root}/train.csv")
print(f"Loaded {len(df):,} rows of sensor frames")

# --- NEW: merge participant demographics on the key `subject` --------
print("Merging demographic attributes …")
demographics = pd.read_csv(f"{root}/train_demographics.csv")
df = df.merge(demographics, on="subject", how="left")



Imports loaded
Loading sensor dataset …
Loaded 574,945 rows of sensor frames
Merging demographic attributes …


In [2]:
# ------------------------------------------------------------------
# 2. BINARY LABEL-ENCODE GESTURE TARGET
# ------------------------------------------------------------------

# Define target gestures (BFRB-like = 1) and map others to 0
bfrb_gestures = [
    "Above ear - pull hair",
    "Forehead - pull hairline",
    "Forehead - scratch",
    "Eyebrow - pull hair",
    "Eyelash - pull hair",
    "Neck - pinch skin",
    "Neck - scratch",
    "Cheek - pinch skin",
]

# Assign binary labels
df["gesture"] = df["gesture"].apply(lambda g: 1 if g in bfrb_gestures else 0)

# Save the binary class names
binary_classes = np.array(["non_target", "target"])
np.save("gesture_classes_binary.npy", binary_classes)

# Optional: print class distribution
print("Binary label distribution:")
print(df["gesture"].value_counts().rename(index={0: "non-target", 1: "target"}))

Binary label distribution:
gesture
target        344058
non-target    230887
Name: count, dtype: int64


In [3]:

# ------------------------------------------------------------------
# 3.  FEATURE LIST CONSTRUCTION
# ------------------------------------------------------------------
# Optionally skip thermal/TOF values → set to False to use them.

drop_thermal_and_tof = False

excluded_cols = {
    "gesture", "sequence_type", "behavior", "orientation",  # train-only targets
    "row_id", "subject", "phase",                            # meta
    "sequence_id", "sequence_counter"                         # ids
}

thermal_tof_cols = [c for c in df.columns if c.startswith(("thm_", "tof_"))]

if drop_thermal_and_tof:
    excluded_cols.update(thermal_tof_cols)
    print(f"Ignoring {len(thermal_tof_cols)} thermopile/TOF channels → set drop_thermal_and_tof=False to use them.")

# --- NEW: demographic numeric columns --------------------------------
demographic_cols = [
    "adult_child", "age", "sex", "handedness",
    "height_cm", "shoulder_to_wrist_cm", "elbow_to_wrist_cm",
]

# Combine sensor + demographic feature list
feature_cols = [c for c in df.columns if c not in excluded_cols]
print(f"Using {len(feature_cols)} feature columns for training, including demographics:")
print(sorted(feature_cols)[:15], "…")

# Check missing values
nan_total = df[feature_cols].isna().sum().sum()
print(f"Total NaNs inside feature matrix: {nan_total:,}")

Using 339 feature columns for training, including demographics:
['acc_x', 'acc_y', 'acc_z', 'adult_child', 'age', 'elbow_to_wrist_cm', 'handedness', 'height_cm', 'rot_w', 'rot_x', 'rot_y', 'rot_z', 'sex', 'shoulder_to_wrist_cm', 'thm_1'] …
Total NaNs inside feature matrix: 3,597,807


In [4]:
# ------------------------------------------------------------------
# 4.  SEQUENCE BUILDING HELPERS
# ------------------------------------------------------------------

def preprocess_sequence(df_seq: pd.DataFrame, feature_columns: list[str]) -> np.ndarray:
    """Fill→scale a *single* sequence dataframe and return float32 numpy."""
    data = df_seq[feature_columns].copy()
    data = data.ffill().bfill().fillna(0.0)
    scaled = StandardScaler().fit_transform(data)   # per-sequence scaler (unchanged)
    return scaled.astype("float32")

print("Constructing padded tensor dataset …")
seq_groups = df.groupby("sequence_id")

X, seq_lengths = [], []
for i, (_, seq) in enumerate(seq_groups):
    if i and i % 500 == 0:
        print(f"  processed {i} sequences …")
    arr = preprocess_sequence(seq, feature_cols)
    X.append(arr)
    seq_lengths.append(arr.shape[0])

pad_len = int(np.percentile(seq_lengths, 90))
print(f"90th-percentile length = {pad_len} → fixed pad length chosen")
np.save("sequence_maxlen.npy", pad_len)



Constructing padded tensor dataset …
  processed 500 sequences …
  processed 1000 sequences …
  processed 1500 sequences …
  processed 2000 sequences …
  processed 2500 sequences …
  processed 3000 sequences …
  processed 3500 sequences …
  processed 4000 sequences …
  processed 4500 sequences …
  processed 5000 sequences …
  processed 5500 sequences …
  processed 6000 sequences …
  processed 6500 sequences …
  processed 7000 sequences …
  processed 7500 sequences …
  processed 8000 sequences …
90th-percentile length = 103 → fixed pad length chosen


In [5]:
X = pad_sequences(X, maxlen=pad_len, dtype="float32", padding="post", truncating="post")

y = seq_groups["gesture"].first().values
num_classes = len(np.unique(y))
y = to_categorical(y, num_classes=num_classes)

# ------------------------------------------------------------------
# 5.  TRAIN/VAL SPLIT & MODEL
# ------------------------------------------------------------------
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.20, random_state=42, stratify=y
)

print(X.shape,y.shape,X_train.shape,y_train.shape,X_val.shape,y_val.shape)

(8151, 103, 339) (8151, 2) (6520, 103, 339) (6520, 2) (1631, 103, 339) (1631, 2)


In [6]:
# Flatten the time series data
X_train_flat = X_train.reshape(X_train.shape[0], -1)
X_val_flat = X_val.reshape(X_val.shape[0], -1)

In [7]:
y_train_labels = np.argmax(y_train, axis=1)
y_val_labels = np.argmax(y_val, axis=1)

In [8]:
import cebra
from cebra import CEBRA

cebra_model = CEBRA(
    model_architecture="offset10-model",
    output_dimension=16,
    batch_size=256,
    temperature=1.0,
    learning_rate=1e-3,
    max_iterations=1000,
    conditional="time",
    device="mps"  # or "cpu"
)

cebra_model.fit(X_train_flat)

In [9]:
X_train_latents = cebra_model.transform(X_train_flat)
X_val_latents = cebra_model.transform(X_val_flat)

In [10]:
print(X_train_latents.shape)

(6520, 16)


In [11]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score, classification_report

clf = LogisticRegression(max_iter=1000, multi_class="multinomial")
clf.fit(X_train_latents, y_train_labels)

y_val_pred = clf.predict(X_val_latents)

f1_macro = f1_score(y_val_labels, y_val_pred, average="macro")
print(f"Macro F1 Score (Validation): {f1_macro:.4f}")
print(classification_report(y_val_labels, y_val_pred))

Macro F1 Score (Validation): 0.4187
              precision    recall  f1-score   support

           0       0.56      0.04      0.07       608
           1       0.63      0.98      0.77      1023

    accuracy                           0.63      1631
   macro avg       0.60      0.51      0.42      1631
weighted avg       0.61      0.63      0.51      1631





In [12]:
# ------------------------------------------------------------------
# 6.  LOCAL VALIDATION METRIC
# ------------------------------------------------------------------
print("Computing validation hierarchical-F1 …")
from cmi_2025_metric_copy_for_import import CompetitionMetric  # local helper

probs_val = clf.predict_proba(X_val_latents)
labels_val_pred = np.argmax(probs_val, axis=1)
labels_val_true = np.argmax(y_val, axis=1)

cls = np.load("gesture_classes_binary.npy", allow_pickle=True)
# Do this (keep binary as integers: 0 = non-target, 1 = target)
val_pred_df = pd.DataFrame({"gesture_binary": labels_val_pred})
val_true_df = pd.DataFrame({"gesture_binary": labels_val_true})

metric = CompetitionMetric()
score = metric.calculate_binary_f1(val_true_df, val_pred_df)
print(f"Estimated public-LB score on held-out fold: {score:.4f}")

Computing validation hierarchical-F1 …
Estimated public-LB score on held-out fold: 0.7694
