# Data loading

This cell loads the raw CSV files into pandas DataFrames.
- Inputs: `../data/train.csv`, `../data/test.csv`
- Outputs: `train_df`, `test_df`
- Purpose: quickly inspect shapes to confirm files were read correctly.

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import os

# Load data
train_df = pd.read_csv("../data/train.csv")
test_df = pd.read_csv("../data/test.csv")

print('Original shapes:')
print(f'Train: {train_df.shape}')
print(f'Test: {test_df.shape}')

Original shapes:
Train: (574945, 341)
Test: (107, 336)


# Column comparison

Compare columns between `train_df` and `test_df` to identify mismatches and common features.
This helps align feature sets before training.

In [None]:
print('='*80)
print('COLUMN COMPARISON')
print('='*80)

print(f'\nTrain columns: {len(train_df.columns)}')
print(f'Test columns: {len(test_df.columns)}')

# Find columns present in train but not in test
missing_in_test = set(train_df.columns) - set(test_df.columns)
print(f'\nColumns in TRAIN but NOT in TEST: {missing_in_test}')

# Find common columns
common_cols = set(train_df.columns) & set(test_df.columns)
print(f'Common columns: {len(common_cols)}')

# Print all test columns
print('\nAll TEST columns:')
print(test_df.columns.tolist())

COLUMN COMPARISON

Train columns: 341
Test columns: 336

Columns in TRAIN but NOT in TEST: {'orientation', 'gesture', 'behavior', 'sequence_type', 'phase'}
Common columns: 336

All TEST columns:
['row_id', 'sequence_id', 'sequence_counter', 'subject', 'acc_x', 'acc_y', 'acc_z', 'rot_w', 'rot_x', 'rot_y', 'rot_z', 'thm_1', 'thm_2', 'thm_3', 'thm_4', 'thm_5', 'tof_1_v0', 'tof_1_v1', 'tof_1_v2', 'tof_1_v3', 'tof_1_v4', 'tof_1_v5', 'tof_1_v6', 'tof_1_v7', 'tof_1_v8', 'tof_1_v9', 'tof_1_v10', 'tof_1_v11', 'tof_1_v12', 'tof_1_v13', 'tof_1_v14', 'tof_1_v15', 'tof_1_v16', 'tof_1_v17', 'tof_1_v18', 'tof_1_v19', 'tof_1_v20', 'tof_1_v21', 'tof_1_v22', 'tof_1_v23', 'tof_1_v24', 'tof_1_v25', 'tof_1_v26', 'tof_1_v27', 'tof_1_v28', 'tof_1_v29', 'tof_1_v30', 'tof_1_v31', 'tof_1_v32', 'tof_1_v33', 'tof_1_v34', 'tof_1_v35', 'tof_1_v36', 'tof_1_v37', 'tof_1_v38', 'tof_1_v39', 'tof_1_v40', 'tof_1_v41', 'tof_1_v42', 'tof_1_v43', 'tof_1_v44', 'tof_1_v45', 'tof_1_v46', 'tof_1_v47', 'tof_1_v48', 'tof_1_v49', 

# Feature selection and splitting

Define the label column and select feature columns by excluding metadata columns. Create `X_train`, `y_train` and align `X_test` with available test features.

In [None]:
# Label column
label_col = 'sequence_type'

# Feature columns (exclude metadata)
metadata_cols = ['row_id', 'sequence_id', 'sequence_counter', 'subject', 
                 'orientation', label_col]
feature_cols = [col for col in train_df.columns if col not in metadata_cols]

print(f'Feature columns: {len(feature_cols)}')
print(f'Label column: {label_col}')

# Split features and label
X_train = train_df[feature_cols].copy()
y_train = train_df[label_col].copy()

# Important: for test_df, only select columns that exist in test_df
feature_cols_test = [col for col in feature_cols if col in test_df.columns]

X_test = test_df[feature_cols_test].copy()

print(f'\nX_train shape: {X_train.shape}')
print(f'y_train shape: {y_train.shape}')
print(f'X_test shape: {X_test.shape}')

print(f'\nColumns in train but not test: {set(feature_cols) - set(feature_cols_test)}')

Feature columns: 335
Label column: sequence_type

X_train shape: (574945, 335)
y_train shape: (574945,)
X_test shape: (107, 332)

Columns in train but not test: {'behavior', 'phase', 'gesture'}


# Missing value handling

Inspect missing values and apply imputation:
- Drop columns with >50% missing values.
- Fill numeric columns using the training median.
- Fill categorical/string columns using the training mode.

In [None]:
print('='*80)
print('HANDLING MISSING VALUES')
print('='*80)

# Check missing values (only common columns)
common_feature_cols = [col for col in X_train.columns if col in X_test.columns]

print(f'\nMissing values in X_train:')
missing_counts = X_train[common_feature_cols].isnull().sum()
missing_pct = (missing_counts / len(X_train)) * 100
missing_df = pd.DataFrame({
    'Missing_Count': missing_counts,
    'Missing_Percentage': missing_pct
})
missing_df = missing_df[missing_df['Missing_Count'] > 0].sort_values('Missing_Count', ascending=False)
print(missing_df.head(20))

# Strategy: drop columns with >50% missing values
cols_to_drop = missing_df[missing_df['Missing_Percentage'] > 50].index.tolist()
print(f'\nColumns to drop (>50% missing): {len(cols_to_drop)}')

# Drop these columns
X_train = X_train.drop(columns=cols_to_drop)
X_test = X_test.drop(columns=[col for col in cols_to_drop if col in X_test.columns])

# Fill remaining missing values: numeric -> median, string/object -> mode
numeric_cols = X_train.select_dtypes(include=[np.number]).columns.tolist()
string_cols = X_train.select_dtypes(include=['str', 'object']).columns.tolist()

print(f'\nNumeric columns: {len(numeric_cols)}')
print(f'String columns: {len(string_cols)}')
if string_cols:
    print(f'String column examples: {string_cols[:5]}')

# Compute medians on training numeric columns
train_medians = X_train[numeric_cols].median()

# Fill numeric columns with medians
X_train[numeric_cols] = X_train[numeric_cols].fillna(train_medians)
for col in numeric_cols:
    if col in X_test.columns:
        X_test[col] = X_test[col].fillna(train_medians[col])

# Fill string/object columns with mode
for col in string_cols:
    mode_value = X_train[col].mode()[0]
    X_train[col] = X_train[col].fillna(mode_value)
    if col in X_test.columns:
        X_test[col] = X_test[col].fillna(mode_value)

print(f'\nAfter handling missing values:')
print(f'X_train shape: {X_train.shape}')
print(f'X_test shape: {X_test.shape}')
print(f'Remaining missing values in X_train: {X_train.isnull().sum().sum()}')
print(f'Remaining missing values in X_test: {X_test.isnull().sum().sum()}')

HANDLING MISSING VALUES

Missing values in X_train:
           Missing_Count  Missing_Percentage
thm_5              33286            5.789423
tof_5_v63          30142            5.242588
tof_5_v24          30142            5.242588
tof_5_v18          30142            5.242588
tof_5_v19          30142            5.242588
tof_5_v20          30142            5.242588
tof_5_v21          30142            5.242588
tof_5_v22          30142            5.242588
tof_5_v23          30142            5.242588
tof_5_v25          30142            5.242588
tof_5_v16          30142            5.242588
tof_5_v26          30142            5.242588
tof_5_v27          30142            5.242588
tof_5_v28          30142            5.242588
tof_5_v29          30142            5.242588
tof_5_v30          30142            5.242588
tof_5_v31          30142            5.242588
tof_5_v17          30142            5.242588
tof_5_v15          30142            5.242588
tof_5_v33          30142            5.242588

Co

# Encode target variable

Map categorical target labels to numeric values for modeling.
- Example mapping: `Target` -> 1, `Non-Target` -> 0.

In [None]:
print('='*80)
print('ENCODING TARGET VARIABLE')
print('='*80)

# View label distribution
print('\nOriginal label distribution:')
print(y_train.value_counts())

# Encode: Target=1, Non-Target=0
label_mapping = {'Target': 1, 'Non-Target': 0}
y_train_encoded = y_train.map(label_mapping)

print('\nEncoded label distribution:')
print(y_train_encoded.value_counts())
print(f'\nLabel mapping: {label_mapping}')

y_train = y_train_encoded

ENCODING TARGET VARIABLE

Original label distribution:
sequence_type
Target        344058
Non-Target    230887
Name: count, dtype: int64

Encoded label distribution:
sequence_type
1    344058
0    230887
Name: count, dtype: int64

Label mapping: {'Target': 1, 'Non-Target': 0}


# Train / Validation split

Split the processed training data into training and validation sets (80/20) while preserving class proportions using stratification.

In [None]:
print('='*80)
print('TRAIN-VALIDATION SPLIT')
print('='*80)

# 80-20 split
X_train_split, X_val, y_train_split, y_val = train_test_split(
    X_train, y_train,
    test_size=0.2,
    random_state=42,
    stratify=y_train  # preserve class proportions
)

print(f'\nTrain split: {X_train_split.shape}')
print(f'Validation split: {X_val.shape}')
print(f'Test: {X_test.shape}')

print(f'\nClass distribution in train split:')
print(y_train_split.value_counts())
print(f'\nClass distribution in validation split:')
print(y_val.value_counts())

TRAIN-VALIDATION SPLIT

Train split: (459956, 335)
Validation split: (114989, 335)
Test: (107, 332)

Class distribution in train split:
sequence_type
1    275246
0    184710
Name: count, dtype: int64

Class distribution in validation split:
sequence_type
1    68812
0    46177
Name: count, dtype: int64


# Remove non-numeric columns

Detect and remove `object`/string columns from each split to ensure only numeric data is scaled.

In [None]:
# Ensure removal of any string/object columns
print('Checking for string columns...')

# Remove string/object columns from X_train_split
string_cols_train = X_train_split.select_dtypes(include=['str','object']).columns.tolist()
if string_cols_train:
    print(f'Removing {len(string_cols_train)} string columns from X_train_split: {string_cols_train}')
    X_train_split = X_train_split.drop(columns=string_cols_train)

# Remove string/object columns from X_val
string_cols_val = X_val.select_dtypes(include=['str','object']).columns.tolist()
if string_cols_val:
    print(f'Removing {len(string_cols_val)} string columns from X_val: {string_cols_val}')
    X_val = X_val.drop(columns=string_cols_val)

# Remove string/object columns from X_test
string_cols_test = X_test.select_dtypes(include=['str','object']).columns.tolist()
if string_cols_test:
    print(f'Removing {len(string_cols_test)} string columns from X_test: {string_cols_test}')
    X_test = X_test.drop(columns=string_cols_test)

print(f'\nFinal shapes:')
print(f'X_train_split: {X_train_split.shape} - dtypes: {X_train_split.dtypes.value_counts().to_dict()}')
print(f'X_val: {X_val.shape} - dtypes: {X_val.dtypes.value_counts().to_dict()}')
print(f'X_test: {X_test.shape} - dtypes: {X_test.dtypes.value_counts().to_dict()}')

print('\n Ready for scaling!')

Checking for string columns...

Final shapes:
X_train_split: (459956, 332) - dtypes: {dtype('float64'): 332}
X_val: (114989, 332) - dtypes: {dtype('float64'): 332}
X_test: (107, 332) - dtypes: {dtype('float64'): 332}

✅ Ready for scaling!


# Standardization

Fit a `StandardScaler` on the training split and apply it to validation and test sets. Convert results back to DataFrames for convenience.

In [26]:
print('='*80)
print('STANDARDIZATION')
print('='*80)

# Fit scaler on train split
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_split)
X_val_scaled = scaler.transform(X_val)
X_test_scaled = scaler.transform(X_test)

print('Scaler fitted on train split')
print(f'Mean of scaled train: {X_train_scaled.mean():.6f}')
print(f'Std of scaled train: {X_train_scaled.std():.6f}')

# Convert back to DataFrame
X_train_scaled = pd.DataFrame(X_train_scaled, columns=X_train_split.columns)
X_val_scaled = pd.DataFrame(X_val_scaled, columns=X_val.columns)
X_test_scaled = pd.DataFrame(X_test_scaled, columns=X_test.columns)

print('\nScaled shapes:')
print(f'X_train_scaled: {X_train_scaled.shape}')
print(f'X_val_scaled: {X_val_scaled.shape}')
print(f'X_test_scaled: {X_test_scaled.shape}')

STANDARDIZATION
Scaler fitted on train split
Mean of scaled train: 0.000000
Std of scaled train: 1.000000

Scaled shapes:
X_train_scaled: (459956, 332)
X_val_scaled: (114989, 332)
X_test_scaled: (107, 332)


# Save preprocessed data

Persist the preprocessed datasets and the fitted scaler to `../data/` using pickle for downstream modeling and inference.

In [None]:
import pickle

print('='*80)
print('SAVING PREPROCESSED DATA')
print('='*80)

os.makedirs('../data', exist_ok=True)

# Save as pickle (preserve column info)
with open('../data/X_train_preprocessed.pkl', 'wb') as f:
    pickle.dump(X_train_scaled, f)

with open('../data/X_val_preprocessed.pkl', 'wb') as f:
    pickle.dump(X_val_scaled, f)

with open('../data/X_test_preprocessed.pkl', 'wb') as f:
    pickle.dump(X_test_scaled, f)

with open('../data/y_train_preprocessed.pkl', 'wb') as f:
    pickle.dump(y_train_split, f)

with open('../data/y_val_preprocessed.pkl', 'wb') as f:
    pickle.dump(y_val, f)

# Also save the scaler (for later inference)
with open('../data/scaler.pkl', 'wb') as f:
    pickle.dump(scaler, f)

print('  Preprocessed data saved:')
print('  - X_train_preprocessed.pkl')
print('  - X_val_preprocessed.pkl')
print('  - X_test_preprocessed.pkl')
print('  - y_train_preprocessed.pkl')
print('  - y_val_preprocessed.pkl')
print('  - scaler.pkl')

SAVING PREPROCESSED DATA
✅ Preprocessed data saved:
  - X_train_preprocessed.pkl
  - X_val_preprocessed.pkl
  - X_test_preprocessed.pkl
  - y_train_preprocessed.pkl
  - y_val_preprocessed.pkl
  - scaler.pkl


# Preprocessing summary

Summarize preprocessing results (rows, dropped columns, final feature count, split sizes, and class counts).

In [None]:
print('='*80)
print('PREPROCESSING SUMMARY')
print('='*80)

summary = {
    'Original Train Rows': len(train_df),
    'Columns Dropped': len(cols_to_drop),
    'Final Features': X_train_scaled.shape[1],
    'Train Split': X_train_scaled.shape[0],
    'Validation Split': X_val_scaled.shape[0],
    'Test': X_test_scaled.shape[0],
    'Target=1': (y_train_split == 1).sum(),
    'Target=0': (y_train_split == 0).sum(),
}

for key, value in summary.items():
    print(f'{key}: {value}')

print('\n Preprocessing Complete!')

PREPROCESSING SUMMARY
Original Train Rows: 574945
Columns Dropped: 0
Final Features: 332
Train Split: 459956
Validation Split: 114989
Test: 107
Target=1: 275246
Target=0: 184710

✅ Preprocessing Complete!
