In [1]:
# Standard library imports
import os
import warnings
import gc
import json
from datetime import datetime
from pathlib import Path
import re
from collections import Counter
from typing import List, Dict, Tuple
import pickle
import joblib
import hashlib
import pyarrow.parquet as pq
import pyarrow as pa

# Core data science libraries
import numpy as np
import pandas as pd
from math import ceil
from scipy import stats

# Visualization libraries
import matplotlib.pyplot as plt
import seaborn as sns

# Progress bar
from tqdm import tqdm

# Scikit-learn imports
from sklearn.model_selection import train_test_split, StratifiedKFold, KFold
from sklearn.metrics import (
    mean_absolute_error, mean_squared_error, r2_score,
    accuracy_score, precision_score, recall_score, f1_score, 
    roc_auc_score, classification_report, confusion_matrix,
    mean_absolute_percentage_error, precision_recall_curve
)
from sklearn.preprocessing import LabelEncoder, StandardScaler, PowerTransformer, QuantileTransformer, MinMaxScaler
from sklearn.cluster import KMeans
from sklearn.utils.class_weight import compute_class_weight
from sklearn.inspection import permutation_importance
from sklearn.impute import KNNImputer

# Catboost
import catboost as cb
from catboost import CatBoostRegressor, CatBoostClassifier

# Optuna for hyperparameter optimization
import optuna
from optuna.samplers import TPESampler

# Added tensorflow libraries
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers, models, optimizers, callbacks
from tensorflow.keras import backend as K

# Configuration
warnings.filterwarnings('ignore')

In [2]:
# Load training data
train_df = pd.read_csv('input/train.csv', low_memory=False)
train_demo_df = pd.read_csv('input/train_demographics.csv', low_memory=False)

In [3]:
# Define the plot output path
plot_output_path = Path('plots')

# Create directory if it doesn't exist
plot_output_path.mkdir(exist_ok=True)
print(f"Directory '{plot_output_path}' is ready.")

# Define processed data output path
processed_data_output_path = Path('processed_data')

# Create directory if it doesn't exist
processed_data_output_path.mkdir(exist_ok=True)
print(f"Directory '{processed_data_output_path}' is ready.")

Directory 'plots' is ready.
Directory 'processed_data' is ready.


In [5]:
# feature testing
def feature_engineering(df):   
    df = df.copy()

    # checks N/A before combining
    def combine_or_na(a, b):
        if 'N/A' in str(a) or 'N/A' in str(b):
            return 'N/A'
        return f"{a}_{b}"

    # combine orientation and gesture
    df['orientation_gesture'] = df.apply(lambda x: combine_or_na(x['orientation'], x['gesture']), axis=1).astype('category') 
        
    # behavioural boolean columns
    df['performs_gesture'] = df['behavior'].str.contains('Performs gesture', case=False, na=False)
    df['move_hand_to_target'] = df['behavior'].str.contains('Moves hand to target location', case=False, na=False)
    df['hand_at_target'] = df['behavior'].str.contains('Hand at target location', case=False, na=False)
    df['relaxes_moves_hand_to_target'] = df['behavior'].str.contains('Relaxes and moves hand to target location', case=False, na=False)
    
    return df

fe_train_df = feature_engineering(train_df)

In [None]:
# numerical distributions
def plot_numerical_distributions(df, output_path, exclude_zeros=False):
    numerical_cols = df.select_dtypes(include=['int64', 'float64', 'int32', 'float32']).columns
    
    # Create progress bar for numerical distributions
    for col in tqdm(numerical_cols, desc="Creating distribution plots"):
        plt.figure(figsize=(10, 8))
        
        # Filter data for plotting if exclude_zeros is True
        if exclude_zeros:
            plot_data = df[df[col] != 0][col]
            title_suffix = " (excluding zeros)"
        else:
            plot_data = df[col]
            title_suffix = ""
        
        # Skip if no data remains after filtering
        if len(plot_data) == 0:
            print(f"Warning: No data remaining for {col} after excluding zeros")
            plt.close()
            continue
        
        # Create subplot with histogram and kde
        sns.histplot(data=plot_data, kde=True)
        plt.title(f'Distribution of {col}{title_suffix}')
        plt.xlabel(col)
        plt.ylabel('Count')
        
        # Add statistical annotations based on filtered data
        stats_text = f'Mean: {plot_data.mean():.2f}\n'
        stats_text += f'Median: {plot_data.median():.2f}\n'
        stats_text += f'Std: {plot_data.std():.2f}\n'
        stats_text += f'Count: {len(plot_data)}'
        
        # Add additional info if zeros were excluded
        if exclude_zeros:
            zero_count = (df[col] == 0).sum()
            stats_text += f'\nZeros excluded: {zero_count}'
        
        plt.text(0.95, 0.95, stats_text,
                transform=plt.gca().transAxes,
                verticalalignment='top',
                horizontalalignment='right',
                bbox=dict(boxstyle='round', facecolor='white', alpha=0.8))
        
        plt.tight_layout()
        
        # Modify filename if zeros are excluded
        filename_suffix = "_no_zeros" if exclude_zeros else ""
        plt.savefig(f'{output_path}/distribution_{col}{filename_suffix}.png')
        plt.close()

# Exclude zeros from visualization
plot_numerical_distributions(train_df, plot_output_path, exclude_zeros=False)

In [None]:
# categorical distributions
def plot_categorical_distributions(df, output_path):
    categorical_cols = df.select_dtypes(include=['object', 'category']).columns
    
    # Create progress bar for categorical distributions
    for col in tqdm(categorical_cols, desc="Creating distribution plots"):
        # Skip columns with 100 or more unique values
        if df[col].nunique() >= 100:
            print(f"Skipping {col}: too many unique values ({df[col].nunique()})")
            continue
            
        plt.figure(figsize=(18, 10))
        
        plot_data = df[col]
        
        # Skip if no data remains
        if len(plot_data) == 0:
            print(f"Warning: No data remaining for {col}")
            plt.close()
            continue
        
        # Create subplot with count plot
        sns.countplot(data=plot_data.to_frame(), y=col, order=plot_data.value_counts().index)
        plt.title(f'Distribution of {col}')
        plt.xlabel('Count')
        plt.ylabel(col)
        
        # Add statistical annotations
        stats_text = f'Unique: {plot_data.nunique()}\n'
        stats_text += f'Top freq: {plot_data.value_counts().iloc[0] if len(plot_data.value_counts()) > 0 else 0}\n'
        stats_text += f'Count: {len(plot_data)}'
        
        plt.text(0.95, 0.10, stats_text,
                transform=plt.gca().transAxes,
                verticalalignment='top',
                horizontalalignment='right',
                bbox=dict(boxstyle='round', facecolor='white', alpha=0.8))
        
        plt.tight_layout()
        
        plt.savefig(f'{output_path}/distribution_{col}.png')
        plt.close()

# Plot categorical distributions
plot_categorical_distributions(train_df, plot_output_path)