In [None]:
import numpy as np
import pandas as pd
import os
import random
from sklearn.base import clone
from sklearn.metrics import cohen_kappa_score,  make_scorer
from sklearn.model_selection import StratifiedKFold, cross_val_score, train_test_split
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.ensemble import VotingRegressor, RandomForestClassifier, VotingClassifier
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler
from sklearn.decomposition import PCA
from scipy.optimize import minimize, minimize_scalar
from scipy import stats
from concurrent.futures import ThreadPoolExecutor
from tqdm import tqdm
from colorama import Fore, Style
from IPython.display import clear_output
from lightgbm import LGBMRegressor, LGBMClassifier
from xgboost import XGBRegressor, XGBClassifier
from catboost import CatBoostRegressor, CatBoostClassifier
from imblearn.over_sampling import SMOTE
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import optuna
import json
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
import time

torch.manual_seed(42)
np.random.seed(42)
random.seed(42)

In [None]:
# If using CUDA (GPU)
torch.cuda.is_available()

In [None]:
# Create output folders
output_folder = 'output'
if not os.path.exists(output_folder):
    os.makedirs(output_folder)

# Create separate analysis output folders
analysis_output_folder = 'analysis_output'
os.makedirs(analysis_output_folder, exist_ok=True)

physical_analysis_output_folder = 'analysis_output/physical'
os.makedirs(physical_analysis_output_folder, exist_ok=True)

fitness_analysis_output_folder = 'analysis_output/fitness'
os.makedirs(fitness_analysis_output_folder, exist_ok=True)

bia_analysis_output_folder = 'analysis_output/bia'
os.makedirs(bia_analysis_output_folder, exist_ok=True)

child_info_analysis_output_folder = 'analysis_output/child_info'
os.makedirs(child_info_analysis_output_folder, exist_ok=True)

actigraphy_analysis_output_folder = 'analysis_output/actigraphy'
os.makedirs(actigraphy_analysis_output_folder, exist_ok=True)

# Set display all columns in dataframes property
pd.options.display.max_columns = None

# Supress warnings
warnings.filterwarnings('ignore')

In [None]:
# Model classes
class ResidualBlock(nn.Module):
    def __init__(self, in_features, out_features):
        super(ResidualBlock, self).__init__()
        
        self.block = nn.Sequential(
            nn.Linear(in_features, out_features),
            nn.BatchNorm1d(out_features),
            nn.LeakyReLU(0.2),
            nn.Dropout(0.2),
            nn.Linear(out_features, out_features),
            nn.BatchNorm1d(out_features)
        )
        
        self.skip = nn.Sequential(
            nn.Linear(in_features, out_features),
            nn.BatchNorm1d(out_features)
        ) if in_features != out_features else nn.Identity()
        
        self.activation = nn.LeakyReLU(0.2)
        
    def forward(self, x):
        return self.activation(self.block(x) + self.skip(x))

class AdvancedAutoencoder(nn.Module):
    def __init__(self, input_dim, latent_dim=32, dropout_rate=0.2):
        super(AdvancedAutoencoder, self).__init__()
        
        # Encoder with residual connections and batch normalization
        self.encoder = nn.Sequential(
            # Input layer
            nn.Linear(input_dim, 128),
            nn.BatchNorm1d(128),
            nn.LeakyReLU(0.2),
            nn.Dropout(dropout_rate),
            
            # First residual block
            ResidualBlock(128, 96),
            
            # Second residual block
            ResidualBlock(96, 64),
            
            # Bottleneck layers with variational component
            nn.Linear(64, 32),
            nn.BatchNorm1d(32),
            nn.LeakyReLU(0.2),
            
            # Mean and log variance for variational component
            nn.Linear(32, latent_dim * 2)
        )
        
        # Decoder with transpose of encoder architecture
        self.decoder = nn.Sequential(
            nn.Linear(latent_dim, 32),
            nn.BatchNorm1d(32),
            nn.LeakyReLU(0.2),
            nn.Dropout(dropout_rate),
            
            # Expanding layers
            ResidualBlock(32, 64),
            ResidualBlock(64, 96),
            
            # Output layer
            nn.Linear(96, input_dim),
            nn.Sigmoid()
        )
    
    def reparameterize(self, mu, log_var):
        std = torch.exp(0.5 * log_var)
        eps = torch.randn_like(std)
        return mu + eps * std
        
    def forward(self, x):
        # Encode
        h = self.encoder(x)
        mu, log_var = h.chunk(2, dim=1)
        
        # Reparameterize
        z = self.reparameterize(mu, log_var)
        
        # Decode
        reconstructed = self.decoder(z)
        
        return reconstructed, mu, log_var, z

In [None]:
# Helper functions
def train_model(model, train_loader, val_loader, epochs=150, learning_rate=0.001, beta=0.05, device='cuda'):
    model = model.to(device)
    optimizer = optim.AdamW(model.parameters(), lr=learning_rate, weight_decay=1e-4)
    scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=epochs)
    
    reconstruction_criterion = nn.MSELoss()
    best_val_loss = float('inf')
    best_model = None
    
    history = {'train_loss': [], 'val_loss': []}
    
    for epoch in tqdm(range(epochs), desc="Training"):
        # Training phase
        model.train()
        train_loss = 0
        for batch in train_loader:
            x = batch[0].to(device)
            
            # Forward pass
            reconstructed, mu, log_var, z = model(x)
            
            # Calculate losses
            recon_loss = reconstruction_criterion(reconstructed, x)
            kl_loss = -0.5 * torch.sum(1 + log_var - mu.pow(2) - log_var.exp())
            total_loss = recon_loss + beta * kl_loss
            
            # Backward pass
            optimizer.zero_grad()
            total_loss.backward()
            optimizer.step()
            
            train_loss += total_loss.item()
        
        # Validation phase
        model.eval()
        val_loss = 0
        with torch.no_grad():
            for batch in val_loader:
                x = batch[0].to(device)
                reconstructed, mu, log_var, z = model(x)
                
                recon_loss = reconstruction_criterion(reconstructed, x)
                kl_loss = -0.5 * torch.sum(1 + log_var - mu.pow(2) - log_var.exp())
                total_loss = recon_loss + beta * kl_loss
                val_loss += total_loss.item()
        
        # Update learning rate
        scheduler.step()
        
        # Record losses
        avg_train_loss = train_loss / len(train_loader)
        avg_val_loss = val_loss / len(val_loader)
        history['train_loss'].append(avg_train_loss)
        history['val_loss'].append(avg_val_loss)
        
        # Save best model
        if avg_val_loss < best_val_loss:
            best_val_loss = avg_val_loss
            best_model = model.state_dict().copy()
        
        if (epoch + 1) % 10 == 0:
            print(f'Epoch [{epoch+1}/{epochs}], Train Loss: {avg_train_loss:.4f}, Val Loss: {avg_val_loss:.4f}')
    
    # Load best model
    model.load_state_dict(best_model)
    return model, history

def encode_data(model, data, scaler, device='cuda'):
    model.eval()
    scaled_data = scaler.transform(data)
    data_tensor = torch.FloatTensor(scaled_data).to(device)
    
    with torch.no_grad():
        _, mu, _, z = model(data_tensor)
    
    return mu.cpu().numpy(), z.cpu().numpy()

def decode_data(model, latent_vectors, scaler, device='cuda'):
    model.eval()
    latent_tensor = torch.FloatTensor(latent_vectors).to(device)
    
    with torch.no_grad():
        reconstructed = model.decoder(latent_tensor)
    
    return scaler.inverse_transform(reconstructed.cpu().numpy())

In [None]:
def process_file(filename, dirname):
    df = pd.read_parquet(os.path.join(dirname, filename, 'part-0.parquet'))
    df.drop('step', axis=1, inplace=True)
    return df.describe().values.reshape(-1), filename.split('=')[1]

def load_time_series(dirname) -> pd.DataFrame:
    ids = os.listdir(dirname)
    
    with ThreadPoolExecutor() as executor:
        results = list(tqdm(executor.map(lambda fname: process_file(fname, dirname), ids), total=len(ids)))
    
    stats, indexes = zip(*results)
    
    df = pd.DataFrame(stats, columns=[f"stat_{i}" for i in range(len(stats[0]))])
    df['id'] = indexes
    return df

In [None]:
def train_timeseries_autoencoder(df_train, df_test, verbose=True):
    """Train autoencoder on time series statistics"""
    # Set device
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    if verbose:
        print(f"Using device: {device}")

    # Initialize model
    input_dim = df_train.shape[1]
    model = AdvancedAutoencoder(input_dim=input_dim, latent_dim=32, dropout_rate=0.2)
    
    # Prepare data
    scaler = StandardScaler()
    train_scaled = scaler.fit_transform(df_train)
    test_scaled = scaler.transform(df_test)
    
    # Convert to tensors
    train_tensor = torch.FloatTensor(train_scaled)
    test_tensor = torch.FloatTensor(test_scaled)
    
    # Create data loaders
    train_loader = DataLoader(TensorDataset(train_tensor), batch_size=64, shuffle=True)
    val_loader = DataLoader(TensorDataset(test_tensor), batch_size=64)
    
    # Train model
    model, history = train_model(
        model=model,
        train_loader=train_loader,
        val_loader=val_loader,
        epochs=150,
        learning_rate=0.001,
        beta=0.05,
        device=device
    )
    
    # Generate encodings
    train_mu, train_z = encode_data(model, df_train, scaler, device)
    test_mu, test_z = encode_data(model, df_test, scaler, device)
    
    # Create DataFrames with encoded features
    train_encoded = pd.DataFrame(
        train_z,
        columns=[f'ae_feature_{i}' for i in range(train_z.shape[1])]
    )
    test_encoded = pd.DataFrame(
        test_z,
        columns=[f'ae_feature_{i}' for i in range(test_z.shape[1])]
    )
    
    # Calculate reconstruction error
    train_reconstructed = decode_data(model, train_z, scaler, device)
    test_reconstructed = decode_data(model, test_z, scaler, device)
    
    train_mse = np.mean((df_train.values - train_reconstructed) ** 2, axis=1)
    test_mse = np.mean((df_test.values - test_reconstructed) ** 2, axis=1)
    
    train_encoded['reconstruction_error'] = train_mse
    test_encoded['reconstruction_error'] = test_mse
    
    if verbose:
        # Plot training history
        plt.figure(figsize=(12, 6))
        plt.plot(history['train_loss'], label='Training Loss')
        plt.plot(history['val_loss'], label='Validation Loss')
        plt.xlabel('Epoch')
        plt.ylabel('Loss')
        plt.title('Autoencoder Training History')
        plt.legend()
        plt.show()
        
        # Plot reconstruction error distribution
        plt.figure(figsize=(12, 6))
        plt.hist(train_mse, bins=50, alpha=0.5, label='Train')
        plt.hist(test_mse, bins=50, alpha=0.5, label='Test')
        plt.xlabel('Reconstruction Error')
        plt.ylabel('Count')
        plt.title('Reconstruction Error Distribution')
        plt.legend()
        plt.show()
        
        # Plot feature importance
        corr = np.corrcoef(train_z.T)[0]
        plt.figure(figsize=(12, 6))
        plt.bar(range(len(corr)), np.abs(corr))
        plt.xlabel('Latent Dimension')
        plt.ylabel('Absolute Correlation with First Component')
        plt.title('Latent Space Feature Importance')
        plt.show()
    
    return model, train_encoded, test_encoded, scaler

In [None]:
# Load data
print("Loading data...")
train = pd.read_csv('input/train.csv')
test = pd.read_csv('input/test.csv')
sample = pd.read_csv('input/sample_submission.csv')

print("Loading time series data...")
train_ts = load_time_series("input/series_train.parquet")
test_ts = load_time_series("input/series_test.parquet")

print("Preparing features...")
df_train = train_ts.drop('id', axis=1)
df_test = test_ts.drop('id', axis=1)

In [None]:
# Train autoencoder
print("Training autoencoder...")
model, train_encoded, test_encoded, scaler = train_timeseries_autoencoder(
    df_train, 
    df_test,
    verbose=True
)

# Add encoded features to original dataframes
print("Adding encoded features...")
train_ts_with_features = pd.concat([train_ts, train_encoded], axis=1)
test_ts_with_features = pd.concat([test_ts, test_encoded], axis=1)

print("\nTrain shape after encoding:", train_ts_with_features.shape)
print("Test shape after encoding:", test_ts_with_features.shape)

print("Done!")

In [None]:
# Skew removal for specified columns
skewed_columns = [
    'BIA-BIA_BMC', 'BIA-BIA_BMR', 'BIA-BIA_DEE', 'BIA-BIA_ECW', 'BIA-BIA_Fat',
    'BIA-BIA_FFM', 'BIA-BIA_FFMI', 'BIA-BIA_FMI', 'BIA-BIA_ICW', 'BIA-BIA_LDM', 'BIA-BIA_LST', 
    'BIA-BIA_TBW', 'CGAS-CGAS_Score', 'stat_23', 'stat_35', 'stat_38', 'stat_40', 'stat_47',
    'stat_54', 'stat_66', 'stat_78', 'stat_80', 'stat_88', 'stat_90'
]

def box_cox_transform(df, column, lambda_param=None, verbose=True):
    """Apply Box-Cox transformation to a column"""
    df_copy = df.copy()
    df_copy = df_copy.dropna(subset=[column])
    
    # Ensure all values are positive
    min_value = df_copy[column].min()
    if min_value <= 0:
        df_copy[column] = df_copy[column] - min_value + 1
    
    try:
        if lambda_param is None:
            transformed_values, lambda_param = stats.boxcox(df_copy[column])
            if verbose:
                print(f"Optimal lambda for {column}: {lambda_param:.4f}")
        else:
            transformed_values = stats.boxcox(df_copy[column], lmbda=lambda_param)
            if verbose:
                print(f"Applied lambda {lambda_param:.4f} to {column}")
        
        df_copy[f'{column}_boxcox'] = transformed_values
        
        if verbose:
            print(f"Rows before/after: {len(df)}/{len(df_copy)}")
            
        return df_copy, lambda_param
    
    except Exception as e:
        print(f"Error transforming {column}: {str(e)}")
        return df_copy, None

def replace_inf_with_max(df):
    """Replace infinite values with maximum non-infinite value"""
    df_copy = df.copy()
    for column in df_copy.columns:
        if df_copy[column].dtype == 'float64':
            mask = ~np.isinf(df_copy[column])
            if mask.any():
                max_value = df_copy.loc[mask, column].max()
                df_copy[column] = df_copy[column].replace([np.inf, -np.inf], max_value)
    return df_copy

def process_skewed_columns(train_df, test_df, skewed_columns, verbose=True):
    """Process skewed columns in both train and test data"""
    lambda_params = {}
    train_processed = train_df.copy()
    test_processed = test_df.copy()
    
    # Process training data and store lambda values
    print("Processing training data...")
    for column in tqdm(skewed_columns):
        if column in train_df.columns:
            transformed_train, lambda_params[column] = box_cox_transform(
                train_df, 
                column, 
                verbose=verbose
            )
            if lambda_params[column] is not None:
                train_processed[f'{column}_boxcox'] = transformed_train[f'{column}_boxcox']
    
    # Process test data using stored lambda values
    print("\nProcessing test data...")
    for column in tqdm(skewed_columns):
        if column in test_df.columns and column in lambda_params:
            transformed_test, _ = box_cox_transform(
                test_df, 
                column, 
                lambda_param=lambda_params[column],
                verbose=verbose
            )
            if lambda_params[column] is not None:
                test_processed[f'{column}_boxcox'] = transformed_test[f'{column}_boxcox']
    
    # Handle infinite values
    train_processed = replace_inf_with_max(train_processed)
    test_processed = replace_inf_with_max(test_processed)
    
    # Get list of successfully transformed columns
    transformed_columns = [f'{col}_boxcox' for col in skewed_columns 
                         if f'{col}_boxcox' in train_processed.columns]
    
    if verbose:
        print(f"\nSuccessfully transformed {len(transformed_columns)} columns")
        
    return train_processed, test_processed, lambda_params, transformed_columns

# Function to visualize transformations
def plot_transformations(df_before, df_after, columns, n_cols=3, max_rows=5):
    """Plot histograms before and after transformation"""
    n_cols = min(n_cols, len(columns))
    n_rows = min(max_rows, (len(columns) + n_cols - 1) // n_cols)
    
    fig, axes = plt.subplots(n_rows, n_cols * 2, figsize=(n_cols * 8, n_rows * 4))
    axes = axes.ravel()
    
    for i, column in enumerate(columns[:n_rows * n_cols]):
        # Original distribution
        sns.histplot(df_before[column], ax=axes[i*2], kde=True)
        axes[i*2].set_title(f'Original: {column}')
        axes[i*2].tick_params(axis='x', rotation=45)
        
        # Transformed distribution
        sns.histplot(df_after[f'{column}_boxcox'], ax=axes[i*2+1], kde=True)
        axes[i*2+1].set_title(f'Transformed: {column}')
        axes[i*2+1].tick_params(axis='x', rotation=45)
    
    plt.tight_layout()
    plt.show()

# Apply transformations
print("Applying Box-Cox transformations...")
train_processed, test_processed, lambda_params, transformed_cols = process_skewed_columns(
    df_train, 
    df_test, 
    skewed_columns,
    verbose=True
)

# Visualize some transformations
print("\nPlotting transformations...")
plot_transformations(df_train, train_processed, 
                    [col for col in skewed_columns if f'{col}_boxcox' in transformed_cols])

# Update training data for autoencoder
df_train = train_processed
df_test = test_processed

print("\nData preprocessing complete!")

In [None]:
# Ensure IDs are properly formatted
train_ts_with_features['id'] = train_ts_with_features['id'].astype(str)
test_ts_with_features['id'] = test_ts_with_features['id'].astype(str)
train['id'] = train['id'].astype(str)
test['id'] = test['id'].astype(str)

# Get lists of our different feature types
encoded_cols = [col for col in train_ts_with_features.columns if 'ae_feature_' in col]
boxcox_cols = [col for col in train_processed.columns if '_boxcox' in col]
stats_cols = [col for col in train_processed.columns if col not in boxcox_cols + ['id']]  # Keep original stats

# Create DataFrame with all features: original stats, box-cox transformations, and encoded features
train_all_features = pd.concat([
    train_ts_with_features[['id'] + encoded_cols],  # Encoded features
    train_processed[boxcox_cols],                   # Box-cox transformations
    train_processed[stats_cols]                     # Original stats
], axis=1)

test_all_features = pd.concat([
    test_ts_with_features[['id'] + encoded_cols],   # Encoded features
    test_processed[boxcox_cols],                    # Box-cox transformations
    test_processed[stats_cols]                      # Original stats
], axis=1)

# Merge everything with original dataframes
print("Merging all features with original data...")
train_final = train.merge(
    train_all_features,
    on='id',
    how='left',
    validate='1:1'
)

test_final = test.merge(
    test_all_features,
    on='id',
    how='left',
    validate='1:1'
)

# Verify the merges and feature presence
print("\nShape Check:")
print(f"Original train shape: {train.shape}")
print(f"Train with all features shape: {train_final.shape}")
print(f"Original test shape: {test.shape}")
print(f"Test with all features shape: {test_final.shape}")

# Verify feature counts
print("\nFeature Counts:")
print(f"Number of original stats: {len(stats_cols)}")
print(f"Number of encoded features: {len(encoded_cols)}")
print(f"Number of box-cox transformed features: {len(boxcox_cols)}")
print(f"Total features: {len(stats_cols) + len(encoded_cols) + len(boxcox_cols)}")

# Check that all types of features are present
print("\nSample of feature types in final training data:")
print("\nOriginal stats (first 3):")
print(train_final[stats_cols[:3]].head())
print("\nEncoded features (first 3):")
print(train_final[encoded_cols[:3]].head())
print("\nBox-Cox transformed features (first 3):")
print(train_final[boxcox_cols[:3]].head())

# Check for any missing values after merge
print("\nMissing Values Check:")
print("Train missing values:")
print(train_final.isnull().sum()[train_final.isnull().sum() > 0])
print("\nTest missing values:")
print(test_final.isnull().sum()[test_final.isnull().sum() > 0])

print("\nMerge complete!")

# Print summary of all features
print("\nSummary of features:")
feature_types = {
    'Original Stats': stats_cols,
    'Encoded Features': encoded_cols,
    'Box-Cox Transformed': boxcox_cols
}
for feature_type, features in feature_types.items():
    print(f"\n{feature_type} ({len(features)}):")
    for feat in features[:5]:  # Show first 5 of each type
        print(f"  - {feat}")
    if len(features) > 5:
        print(f"  ... and {len(features)-5} more")

# Optional: Basic feature analysis
print("\nFeature Analysis:")
print("\nMemory usage by feature type:")
for feature_type, features in feature_types.items():
    memory = train_final[features].memory_usage(deep=True).sum() / 1024**2  # Convert to MB
    print(f"{feature_type}: {memory:.2f} MB")

print("\nCorrelation between original and transformed features:")
for original, transformed in zip(skewed_columns[:3], [col for col in boxcox_cols[:3]]):
    if original in train_final.columns:
        corr = train_final[original].corr(train_final[transformed])
        print(f"{original} vs {transformed}: {corr:.3f}")

In [None]:
# Supplement missing data with data from WHO
# Load who data and group
def load_who_bmi_data(file_path):
    who_data = pd.read_csv(file_path)
    who_data = who_data.groupby(['age', 'sex']).agg({
        'L': 'mean', 'mean_bmi': 'mean', 'S': 'mean'
    }).reset_index()
    who_data = who_data.set_index(['sex', 'age'])
    return who_data

def load_who_height_data(file_path):
    who_data = pd.read_csv(file_path)
    who_data = who_data.groupby(['age', 'sex']).agg({
        'mean_height': 'mean'
    }).reset_index()
    who_data = who_data.set_index(['sex', 'age'])
    return who_data

who_bmi_data = load_who_bmi_data('supplemental_data/bmi_for_age_5_to_19.csv')
who_height_data = load_who_height_data('supplemental_data/height_for_age_5_to_19.csv')

In [None]:
# Defining functions to Impute with data from WHO
def get_who_stats(age, sex, data_type='bmi'):
    try:
        if data_type == 'bmi':
            stats = who_bmi_data.loc[(sex, age), ['mean_bmi', 'S']]
            return stats['mean_bmi'], stats['S']
        elif data_type == 'height':
            stats = who_height_data.loc[(sex, age), 'mean_height']
            return stats
    except KeyError:
        return None, None if data_type == 'bmi' else None
    
def impute_bmi(age, sex):
    mean_bmi, sd = get_who_stats(age, sex, 'bmi')
    if mean_bmi is not None and sd is not None:
        imputed_bmi = np.random.normal(mean_bmi, sd)
        return round(imputed_bmi, 8)
    else:
        return None

def impute_height(age, sex):
    mean_height_cm = get_who_stats(age, sex, 'height')
    if mean_height_cm is not None:
        mean_height_inches = mean_height_cm / 2.54  # Convert cm to inches
        return round(mean_height_inches, 2)
    else:
        return None
    
def impute_weight(bmi, height_inches):
    if bmi is not None and height_inches is not None:
        height_meters = height_inches * 0.0254  # Convert inches to meters
        weight_kg = bmi * (height_meters ** 2)
        weight_lbs = weight_kg * 2.20462  # Convert kg to lbs
        return round(weight_lbs, 2)
    else:
        return None

def impute_waist_circumference(weight, height, age, sex):
    if weight is not None and height is not None:
        if sex == '0':
            waist = (weight * 0.5) + (height * 0.2) - (age * 0.1)
        else:
            waist = (weight * 0.4) + (height * 0.3) - (age * 0.1)
        return round(waist, 2)
    else:
        return None

def impute_body_fat(bmi, age, sex):
    if bmi is not None:
        if sex == '0':
            body_fat = (1.20 * bmi) + (0.23 * age) - 16.2
        else:
            body_fat = (1.20 * bmi) + (0.23 * age) - 5.4
        return round(max(body_fat, 0), 2)  # Ensure body fat is not negative
    else:
        return None
    
def impute_sds_total_raw():
    # Based on the distribution in Image 1
    mu, sigma = 40, 12  # Estimated from the graph
    imputed_value = np.random.normal(mu, sigma)
    return round(max(min(imputed_value, 90), 20))  # Clip between 20 and 90

def impute_sds_total_t(sds_total_raw):
    if sds_total_raw is not None:
        # Rough linear transformation based on the relationship between raw and T scores
        t_score = 50 + (sds_total_raw - 40) * (10 / 12)
        return round(max(min(t_score, 100), 40))  # Clip between 40 and 100
    else:
        # If raw score is not available, impute based on the distribution in Image 2
        mu, sigma = 60, 15  # Estimated from the graph
        imputed_value = np.random.normal(mu, sigma)
        return round(max(min(imputed_value, 100), 40))  # Clip between 40 and 100

def impute_sii_function(bmi, waist_circumference, body_fat, sds_total_raw, sds_total_t):
    # Create a simple scoring system
    score = 0
    
    # BMI-based score
    if bmi is not None:
        if bmi < 18.5:
            score += 1
        elif bmi >= 25:
            score += 2
        elif bmi >= 30:
            score += 3
    
    # Waist circumference-based score (using general guidelines)
    if waist_circumference is not None:
        if waist_circumference > 60:  # This is a general threshold, might need adjustment
            score += 1
    
    if sds_total_t is not None:
        if sds_total_t > 80:
            score += 1
        if sds_total_t > 90:
            score += 1
    
    # Map the score to sii values without forcing a 3
    if score <= 2:
        return 0
    else:
        return 1

def apply_imputation(df, impute_sii=True):
    def impute_if_missing(row):
        age = row['Basic_Demos-Age']
        sex = row['Basic_Demos-Sex']
        
        if pd.isna(row['Physical-BMI']) and 5 <= age <= 19:
            row['Physical-BMI'] = impute_bmi(age, sex)
        
        if pd.isna(row['Physical-Height']) and 5 <= age <= 19:
            row['Physical-Height'] = impute_height(age, sex)
        
        if pd.isna(row['Physical-Weight']) and row['Physical-BMI'] is not None and row['Physical-Height'] is not None:
            row['Physical-Weight'] = impute_weight(row['Physical-BMI'], row['Physical-Height'])
        
        if pd.isna(row['Physical-Waist_Circumference']):
            row['Physical-Waist_Circumference'] = impute_waist_circumference(
                row['Physical-Weight'], row['Physical-Height'], age, sex
            )
        
        if pd.isna(row['BIA-BIA_Fat']):
            row['BIA-BIA_Fat'] = impute_body_fat(row['Physical-BMI'], age, sex)

        if pd.isna(row['SDS-SDS_Total_Raw']):
            row['SDS-SDS_Total_Raw'] = impute_sds_total_raw()
        
        if pd.isna(row['SDS-SDS_Total_T']):
            row['SDS-SDS_Total_T'] = impute_sds_total_t(row['SDS-SDS_Total_Raw'])
        
        # Impute sii if it's missing and impute_sii is True
        if impute_sii and pd.isna(row['sii']):
            row['sii'] = impute_sii_function(
                row['Physical-BMI'],
                row['Physical-Waist_Circumference'],
                row['BIA-BIA_Fat'],
                row['SDS-SDS_Total_Raw'],
                row['SDS-SDS_Total_T']
            )
        
        return row
    
    return df.apply(impute_if_missing, axis=1)

In [None]:
# Impute all values including 'sii' for train data
train_final = apply_imputation(train_final, impute_sii=True)

# Impute all values except 'sii' for test data
test_final = apply_imputation(test_final, impute_sii=False)

# Check the results
print("Number of missing values after imputation:")
print("BMI:", train_final['Physical-BMI'].isna().sum())
print("Height:", train_final['Physical-Height'].isna().sum())
print("Weight:", train_final['Physical-Weight'].isna().sum())
print("Waist Circumference:", train_final['Physical-Waist_Circumference'].isna().sum())
print("Body Fat Percentage:", train_final['BIA-BIA_Fat'].isna().sum())
print("SDS Total Raw:", train_final['SDS-SDS_Total_Raw'].isna().sum())
print("SDS Total T:", train_final['SDS-SDS_Total_T'].isna().sum())

print("\nSample of imputed values:")
imputed_sample = train_final[
    (train_final['Physical-BMI'].notnull() | train_final['Physical-Height'].notnull() | 
     train_final['Physical-Weight'].notnull() | train_final['Physical-Waist_Circumference'].notnull() | 
     train_final['BIA-BIA_Fat'].notnull() | train_final['SDS-SDS_Total_Raw'].notnull() | 
     train_final['SDS-SDS_Total_T'].notnull()) &
    ((train_final['Physical-BMI'].notnull() != train_final['Physical-BMI'].notnull().shift()) |
     (train_final['Physical-Height'].notnull() != train_final['Physical-Height'].notnull().shift()) |
     (train_final['Physical-Weight'].notnull() != train_final['Physical-Weight'].notnull().shift()) |
     (train_final['Physical-Waist_Circumference'].notnull() != train_final['Physical-Waist_Circumference'].notnull().shift()) |
     (train_final['BIA-BIA_Fat'].notnull() != train_final['BIA-BIA_Fat'].notnull().shift()) |
     (train_final['SDS-SDS_Total_Raw'].notnull() != train_final['SDS-SDS_Total_Raw'].notnull().shift()) |
     (train_final['SDS-SDS_Total_T'].notnull() != train_final['SDS-SDS_Total_T'].notnull().shift()))
].sample(5)[['Basic_Demos-Age', 'Basic_Demos-Sex', 'Physical-BMI', 'Physical-Height', 'Physical-Weight',
              'Physical-Waist_Circumference', 'BIA-BIA_Fat', 'SDS-SDS_Total_Raw', 'SDS-SDS_Total_T']]
print(imputed_sample)

In [None]:
# Additional analysis of imputation results
print("\nImputation summary by age:")
age_summary = train_final.groupby('Basic_Demos-Age').agg({
    'Physical-BMI': ['count', 'mean', 'std', 'min', 'max'],
    'Physical-Height': ['count', 'mean', 'std', 'min', 'max'],
    'Physical-Weight': ['count', 'mean', 'std', 'min', 'max'],
    'Basic_Demos-Sex': 'count'
})
print(age_summary)

In [None]:
# Plot of imputed vs. original data
plt.figure(figsize=(12, 6))
sns.scatterplot(data=train_final, x='Basic_Demos-Age', y='Physical-BMI', hue='Basic_Demos-Sex', alpha=0.5)
plt.title('BMI vs Age (After Imputation)')
plt.savefig('analysis_output/bmi_vs_age_imputed.png')
plt.close()

plt.figure(figsize=(12, 6))
sns.scatterplot(data=train_final, x='Basic_Demos-Age', y='Physical-Height', hue='Basic_Demos-Sex', alpha=0.5)
plt.title('Height vs Age (After Imputation)')
plt.savefig('analysis_output/height_vs_age_imputed.png')
plt.close()

plt.figure(figsize=(12, 6))
sns.scatterplot(data=train_final, x='Basic_Demos-Age', y='Physical-Weight', hue='Basic_Demos-Sex', alpha=0.5)
plt.title('Weight vs Age (After Imputation)')
plt.savefig('analysis_output/weight_vs_age_imputed.png')
plt.close()

print("Imputation analysis plots saved in the 'analysis_output' folder.")

In [None]:
# Feature engineering
def engineer_features(df, is_train=True):
    # Combine all grip strength
    df['FGC-FGC_GS'] = df['FGC-FGC_GSD_Zone'] + df['FGC-FGC_GSND_Zone']
    
    # Combine all sit and reach
    df['FGC-FGC_SR'] = df['FGC-FGC_SRL_Zone'] + df['FGC-FGC_SRR_Zone']
    
    # Create a fitness score by adding the zone fitness data
    df['fitness_score'] = df['FGC-FGC_GS'] + df['FGC-FGC_SR'] + df['FGC-FGC_CU_Zone'] + df['FGC-FGC_PU_Zone'] + df['FGC-FGC_TL_Zone']
    
    # Combine PAQ_A-PAQ_A_Total and PAQ_C-PAQ_C_Total into one column
    df['PAQ_Total'] = df['PAQ_A-PAQ_A_Total'].combine_first(df['PAQ_C-PAQ_C_Total'])
    
    # Reworking of features from other notebook
    df['BMI_Age'] = df['Physical-BMI'] * df['Basic_Demos-Age']
    
    df['Internet_Hours_Age'] = df['PreInt_EduHx-computerinternet_hoursday'] * df['Basic_Demos-Age']
    df['Age_Internet_Hours'] = df['PreInt_EduHx-computerinternet_hoursday'] / df['Basic_Demos-Age']
    df['BMI_Internet_Hours'] = df['Physical-BMI'] * df['PreInt_EduHx-computerinternet_hoursday']
    df['BIA_BMI_Internet_Hours_Age'] = (df['BIA-BIA_BMI'] * df['PreInt_EduHx-computerinternet_hoursday']) / df['Basic_Demos-Age']
    
    # df['BFP_BMI'] = df['BIA-BIA_Fat'] / df['BIA-BIA_BMI']
    # df['BFP_BMI_Age'] = df['BIA-BIA_Fat'] / (df['BIA-BIA_BMI']*df['BMI_Age'])
    
    df['FFMI_BFP'] = df['BIA-BIA_FFMI'] / df['BIA-BIA_Fat']
    df['FMI_BFP'] = df['BIA-BIA_FMI'] / df['BIA-BIA_Fat']
    df['LST_TBW'] = df['BIA-BIA_LST'] / df['BIA-BIA_TBW']
    # df['BFP_BMR'] = df['BIA-BIA_Fat'] * df['BIA-BIA_BMR']
    df['BFP_DEE'] = df['BIA-BIA_Fat'] * df['BIA-BIA_DEE']
   
    df['BMR_Weight'] = df['BIA-BIA_BMR'] / df['Physical-Weight']
    df['DEE_Weight'] = df['BIA-BIA_DEE'] / df['Physical-Weight']
    df['SMM_Height'] = df['BIA-BIA_SMM'] / df['Physical-Height']
    df['BMR_BMI'] = df['BIA-BIA_BMR'] / df['Physical-BMI']
    df['DEE_BMI'] = df['BIA-BIA_DEE'] / df['Physical-BMI']
    df['SMM_BMI'] = df['BIA-BIA_SMM'] / df['Physical-BMI']
    
    df['Muscle_to_Fat'] = df['BIA-BIA_SMM'] / df['BIA-BIA_FMI']
    df['Hydration_Status'] = df['Physical-Weight'] / df['BIA-BIA_TBW']
    df['ICW_TBW'] = df['BIA-BIA_ICW'] / df['BIA-BIA_TBW']
    
    df['Age_Weight'] = df['Basic_Demos-Age'] * df['Physical-Weight']
    df['Age_Weight_BMI'] = (df['Basic_Demos-Age'] * df['Physical-Weight']) / df['Physical-BMI']
    df['Sex_BMI'] = df['Basic_Demos-Sex'] * df['Physical-BMI']
    df['Sex_HeartRate'] = df['Basic_Demos-Sex'] * df['Physical-HeartRate']
    
    df['Age_WaistCirc'] = df['Basic_Demos-Age'] * df['Physical-Waist_Circumference']
    df['Age_WaistCirc_BMI'] = np.log10((df['Basic_Demos-Age'] * df['Physical-Waist_Circumference']) / df['Physical-BMI'])
    
    df['BMI_FitnessMaxStage'] = df['Physical-BMI'] * df['Fitness_Endurance-Max_Stage']
    df['Weight_GripStrengthDominant'] = df['Physical-Weight'] * df['FGC-FGC_GSD']
    df['Weight_GripStrengthNonDominant'] = df['Physical-Weight'] * df['FGC-FGC_GSND']
    df['HeartRate_FitnessTime'] = df['Physical-HeartRate'] * (df['Fitness_Endurance-Time_Mins'] + df['Fitness_Endurance-Time_Sec'])
    df['Age_PushUp'] = df['Basic_Demos-Age'] * df['FGC-FGC_PU']
    df['FFMI_Age'] = df['BIA-BIA_FFMI'] * df['Basic_Demos-Age']
    df['InternetUse_SleepDisturbance'] = df['PreInt_EduHx-computerinternet_hoursday'] * df['SDS-SDS_Total_Raw']
    df['InternetUse_SleepDisturbance_BMI'] = (df['PreInt_EduHx-computerinternet_hoursday'] * df['SDS-SDS_Total_Raw']) / df['Physical-BMI']
    df['CGAS_BMI'] = df['CGAS-CGAS_Score'] * df['Physical-BMI']
    df['CGAS_FitnessMaxStage'] = df['CGAS-CGAS_Score'] * df['Fitness_Endurance-Max_Stage']

    return df

# Apply feature engineering to train data
train_final = engineer_features(train_final, is_train=True)

# Apply feature engineering to test data
test_final = engineer_features(test_final, is_train=False)

print("Feature engineering completed for both train and test data.")

In [None]:
# Get column names as a list
column_names = train_final.columns.tolist()

train_final.head()

In [None]:
# Isolate the physical attribute columns and some contextual columns for analysis
physical_columns = [
    'Basic_Demos-Enroll_Season', 'BMI_Age', 'Basic_Demos-Age', 'Basic_Demos-Sex', 'DEE_Weight',
    'CGAS-Season', 'Physical-Season', 'Physical-BMI', 'Internet_Hours_Age', 'Age_Internet_Hours', 
    'Physical-Height', 'Physical-Weight', 'Physical-Waist_Circumference',
    'Physical-Diastolic_BP', 'Physical-HeartRate', 'Physical-Systolic_BP', 'BMI_Internet_Hours', 'BMR_Weight',
    'fitness_score', 'BIA-BIA_Frame_num', 'BIA-BIA_BMI', 'PreInt_EduHx-computerinternet_hoursday',
    'BIA_BMI_Internet_Hours_Age', 
    'Muscle_to_Fat', 'Hydration_Status', 'ICW_TBW', 'Age_Weight', 'Age_Weight_BMI', 'Sex_BMI', 'Sex_HeartRate',
    'sii'
]

# Isolate the fitness attributes
# Removed columns: 'FGC-FGC_CU' 'FGC-FGC_PU', 'FGC-FGC_SRL_Zone', 'FGC-FGC_SRR', 'FGC-FGC_SRR_Zone', 'FGC-FGC_SRL' 
# 'FGC-FGC_GSND_Zone' 'FGC-FGC_GSND' 'FGC-FGC_GSD' 'FGC-FGC_GSD_Zone' 'Fitness_Endurance-Time_Sec'
fitness_columns = [
    'Fitness_Endurance-Season', 'Fitness_Endurance-Max_Stage', 'Fitness_Endurance-Time_Mins', 'SMM_Height',
    'FGC-Season', 'FGC-FGC_CU_Zone', 'FGC-FGC_SR', 'FGC-FGC_PU_Zone', 'FGC-FGC_TL',
    'FGC-FGC_TL_Zone', 'FGC-FGC_GS', 'fitness_score', 'BIA-BIA_BMI', 'Physical-BMI', 'FMI_BFP', 'LST_TBW', 'SMM_Height',
    'BIA-BIA_Frame_num', 'PreInt_EduHx-computerinternet_hoursday', 'FFMI_BFP', 'BFP_DEE', 'BMR_Weight',
    'Muscle_to_Fat', 'Hydration_Status', 'ICW_TBW', 'Age_Weight_BMI', 'Sex_HeartRate',
    'sii'
]

# Isolate the BIA attributes
bia_columns = [
    'BIA-Season', 'BIA-BIA_Activity_Level_num', 'BIA-BIA_Frame_num', 'DEE_Weight', 'SMM_Height',
    'BIA-BIA_BMI', 'fitness_score', 'Physical-BMI', 'PreInt_EduHx-computerinternet_hoursday', 
    'BMR_BMI', 'DEE_BMI', 'SMM_BMI', 'ICW_TBW', 'sii'
]

# Isolate the PAQ, PCIAT, and SDS
child_info_columns = [
    'PreInt_EduHx-computerinternet_hoursday', 'PAQ_A-Season', 'PAQ_A-PAQ_A_Total', 'PAQ_C-PAQ_C_Total',
    'PAQ_Total', 'PCIAT-Season', 'PCIAT-PCIAT_01', 'PCIAT-PCIAT_02', 'PCIAT-PCIAT_03', 'PCIAT-PCIAT_04',
    'PCIAT-PCIAT_05', 'PCIAT-PCIAT_06', 'PCIAT-PCIAT_07', 'PCIAT-PCIAT_08', 'PCIAT-PCIAT_09',
    'PCIAT-PCIAT_10', 'PCIAT-PCIAT_11', 'PCIAT-PCIAT_12', 'PCIAT-PCIAT_13', 'PCIAT-PCIAT_14', 
    'PCIAT-PCIAT_15', 'PCIAT-PCIAT_16', 'PCIAT-PCIAT_17', 'PCIAT-PCIAT_18', 'PCIAT-PCIAT_19', 
    'PCIAT-PCIAT_20', 'PCIAT-PCIAT_Total', 'SDS-Season', 'SDS-SDS_Total_Raw', 'SDS-SDS_Total_T',
    'PreInt_EduHx-Season', 'PreInt_EduHx-computerinternet_hoursday', 'BIA-BIA_BMI', 'fitness_score',
    'Physical-BMI', 'Sex_BMI', 'PCIAT-Season', 'sii'
]

# Isolate the Actigraphy data
# removed columns: 'stat_41', 'stat_42', 'stat_39','stat_92_boxcox' 'stat_0', 'stat_1', 'stat_2', 'stat_3', 'stat_4', 'stat_5', 'stat_6', 'stat_7', 
# 'stat_8', 'stat_9', 'stat_10','stat_11'
actigraphy_columns = [
    'stat_12', 'stat_13', 'stat_14', 'stat_15', 'stat_16', 'stat_17', 'stat_18', 'stat_19', 'stat_20',
    'stat_21', 'stat_22', 'stat_23_boxcox', 'stat_24', 'stat_25', 'stat_26', 'stat_27', 'stat_28', 'stat_29', 'stat_30',
    'stat_31', 'stat_32', 'stat_33', 'stat_34', 'stat_35_boxcox', 'stat_36', 'stat_37', 'stat_38_boxcox', 'stat_40_boxcox',
    'stat_43', 'stat_44', 'stat_45', 'stat_46', 'stat_47_boxcox', 'stat_48', 'stat_49', 'stat_50',
    'stat_51', 'stat_52', 'stat_53', 'stat_54_boxcox', 'stat_55', 'stat_56', 'stat_57', 'stat_58', 'stat_59', 'stat_60',
    'stat_61', 'stat_62', 'stat_63', 'stat_64', 'stat_65', 'stat_66_boxcox', 'stat_67', 'stat_68', 'stat_69', 'stat_70',
    'stat_71', 'stat_72', 'stat_73', 'stat_74', 'stat_75', 'stat_76', 'stat_77', 'stat_78_boxcox', 'stat_79', 'stat_80_boxcox',
    'stat_81', 'stat_82', 'stat_83', 'stat_84', 'stat_85', 'stat_86', 'stat_87', 'stat_88_boxcox', 'stat_89', 'stat_90_boxcox',
    'stat_91', 'stat_93', 'stat_94', 'stat_95', 'PreInt_EduHx-computerinternet_hoursday', 
    'BIA-BIA_Frame_num', 'SDS-SDS_Total_T', 'BIA-BIA_BMI', 'Physical-BMI', 'sii'
]

In [None]:
# Function to analyze columns
def analyze_column(column):
    total_count = len(train_final)
    missing_count = train_final[column].isnull().sum()
    missing_percentage = (missing_count / total_count) * 100
    unique_values = train_final[column].nunique()
    
    if pd.api.types.is_numeric_dtype(train_final[column]):
        mean_value = train_final[column].mean()
        median_value = train_final[column].median()
        std_dev = train_final[column].std()
        min_value = train_final[column].min()
        max_value = train_final[column].max()
        return {
            "Column": column,
            "Total Count": total_count,
            "Missing Count": missing_count,
            "Missing Percentage": f"{missing_percentage:.2f}%",
            "Unique Values": unique_values,
            "Data Type": train_final[column].dtype,
            "Mean": mean_value,
            "Median": median_value,
            "Standard Deviation": std_dev,
            "Minimum": min_value,
            "Maximum": max_value
        }
    else:
        top_values = train_final[column].value_counts().head(3).to_dict()
        return {
            "Column": column,
            "Total Count": total_count,
            "Missing Count": missing_count,
            "Missing Percentage": f"{missing_percentage:.2f}%",
            "Unique Values": unique_values,
            "Data Type": train_final[column].dtype,
            "Top 3 Values": top_values
        }

# Physical column profiles        
physical_column_profiles = [analyze_column(col) for col in physical_columns]
physical_column_profiles_df = pd.DataFrame(physical_column_profiles)

# Save column profiles to CSV
physical_column_profiles_df.to_csv(os.path.join(physical_analysis_output_folder, 'physical_column_profiles.csv'), index=False)
print(f"Column profiles saved to {os.path.join(physical_analysis_output_folder, 'physical_column_profiles.csv')}")

# Fitness column profiles
fitness_column_profiles = [analyze_column(col) for col in fitness_columns]
fitness_column_profiles_df = pd.DataFrame(fitness_column_profiles)

# Save column profiles to CSV
fitness_column_profiles_df.to_csv(os.path.join(analysis_output_folder, 'fitness_column_profiles.csv'), index=False)
print(f"Column profiles saved to {os.path.join(analysis_output_folder, 'fitness_column_profiles.csv')}")

# BIA column profiles
bia_column_profiles = [analyze_column(col) for col in bia_columns]
bia_column_profiles_df = pd.DataFrame(bia_column_profiles)

# Save column profiles to CSV
bia_column_profiles_df.to_csv(os.path.join(analysis_output_folder, 'bia_column_profiles.csv'), index=False)
print(f"Column profiles saved to {os.path.join(analysis_output_folder, 'bia_column_profiles.csv')}")

# Child info column profiles
child_info_column_profiles = [analyze_column(col) for col in child_info_columns]
child_info_column_profiles_df = pd.DataFrame(child_info_column_profiles)

# Save column profiles to CSV
child_info_column_profiles_df.to_csv(os.path.join(analysis_output_folder, 'child_info_column_profiles.csv'), index=False)
print(f"Column profiles saved to {os.path.join(analysis_output_folder, 'child_info_column_profiles.csv')}")

# Actigraphy info column profiles
actigraphy_column_profiles = [analyze_column(col) for col in actigraphy_columns]
actigraphy_column_profiles_df = pd.DataFrame(actigraphy_column_profiles)

# Save column profiles to CSV
actigraphy_column_profiles_df.to_csv(os.path.join(analysis_output_folder, 'actigraphy_column_profiles.csv'), index=False)
print(f"Column profiles saved to {os.path.join(analysis_output_folder, 'actigraphy_column_profiles.csv')}")

In [None]:
# Visualize missing data
# Physical columns
plt.figure(figsize=(12, 6))
sns.heatmap(train_final[physical_columns].isnull(), cbar=False, yticklabels=False, cmap='viridis')
plt.title('Missing Data in Physical Attribute Columns')
plt.xlabel('Columns')
plt.ylabel('Rows')
plt.tight_layout()
plt.savefig(os.path.join(physical_analysis_output_folder, 'physical_missing_data_heatmap.png'))
plt.close()
print(f"Missing data heatmap saved to {os.path.join(physical_analysis_output_folder, 'physical_missing_data_heatmap.png')}")

# Fitness columns
plt.figure(figsize=(12, 6))
sns.heatmap(train_final[fitness_columns].isnull(), cbar=False, yticklabels=False, cmap='viridis')
plt.title('Missing Data in Fitness Attribute Columns')
plt.xlabel('Columns')
plt.ylabel('Rows')
plt.tight_layout()
plt.savefig(os.path.join(fitness_analysis_output_folder, 'fitness_missing_data_heatmap.png'))
plt.close()
print(f"Missing data heatmap saved to {os.path.join(fitness_analysis_output_folder, 'fitness_missing_data_heatmap.png')}")

# BIA columns
plt.figure(figsize=(12, 6))
sns.heatmap(train_final[bia_columns].isnull(), cbar=False, yticklabels=False, cmap='viridis')
plt.title('Missing Data in BIA Attribute Columns')
plt.xlabel('Columns')
plt.ylabel('Rows')
plt.tight_layout()
plt.savefig(os.path.join(bia_analysis_output_folder, 'bia_missing_data_heatmap.png'))
plt.close()
print(f"Missing data heatmap saved to {os.path.join(bia_analysis_output_folder, 'bia_missing_data_heatmap.png')}")

# Child info columns
plt.figure(figsize=(12, 6))
sns.heatmap(train_final[child_info_columns].isnull(), cbar=False, yticklabels=False, cmap='viridis')
plt.title('Missing Data in Child info Attribute Columns')
plt.xlabel('Columns')
plt.ylabel('Rows')
plt.tight_layout()
plt.savefig(os.path.join(child_info_analysis_output_folder, 'child_info_missing_data_heatmap.png'))
plt.close()
print(f"Missing data heatmap saved to {os.path.join(child_info_analysis_output_folder, 'child_info_missing_data_heatmap.png')}")

# Actigraphy info columns
plt.figure(figsize=(40, 6))
sns.heatmap(train_final[actigraphy_columns].isnull(), cbar=False, yticklabels=False, cmap='viridis')
plt.title('Missing Data in Actigraphy info Attribute Columns')
plt.xlabel('Columns')
plt.ylabel('Rows')
plt.tight_layout()
plt.savefig(os.path.join(actigraphy_analysis_output_folder, 'actigraphy_missing_data_heatmap.png'))
plt.close()
print(f"Missing data heatmap saved to {os.path.join(actigraphy_analysis_output_folder, 'actigraphy_missing_data_heatmap.png')}")

In [None]:
# Correlation matrix for physical numeric columns
physical_numeric_columns = train_final[physical_columns].select_dtypes(include=[np.number]).columns
physical_correlation_matrix = train_final[physical_numeric_columns].corr()

plt.figure(figsize=(20, 18))
sns.heatmap(physical_correlation_matrix, annot=True, cmap='coolwarm', linewidths=0.5)
plt.title('Correlation Matrix of Numeric Physical Attributes')
plt.tight_layout()
plt.savefig(os.path.join(physical_analysis_output_folder, 'physical_correlation_matrix.png'))
plt.close()
print(f"Physical correlation matrix saved to {os.path.join(physical_analysis_output_folder, 'physical_correlation_matrix.png')}")

# Correlation matrix for fitness numeric columns
fitness_numeric_columns = train_final[fitness_columns].select_dtypes(include=[np.number]).columns
fitness_correlation_matrix = train_final[fitness_numeric_columns].corr()

plt.figure(figsize=(20, 18))
sns.heatmap(fitness_correlation_matrix, annot=True, cmap='coolwarm', linewidths=0.5)
plt.title('Correlation Matrix of Numeric Fitness Attributes')
plt.tight_layout()
plt.savefig(os.path.join(fitness_analysis_output_folder, 'fitness_correlation_matrix.png'))
plt.close()
print(f"Fitness correlation matrix saved to {os.path.join(fitness_analysis_output_folder, 'fitness_correlation_matrix.png')}")

# Correlation matrix for bia numeric columns
bia_numeric_columns = train_final[bia_columns].select_dtypes(include=[np.number]).columns
bia_correlation_matrix = train_final[bia_numeric_columns].corr()

plt.figure(figsize=(20, 18))
sns.heatmap(bia_correlation_matrix, annot=True, cmap='coolwarm', linewidths=0.5)
plt.title('Correlation Matrix of Numeric BIA Attributes')
plt.tight_layout()
plt.savefig(os.path.join(bia_analysis_output_folder, 'bia_correlation_matrix.png'))
plt.close()
print(f"BIA correlation matrix saved to {os.path.join(bia_analysis_output_folder, 'BIA_correlation_matrix.png')}")

# Correlation matrix for child info numeric columns
child_info_numeric_columns = train_final[child_info_columns].select_dtypes(include=[np.number]).columns
child_info_correlation_matrix = train_final[child_info_numeric_columns].corr()

plt.figure(figsize=(24, 22))
sns.heatmap(child_info_correlation_matrix, annot=True, cmap='coolwarm', linewidths=0.5)
plt.title('Correlation Matrix of Numeric Child info Attributes')
plt.tight_layout()
plt.savefig(os.path.join(child_info_analysis_output_folder, 'child_info_correlation_matrix.png'))
plt.close()
print(f"BIA correlation matrix saved to {os.path.join(child_info_analysis_output_folder, 'child_info_correlation_matrix.png')}")

# Correlation matrix for actigraphy numeric columns
actigraphy_numeric_columns = train_final[actigraphy_columns].select_dtypes(include=[np.number]).columns
actigraphy_correlation_matrix = train_final[actigraphy_numeric_columns].corr()

plt.figure(figsize=(80, 78))
sns.heatmap(actigraphy_correlation_matrix, annot=True, cmap='coolwarm', linewidths=0.5)
plt.title('Correlation Matrix of Numeric Actigraphy Attributes')
plt.tight_layout()
plt.savefig(os.path.join(actigraphy_analysis_output_folder, 'actigraphy_correlation_matrix.png'))
plt.close()
print(f"BIA correlation matrix saved to {os.path.join(actigraphy_analysis_output_folder, 'actigraphy_correlation_matrix.png')}")

In [None]:
# Combine all columns into a single list
all_columns = []
all_columns.extend(physical_numeric_columns)
all_columns.extend(fitness_numeric_columns)
all_columns.extend(bia_numeric_columns)
all_columns.extend(child_info_numeric_columns)
all_columns.extend(actigraphy_numeric_columns)

# Function to create and save distribution plot
def create_distribution_plot(column, category, output_folder, data):
    print(f"\nProcessing {category} column: {column}")  # Print current column
    plt.figure(figsize=(10, 6))
    sns.histplot(data[column].dropna(), kde=True)
    plt.title(f'Distribution of {category} {column}')
    plt.xlabel(column)
    plt.ylabel('Count')
    plt.tight_layout()
    plt.savefig(os.path.join(output_folder, f'{category}_{column}_distribution.png'))
    plt.close()

# Create a dictionary mapping columns to their categories and output folders
column_info = {
    'physical': (physical_numeric_columns, physical_analysis_output_folder),
    'fitness': (fitness_numeric_columns, fitness_analysis_output_folder),
    'bia': (bia_numeric_columns, bia_analysis_output_folder),
    'child_info': (child_info_numeric_columns, child_info_analysis_output_folder),
    'actigraphy': (actigraphy_numeric_columns, actigraphy_analysis_output_folder)
}

# Create progress bar for all columns
with tqdm(total=len(all_columns), desc="Creating distribution plots") as pbar:
    # Process each category
    for category, (columns, output_folder) in column_info.items():
        for column in columns:
            # Update progress description to show current column
            pbar.set_description(f"Processing {category}: {column}")
            
            # Create and save plot
            create_distribution_plot(column, category, output_folder, train_final)
            
            # Update progress bar
            pbar.update(1)
            
            # Optional: Add a small delay to make the progress messages readable
            time.sleep(0.1)

print("\nAll analyses completed and saved to the 'analysis_output' folder.")

# Print summary of processed columns
print("\nSummary of processed columns:")
for category, (columns, _) in column_info.items():
    print(f"\n{category.upper()} columns processed ({len(columns)}):")
    for col in columns:
        print(f"- {col}")

In [None]:
def handle_categorical_columns(df):
    """
    Convert categorical string columns to numeric categories
    """
    # Identify all season columns and other known categorical columns
    season_columns = [col for col in df.columns if 'Season' in col]
    categorical_columns = season_columns + [
        'Basic_Demos-Sex',
        'BIA-BIA_Activity_Level_num',
        'BIA-BIA_Frame_num'
    ]
    
    df_processed = df.copy()
    
    for col in categorical_columns:
        if col in df.columns:
            # Fill missing values with a placeholder
            df_processed[col] = df_processed[col].fillna('Missing')
            
            # Create a mapping for unique values
            unique_values = df_processed[col].unique()
            mapping = {val: idx for idx, val in enumerate(unique_values)}
            
            # Convert to numeric using the mapping
            df_processed[col] = df_processed[col].map(mapping).astype(int)
            
            # Store the mapping for future reference
            if not hasattr(handle_categorical_columns, 'mappings'):
                handle_categorical_columns.mappings = {}
            handle_categorical_columns.mappings[col] = mapping
            
    return df_processed

def check_data_types(df, verbose=True):
    """
    Check data types of all columns and identify potential issues
    Returns problematic columns that need conversion
    """
    object_cols = []
    numeric_cols = []
    problematic_cols = []
    
    for col in df.columns:
        if df[col].dtype == 'object':
            # Check if column should be categorical (contains 'Season' or other categorical indicators)
            if 'Season' in col or col in ['Basic_Demos-Sex', 'BIA-BIA_Activity_Level_num', 'BIA-BIA_Frame_num']:
                problematic_cols.append((col, 'categorical'))
            else:
                # Try converting to numeric
                try:
                    pd.to_numeric(df[col].dropna().iloc[0])
                    problematic_cols.append((col, 'numeric'))
                except:
                    object_cols.append(col)
        else:
            numeric_cols.append(col)
    
    if verbose:
        print(f"Total columns: {len(df.columns)}")
        print(f"Numeric columns: {len(numeric_cols)}")
        print(f"Object columns: {len(object_cols)}")
        print(f"Potentially problematic columns: {len(problematic_cols)}")
        if problematic_cols:
            print("\nProblematic columns that should be converted:")
            for col, col_type in problematic_cols:
                print(f"- {col}: current={df[col].dtype}, should be={col_type}")
                print(f"  Sample values: {df[col].dropna().head().tolist()}")
    
    return problematic_cols

def fix_data_types(df):
    """
    Convert data types of problematic columns to appropriate numeric types
    """
    df_fixed = df.copy()
    
    # First handle categorical columns
    df_fixed = handle_categorical_columns(df_fixed)
    
    # Then handle numeric columns
    cols_to_int = [
        'Basic_Demos-Sex', 'BIA-BIA_Activity_Level_num', 
        'BIA-BIA_Frame_num', 'sii'
    ]
    
    cols_to_float = [
        'Physical-BMI', 'Physical-Height', 'Physical-Weight',
        'Physical-Waist_Circumference', 'Physical-HeartRate',
        'Physical-Diastolic_BP', 'Physical-Systolic_BP',
        'BIA-BIA_BMI', 'BIA-BIA_Fat', 'BIA-BIA_FFM', 'BIA-BIA_FFMI',
        'BIA-BIA_FMI', 'BIA-BIA_ICW', 'BIA-BIA_ECW', 'BIA-BIA_TBW',
        'BIA-BIA_BMR', 'BIA-BIA_DEE', 'BIA-BIA_SMM', 'BIA-BIA_BMC',
        'BIA-BIA_LST', 'BIA-BIA_LDM'
    ]
    
    # Convert remaining numeric columns
    for col in df_fixed.columns:
        if col not in cols_to_int and col not in cols_to_float:
            try:
                df_fixed[col] = pd.to_numeric(df_fixed[col], errors='coerce')
            except:
                continue
    
    # Convert specific columns to integer
    for col in cols_to_int:
        if col in df_fixed.columns:
            df_fixed[col] = df_fixed[col].fillna(-999).astype(int)
            if col == 'sii':  # Special handling for target variable
                df_fixed[col] = df_fixed[col].replace(-999, np.nan)
    
    # Convert specific columns to float
    for col in cols_to_float:
        if col in df_fixed.columns:
            df_fixed[col] = df_fixed[col].astype(float)
    
    return df_fixed

# Usage
print("Checking train data types...")
train_final = fix_data_types(train_final)
problematic_train = check_data_types(train_final)

print("\nChecking test data types...")
test_final = fix_data_types(test_final)
problematic_test = check_data_types(test_final)

if not problematic_train and not problematic_test:
    print("\nAll data types have been fixed successfully!")
else:
    print("\nWarning: Some columns still have incorrect data types!")
    
# Print sample of categorical mappings
if hasattr(handle_categorical_columns, 'mappings'):
    print("\nCategorical mappings:")
    for col, mapping in handle_categorical_columns.mappings.items():
        print(f"\n{col}:")
        for original, encoded in mapping.items():
            print(f"  {original} -> {encoded}")

In [None]:
# Enumerate category columns
cat_columns = ['Basic_Demos-Enroll_Season', 'CGAS-Season', 'Physical-Season', 
          'Fitness_Endurance-Season', 'FGC-Season', 'BIA-Season', 'PCIAT-Season',
          'PAQ_A-Season', 'PAQ_C-Season', 'SDS-Season', 'PreInt_EduHx-Season']

def replace_inf_nan(df):
    """Replace infinite values with NaN and then fill NaN with median for numeric columns"""
    # Replace inf with NaN
    df = df.replace([np.inf, -np.inf], np.nan)
    
    # Get numeric columns
    numeric_cols = df.select_dtypes(include=['float64', 'int64']).columns
    
    # Fill NaN with median for numeric columns
    for col in numeric_cols:
        df[col] = df[col].fillna(df[col].median())
    
    return df

def create_mapping(column, dataset):
    unique_values = dataset[column].unique()
    return {value: idx for idx, value in enumerate(unique_values)}

for col in cat_columns:
    mapping = create_mapping(col, train_final)
    mappingTe = create_mapping(col, test_final)
    
    train_final[col] = train_final[col].replace(mapping).astype(int)
    test_final[col] = test_final[col].replace(mappingTe).astype(int)

imputer = KNNImputer(n_neighbors=5)
numeric_cols = train_final.select_dtypes(include=['float64', 'int64']).columns
imputed_data = imputer.fit_transform(train_final[numeric_cols])
train_imputed = pd.DataFrame(imputed_data, columns=numeric_cols)
train_imputed['sii'] = train_imputed['sii'].round().astype(int)
for col in train_final.columns:
    if col not in numeric_cols:
        train_imputed[col] = train_final[col]
        
train_final = train_imputed

train_final = engineer_features(train_final)
train_final = train_final.dropna(thresh=10, axis=0)
test_final = engineer_features(test_final)

In [None]:
train_final.head()

In [None]:
# reset output folder
output_folder = 'output'

# Export train_data to CSV
train_output_path = os.path.join(output_folder, 'train_data_imputed.csv')
train_final.to_csv(train_output_path, index=False)
print(f"Imputed train data exported to: {train_output_path}")

# Export test_data to CSV
test_output_path = os.path.join(output_folder, 'test_data_imputed.csv')
test_final.to_csv(test_output_path, index=False)
print(f"Imputed test data exported to: {test_output_path}")

# Make copies of data for other submissions
train2 = train_final.copy()
test2 = test_final.copy()

train3 = train_final.copy()
test3 = test_final.copy()

print("Data export completed.")

In [None]:
SEED = 42
n_splits = 5

# Function definitions
def quadratic_weighted_kappa(y_true, y_pred):
    return cohen_kappa_score(y_true, y_pred, weights='quadratic')

def threshold_Rounder(oof_non_rounded, thresholds):
    return np.where(oof_non_rounded < thresholds[0], 0,
                    np.where(oof_non_rounded < thresholds[1], 1,
                             np.where(oof_non_rounded < thresholds[2], 2, 3)))

def evaluate_predictions(thresholds, y_true, oof_non_rounded):
    rounded_p = threshold_Rounder(oof_non_rounded, thresholds)
    return -quadratic_weighted_kappa(y_true, rounded_p)

def TrainML(model_class, test_data):
    X = train_final.drop(['sii'], axis=1)
    y = train_final['sii']

    SKF = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=SEED)
    
    train_S = []
    test_S = []
    
    oof_non_rounded = np.zeros(len(y), dtype=float) 
    oof_rounded = np.zeros(len(y), dtype=int) 
    test_preds = np.zeros((len(test_data), n_splits))

    for fold, (train_idx, test_idx) in enumerate(tqdm(SKF.split(X, y), desc="Training Folds", total=n_splits)):
        X_train, X_val = X.iloc[train_idx], X.iloc[test_idx]
        y_train, y_val = y.iloc[train_idx], y.iloc[test_idx]

        model = clone(model_class)
        model.fit(X_train, y_train)

        y_train_pred = model.predict(X_train)
        y_val_pred = model.predict(X_val)

        oof_non_rounded[test_idx] = y_val_pred
        y_val_pred_rounded = y_val_pred.round(0).astype(int)
        oof_rounded[test_idx] = y_val_pred_rounded

        train_kappa = quadratic_weighted_kappa(y_train, y_train_pred.round(0).astype(int))
        val_kappa = quadratic_weighted_kappa(y_val, y_val_pred_rounded)

        train_S.append(train_kappa)
        test_S.append(val_kappa)
        
        test_preds[:, fold] = model.predict(test_data)
        
        print(f"Fold {fold+1} - Train QWK: {train_kappa:.4f}, Validation QWK: {val_kappa:.4f}")
        clear_output(wait=True)

    print(f"Mean Train QWK --> {np.mean(train_S):.4f}")
    print(f"Mean Validation QWK ---> {np.mean(test_S):.4f}")

    KappaOPtimizer = minimize(evaluate_predictions,
                              x0=[0.5, 1.5, 2.5], args=(y, oof_non_rounded), 
                              method='Nelder-Mead')
    assert KappaOPtimizer.success, "Optimization did not converge."
    
    oof_tuned = threshold_Rounder(oof_non_rounded, KappaOPtimizer.x)
    tKappa = quadratic_weighted_kappa(y, oof_tuned)

    print(f"----> || Optimized QWK SCORE :: {Fore.CYAN}{Style.BRIGHT} {tKappa:.3f}{Style.RESET_ALL}")

    tpm = test_preds.mean(axis=1)
    tpTuned = threshold_Rounder(tpm, KappaOPtimizer.x)
    
    submission = pd.DataFrame({
        'id': sample['id'],
        'sii': tpTuned
    })

    return submission

In [None]:
# parameters
Params = {
    'n_estimators': 682,
    'learning_rate': 0.022201704131134002,
    'max_depth': 3,
    'num_leaves': 843,
    'min_data_in_leaf': 13,
    'feature_fraction': 0.893,
    'bagging_fraction': 0.8918812900108436,
    'bagging_freq': 10,
    'lambda_l1': 4.79779460021304e-07,
    'lambda_l2': 2.0055171376757653e-06,
    'min_child_samples': 39,
    'colsample_bytree': 0.9264391369678474
}


XGB_Params = {
    'learning_rate': 0.02829495436971426,
    'max_depth': 8,
    'n_estimators': 484,
    'subsample': 0.9834706801888403,
    'colsample_bytree': 0.7681852816292032,
    'reg_alpha': 0.010495697417466835,
    'reg_lambda': 0.022138771647168275,
    'random_state': SEED,
    'tree_method': 'exact',
    'min_child_weight': 9
}


CatBoost_Params = {
    'learning_rate': 0.09981560408272906,
    'depth': 6,
    'iterations': 43,
    'random_seed': SEED,
    'verbose': 0,
    'l2_leaf_reg': 3.4155853181132496,
    'bootstrap_type': 'MVS',
    'random_strength': 0.06930624569348895
}

# Create model instances
Light = LGBMRegressor(**Params, random_state=SEED, verbose=-1)
XGB_Model = XGBRegressor(**XGB_Params)
CatBoost_Model = CatBoostRegressor(**CatBoost_Params)

# Combine models using Voting Regressor
voting_model = VotingRegressor(estimators=[
    ('lightgbm', Light),
    ('xgboost', XGB_Model),
    ('catboost', CatBoost_Model)
])

In [None]:
Submission1 = TrainML(voting_model, test_final)

# Save submission
Submission1.to_csv('submission_1.csv', index=False)