In [1]:
import numpy as np
import pandas as pd
import os
import re
from sklearn.base import clone
from sklearn.metrics import cohen_kappa_score
from sklearn.model_selection import StratifiedKFold
from scipy.optimize import minimize
from concurrent.futures import ThreadPoolExecutor
from tqdm import tqdm
from sklearn.preprocessing import MinMaxScaler, StandardScaler, RobustScaler
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from keras.models import Model
from keras.layers import Input, Dense
from keras.optimizers import Adam
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from colorama import Fore, Style
from IPython.display import clear_output
import warnings
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor
from catboost import CatBoostRegressor
from sklearn.ensemble import VotingRegressor, RandomForestRegressor, GradientBoostingRegressor
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.pipeline import Pipeline
from scipy import stats

In [2]:
# Create output folders
output_folder = 'output'
if not os.path.exists(output_folder):
    os.makedirs(output_folder)

# Create separate analysis output folders
analysis_output_folder = 'analysis_output'
os.makedirs(analysis_output_folder, exist_ok=True)

physical_analysis_output_folder = 'analysis_output/physical'
os.makedirs(physical_analysis_output_folder, exist_ok=True)

fitness_analysis_output_folder = 'analysis_output/fitness'
os.makedirs(fitness_analysis_output_folder, exist_ok=True)

bia_analysis_output_folder = 'analysis_output/bia'
os.makedirs(bia_analysis_output_folder, exist_ok=True)

child_info_analysis_output_folder = 'analysis_output/child_info'
os.makedirs(child_info_analysis_output_folder, exist_ok=True)

actigraphy_analysis_output_folder = 'analysis_output/actigraphy'
os.makedirs(actigraphy_analysis_output_folder, exist_ok=True)

# Set display all columns in dataframes property
pd.options.display.max_columns = None

# Supress warnings
warnings.filterwarnings('ignore')

In [3]:
# Load and process data files
def process_file(filename, dirname):
    df = pd.read_parquet(os.path.join(dirname, filename, 'part-0.parquet'))
    df.drop('step', axis=1, inplace=True)
    return df.describe().values.reshape(-1), filename.split('=')[1]

# Load time series data
def load_time_series(dirname) -> pd.DataFrame:
    ids = os.listdir(dirname)
    
    with ThreadPoolExecutor() as executor:
        results = list(tqdm(executor.map(lambda fname: process_file(fname, dirname), ids), total=len(ids)))
    
    stats, indexes = zip(*results)
    
    df = pd.DataFrame(stats, columns=[f"stat_{i}" for i in range(len(stats[0]))])
    df['id'] = indexes
    return df

In [None]:
# Load data
train = pd.read_csv('input/train.csv')
test = pd.read_csv('input/test.csv')
sample = pd.read_csv('input/sample_submission.csv')

train_ts = load_time_series("input/series_train.parquet")
test_ts = load_time_series("input/series_test.parquet")

df_train = train_ts.drop('id', axis=1)
df_test = test_ts.drop('id', axis=1)

In [5]:
# Sparse Autoencoder Model
class SparseAutoencoder(nn.Module):
    def __init__(self, input_dim, sparsity_weight=1e-5):
        super(SparseAutoencoder, self).__init__()
        self.sparsity_weight = sparsity_weight
        self.encoder = nn.Sequential(
            nn.Linear(input_dim, 64),
            nn.ReLU(),
            nn.Linear(64, 32),
            nn.ReLU(),
            nn.Linear(32, 16),
            nn.ReLU()
        )
        
        self.decoder = nn.Sequential(
            nn.Linear(16, 32),
            nn.ReLU(),
            nn.Linear(32, 64),
            nn.ReLU(),
            nn.Linear(64, input_dim),
            nn.Sigmoid()  # Outputs in the range [0, 1]
        )
        
    def forward(self, x):
        encoded = self.encoder(x)
        decoded = self.decoder(encoded)
        return encoded, decoded

# Preparing Data
# Option to use different scalers: MinMaxScaler, StandardScaler, RobustScaler
def prepare_data(data, scaler_type='MinMaxScaler'):
    if scaler_type == 'StandardScaler':
        scaler = StandardScaler()
    elif scaler_type == 'RobustScaler':
        scaler = RobustScaler()
    else:
        scaler = MinMaxScaler()
    
    data_scaled = scaler.fit_transform(data)
    return torch.tensor(data_scaled, dtype=torch.float32), scaler

# Apply PCA for Dimensionality Reduction
# This can help focus the autoencoder on the most relevant features
def apply_pca(data, n_components=0.95):
    pca = PCA(n_components=n_components)
    data_pca = pca.fit_transform(data)
    return data_pca, pca

# Early Stopping Functionality
def early_stopping(patience):
    class EarlyStopping:
        def __init__(self, patience=patience):
            self.patience = patience
            self.counter = 0
            self.best_loss = float('inf')
            self.early_stop = False
        
        def __call__(self, loss):
            if loss < self.best_loss:
                self.best_loss = loss
                self.counter = 0
            else:
                self.counter += 1
                if self.counter >= self.patience:
                    self.early_stop = True
    return EarlyStopping()

# Training the Sparse Autoencoder with DataFrame Output
def perform_autoencoder(data, epochs=100, batch_size=32, learning_rate=0.001, patience=10, scaler_type='MinMaxScaler', use_pca=False, sparsity_weight=1e-5):
    # Preprocess Data
    if use_pca:
        data, pca = apply_pca(data)

    data_tensor, scaler = prepare_data(data, scaler_type=scaler_type)
    train_data, val_data = train_test_split(data_tensor, test_size=0.2, random_state=42)

    train_loader = DataLoader(TensorDataset(train_data), batch_size=batch_size, shuffle=True)
    val_loader = DataLoader(TensorDataset(val_data), batch_size=batch_size, shuffle=False)

    model = SparseAutoencoder(input_dim=data.shape[1], sparsity_weight=sparsity_weight)
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model.to(device)

    criterion = nn.SmoothL1Loss()  # Changed to Smooth L1 Loss
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)
    stopper = early_stopping(patience=patience)

    for epoch in range(epochs):
        model.train()
        train_loss = 0.0
        for batch in train_loader:
            batch = batch[0].to(device)
            optimizer.zero_grad()
            encoded, outputs = model(batch)
            
            # Reconstruction loss
            loss = criterion(outputs, batch)
            
            # Sparsity penalty (L1 regularization on encoded activations)
            l1_penalty = torch.mean(torch.abs(encoded))
            loss += sparsity_weight * l1_penalty
            
            loss.backward()
            optimizer.step()
            train_loss += loss.item() * batch.size(0)

        train_loss /= len(train_loader.dataset)

        # Validation
        model.eval()
        val_loss = 0.0
        with torch.no_grad():
            for batch in val_loader:
                batch = batch[0].to(device)
                _, outputs = model(batch)
                loss = criterion(outputs, batch)
                val_loss += loss.item() * batch.size(0)

        val_loss /= len(val_loader.dataset)
        print(f"Epoch {epoch+1}, Train Loss: {train_loss:.4f}, Validation Loss: {val_loss:.4f}")

        # Early stopping
        stopper(val_loss)
        if stopper.early_stop:
            print(f"Early stopping at epoch {epoch + 1}")
            break

    # Convert tensor back to DataFrame for consistency
    _, data_decoded = model(data_tensor.to(device))
    data_decoded = data_decoded.cpu().detach().numpy()
    df_encoded = pd.DataFrame(data_decoded, columns=[f'feature_{i}' for i in range(data_decoded.shape[1])])
    return df_encoded

# Usage example
# Assuming 'data' is your input dataset as a NumPy array or pandas DataFrame.
# df_encoded = train_sparse_autoencoder(data, epochs=100, batch_size=32, learning_rate=0.001, patience=10, scaler_type='StandardScaler', use_pca=True, sparsity_weight=1e-5)


In [None]:
# Encode time series data
train_ts_encoded = perform_autoencoder(df_train, epochs=100, batch_size=32, learning_rate=0.001, patience=10, use_pca=False, scaler_type='MinMaxScaler', sparsity_weight=1e-5)
test_ts_encoded = perform_autoencoder(df_test, epochs=100, batch_size=32, learning_rate=0.001, patience=10, use_pca=False, scaler_type='MinMaxScaler', sparsity_weight=1e-5)

train_ts_encoded["id"]=train_ts["id"]
test_ts_encoded['id']=test_ts["id"]

# Merge data
train = pd.merge(train, train_ts_encoded, how="left", on='id')
test = pd.merge(test, test_ts_encoded, how="left", on='id')

train = train.drop('id', axis=1)
test = test.drop('id', axis=1)

In [None]:
# Skew removal for some columns
skewed_columns = [
    'BIA-BIA_BMC', 'BIA-BIA_BMR', 'BIA-BIA_DEE', 'BIA-BIA_ECW', 'BIA-BIA_Fat',
    'BIA-BIA_FFM', 'BIA-BIA_FFMI', 'BIA-BIA_FMI', 'BIA-BIA_ICW', 'BIA-BIA_LDM', 'BIA-BIA_LST', 
    'BIA-BIA_TBW', 'CGAS-CGAS_Score', 'feature_23', 'feature_35', 'feature_38', 'feature_40', 'feature_47',
    'feature_54', 'feature_66', 'feature_78', 'feature_80', 'feature_88', 'feature_90'
]
lambda_params = {}

# Define the box-cox function to remove skew
def box_cox_transform(df, column, lambda_param=None):
    # Create a copy of the dataframe
    df_copy = df.copy()
    
    # Drop NaN values for the specific column
    df_copy = df_copy.dropna(subset=[column])
    
    # Ensure all values are positive
    min_value = df_copy[column].min()
    if min_value <= 0:
        df_copy[column] = df_copy[column] - min_value + 1  # Add 1 to ensure all values are positive
    
    # Perform Box-Cox transformation
    if lambda_param is None:
        df_copy[f'{column}_boxcox'], lambda_param = stats.boxcox(df_copy[column])
        print(f"Transforming column: {column}")
        print(f"Optimal lambda for Box-Cox transformation: {lambda_param}")
    else:
        df_copy[f'{column}_boxcox'] = stats.boxcox(df_copy[column], lmbda=lambda_param)
        print(f"Applying transformation to column: {column} with lambda: {lambda_param}")
    
    print(f"Number of rows before transformation: {len(df)}")
    print(f"Number of rows after removing NaN values: {len(df_copy)}")
    
    return df_copy, lambda_param

# Apply Box-Cox transformation to train data and store lambda values
for column in skewed_columns:
    transformed_train_data, lambda_params[column] = box_cox_transform(train, column)
    # Update only the new transformed column in the original dataframe
    train[f'{column}_boxcox'] = transformed_train_data[f'{column}_boxcox']

# Apply the same transformation to test data using stored lambda values
for column in skewed_columns:
    transformed_test_data, _ = box_cox_transform(test, column, lambda_param=lambda_params[column])
    # Update only the new transformed column in the original dataframe
    test[f'{column}_boxcox'] = transformed_test_data[f'{column}_boxcox']

# Function to handle infinite values
def replace_inf_with_max(df):
    for column in df.columns:
        if df[column].dtype == 'float64':
            max_value = df[column][~np.isinf(df[column])].max()
            df[column] = df[column].replace([np.inf, -np.inf], max_value)
    return df

# Replace infinite values with the maximum non-infinite value in each column
train_data = replace_inf_with_max(train)
test_data = replace_inf_with_max(test)

In [None]:
train.head(10)

In [9]:
# Feature engineering
def feature_engineering(df):     
    # Combine all grip strength
    df['FGC-FGC_GS'] = df['FGC-FGC_GSD_Zone'] + df['FGC-FGC_GSND_Zone']
    
    # Combine all sit and reach
    df['FGC-FGC_SR'] = df['FGC-FGC_SRL_Zone'] + df['FGC-FGC_SRR_Zone']
    
    # Create a fitness score by adding the zone fitness data
    df['fitness_score'] = df['FGC-FGC_GS'] + df['FGC-FGC_SR'] + df['FGC-FGC_CU_Zone'] + df['FGC-FGC_PU_Zone'] + df['FGC-FGC_TL_Zone']
    
    # Combine PAQ_A-PAQ_A_Total and PAQ_C-PAQ_C_Total into one column
    df['PAQ_Total'] = df['PAQ_A-PAQ_A_Total'].combine_first(df['PAQ_C-PAQ_C_Total'])
    
    # Combine up to stat 11 of actigraphy stats
    df['combined_actigraphy_stat'] = df['feature_0'] + df['feature_1'] + df['feature_2'] + df['feature_3'] + df['feature_4'] + df['feature_5'] + df['feature_6'] + df['feature_7'] + df['feature_7']+ df['feature_9'] + df['feature_10'] + df['feature_11']
    
    # Features from other notebook
    df['BMI_Age'] = df['Physical-BMI'] * df['Basic_Demos-Age']
    df['Internet_Hours_Age'] = df['PreInt_EduHx-computerinternet_hoursday'] * df['Basic_Demos-Age']
    df['BMI_Internet_Hours'] = df['Physical-BMI'] * df['PreInt_EduHx-computerinternet_hoursday']
    df['BFP_BMI'] = df['BIA-BIA_Fat'] / df['BIA-BIA_BMI']
    df['FFMI_BFP'] = df['BIA-BIA_FFMI'] / df['BIA-BIA_Fat']
    df['FMI_BFP'] = df['BIA-BIA_FMI'] / df['BIA-BIA_Fat']
    df['LST_TBW'] = df['BIA-BIA_LST'] / df['BIA-BIA_TBW']
    df['BFP_BMR'] = df['BIA-BIA_Fat'] * df['BIA-BIA_BMR']
    df['BFP_DEE'] = df['BIA-BIA_Fat'] * df['BIA-BIA_DEE']
    df['BMR_Weight'] = df['BIA-BIA_BMR'] / df['Physical-Weight']
    df['DEE_Weight'] = df['BIA-BIA_DEE'] / df['Physical-Weight']
    df['SMM_Height'] = df['BIA-BIA_SMM'] / df['Physical-Height']
    df['Muscle_to_Fat'] = df['BIA-BIA_SMM'] / df['BIA-BIA_FMI']
    df['Hydration_Status'] = df['BIA-BIA_TBW'] / df['Physical-Weight']
    df['ICW_TBW'] = df['BIA-BIA_ICW'] / df['BIA-BIA_TBW']
    df['Age_Weight'] = df['Basic_Demos-Age'] * df['Physical-Weight']
    df['Sex_BMI'] = df['Basic_Demos-Sex'] * df['Physical-BMI']
    df['Sex_HeartRate'] = df['Basic_Demos-Sex'] * df['Physical-HeartRate']
    df['Age_WaistCirc'] = df['Basic_Demos-Age'] * df['Physical-Waist_Circumference']
    df['BMI_FitnessMaxStage'] = df['Physical-BMI'] * df['Fitness_Endurance-Max_Stage']
    df['Weight_GripStrengthDominant'] = df['Physical-Weight'] * df['FGC-FGC_GSD']
    df['Weight_GripStrengthNonDominant'] = df['Physical-Weight'] * df['FGC-FGC_GSND']
    df['HeartRate_FitnessTime'] = df['Physical-HeartRate'] * (df['Fitness_Endurance-Time_Mins'] + df['Fitness_Endurance-Time_Sec'])
    df['Age_PushUp'] = df['Basic_Demos-Age'] * df['FGC-FGC_PU']
    df['FFMI_Age'] = df['BIA-BIA_FFMI'] * df['Basic_Demos-Age']
    df['InternetUse_SleepDisturbance'] = df['PreInt_EduHx-computerinternet_hoursday'] * df['SDS-SDS_Total_Raw']
    df['CGAS_BMI'] = df['CGAS-CGAS_Score'] * df['Physical-BMI']
    df['CGAS_FitnessMaxStage'] = df['CGAS-CGAS_Score'] * df['Fitness_Endurance-Max_Stage']
    
    return df

In [10]:
cat_columns = ['Basic_Demos-Enroll_Season', 'CGAS-Season', 'Physical-Season', 
          'Fitness_Endurance-Season', 'FGC-Season', 'BIA-Season', 
          'PAQ_A-Season', 'PAQ_C-Season', 'SDS-Season', 'PreInt_EduHx-Season']

def update(df):
    global cat_columns
    for c in cat_columns: 
        df[c] = df[c].fillna('Missing')
        df[c] = df[c].astype('category')
    return df
        
train = update(train)
test = update(test)

def create_mapping(column, dataset):
    unique_values = dataset[column].unique()
    return {value: idx for idx, value in enumerate(unique_values)}

for col in cat_columns:
    mapping = create_mapping(col, train)
    mappingTe = create_mapping(col, test)
    
    train[col] = train[col].replace(mapping).astype(int)
    test[col] = test[col].replace(mappingTe).astype(int)

In [11]:
imputer = KNNImputer(n_neighbors=5)
numeric_cols = train.select_dtypes(include=['float64', 'int64']).columns
imputed_data = imputer.fit_transform(train[numeric_cols])
train_imputed = pd.DataFrame(imputed_data, columns=numeric_cols)
train_imputed['sii'] = train_imputed['sii'].round().astype(int)
for col in train.columns:
    if col not in numeric_cols:
        train_imputed[col] = train[col]
        
train = train_imputed

train = feature_engineering(train)
train = train.dropna(thresh=10, axis=0)
test = feature_engineering(test)

In [None]:
train.head(10)

In [None]:
# Export train_data to CSV
train_output_path = os.path.join(output_folder, 'train_data_imputed.csv')
train.to_csv(train_output_path, index=False)
print(f"Imputed train data exported to: {train_output_path}")

# Export test_data to CSV
test_output_path = os.path.join(output_folder, 'test_data_imputed.csv')
test.to_csv(test_output_path, index=False)
print(f"Imputed test data exported to: {test_output_path}")

# Make copies of data for other submissions
train2 = train.copy()
test2 = test.copy()

train3 = train.copy()
test3 = test.copy()

print("Data export completed.")

In [14]:
# Feature selection
# Removed stats: 'feature_92', 'feature_0', 'feature_1', 'feature_2', 'feature_3', 'feature_4','feature_5', 'feature_6', 'feature_7', 'feature_8', 'feature_9','feature_10', 'feature_11',
time_series_cols = [
    'combined_actigraphy_stat', 'feature_12', 'feature_13', 'feature_14',
    'feature_15', 'feature_16', 'feature_17', 'feature_18', 'feature_19',
    'feature_20', 'feature_21', 'feature_22', 'feature_24',
    'feature_25', 'feature_26', 'feature_27', 'feature_28', 'feature_29',
    'feature_30', 'feature_31', 'feature_32', 'feature_33', 'feature_34',
    'feature_36', 'feature_37', 'feature_39',
    'feature_41', 'feature_42', 'feature_43', 'feature_44',
    'feature_45', 'feature_46', 'feature_48', 'feature_49',
    'feature_50', 'feature_51', 'feature_52', 'feature_53', 
    'feature_55', 'feature_56', 'feature_57', 'feature_58', 'feature_59',
    'feature_60', 'feature_61', 'feature_62', 'feature_63', 'feature_64',
    'feature_65', 'feature_67', 'feature_68', 'feature_69',
    'feature_70', 'feature_71', 'feature_72', 'feature_73', 'feature_74',
    'feature_75', 'feature_76', 'feature_77', 'feature_79',
    'feature_81', 'feature_82', 'feature_83', 'feature_84',
    'feature_85', 'feature_86', 'feature_87', 'feature_89',
    'feature_91', 'feature_93', 'feature_94',
    'feature_95', 'feature_23_boxcox', 'feature_35_boxcox', 'feature_38_boxcox',
    'feature_40_boxcox', 'feature_47_boxcox', 'feature_54_boxcox',
    'feature_66_boxcox', 'feature_78_boxcox', 'feature_80_boxcox',
    'feature_88_boxcox', 'feature_90_boxcox'
]

# Dropped columns 'FGC-FGC_CU', 'FGC-FGC_PU', 'FGC-FGC_SRR', 'FGC-FGC_SRR_Zone',
featuresCols = ['Basic_Demos-Age', 'Basic_Demos-Sex', 'FGC-FGC_GS', 'FGC-FGC_SR',
                'CGAS-CGAS_Score', 'Physical-BMI', 'fitness_score',
                'Physical-Height', 'Physical-Weight', 'Physical-Waist_Circumference',
                'Physical-Diastolic_BP', 'Physical-HeartRate', 'Physical-Systolic_BP',
                'Fitness_Endurance-Max_Stage',
                'Fitness_Endurance-Time_Mins', 'Fitness_Endurance-Time_Sec',
                'FGC-FGC_CU_Zone', 'FGC-FGC_GSND',
                'FGC-FGC_GSD',  
                'FGC-FGC_PU_Zone', 
                'FGC-FGC_TL', 'FGC-FGC_TL_Zone',
                'BIA-BIA_Activity_Level_num', 'BIA-BIA_BMC_boxcox', 'BIA-BIA_BMI',
                'BIA-BIA_BMR_boxcox', 'BIA-BIA_DEE_boxcox', 'BIA-BIA_ECW_boxcox', 'BIA-BIA_FFM_boxcox',
                'BIA-BIA_FFMI_boxcox', 'BIA-BIA_FMI_boxcox', 'BIA-BIA_Fat', 'BIA-BIA_Frame_num',
                'BIA-BIA_ICW_boxcox', 'BIA-BIA_LDM_boxcox', 'BIA-BIA_LST_boxcox', 'BIA-BIA_SMM',
                'BIA-BIA_TBW', 'PAQ_A-PAQ_A_Total', 'PAQ_Total',
                'PAQ_C-PAQ_C_Total', 'SDS-SDS_Total_Raw',
                'SDS-SDS_Total_T',
                'PreInt_EduHx-computerinternet_hoursday', 'sii', 'BMI_Age','Internet_Hours_Age','BMI_Internet_Hours',
                'BFP_BMI', 'FFMI_BFP', 'FMI_BFP', 'LST_TBW', 'BFP_BMR', 'BFP_DEE', 'BMR_Weight', 'DEE_Weight',
                'SMM_Height', 'Muscle_to_Fat', 'Hydration_Status', 'ICW_TBW','Age_Weight','Sex_BMI','Sex_HeartRate','Age_WaistCirc',
                'BMI_FitnessMaxStage','Weight_GripStrengthDominant','Weight_GripStrengthNonDominant','HeartRate_FitnessTime',
                'Age_PushUp','FFMI_Age','InternetUse_SleepDisturbance','CGAS_BMI','CGAS_FitnessMaxStage']

featuresCols += time_series_cols

train = train[featuresCols]
train = train.dropna(subset='sii')

# Dropped columns 'FGC-FGC_CU', 'FGC-FGC_PU', 'FGC-FGC_SRR', 'FGC-FGC_SRR_Zone',
featuresCols = ['Basic_Demos-Age', 'Basic_Demos-Sex', 'FGC-FGC_GS', 'FGC-FGC_SR',
                'CGAS-CGAS_Score', 'Physical-BMI', 'fitness_score',
                'Physical-Height', 'Physical-Weight', 'Physical-Waist_Circumference',
                'Physical-Diastolic_BP', 'Physical-HeartRate', 'Physical-Systolic_BP',
                'Fitness_Endurance-Max_Stage',
                'Fitness_Endurance-Time_Mins', 'Fitness_Endurance-Time_Sec',
                'FGC-FGC_CU_Zone', 'FGC-FGC_GSND',
                'FGC-FGC_GSD',  
                'FGC-FGC_PU_Zone', 
                'FGC-FGC_TL', 'FGC-FGC_TL_Zone',
                'BIA-BIA_Activity_Level_num', 'BIA-BIA_BMC_boxcox', 'BIA-BIA_BMI',
                'BIA-BIA_BMR_boxcox', 'BIA-BIA_DEE_boxcox', 'BIA-BIA_ECW_boxcox', 'BIA-BIA_FFM_boxcox',
                'BIA-BIA_FFMI_boxcox', 'BIA-BIA_FMI_boxcox', 'BIA-BIA_Fat', 'BIA-BIA_Frame_num',
                'BIA-BIA_ICW_boxcox', 'BIA-BIA_LDM_boxcox', 'BIA-BIA_LST_boxcox', 'BIA-BIA_SMM',
                'BIA-BIA_TBW', 'PAQ_A-PAQ_A_Total', 'PAQ_Total',
                'PAQ_C-PAQ_C_Total', 'SDS-SDS_Total_Raw',
                'SDS-SDS_Total_T',
                'PreInt_EduHx-computerinternet_hoursday', 'BMI_Age','Internet_Hours_Age','BMI_Internet_Hours',
                'BFP_BMI', 'FFMI_BFP', 'FMI_BFP', 'LST_TBW', 'BFP_BMR', 'BFP_DEE', 'BMR_Weight', 'DEE_Weight',
                'SMM_Height', 'Muscle_to_Fat', 'Hydration_Status', 'ICW_TBW','Age_Weight','Sex_BMI','Sex_HeartRate','Age_WaistCirc',
                'BMI_FitnessMaxStage','Weight_GripStrengthDominant','Weight_GripStrengthNonDominant','HeartRate_FitnessTime',
                'Age_PushUp','FFMI_Age','InternetUse_SleepDisturbance','CGAS_BMI','CGAS_FitnessMaxStage']

featuresCols += time_series_cols
test = test[featuresCols]

In [None]:
train.head(10)

In [None]:
test.head(10)

In [17]:
# Replace inf
if np.any(np.isinf(train)):
    train = train.replace([np.inf, -np.inf], np.nan)

In [18]:
def quadratic_weighted_kappa(y_true, y_pred):
    return cohen_kappa_score(y_true, y_pred, weights='quadratic')

def threshold_Rounder(oof_non_rounded, thresholds):
    return np.where(oof_non_rounded < thresholds[0], 0,
                    np.where(oof_non_rounded < thresholds[1], 1,
                             np.where(oof_non_rounded < thresholds[2], 2, 3)))

def evaluate_predictions(thresholds, y_true, oof_non_rounded):
    rounded_p = threshold_Rounder(oof_non_rounded, thresholds)
    return -quadratic_weighted_kappa(y_true, rounded_p)

def TrainML(model_class, test_data):
    X = train.drop(['sii'], axis=1)
    y = train['sii']

    SKF = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=SEED)
    
    train_S = []
    test_S = []
    
    oof_non_rounded = np.zeros(len(y), dtype=float) 
    oof_rounded = np.zeros(len(y), dtype=int) 
    test_preds = np.zeros((len(test_data), n_splits))

    for fold, (train_idx, test_idx) in enumerate(tqdm(SKF.split(X, y), desc="Training Folds", total=n_splits)):
        X_train, X_val = X.iloc[train_idx], X.iloc[test_idx]
        y_train, y_val = y.iloc[train_idx], y.iloc[test_idx]

        model = clone(model_class)
        model.fit(X_train, y_train)

        y_train_pred = model.predict(X_train)
        y_val_pred = model.predict(X_val)

        oof_non_rounded[test_idx] = y_val_pred
        y_val_pred_rounded = y_val_pred.round(0).astype(int)
        oof_rounded[test_idx] = y_val_pred_rounded

        train_kappa = quadratic_weighted_kappa(y_train, y_train_pred.round(0).astype(int))
        val_kappa = quadratic_weighted_kappa(y_val, y_val_pred_rounded)

        train_S.append(train_kappa)
        test_S.append(val_kappa)
        
        test_preds[:, fold] = model.predict(test_data)
        
        print(f"Fold {fold+1} - Train QWK: {train_kappa:.4f}, Validation QWK: {val_kappa:.4f}")
        clear_output(wait=True)

    print(f"Mean Train QWK --> {np.mean(train_S):.4f}")
    print(f"Mean Validation QWK ---> {np.mean(test_S):.4f}")

    KappaOPtimizer = minimize(evaluate_predictions,
                              x0=[0.5, 1.5, 2.5], args=(y, oof_non_rounded), 
                              method='Nelder-Mead')
    assert KappaOPtimizer.success, "Optimization did not converge."
    
    oof_tuned = threshold_Rounder(oof_non_rounded, KappaOPtimizer.x)
    tKappa = quadratic_weighted_kappa(y, oof_tuned)

    print(f"----> || Optimized QWK SCORE :: {Fore.CYAN}{Style.BRIGHT} {tKappa:.3f}{Style.RESET_ALL}")

    tpm = test_preds.mean(axis=1)
    tpTuned = threshold_Rounder(tpm, KappaOPtimizer.x)
    
    submission = pd.DataFrame({
        'id': sample['id'],
        'sii': tpTuned
    })

    return submission

In [19]:
# Model parameters from hyper-tuning
SEED = 42
n_splits = 5

Params = {
    'n_estimators': 656,
    'learning_rate': 0.046,
    'max_depth': 12,
    'num_leaves': 478,
    'min_data_in_leaf': 13,
    'feature_fraction': 0.893,
    'bagging_fraction': 0.784,
    'bagging_freq': 4,
    'lambda_l1': 10,  # Increased from 6.59
    'lambda_l2': 0.01  # Increased from 2.68e-06
}


XGB_Params = {
    'learning_rate': 0.05,
    'max_depth': 6,
    'n_estimators': 200,
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    'reg_alpha': 1,  # Increased from 0.1
    'reg_lambda': 5,  # Increased from 1
    'random_state': SEED,
    'tree_method': 'exact'
}


CatBoost_Params = {
    'learning_rate': 0.05,
    'depth': 6,
    'iterations': 200,
    'random_seed': SEED,
    'verbose': 0,
    'l2_leaf_reg': 10  # Increase this value
}

# Create model instances
Light = LGBMRegressor(**Params, random_state=SEED, verbose=-1)
XGB_Model = XGBRegressor(**XGB_Params)
CatBoost_Model = CatBoostRegressor(**CatBoost_Params)

# Combine models using Voting Regressor
voting_model = VotingRegressor(estimators=[
    ('lightgbm', Light),
    ('xgboost', XGB_Model),
    ('catboost', CatBoost_Model)
])

In [None]:
Submission1 = TrainML(voting_model, test)

# Save submission
Submission1.to_csv('submission_1.csv', index=False)

In [None]:
Submission1

In [None]:
# Submission #2
time_series_cols = [
    'combined_actigraphy_stat', 'feature_12', 'feature_13', 'feature_14',
    'feature_15', 'feature_16', 'feature_17', 'feature_18', 'feature_19',
    'feature_20', 'feature_21', 'feature_22', 'feature_24',
    'feature_25', 'feature_26', 'feature_27', 'feature_28', 'feature_29',
    'feature_30', 'feature_31', 'feature_32', 'feature_33', 'feature_34',
    'feature_36', 'feature_37', 'feature_39',
    'feature_41', 'feature_42', 'feature_43', 'feature_44',
    'feature_45', 'feature_46', 'feature_48', 'feature_49',
    'feature_50', 'feature_51', 'feature_52', 'feature_53', 
    'feature_55', 'feature_56', 'feature_57', 'feature_58', 'feature_59',
    'feature_60', 'feature_61', 'feature_62', 'feature_63', 'feature_64',
    'feature_65', 'feature_67', 'feature_68', 'feature_69',
    'feature_70', 'feature_71', 'feature_72', 'feature_73', 'feature_74',
    'feature_75', 'feature_76', 'feature_77', 'feature_79',
    'feature_81', 'feature_82', 'feature_83', 'feature_84',
    'feature_85', 'feature_86', 'feature_87', 'feature_89',
    'feature_91', 'feature_93', 'feature_94',
    'feature_95', 'feature_23_boxcox', 'feature_35_boxcox', 'feature_38_boxcox',
    'feature_40_boxcox', 'feature_47_boxcox', 'feature_54_boxcox',
    'feature_66_boxcox', 'feature_78_boxcox', 'feature_80_boxcox',
    'feature_88_boxcox', 'feature_90_boxcox'
]

featuresCols = ['Basic_Demos-Enroll_Season', 'Basic_Demos-Age', 'Basic_Demos-Sex',
                'CGAS-Season', 'CGAS-CGAS_Score', 'Physical-Season', 'Physical-BMI',
                'Physical-Height', 'Physical-Weight', 'Physical-Waist_Circumference',
                'Physical-Diastolic_BP', 'Physical-HeartRate', 'Physical-Systolic_BP',
                'Fitness_Endurance-Season', 'Fitness_Endurance-Max_Stage',
                'Fitness_Endurance-Time_Mins', 'Fitness_Endurance-Time_Sec',
                'FGC-Season', 'FGC-FGC_CU', 'FGC-FGC_CU_Zone', 'FGC-FGC_GSND',
                'FGC-FGC_GSND_Zone', 'FGC-FGC_GSD', 'FGC-FGC_GSD_Zone', 'FGC-FGC_PU',
                'FGC-FGC_PU_Zone', 'FGC-FGC_SRL', 'FGC-FGC_SRL_Zone', 'FGC-FGC_SRR',
                'FGC-FGC_SRR_Zone', 'FGC-FGC_TL', 'FGC-FGC_TL_Zone', 'BIA-Season',
                'BIA-BIA_Activity_Level_num', 'BIA-BIA_BMC_boxcox', 'BIA-BIA_BMI',
                'BIA-BIA_BMR_boxcox', 'BIA-BIA_DEE_boxcox', 'BIA-BIA_ECW_boxcox', 'BIA-BIA_FFM_boxcox',
                'BIA-BIA_FFMI_boxcox', 'BIA-BIA_FMI_boxcox', 'BIA-BIA_Fat', 'BIA-BIA_Frame_num',
                'BIA-BIA_ICW_boxcox', 'BIA-BIA_LDM_boxcox', 'BIA-BIA_LST_boxcox', 'BIA-BIA_SMM',
                'BIA-BIA_TBW_boxcox', 'PAQ_A-Season', 'PAQ_A-PAQ_A_Total', 'PAQ_C-Season',
                'PAQ_C-PAQ_C_Total', 'SDS-Season', 'SDS-SDS_Total_Raw',
                'SDS-SDS_Total_T', 'PreInt_EduHx-Season',
                'PreInt_EduHx-computerinternet_hoursday', 'sii']

featuresCols += time_series_cols

train2 = train2[featuresCols]
train2 = train2.dropna(subset='sii')

featuresCols = ['Basic_Demos-Enroll_Season', 'Basic_Demos-Age', 'Basic_Demos-Sex',
                'CGAS-Season', 'CGAS-CGAS_Score', 'Physical-Season', 'Physical-BMI',
                'Physical-Height', 'Physical-Weight', 'Physical-Waist_Circumference',
                'Physical-Diastolic_BP', 'Physical-HeartRate', 'Physical-Systolic_BP',
                'Fitness_Endurance-Season', 'Fitness_Endurance-Max_Stage',
                'Fitness_Endurance-Time_Mins', 'Fitness_Endurance-Time_Sec',
                'FGC-Season', 'FGC-FGC_CU', 'FGC-FGC_CU_Zone', 'FGC-FGC_GSND',
                'FGC-FGC_GSND_Zone', 'FGC-FGC_GSD', 'FGC-FGC_GSD_Zone', 'FGC-FGC_PU',
                'FGC-FGC_PU_Zone', 'FGC-FGC_SRL', 'FGC-FGC_SRL_Zone', 'FGC-FGC_SRR',
                'FGC-FGC_SRR_Zone', 'FGC-FGC_TL', 'FGC-FGC_TL_Zone', 'BIA-Season',
                'BIA-BIA_Activity_Level_num', 'BIA-BIA_BMC_boxcox', 'BIA-BIA_BMI',
                'BIA-BIA_BMR_boxcox', 'BIA-BIA_DEE_boxcox', 'BIA-BIA_ECW_boxcox', 'BIA-BIA_FFM_boxcox',
                'BIA-BIA_FFMI_boxcox', 'BIA-BIA_FMI_boxcox', 'BIA-BIA_Fat', 'BIA-BIA_Frame_num',
                'BIA-BIA_ICW_boxcox', 'BIA-BIA_LDM_boxcox', 'BIA-BIA_LST_boxcox', 'BIA-BIA_SMM',
                'BIA-BIA_TBW_boxcox', 'PAQ_A-Season', 'PAQ_A-PAQ_A_Total', 'PAQ_C-Season',
                'PAQ_C-PAQ_C_Total', 'SDS-Season', 'SDS-SDS_Total_Raw',
                'SDS-SDS_Total_T', 'PreInt_EduHx-Season',
                'PreInt_EduHx-computerinternet_hoursday']

featuresCols += time_series_cols
test2 = test2[featuresCols]

cat_c = ['Basic_Demos-Enroll_Season', 'CGAS-Season', 'Physical-Season', 
          'Fitness_Endurance-Season', 'FGC-Season', 'BIA-Season', 
          'PAQ_A-Season', 'PAQ_C-Season', 'SDS-Season', 'PreInt_EduHx-Season']

def update(df):
    global cat_c
    for c in cat_c: 
        df[c] = df[c].fillna('Missing')
        df[c] = df[c].astype('category')
    return df
        
train2 = update(train2)
test2 = update(test2)

def create_mapping(column, dataset):
    unique_values = dataset[column].unique()
    return {value: idx for idx, value in enumerate(unique_values)}

for col in cat_c:
    mapping = create_mapping(col, train2)
    mappingTe = create_mapping(col, test2)
    
    train2[col] = train2[col].replace(mapping).astype(int)
    test2[col] = test2[col].replace(mappingTe).astype(int)

def quadratic_weighted_kappa(y_true, y_pred):
    return cohen_kappa_score(y_true, y_pred, weights='quadratic')

def threshold_Rounder(oof_non_rounded, thresholds):
    return np.where(oof_non_rounded < thresholds[0], 0,
                    np.where(oof_non_rounded < thresholds[1], 1,
                             np.where(oof_non_rounded < thresholds[2], 2, 3)))

def evaluate_predictions(thresholds, y_true, oof_non_rounded):
    rounded_p = threshold_Rounder(oof_non_rounded, thresholds)
    return -quadratic_weighted_kappa(y_true, rounded_p)

def TrainML(model_class, test_data):
    X = train2.drop(['sii'], axis=1)
    y = train2['sii']

    SKF = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=SEED)
    
    train2_S = []
    test2_S = []
    
    oof_non_rounded = np.zeros(len(y), dtype=float) 
    oof_rounded = np.zeros(len(y), dtype=int) 
    test_preds = np.zeros((len(test_data), n_splits))

    for fold, (train_idx, test_idx) in enumerate(tqdm(SKF.split(X, y), desc="Training Folds", total=n_splits)):
        X_train, X_val = X.iloc[train_idx], X.iloc[test_idx]
        y_train, y_val = y.iloc[train_idx], y.iloc[test_idx]

        model = clone(model_class)
        model.fit(X_train, y_train)

        y_train_pred = model.predict(X_train)
        y_val_pred = model.predict(X_val)

        oof_non_rounded[test_idx] = y_val_pred
        y_val_pred_rounded = y_val_pred.round(0).astype(int)
        oof_rounded[test_idx] = y_val_pred_rounded

        train_kappa = quadratic_weighted_kappa(y_train, y_train_pred.round(0).astype(int))
        val_kappa = quadratic_weighted_kappa(y_val, y_val_pred_rounded)

        train2_S.append(train_kappa)
        test2_S.append(val_kappa)
        
        test_preds[:, fold] = model.predict(test_data)
        
        print(f"Fold {fold+1} - Train QWK: {train_kappa:.4f}, Validation QWK: {val_kappa:.4f}")
        clear_output(wait=True)

    print(f"Mean Train QWK --> {np.mean(train2_S):.4f}")
    print(f"Mean Validation QWK ---> {np.mean(test2_S):.4f}")

    KappaOPtimizer = minimize(evaluate_predictions,
                              x0=[0.5, 1.5, 2.5], args=(y, oof_non_rounded), 
                              method='Nelder-Mead')
    assert KappaOPtimizer.success, "Optimization did not converge."
    
    oof_tuned = threshold_Rounder(oof_non_rounded, KappaOPtimizer.x)
    tKappa = quadratic_weighted_kappa(y, oof_tuned)

    print(f"----> || Optimized QWK SCORE :: {Fore.CYAN}{Style.BRIGHT} {tKappa:.3f}{Style.RESET_ALL}")

    tpm = test_preds.mean(axis=1)
    tpTuned = threshold_Rounder(tpm, KappaOPtimizer.x)
    
    submission = pd.DataFrame({
        'id': sample['id'],
        'sii': tpTuned
    })

    return submission

# Model parameters for LightGBM
Params = {
    'learning_rate': 0.046,
    'max_depth': 12,
    'num_leaves': 478,
    'min_data_in_leaf': 13,
    'feature_fraction': 0.893,
    'bagging_fraction': 0.784,
    'bagging_freq': 4,
    'lambda_l1': 10,  # Increased from 6.59
    'lambda_l2': 0.01  # Increased from 2.68e-06
}


# XGBoost parameters
XGB_Params = {
    'learning_rate': 0.05,
    'max_depth': 6,
    'n_estimators': 200,
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    'reg_alpha': 1,  # Increased from 0.1
    'reg_lambda': 5,  # Increased from 1
    'random_state': SEED
}


CatBoost_Params = {
    'learning_rate': 0.05,
    'depth': 6,
    'iterations': 200,
    'random_seed': SEED,
    'cat_features': cat_c,
    'verbose': 0,
    'l2_leaf_reg': 10  # Increase this value
}

# Create model instances
Light = LGBMRegressor(**Params, random_state=SEED, verbose=-1, n_estimators=300)
XGB_Model = XGBRegressor(**XGB_Params)
CatBoost_Model = CatBoostRegressor(**CatBoost_Params)

# Combine models using Voting Regressor
voting_model = VotingRegressor(estimators=[
    ('lightgbm', Light),
    ('xgboost', XGB_Model),
    ('catboost', CatBoost_Model)
])

# Train the ensemble model
Submission2 = TrainML(voting_model, test2)

# Save submission
Submission2.to_csv('submission_2.csv', index=False)
Submission2

In [None]:
# Submission 3
time_series_cols = [
    'combined_actigraphy_stat', 'feature_12', 'feature_13', 'feature_14',
    'feature_15', 'feature_16', 'feature_17', 'feature_18', 'feature_19',
    'feature_20', 'feature_21', 'feature_22', 'feature_24',
    'feature_25', 'feature_26', 'feature_27', 'feature_28', 'feature_29',
    'feature_30', 'feature_31', 'feature_32', 'feature_33', 'feature_34',
    'feature_36', 'feature_37', 'feature_39',
    'feature_41', 'feature_42', 'feature_43', 'feature_44',
    'feature_45', 'feature_46', 'feature_48', 'feature_49',
    'feature_50', 'feature_51', 'feature_52', 'feature_53', 
    'feature_55', 'feature_56', 'feature_57', 'feature_58', 'feature_59',
    'feature_60', 'feature_61', 'feature_62', 'feature_63', 'feature_64',
    'feature_65', 'feature_67', 'feature_68', 'feature_69',
    'feature_70', 'feature_71', 'feature_72', 'feature_73', 'feature_74',
    'feature_75', 'feature_76', 'feature_77', 'feature_79',
    'feature_81', 'feature_82', 'feature_83', 'feature_84',
    'feature_85', 'feature_86', 'feature_87', 'feature_89',
    'feature_91', 'feature_93', 'feature_94',
    'feature_95', 'feature_23_boxcox', 'feature_35_boxcox', 'feature_38_boxcox',
    'feature_40_boxcox', 'feature_47_boxcox', 'feature_54_boxcox',
    'feature_66_boxcox', 'feature_78_boxcox', 'feature_80_boxcox',
    'feature_88_boxcox', 'feature_90_boxcox'
]

featuresCols = ['Basic_Demos-Enroll_Season', 'Basic_Demos-Age', 'Basic_Demos-Sex',
                'CGAS-Season', 'CGAS-CGAS_Score', 'Physical-Season', 'Physical-BMI',
                'Physical-Height', 'Physical-Weight', 'Physical-Waist_Circumference',
                'Physical-Diastolic_BP', 'Physical-HeartRate', 'Physical-Systolic_BP',
                'Fitness_Endurance-Season', 'Fitness_Endurance-Max_Stage',
                'Fitness_Endurance-Time_Mins', 'Fitness_Endurance-Time_Sec',
                'FGC-Season', 'FGC-FGC_CU', 'FGC-FGC_CU_Zone', 'FGC-FGC_GSND',
                'FGC-FGC_GSND_Zone', 'FGC-FGC_GSD', 'FGC-FGC_GSD_Zone', 'FGC-FGC_PU',
                'FGC-FGC_PU_Zone', 'FGC-FGC_SRL', 'FGC-FGC_SRL_Zone', 'FGC-FGC_SRR',
                'FGC-FGC_SRR_Zone', 'FGC-FGC_TL', 'FGC-FGC_TL_Zone', 'BIA-Season',
                'BIA-BIA_Activity_Level_num', 'BIA-BIA_BMC_boxcox', 'BIA-BIA_BMI',
                'BIA-BIA_BMR_boxcox', 'BIA-BIA_DEE_boxcox', 'BIA-BIA_ECW_boxcox', 'BIA-BIA_FFM_boxcox',
                'BIA-BIA_FFMI_boxcox', 'BIA-BIA_FMI_boxcox', 'BIA-BIA_Fat', 'BIA-BIA_Frame_num',
                'BIA-BIA_ICW_boxcox', 'BIA-BIA_LDM_boxcox', 'BIA-BIA_LST_boxcox', 'BIA-BIA_SMM',
                'BIA-BIA_TBW_box', 'PAQ_A-Season', 'PAQ_A-PAQ_A_Total', 'PAQ_C-Season',
                'PAQ_C-PAQ_C_Total', 'SDS-Season', 'SDS-SDS_Total_Raw',
                'SDS-SDS_Total_T', 'PreInt_EduHx-Season',
                'PreInt_EduHx-computerinternet_hoursday', 'sii']

featuresCols += time_series_cols

train3 = train3[featuresCols]
train3 = train3.dropna(subset='sii')

featuresCols = ['Basic_Demos-Enroll_Season', 'Basic_Demos-Age', 'Basic_Demos-Sex',
                'CGAS-Season', 'CGAS-CGAS_Score', 'Physical-Season', 'Physical-BMI',
                'Physical-Height', 'Physical-Weight', 'Physical-Waist_Circumference',
                'Physical-Diastolic_BP', 'Physical-HeartRate', 'Physical-Systolic_BP',
                'Fitness_Endurance-Season', 'Fitness_Endurance-Max_Stage',
                'Fitness_Endurance-Time_Mins', 'Fitness_Endurance-Time_Sec',
                'FGC-Season', 'FGC-FGC_CU', 'FGC-FGC_CU_Zone', 'FGC-FGC_GSND',
                'FGC-FGC_GSND_Zone', 'FGC-FGC_GSD', 'FGC-FGC_GSD_Zone', 'FGC-FGC_PU',
                'FGC-FGC_PU_Zone', 'FGC-FGC_SRL', 'FGC-FGC_SRL_Zone', 'FGC-FGC_SRR',
                'FGC-FGC_SRR_Zone', 'FGC-FGC_TL', 'FGC-FGC_TL_Zone', 'BIA-Season',
                'BIA-BIA_Activity_Level_num', 'BIA-BIA_BMC_boxcox', 'BIA-BIA_BMI',
                'BIA-BIA_BMR_boxcox', 'BIA-BIA_DEE_boxcox', 'BIA-BIA_ECW_boxcox', 'BIA-BIA_FFM_boxcox',
                'BIA-BIA_FFMI_boxcox', 'BIA-BIA_FMI_boxcox', 'BIA-BIA_Fat', 'BIA-BIA_Frame_num',
                'BIA-BIA_ICW_boxcox', 'BIA-BIA_LDM_boxcox', 'BIA-BIA_LST_boxcox', 'BIA-BIA_SMM',
                'BIA-BIA_TBW_box', 'PAQ_A-Season', 'PAQ_A-PAQ_A_Total', 'PAQ_C-Season',
                'PAQ_C-PAQ_C_Total', 'SDS-Season', 'SDS-SDS_Total_Raw',
                'SDS-SDS_Total_T', 'PreInt_EduHx-Season',
                'PreInt_EduHx-computerinternet_hoursday']

featuresCols += time_series_cols
test3 = test3[featuresCols]

def quadratic_weighted_kappa(y_true, y_pred):
    return cohen_kappa_score(y_true, y_pred, weights='quadratic')

def threshold_Rounder(oof_non_rounded, thresholds):
    return np.where(oof_non_rounded < thresholds[0], 0,
                    np.where(oof_non_rounded < thresholds[1], 1,
                             np.where(oof_non_rounded < thresholds[2], 2, 3)))

def evaluate_predictions(thresholds, y_true, oof_non_rounded):
    rounded_p = threshold_Rounder(oof_non_rounded, thresholds)
    return -quadratic_weighted_kappa(y_true, rounded_p)

def TrainML(model_class, test_data):
    X = train3.drop(['sii'], axis=1)
    y = train3['sii']

    SKF = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=SEED)
    
    train_S = []
    test_S = []
    
    oof_non_rounded = np.zeros(len(y), dtype=float) 
    oof_rounded = np.zeros(len(y), dtype=int) 
    test_preds = np.zeros((len(test_data), n_splits))

    for fold, (train_idx, test_idx) in enumerate(tqdm(SKF.split(X, y), desc="Training Folds", total=n_splits)):
        X_train, X_val = X.iloc[train_idx], X.iloc[test_idx]
        y_train, y_val = y.iloc[train_idx], y.iloc[test_idx]

        model = clone(model_class)
        model.fit(X_train, y_train)

        y_train_pred = model.predict(X_train)
        y_val_pred = model.predict(X_val)

        oof_non_rounded[test_idx] = y_val_pred
        y_val_pred_rounded = y_val_pred.round(0).astype(int)
        oof_rounded[test_idx] = y_val_pred_rounded

        train_kappa = quadratic_weighted_kappa(y_train, y_train_pred.round(0).astype(int))
        val_kappa = quadratic_weighted_kappa(y_val, y_val_pred_rounded)

        train_S.append(train_kappa)
        test_S.append(val_kappa)
        
        test_preds[:, fold] = model.predict(test_data)
        
        print(f"Fold {fold+1} - Train QWK: {train_kappa:.4f}, Validation QWK: {val_kappa:.4f}")
        clear_output(wait=True)

    print(f"Mean Train QWK --> {np.mean(train_S):.4f}")
    print(f"Mean Validation QWK ---> {np.mean(test_S):.4f}")

    KappaOPtimizer = minimize(evaluate_predictions,
                              x0=[0.5, 1.5, 2.5], args=(y, oof_non_rounded), 
                              method='Nelder-Mead')
    assert KappaOPtimizer.success, "Optimization did not converge."
    
    oof_tuned = threshold_Rounder(oof_non_rounded, KappaOPtimizer.x)
    tKappa = quadratic_weighted_kappa(y, oof_tuned)

    print(f"----> || Optimized QWK SCORE :: {Fore.CYAN}{Style.BRIGHT} {tKappa:.3f}{Style.RESET_ALL}")

    tpm = test_preds.mean(axis=1)
    tp_rounded = threshold_Rounder(tpm, KappaOPtimizer.x)

    return tp_rounded

imputer = SimpleImputer(strategy='median')

ensemble = VotingRegressor(estimators=[
    ('lgb', Pipeline(steps=[('imputer', imputer), ('regressor', LGBMRegressor(random_state=SEED))])),
    ('xgb', Pipeline(steps=[('imputer', imputer), ('regressor', XGBRegressor(random_state=SEED))])),
    ('cat', Pipeline(steps=[('imputer', imputer), ('regressor', CatBoostRegressor(random_state=SEED, silent=True))])),
    ('rf', Pipeline(steps=[('imputer', imputer), ('regressor', RandomForestRegressor(random_state=SEED))])),
    ('gb', Pipeline(steps=[('imputer', imputer), ('regressor', GradientBoostingRegressor(random_state=SEED))]))
])

Submission3 = TrainML(ensemble, test3)

In [None]:
Submission3 = pd.DataFrame({
    'id': sample['id'],
    'sii': Submission3
})

Submission3

In [None]:
sub1 = Submission1
sub2 = Submission2
sub3 = Submission3

sub1 = sub1.sort_values(by='id').reset_index(drop=True)
sub2 = sub2.sort_values(by='id').reset_index(drop=True)
sub3 = sub3.sort_values(by='id').reset_index(drop=True)

combined = pd.DataFrame({
    'id': sub1['id'],
    'sii_1': sub1['sii'],
    'sii_2': sub2['sii'],
    'sii_3': sub3['sii']
})

def majority_vote(row):
    return row.mode()[0]

combined['final_sii'] = combined[['sii_1', 'sii_2', 'sii_3']].apply(majority_vote, axis=1)

final_submission = combined[['id', 'final_sii']].rename(columns={'final_sii': 'sii'})

final_submission.to_csv('submission.csv', index=False)

print("Majority voting completed and saved to 'Final_Submission.csv'")

In [None]:
final_submission