In [1]:
import numpy as np
import pandas as pd
import os
import json
from sklearn.base import clone
from sklearn.metrics import cohen_kappa_score, make_scorer
from sklearn.model_selection import StratifiedKFold, cross_val_score
from scipy.optimize import minimize
from concurrent.futures import ThreadPoolExecutor
from tqdm import tqdm
from sklearn.preprocessing import MinMaxScaler, StandardScaler, RobustScaler
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import keras
from keras.models import Model
from keras.layers import Input, Dense
from keras.optimizers import Adam
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from colorama import Fore, Style
from IPython.display import clear_output
import warnings
from lightgbm import LGBMRegressor, LGBMClassifier
from xgboost import XGBRegressor, XGBClassifier
from catboost import CatBoostRegressor, CatBoostClassifier
from sklearn.ensemble import VotingRegressor, RandomForestRegressor, GradientBoostingRegressor, VotingClassifier
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.pipeline import Pipeline
from scipy import stats
import optuna

In [2]:
# Create output folders
output_folder = 'output'
if not os.path.exists(output_folder):
    os.makedirs(output_folder)

# Create separate analysis output folders
analysis_output_folder = 'analysis_output'
os.makedirs(analysis_output_folder, exist_ok=True)

physical_analysis_output_folder = 'analysis_output/physical'
os.makedirs(physical_analysis_output_folder, exist_ok=True)

fitness_analysis_output_folder = 'analysis_output/fitness'
os.makedirs(fitness_analysis_output_folder, exist_ok=True)

bia_analysis_output_folder = 'analysis_output/bia'
os.makedirs(bia_analysis_output_folder, exist_ok=True)

child_info_analysis_output_folder = 'analysis_output/child_info'
os.makedirs(child_info_analysis_output_folder, exist_ok=True)

actigraphy_analysis_output_folder = 'analysis_output/actigraphy'
os.makedirs(actigraphy_analysis_output_folder, exist_ok=True)

# Set display all columns in dataframes property
pd.options.display.max_columns = None

# Supress warnings
warnings.filterwarnings('ignore')

In [3]:
# Load and process data files
def process_file(filename, dirname):
    df = pd.read_parquet(os.path.join(dirname, filename, 'part-0.parquet'))
    df.drop('step', axis=1, inplace=True)
    return df.describe().values.reshape(-1), filename.split('=')[1]

# Load time series data
def load_time_series(dirname) -> pd.DataFrame:
    ids = os.listdir(dirname)
    
    with ThreadPoolExecutor() as executor:
        results = list(tqdm(executor.map(lambda fname: process_file(fname, dirname), ids), total=len(ids)))
    
    stats, indexes = zip(*results)
    
    df = pd.DataFrame(stats, columns=[f"stat_{i}" for i in range(len(stats[0]))])
    df['id'] = indexes
    return df

In [4]:
# Load data
train = pd.read_csv('input/train.csv')
test = pd.read_csv('input/test.csv')
sample = pd.read_csv('input/sample_submission.csv')

train_ts = load_time_series("input/series_train.parquet")
test_ts = load_time_series("input/series_test.parquet")

df_train = train_ts.drop('id', axis=1)
df_test = test_ts.drop('id', axis=1)

100%|██████████| 996/996 [01:03<00:00, 15.67it/s]
100%|██████████| 2/2 [00:00<00:00,  7.62it/s]


In [5]:
# Sparse Autoencoder Model
class SparseAutoencoder(nn.Module):
    def __init__(self, input_dim, sparsity_weight=1e-5):
        super(SparseAutoencoder, self).__init__()
        self.sparsity_weight = sparsity_weight
        self.encoder = nn.Sequential(
            nn.Linear(input_dim, 64),
            nn.ReLU(),
            nn.Linear(64, 32),
            nn.ReLU(),
            nn.Linear(32, 16),
            nn.ReLU()
        )
        
        self.decoder = nn.Sequential(
            nn.Linear(16, 32),
            nn.ReLU(),
            nn.Linear(32, 64),
            nn.ReLU(),
            nn.Linear(64, input_dim),
            nn.Sigmoid()  # Outputs in the range [0, 1]
        )
        
    def forward(self, x):
        encoded = self.encoder(x)
        decoded = self.decoder(encoded)
        return encoded, decoded

# Preparing Data
# Option to use different scalers: MinMaxScaler, StandardScaler, RobustScaler
def prepare_data(data, scaler_type='MinMaxScaler'):
    if scaler_type == 'StandardScaler':
        scaler = StandardScaler()
    elif scaler_type == 'RobustScaler':
        scaler = RobustScaler()
    else:
        scaler = MinMaxScaler()
    
    data_scaled = scaler.fit_transform(data)
    return torch.tensor(data_scaled, dtype=torch.float32), scaler

# Apply PCA for Dimensionality Reduction
# This can help focus the autoencoder on the most relevant features
def apply_pca(data, n_components=0.95):
    pca = PCA(n_components=n_components)
    data_pca = pca.fit_transform(data)
    return data_pca, pca

# Early Stopping Functionality
def early_stopping(patience):
    class EarlyStopping:
        def __init__(self, patience=patience):
            self.patience = patience
            self.counter = 0
            self.best_loss = float('inf')
            self.early_stop = False
        
        def __call__(self, loss):
            if loss < self.best_loss:
                self.best_loss = loss
                self.counter = 0
            else:
                self.counter += 1
                if self.counter >= self.patience:
                    self.early_stop = True
    return EarlyStopping()

# Training the Sparse Autoencoder with DataFrame Output
def perform_autoencoder(data, epochs=100, batch_size=32, learning_rate=0.001, patience=10, scaler_type='MinMaxScaler', use_pca=False, sparsity_weight=1e-5):
    # Preprocess Data
    if use_pca:
        data, pca = apply_pca(data)

    data_tensor, scaler = prepare_data(data, scaler_type=scaler_type)
    train_data, val_data = train_test_split(data_tensor, test_size=0.2, random_state=42)

    train_loader = DataLoader(TensorDataset(train_data), batch_size=batch_size, shuffle=True)
    val_loader = DataLoader(TensorDataset(val_data), batch_size=batch_size, shuffle=False)

    model = SparseAutoencoder(input_dim=data.shape[1], sparsity_weight=sparsity_weight)
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model.to(device)

    criterion = nn.SmoothL1Loss()  # Changed to Smooth L1 Loss
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)
    stopper = early_stopping(patience=patience)

    for epoch in range(epochs):
        model.train()
        train_loss = 0.0
        for batch in train_loader:
            batch = batch[0].to(device)
            optimizer.zero_grad()
            encoded, outputs = model(batch)
            
            # Reconstruction loss
            loss = criterion(outputs, batch)
            
            # Sparsity penalty (L1 regularization on encoded activations)
            l1_penalty = torch.mean(torch.abs(encoded))
            loss += sparsity_weight * l1_penalty
            
            loss.backward()
            optimizer.step()
            train_loss += loss.item() * batch.size(0)

        train_loss /= len(train_loader.dataset)

        # Validation
        model.eval()
        val_loss = 0.0
        with torch.no_grad():
            for batch in val_loader:
                batch = batch[0].to(device)
                _, outputs = model(batch)
                loss = criterion(outputs, batch)
                val_loss += loss.item() * batch.size(0)

        val_loss /= len(val_loader.dataset)
        print(f"Epoch {epoch+1}, Train Loss: {train_loss:.4f}, Validation Loss: {val_loss:.4f}")

        # Early stopping
        stopper(val_loss)
        if stopper.early_stop:
            print(f"Early stopping at epoch {epoch + 1}")
            break

    # Convert tensor back to DataFrame for consistency
    _, data_decoded = model(data_tensor.to(device))
    data_decoded = data_decoded.cpu().detach().numpy()
    df_encoded = pd.DataFrame(data_decoded, columns=[f'feature_{i}' for i in range(data_decoded.shape[1])])
    return df_encoded

# Usage example
# Assuming 'data' is your input dataset as a NumPy array or pandas DataFrame.
# df_encoded = train_sparse_autoencoder(data, epochs=100, batch_size=32, learning_rate=0.001, patience=10, scaler_type='StandardScaler', use_pca=True, sparsity_weight=1e-5)


In [6]:
# Encode time series data
train_ts_encoded = perform_autoencoder(df_train, epochs=100, batch_size=32, learning_rate=0.001, patience=10, use_pca=False, scaler_type='MinMaxScaler', sparsity_weight=1e-5)
test_ts_encoded = perform_autoencoder(df_test, epochs=100, batch_size=32, learning_rate=0.001, patience=10, use_pca=False, scaler_type='MinMaxScaler', sparsity_weight=1e-5)

train_ts_encoded["id"]=train_ts["id"]
test_ts_encoded['id']=test_ts["id"]

# Merge data
train = pd.merge(train, train_ts_encoded, how="left", on='id')
test = pd.merge(test, test_ts_encoded, how="left", on='id')

train = train.drop('id', axis=1)
test = test.drop('id', axis=1)

Epoch 1, Train Loss: 0.0480, Validation Loss: 0.0419
Epoch 2, Train Loss: 0.0299, Validation Loss: 0.0214
Epoch 3, Train Loss: 0.0192, Validation Loss: 0.0184
Epoch 4, Train Loss: 0.0182, Validation Loss: 0.0182
Epoch 5, Train Loss: 0.0180, Validation Loss: 0.0179
Epoch 6, Train Loss: 0.0179, Validation Loss: 0.0178
Epoch 7, Train Loss: 0.0176, Validation Loss: 0.0173
Epoch 8, Train Loss: 0.0166, Validation Loss: 0.0157
Epoch 9, Train Loss: 0.0141, Validation Loss: 0.0129
Epoch 10, Train Loss: 0.0122, Validation Loss: 0.0116
Epoch 11, Train Loss: 0.0113, Validation Loss: 0.0108
Epoch 12, Train Loss: 0.0108, Validation Loss: 0.0105
Epoch 13, Train Loss: 0.0105, Validation Loss: 0.0101
Epoch 14, Train Loss: 0.0103, Validation Loss: 0.0100
Epoch 15, Train Loss: 0.0100, Validation Loss: 0.0097
Epoch 16, Train Loss: 0.0097, Validation Loss: 0.0090
Epoch 17, Train Loss: 0.0087, Validation Loss: 0.0077
Epoch 18, Train Loss: 0.0072, Validation Loss: 0.0061
Epoch 19, Train Loss: 0.0062, Validat

In [7]:
# Skew removal for some columns
skewed_columns = [
    'BIA-BIA_BMC', 'BIA-BIA_BMR', 'BIA-BIA_DEE', 'BIA-BIA_ECW', 'BIA-BIA_Fat',
    'BIA-BIA_FFM', 'BIA-BIA_FFMI', 'BIA-BIA_FMI', 'BIA-BIA_ICW', 'BIA-BIA_LDM', 'BIA-BIA_LST', 
    'BIA-BIA_TBW', 'CGAS-CGAS_Score', 'feature_23', 'feature_35', 'feature_38', 'feature_40', 'feature_47',
    'feature_54', 'feature_66', 'feature_78', 'feature_80', 'feature_88', 'feature_90'
]
lambda_params = {}

# Define the box-cox function to remove skew
def box_cox_transform(df, column, lambda_param=None):
    # Create a copy of the dataframe
    df_copy = df.copy()
    
    # Drop NaN values for the specific column
    df_copy = df_copy.dropna(subset=[column])
    
    # Ensure all values are positive
    min_value = df_copy[column].min()
    if min_value <= 0:
        df_copy[column] = df_copy[column] - min_value + 1  # Add 1 to ensure all values are positive
    
    # Perform Box-Cox transformation
    if lambda_param is None:
        df_copy[f'{column}_boxcox'], lambda_param = stats.boxcox(df_copy[column])
        print(f"Transforming column: {column}")
        print(f"Optimal lambda for Box-Cox transformation: {lambda_param}")
    else:
        df_copy[f'{column}_boxcox'] = stats.boxcox(df_copy[column], lmbda=lambda_param)
        print(f"Applying transformation to column: {column} with lambda: {lambda_param}")
    
    print(f"Number of rows before transformation: {len(df)}")
    print(f"Number of rows after removing NaN values: {len(df_copy)}")
    
    return df_copy, lambda_param

# Apply Box-Cox transformation to train data and store lambda values
for column in skewed_columns:
    transformed_train_data, lambda_params[column] = box_cox_transform(train, column)
    # Update only the new transformed column in the original dataframe
    train[f'{column}_boxcox'] = transformed_train_data[f'{column}_boxcox']

# Apply the same transformation to test data using stored lambda values
for column in skewed_columns:
    transformed_test_data, _ = box_cox_transform(test, column, lambda_param=lambda_params[column])
    # Update only the new transformed column in the original dataframe
    test[f'{column}_boxcox'] = transformed_test_data[f'{column}_boxcox']

# Function to handle infinite values
def replace_inf_with_max(df):
    for column in df.columns:
        if df[column].dtype == 'float64':
            max_value = df[column][~np.isinf(df[column])].max()
            df[column] = df[column].replace([np.inf, -np.inf], max_value)
    return df

# Replace infinite values with the maximum non-infinite value in each column
train_data = replace_inf_with_max(train)
test_data = replace_inf_with_max(test)

Transforming column: BIA-BIA_BMC
Optimal lambda for Box-Cox transformation: -0.26544288750244394
Number of rows before transformation: 3960
Number of rows after removing NaN values: 1991
Transforming column: BIA-BIA_BMR
Optimal lambda for Box-Cox transformation: -2.024016452566404
Number of rows before transformation: 3960
Number of rows after removing NaN values: 1991
Transforming column: BIA-BIA_DEE
Optimal lambda for Box-Cox transformation: -0.9862196352522961
Number of rows before transformation: 3960
Number of rows after removing NaN values: 1991
Transforming column: BIA-BIA_ECW
Optimal lambda for Box-Cox transformation: -0.11312798067663181
Number of rows before transformation: 3960
Number of rows after removing NaN values: 1991
Transforming column: BIA-BIA_Fat
Optimal lambda for Box-Cox transformation: 27.718481796974547
Number of rows before transformation: 3960
Number of rows after removing NaN values: 1991
Transforming column: BIA-BIA_FFM
Optimal lambda for Box-Cox transforma

In [8]:
train.head(10)

Unnamed: 0,Basic_Demos-Enroll_Season,Basic_Demos-Age,Basic_Demos-Sex,CGAS-Season,CGAS-CGAS_Score,Physical-Season,Physical-BMI,Physical-Height,Physical-Weight,Physical-Waist_Circumference,Physical-Diastolic_BP,Physical-HeartRate,Physical-Systolic_BP,Fitness_Endurance-Season,Fitness_Endurance-Max_Stage,Fitness_Endurance-Time_Mins,Fitness_Endurance-Time_Sec,FGC-Season,FGC-FGC_CU,FGC-FGC_CU_Zone,FGC-FGC_GSND,FGC-FGC_GSND_Zone,FGC-FGC_GSD,FGC-FGC_GSD_Zone,FGC-FGC_PU,FGC-FGC_PU_Zone,FGC-FGC_SRL,FGC-FGC_SRL_Zone,FGC-FGC_SRR,FGC-FGC_SRR_Zone,FGC-FGC_TL,FGC-FGC_TL_Zone,BIA-Season,BIA-BIA_Activity_Level_num,BIA-BIA_BMC,BIA-BIA_BMI,BIA-BIA_BMR,BIA-BIA_DEE,BIA-BIA_ECW,BIA-BIA_FFM,BIA-BIA_FFMI,BIA-BIA_FMI,BIA-BIA_Fat,BIA-BIA_Frame_num,BIA-BIA_ICW,BIA-BIA_LDM,BIA-BIA_LST,BIA-BIA_SMM,BIA-BIA_TBW,PAQ_A-Season,PAQ_A-PAQ_A_Total,PAQ_C-Season,PAQ_C-PAQ_C_Total,PCIAT-Season,PCIAT-PCIAT_01,PCIAT-PCIAT_02,PCIAT-PCIAT_03,PCIAT-PCIAT_04,PCIAT-PCIAT_05,PCIAT-PCIAT_06,PCIAT-PCIAT_07,PCIAT-PCIAT_08,PCIAT-PCIAT_09,PCIAT-PCIAT_10,PCIAT-PCIAT_11,PCIAT-PCIAT_12,PCIAT-PCIAT_13,PCIAT-PCIAT_14,PCIAT-PCIAT_15,PCIAT-PCIAT_16,PCIAT-PCIAT_17,PCIAT-PCIAT_18,PCIAT-PCIAT_19,PCIAT-PCIAT_20,PCIAT-PCIAT_Total,SDS-Season,SDS-SDS_Total_Raw,SDS-SDS_Total_T,PreInt_EduHx-Season,PreInt_EduHx-computerinternet_hoursday,sii,feature_0,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,feature_9,feature_10,feature_11,feature_12,feature_13,feature_14,feature_15,feature_16,feature_17,feature_18,feature_19,feature_20,feature_21,feature_22,feature_23,feature_24,feature_25,feature_26,feature_27,feature_28,feature_29,feature_30,feature_31,feature_32,feature_33,feature_34,feature_35,feature_36,feature_37,feature_38,feature_39,feature_40,feature_41,feature_42,feature_43,feature_44,feature_45,feature_46,feature_47,feature_48,feature_49,feature_50,feature_51,feature_52,feature_53,feature_54,feature_55,feature_56,feature_57,feature_58,feature_59,feature_60,feature_61,feature_62,feature_63,feature_64,feature_65,feature_66,feature_67,feature_68,feature_69,feature_70,feature_71,feature_72,feature_73,feature_74,feature_75,feature_76,feature_77,feature_78,feature_79,feature_80,feature_81,feature_82,feature_83,feature_84,feature_85,feature_86,feature_87,feature_88,feature_89,feature_90,feature_91,feature_92,feature_93,feature_94,feature_95,BIA-BIA_BMC_boxcox,BIA-BIA_BMR_boxcox,BIA-BIA_DEE_boxcox,BIA-BIA_ECW_boxcox,BIA-BIA_Fat_boxcox,BIA-BIA_FFM_boxcox,BIA-BIA_FFMI_boxcox,BIA-BIA_FMI_boxcox,BIA-BIA_ICW_boxcox,BIA-BIA_LDM_boxcox,BIA-BIA_LST_boxcox,BIA-BIA_TBW_boxcox,CGAS-CGAS_Score_boxcox,feature_23_boxcox,feature_35_boxcox,feature_38_boxcox,feature_40_boxcox,feature_47_boxcox,feature_54_boxcox,feature_66_boxcox,feature_78_boxcox,feature_80_boxcox,feature_88_boxcox,feature_90_boxcox
0,Fall,5,0,Winter,51.0,Fall,16.877316,46.0,50.8,,,,,,,,,Fall,0.0,0.0,,,,,0.0,0.0,7.0,0.0,6.0,0.0,6.0,1.0,Fall,2.0,2.66855,16.8792,932.498,1492.0,8.25598,41.5862,13.8177,3.06143,9.21377,1.0,24.4349,8.89536,38.9177,19.5413,32.6909,,,,,Fall,5.0,4.0,4.0,0.0,4.0,0.0,0.0,4.0,0.0,0.0,4.0,0.0,4.0,4.0,4.0,4.0,4.0,4.0,2.0,4.0,55.0,,,,Fall,3.0,2.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1.795346,0.494067,1.013221,1.877806,6.776256e+107,1.210694,0.378516,13083250000000.0,0.836481,1.443716,1.461227,1.296982,2.887303,,,,,,,,,,,
1,Summer,9,0,,,Fall,14.03559,48.0,46.0,22.0,75.0,70.0,122.0,,,,,Fall,3.0,0.0,,,,,5.0,0.0,11.0,1.0,11.0,1.0,3.0,0.0,Winter,2.0,2.57949,14.0371,936.656,1498.65,6.01993,42.0291,12.8254,1.21172,3.97085,1.0,21.0352,14.974,39.4497,15.4107,27.0552,,,Fall,2.34,Fall,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Fall,46.0,64.0,Summer,0.0,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1.791257,0.494067,1.013225,1.624546,6.664675e+107,1.211268,0.378436,12361790000000.0,0.832549,1.635205,1.462669,1.279664,,,,,,,,,,,,
2,Summer,10,1,Fall,71.0,Fall,16.648696,56.5,75.6,,65.0,94.0,117.0,Fall,5.0,7.0,33.0,Fall,20.0,1.0,10.2,1.0,14.7,2.0,7.0,1.0,10.0,1.0,10.0,1.0,5.0,0.0,,,,,,,,,,,,,,,,,,,,Summer,2.17,Fall,5.0,2.0,2.0,1.0,2.0,1.0,1.0,2.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,0.0,2.0,2.0,1.0,1.0,28.0,Fall,38.0,54.0,Summer,2.0,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,3.054867,,,,,,,,,,,
3,Winter,9,0,Fall,71.0,Summer,18.292347,56.0,81.6,,60.0,97.0,117.0,Summer,6.0,9.0,37.0,Summer,18.0,1.0,,,,,5.0,0.0,7.0,0.0,7.0,0.0,7.0,1.0,Summer,3.0,3.84191,18.2943,1131.43,1923.44,15.5925,62.7757,14.074,4.22033,18.8243,2.0,30.4041,16.779,58.9338,26.4798,45.9966,,,Winter,2.451,Summer,4.0,2.0,4.0,0.0,5.0,1.0,0.0,3.0,2.0,2.0,3.0,0.0,3.0,0.0,0.0,3.0,4.0,3.0,4.0,1.0,44.0,Summer,31.0,45.0,Winter,0.0,1.0,0.059962,0.063853,0.061115,0.057507,0.063705,0.058935,0.063302,0.06184,0.066272,0.059864,0.061409,0.061272,0.482902,0.449879,0.477025,0.360325,0.486286,0.000282,0.194878,0.761338,0.527233,0.599881,0.673393,0.202446,0.669721,0.634308,0.545043,0.321878,0.4432,0.00045,0.323264,0.383129,0.43089,0.631043,0.14509,0.218223,0.782131,0.62118,0.988499,6e-06,0.031951,0.000417,0.000496,0.704413,0.010682,0.005581,0.517803,0.201068,0.386576,0.306167,0.229836,0.188153,0.293499,2.761214e-08,0.021041,0.690482,0.523655,0.348302,0.622966,0.187875,0.462905,0.494512,0.448863,0.408981,0.460584,4.503229e-08,0.070129,0.739498,0.635275,0.523313,0.701488,0.199662,0.549184,0.654837,0.668681,0.423425,0.617634,4.158378e-07,0.062502,0.830508,0.581014,0.718914,0.73783,0.199819,0.164361,0.159703,0.198518,0.296697,0.994268,0.000273,0.101636,0.082785,0.983878,0.998519,0.821466,0.222241,1.845723,0.494067,1.013388,2.360996,6.985483e+107,1.229847,0.378534,13552920000000.0,0.841114,1.671884,1.5003,1.322946,3.054867,-0.935339,-1.227946,-0.009383,-5.353568,-1.031675,-5.306281,-2.761255,-1.590256,-1.645198,-0.004695,-0.641349
4,Spring,18,1,Summer,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,Summer,1.04,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
5,Spring,13,1,Winter,50.0,Summer,22.279952,59.5,112.2,,60.0,73.0,102.0,,,,,Summer,12.0,0.0,16.5,2.0,17.9,2.0,6.0,0.0,10.0,1.0,11.0,1.0,8.0,0.0,Summer,2.0,4.33036,30.1865,1330.97,1996.45,30.2124,84.0285,16.6877,13.4988,67.9715,2.0,32.9141,20.902,79.6982,35.3804,63.1265,,,Spring,4.11,Summer,3.0,3.0,3.0,0.0,2.0,1.0,0.0,2.0,2.0,1.0,0.0,1.0,3.0,3.0,2.0,1.0,3.0,1.0,2.0,1.0,34.0,Summer,40.0,56.0,Spring,0.0,1.0,0.545608,0.564775,0.551163,0.546069,0.54449,0.541436,0.538521,0.550583,0.549428,0.564151,0.55119,0.557732,0.534399,0.466696,0.247047,0.111138,0.275329,0.651414,0.105707,0.395468,0.272272,0.588416,0.686763,0.257021,0.381962,0.49778,0.63104,0.161748,0.60355,0.860121,0.166548,0.677725,0.706236,0.727261,0.212168,0.156329,0.768742,0.599602,0.984312,0.000559,0.04066,2.7e-05,2e-05,0.040235,6e-06,0.001475,0.502317,0.256872,0.643131,0.439047,0.076362,0.022589,0.091091,0.04402062,0.046633,0.365689,0.190693,0.188637,0.567003,0.254875,0.510128,0.488092,0.153818,0.07395,0.203218,0.9812259,0.070672,0.315606,0.410554,0.480558,0.677895,0.244116,0.40882,0.574972,0.367852,0.082781,0.374608,0.9999955,0.055663,0.465196,0.552358,0.821534,0.740147,0.260989,0.158272,0.18059,0.207435,0.310252,0.991287,1.0,0.101612,0.095628,0.999995,0.999714,0.793989,0.257868,1.864977,0.494067,1.013409,2.828091,8.156581e+107,1.240129,0.378661,17846420000000.0,0.842524,1.738032,1.523098,1.342026,2.876979,-0.855591,-1.431325,-0.011925,-4.818598,-0.930557,-3.935685,-2.752938,-1.623008,-2.022604,-0.006468,-0.641357
6,Fall,10,0,,,Fall,19.66076,55.0,84.6,,123.0,83.0,163.0,,,,,Fall,9.0,1.0,,,,,2.0,0.0,11.0,1.0,11.0,1.0,11.0,1.0,Fall,2.0,3.78271,19.6629,1135.86,1817.38,16.3275,63.247,14.7,4.96291,21.353,2.0,30.8936,16.0259,59.4643,26.1957,47.2211,,,Winter,3.67,Winter,1.0,4.0,1.0,0.0,2.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,1.0,4.0,1.0,0.0,20.0,Winter,27.0,40.0,Fall,3.0,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1.843325,0.494067,1.013354,2.394666,7.041561e+107,1.230141,0.378572,13861210000000.0,0.841408,1.65729,1.501039,1.324696,,,,,,,,,,,,
7,Fall,10,1,,,Fall,16.861286,59.25,84.2,27.0,71.0,90.0,116.0,,,,,Fall,0.0,0.0,12.6,2.0,11.1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,Fall,3.0,4.05726,16.8631,1180.04,1888.06,21.94,67.9527,13.6092,3.25395,16.2474,2.0,28.5367,17.476,63.8954,28.768,50.4767,,,Fall,1.27,,,,,,,,,,,,,,,,,,,,,,,,,,Fall,2.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1.854326,0.494067,1.013377,2.606525,6.92878e+107,1.232877,0.378501,13160320000000.0,0.83989,1.684591,1.506823,1.328996,,,,,,,,,,,,
8,Summer,15,0,,,Spring,,,,,,,,,,,,Spring,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,Summer,2.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
9,Summer,19,1,Summer,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


In [9]:
# Feature engineering
def feature_engineering(df):     
    # Combine all grip strength
    df['FGC-FGC_GS'] = df['FGC-FGC_GSD_Zone'] + df['FGC-FGC_GSND_Zone']
    
    # Combine all sit and reach
    df['FGC-FGC_SR'] = df['FGC-FGC_SRL_Zone'] + df['FGC-FGC_SRR_Zone']
    
    # Create a fitness score by adding the zone fitness data
    df['fitness_score'] = df['FGC-FGC_GS'] + df['FGC-FGC_SR'] + df['FGC-FGC_CU_Zone'] + df['FGC-FGC_PU_Zone'] + df['FGC-FGC_TL_Zone']
    
    # Combine PAQ_A-PAQ_A_Total and PAQ_C-PAQ_C_Total into one column
    df['PAQ_Total'] = df['PAQ_A-PAQ_A_Total'].combine_first(df['PAQ_C-PAQ_C_Total'])
    
    # Combine up to stat 11 of actigraphy stats
    df['combined_actigraphy_stat'] = df['feature_0'] + df['feature_1'] + df['feature_2'] + df['feature_3'] + df['feature_4'] + df['feature_5'] + df['feature_6'] + df['feature_7'] + df['feature_8']+ df['feature_9'] + df['feature_10'] + df['feature_11']
    
    # Reworking of features from other notebook
    df['BMI_Age'] = df['Physical-BMI'] * df['Basic_Demos-Age']
    
    df['Internet_Hours_Age'] = df['PreInt_EduHx-computerinternet_hoursday'] * df['Basic_Demos-Age']
    df['Age_Internet_Hours'] = df['PreInt_EduHx-computerinternet_hoursday'] / df['Basic_Demos-Age']
    df['BMI_Internet_Hours'] = df['Physical-BMI'] * df['PreInt_EduHx-computerinternet_hoursday']
    df['BIA_BMI_Internet_Hours_Age'] = (df['BIA-BIA_BMI'] * df['PreInt_EduHx-computerinternet_hoursday']) / df['Basic_Demos-Age']
    
    df['BFP_BMI'] = df['BIA-BIA_Fat'] / df['BIA-BIA_BMI']
    df['FFMI_BFP'] = df['BIA-BIA_FFMI_boxcox'] / df['BIA-BIA_Fat']
    df['FMI_BFP'] = df['BIA-BIA_FMI_boxcox'] / df['BIA-BIA_Fat']
    df['LST_TBW'] = df['BIA-BIA_LST_boxcox'] / df['BIA-BIA_TBW_boxcox']
    df['BFP_BMR'] = df['BIA-BIA_Fat'] * df['BIA-BIA_BMR_boxcox']
    df['BFP_DEE'] = df['BIA-BIA_Fat'] * df['BIA-BIA_DEE_boxcox']
    
    df['BMR_Weight'] = df['BIA-BIA_BMR_boxcox'] / df['Physical-Weight']
    df['DEE_Weight'] = df['BIA-BIA_DEE_boxcox'] / df['Physical-Weight']
    df['SMM_Height'] = df['BIA-BIA_SMM'] / df['Physical-Height']
    
    df['Muscle_to_Fat'] = df['BIA-BIA_SMM'] / df['BIA-BIA_FMI_boxcox']
    df['Hydration_Status'] = df['BIA-BIA_TBW_boxcox'] / df['Physical-Weight']
    df['ICW_TBW'] = df['BIA-BIA_ICW_boxcox'] / df['BIA-BIA_TBW_boxcox']
    df['Age_Weight'] = df['Basic_Demos-Age'] * df['Physical-Weight']
    df['Sex_BMI'] = df['Basic_Demos-Sex'] * df['Physical-BMI']
    df['Sex_HeartRate'] = df['Basic_Demos-Sex'] * df['Physical-HeartRate']
    df['Age_WaistCirc'] = df['Basic_Demos-Age'] * df['Physical-Waist_Circumference']
    df['BMI_FitnessMaxStage'] = df['Physical-BMI'] * df['Fitness_Endurance-Max_Stage']
    df['Weight_GripStrengthDominant'] = df['Physical-Weight'] * df['FGC-FGC_GSD']
    df['Weight_GripStrengthNonDominant'] = df['Physical-Weight'] * df['FGC-FGC_GSND']
    df['HeartRate_FitnessTime'] = df['Physical-HeartRate'] * (df['Fitness_Endurance-Time_Mins'] + df['Fitness_Endurance-Time_Sec'])
    df['Age_PushUp'] = df['Basic_Demos-Age'] * df['FGC-FGC_PU']
    df['FFMI_Age'] = df['BIA-BIA_FFMI_boxcox'] * df['Basic_Demos-Age']
    df['InternetUse_SleepDisturbance'] = df['PreInt_EduHx-computerinternet_hoursday'] * df['SDS-SDS_Total_Raw']
    df['CGAS_BMI'] = df['CGAS-CGAS_Score'] * df['Physical-BMI']
    df['CGAS_FitnessMaxStage'] = df['CGAS-CGAS_Score'] * df['Fitness_Endurance-Max_Stage']
    
    return df

In [10]:
cat_c = ['Basic_Demos-Enroll_Season', 'CGAS-Season', 'Physical-Season', 
          'Fitness_Endurance-Season', 'FGC-Season', 'BIA-Season', 
          'PAQ_A-Season', 'PAQ_C-Season', 'SDS-Season', 'PreInt_EduHx-Season']

def update(df):
    global cat_c
    for c in cat_c: 
        df[c] = df[c].fillna('Missing')
        df[c] = df[c].astype('category')
    return df
        
train = update(train)
test = update(test)

def create_mapping(column, dataset):
    unique_values = dataset[column].unique()
    return {value: idx for idx, value in enumerate(unique_values)}

for col in cat_c:
    mapping = create_mapping(col, train)
    mappingTe = create_mapping(col, test)
    
    train[col] = train[col].replace(mapping).astype(int)
    test[col] = test[col].replace(mappingTe).astype(int)

In [11]:
imputer = KNNImputer(n_neighbors=5)
numeric_cols = train.select_dtypes(include=['float64', 'int64']).columns
imputed_data = imputer.fit_transform(train[numeric_cols])
train_imputed = pd.DataFrame(imputed_data, columns=numeric_cols)
train_imputed['sii'] = train_imputed['sii'].round().astype(int)
for col in train.columns:
    if col not in numeric_cols:
        train_imputed[col] = train[col]
        
train = train_imputed

train = feature_engineering(train)
train = train.dropna(thresh=10, axis=0)
test = feature_engineering(test)

In [None]:
train.head(10)

In [None]:
# Export train_data to CSV
train_output_path = os.path.join(output_folder, 'train_data_imputed.csv')
train.to_csv(train_output_path, index=False)
print(f"Imputed train data exported to: {train_output_path}")

# Export test_data to CSV
test_output_path = os.path.join(output_folder, 'test_data_imputed.csv')
test.to_csv(test_output_path, index=False)
print(f"Imputed test data exported to: {test_output_path}")

# Make copies of data for other submissions
train2 = train.copy()
test2 = test.copy()
train3 = train.copy()
test3 = test.copy()

print("Data export completed.")

In [14]:
# Feature selection
# Removed stats: 'feature_92','combined_actigraphy_stat', 
time_series_cols = [
    'feature_0', 'feature_1', 'feature_2', 'feature_3', 'feature_4','feature_5', 'feature_6', 
    'feature_7', 'feature_8', 'feature_9','feature_10', 'feature_11',
    'feature_12', 'feature_13', 'feature_14',
    'feature_15', 'feature_16', 'feature_17', 'feature_18', 'feature_19',
    'feature_20', 'feature_21', 'feature_22', 'feature_24',
    'feature_25', 'feature_26', 'feature_27', 'feature_28', 'feature_29',
    'feature_30', 'feature_31', 'feature_32', 'feature_33', 'feature_34',
    'feature_36', 'feature_37', 'feature_41', 'feature_42', 'feature_39',
    'feature_43', 'feature_44',
    'feature_45', 'feature_46', 'feature_48', 'feature_49',
    'feature_50', 'feature_51', 'feature_52', 'feature_53', 
    'feature_55', 'feature_56', 'feature_57', 'feature_58', 'feature_59',
    'feature_60', 'feature_61', 'feature_62', 'feature_63', 'feature_64',
    'feature_65', 'feature_67', 'feature_68', 'feature_69',
    'feature_70', 'feature_71', 'feature_72', 'feature_73', 'feature_74',
    'feature_75', 'feature_76', 'feature_77', 'feature_79',
    'feature_81', 'feature_82', 'feature_83', 'feature_84',
    'feature_85', 'feature_86', 'feature_87', 'feature_89',
    'feature_91', 'feature_93', 'feature_94',
    'feature_95', 'feature_23_boxcox', 'feature_35_boxcox', 'feature_38_boxcox',
    'feature_40_boxcox', 'feature_47_boxcox', 'feature_54_boxcox',
    'feature_66_boxcox', 'feature_78_boxcox', 'feature_80_boxcox',
    'feature_88_boxcox', 'feature_90_boxcox'
]

# Dropped columns 'FGC-FGC_CU',  'FGC-FGC_SRR', 'FGC-FGC_SRR_Zone',
featuresCols = ['Basic_Demos-Age', 'Basic_Demos-Sex', 'FGC-FGC_GS', 'FGC-FGC_SR', 
                'CGAS-CGAS_Score', 'Physical-BMI', 'fitness_score', 'FGC-FGC_PU',
                'Physical-Height', 'Physical-Weight', 'Physical-Waist_Circumference',
                'Physical-Diastolic_BP', 'Physical-HeartRate', 'Physical-Systolic_BP',
                'Fitness_Endurance-Max_Stage', 'Fitness_Endurance-Time_Mins', 'FGC-FGC_CU_Zone', 
                'FGC-FGC_GSND', 'FGC-FGC_GSD', 'FGC-FGC_PU_Zone', 'FGC-FGC_TL', 'FGC-FGC_TL_Zone',
                'BIA-BIA_Activity_Level_num', 'BIA-BIA_BMC_boxcox', 'BIA-BIA_BMI',
                'BIA-BIA_BMR_boxcox', 'BIA-BIA_DEE_boxcox', 'BIA-BIA_ECW_boxcox', 'BIA-BIA_FFM_boxcox',
                'BIA-BIA_FFMI_boxcox', 'BIA-BIA_FMI_boxcox', 'BIA-BIA_Fat', 'BIA-BIA_Frame_num',
                'BIA-BIA_ICW_boxcox', 'BIA-BIA_LDM_boxcox', 'BIA-BIA_LST_boxcox', 'BIA-BIA_SMM',
                'BIA-BIA_TBW', 'PAQ_A-PAQ_A_Total', 'PAQ_Total', 'PAQ_C-PAQ_C_Total', 'SDS-SDS_Total_Raw',
                'SDS-SDS_Total_T', 'PreInt_EduHx-computerinternet_hoursday', 'sii', 'BMI_Age',
                'Internet_Hours_Age','BMI_Internet_Hours', 'BFP_BMI', 'FFMI_BFP', 'FMI_BFP', 'LST_TBW', 'BFP_BMR', 'BFP_DEE', 'BMR_Weight', 'DEE_Weight',
                'SMM_Height', 'Muscle_to_Fat', 'Hydration_Status', 'ICW_TBW','Age_Weight','Sex_BMI','Sex_HeartRate','Age_WaistCirc',
                'BMI_FitnessMaxStage','Weight_GripStrengthDominant','Weight_GripStrengthNonDominant','HeartRate_FitnessTime',
                'Age_PushUp','FFMI_Age','InternetUse_SleepDisturbance','CGAS_BMI','CGAS_FitnessMaxStage',
                'Age_Internet_Hours', 'BIA_BMI_Internet_Hours_Age'
               ]

featuresCols += time_series_cols

train = train[featuresCols]
train = train.dropna(subset='sii')

# Dropped columns 'FGC-FGC_CU', 'FGC-FGC_SRR', 'FGC-FGC_SRR_Zone', 'Fitness_Endurance-Time_Sec',
featuresCols = ['Basic_Demos-Age', 'Basic_Demos-Sex', 'FGC-FGC_GS', 'FGC-FGC_SR',
                'CGAS-CGAS_Score', 'Physical-BMI', 'fitness_score', 'FGC-FGC_PU',
                'Physical-Height', 'Physical-Weight', 'Physical-Waist_Circumference',
                'Physical-Diastolic_BP', 'Physical-HeartRate', 'Physical-Systolic_BP',
                'Fitness_Endurance-Max_Stage','Fitness_Endurance-Time_Mins', 
                'FGC-FGC_CU_Zone', 'FGC-FGC_GSND', 'FGC-FGC_GSD', 'FGC-FGC_PU_Zone', 
                'FGC-FGC_TL', 'FGC-FGC_TL_Zone', 'BIA-BIA_Activity_Level_num', 'BIA-BIA_BMC_boxcox', 
                'BIA-BIA_BMI', 'BIA-BIA_BMR_boxcox', 'BIA-BIA_DEE_boxcox', 'BIA-BIA_ECW_boxcox', 'BIA-BIA_FFM_boxcox',
                'BIA-BIA_FFMI_boxcox', 'BIA-BIA_FMI_boxcox', 'BIA-BIA_Fat', 'BIA-BIA_Frame_num',
                'BIA-BIA_ICW_boxcox', 'BIA-BIA_LDM_boxcox', 'BIA-BIA_LST_boxcox', 'BIA-BIA_SMM',
                'BIA-BIA_TBW', 'PAQ_A-PAQ_A_Total', 'PAQ_Total', 'PAQ_C-PAQ_C_Total', 'SDS-SDS_Total_Raw',
                'SDS-SDS_Total_T', 'PreInt_EduHx-computerinternet_hoursday', 'BMI_Age','Internet_Hours_Age','BMI_Internet_Hours',
                'BFP_BMI', 'FFMI_BFP', 'FMI_BFP', 'LST_TBW', 'BFP_BMR', 'BFP_DEE', 'BMR_Weight', 'DEE_Weight',
                'SMM_Height', 'Muscle_to_Fat', 'Hydration_Status', 'ICW_TBW','Age_Weight','Sex_BMI','Sex_HeartRate','Age_WaistCirc',
                'BMI_FitnessMaxStage','Weight_GripStrengthDominant','Weight_GripStrengthNonDominant','HeartRate_FitnessTime',
                'Age_PushUp','FFMI_Age','InternetUse_SleepDisturbance','CGAS_BMI','CGAS_FitnessMaxStage',
                'Age_Internet_Hours', 'BIA_BMI_Internet_Hours_Age']

featuresCols += time_series_cols
test = test[featuresCols]

In [None]:
train.head(10)

In [None]:
test.head(10)

In [17]:
# Replace inf
if np.any(np.isinf(train)):
    train = train.replace([np.inf, -np.inf], np.nan)

In [None]:
# Model parameters for tuning
def quadratic_weighted_kappa(y_true, y_pred):
    return cohen_kappa_score(y_true, y_pred, weights='quadratic')

def threshold_Rounder(oof_non_rounded, thresholds):
    return np.where(oof_non_rounded < thresholds[0], 0,
                    np.where(oof_non_rounded < thresholds[1], 1,
                             np.where(oof_non_rounded < thresholds[2], 2, 3)))

def evaluate_predictions(thresholds, y_true, oof_non_rounded):
    rounded_p = threshold_Rounder(oof_non_rounded, thresholds)
    return -quadratic_weighted_kappa(y_true, rounded_p)

# Hyperparameter tuning
def tune_hyperparameters(X, y):
    models = [LGBMClassifier, XGBClassifier, CatBoostClassifier]
    best_params = {}

    for model_class in models:
        study = optuna.create_study(direction='maximize')
        study.optimize(lambda trial: objective(trial, X, y, model_class), n_trials=400)
        best_params[model_class.__name__] = study.best_params

    return best_params

def objective(trial, X, y, model_class):
    if model_class == LGBMClassifier:
        params = {
            'n_estimators': trial.suggest_int('n_estimators', 650, 900),
            'learning_rate': trial.suggest_float('learning_rate', 1e-3, 0.05, log=True),
            'num_leaves': trial.suggest_int('num_leaves', 700, 1200),
            'max_depth': trial.suggest_int('max_depth', 3, 15),
            'min_child_samples': trial.suggest_int('min_child_samples', 5, 100),
            'colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 1.0),
            'bagging_fraction': trial.suggest_float('bagging_fraction', 0.5, 1.0),
            'bagging_freq': trial.suggest_int('bagging_freq', 1, 10),
            'lambda_l1': trial.suggest_float('lambda_l1', 1e-8, 10, log=True),
            'lambda_l2': trial.suggest_float('lambda_l2', 1e-8, 10, log=True),
        }
    elif model_class == XGBClassifier:
        params = {
            'max_depth': trial.suggest_int('max_depth', 7, 15),
            'learning_rate': trial.suggest_float('learning_rate', 1e-3, 0.1, log=True),
            'n_estimators': trial.suggest_int('n_estimators', 100, 500),
            'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
            'subsample': trial.suggest_float('subsample', 0.6, 1.0),
            'colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 1.0),
            'reg_alpha': trial.suggest_float('reg_alpha', 1e-8, 10.0, log=True),
            'reg_lambda': trial.suggest_float('reg_lambda', 1e-8, 1.0, log=True),
            'tree_method': 'exact'
        }
    elif model_class == CatBoostClassifier:
        params = {
            'iterations': trial.suggest_int('iterations', 300, 600),
            'learning_rate': trial.suggest_float('learning_rate', 1e-3, 0.1, log=True),
            'depth': trial.suggest_int('depth', 5, 10),
            'l2_leaf_reg': trial.suggest_float('l2_leaf_reg', 1, 10, log=True),
            'bootstrap_type': trial.suggest_categorical('bootstrap_type', ['Bayesian', 'Bernoulli', 'MVS']),
            'random_strength': trial.suggest_float('random_strength', 1e-9, 1),
        }
        if params['bootstrap_type'] == 'Bayesian':
            params['bagging_temperature'] = trial.suggest_float('bagging_temperature', 0, 10)
        elif params['bootstrap_type'] == 'Bernoulli':
            params['subsample'] = trial.suggest_float('subsample', 0.1, 1)
    
    model = model_class(**params, random_state=42)
    score = cross_val_score(model, X, y, cv=5, scoring=make_scorer(quadratic_weighted_kappa))
    return score.mean()

In [None]:
# Hypertuning model
def TrainML(X, y, X_test, n_splits=5, SEED=42):
    X = train.drop(['sii'], axis=1)
    y = train['sii']

    # Tune hyperparameters
    print("Tuning hyperparameters...")
    best_params = tune_hyperparameters(X, y)

    # Create models with tuned hyperparameters
    lgbm_model = LGBMClassifier(**best_params['LGBMClassifier'], random_state=SEED)
    xgb_model = XGBClassifier(**best_params['XGBClassifier'], random_state=SEED)
    catboost_model = CatBoostClassifier(**best_params['CatBoostClassifier'], random_state=SEED)

    # Create VotingClassifier
    voting_model = VotingClassifier([
        ('lgbm', lgbm_model),
        ('xgb', xgb_model),
        ('catboost', catboost_model)
    ], voting='soft')
    
    SKF = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=SEED)
    
    oof_preds = np.zeros((len(X), 4))  # 4 classes
    test_preds = np.zeros((len(X_test), 4))

    for fold, (train_idx, val_idx) in enumerate(tqdm(SKF.split(X, y), total=n_splits, desc="Training folds")):
        X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
        y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

        voting_model.fit(X_train, y_train)

        oof_preds[val_idx] = voting_model.predict_proba(X_val)
        test_preds += voting_model.predict_proba(X_test) / n_splits

        val_preds = np.argmax(oof_preds[val_idx], axis=1)
        val_score = quadratic_weighted_kappa(y_val, val_preds)
        print(f"Fold {fold + 1} Validation QWK: {val_score:.4f}")

    oof_preds_class = np.argmax(oof_preds, axis=1)
    overall_score = quadratic_weighted_kappa(y, oof_preds_class)
    print(f"Overall QWK Score: {overall_score:.4f}")

    submission = pd.DataFrame({
        'id': sample_submission['id'],
        'sii': np.argmax(test_preds, axis=1)
    })

    with open(os.path.join(output_folder, 'best_params.json'), 'w') as f:
        json.dump(best_params, f)

    return submission, overall_score, best_params

In [None]:
Submission_hyper_tuned = TrainML( test)

# Save submission
Submission_hyper_tuned.to_csv('submission_1.csv', index=False)

In [18]:
SEED = 42
n_splits = 5

def TrainML(model_class, test_data):
    X = train.drop(['sii'], axis=1)
    y = train['sii']

    SKF = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=SEED)
    
    train_S = []
    test_S = []
    
    oof_non_rounded = np.zeros(len(y), dtype=float) 
    oof_rounded = np.zeros(len(y), dtype=int) 
    test_preds = np.zeros((len(test_data), n_splits))

    for fold, (train_idx, test_idx) in enumerate(tqdm(SKF.split(X, y), desc="Training Folds", total=n_splits)):
        X_train, X_val = X.iloc[train_idx], X.iloc[test_idx]
        y_train, y_val = y.iloc[train_idx], y.iloc[test_idx]

        model = clone(model_class)
        model.fit(X_train, y_train)

        y_train_pred = model.predict(X_train)
        y_val_pred = model.predict(X_val)

        oof_non_rounded[test_idx] = y_val_pred
        y_val_pred_rounded = y_val_pred.round(0).astype(int)
        oof_rounded[test_idx] = y_val_pred_rounded

        train_kappa = quadratic_weighted_kappa(y_train, y_train_pred.round(0).astype(int))
        val_kappa = quadratic_weighted_kappa(y_val, y_val_pred_rounded)

        train_S.append(train_kappa)
        test_S.append(val_kappa)
        
        test_preds[:, fold] = model.predict(test_data)
        
        print(f"Fold {fold+1} - Train QWK: {train_kappa:.4f}, Validation QWK: {val_kappa:.4f}")
        clear_output(wait=True)

    print(f"Mean Train QWK --> {np.mean(train_S):.4f}")
    print(f"Mean Validation QWK ---> {np.mean(test_S):.4f}")

    KappaOPtimizer = minimize(evaluate_predictions,
                              x0=[0.5, 1.5, 2.5], args=(y, oof_non_rounded), 
                              method='Nelder-Mead')
    assert KappaOPtimizer.success, "Optimization did not converge."
    
    oof_tuned = threshold_Rounder(oof_non_rounded, KappaOPtimizer.x)
    tKappa = quadratic_weighted_kappa(y, oof_tuned)

    print(f"----> || Optimized QWK SCORE :: {Fore.CYAN}{Style.BRIGHT} {tKappa:.3f}{Style.RESET_ALL}")

    tpm = test_preds.mean(axis=1)
    tpTuned = threshold_Rounder(tpm, KappaOPtimizer.x)
    
    submission = pd.DataFrame({
        'id': sample['id'],
        'sii': tpTuned
    })

    return submission

In [19]:
# Model parameters from hyper-tuning
Params = {
    'n_estimators': 656,
    'learning_rate': 0.046,
    'max_depth': 12,
    'num_leaves': 478,
    'min_data_in_leaf': 13,
    'feature_fraction': 0.893,
    'bagging_fraction': 0.784,
    'bagging_freq': 4,
    'lambda_l1': 10,  # Increased from 6.59
    'lambda_l2': 0.01  # Increased from 2.68e-06
}


XGB_Params = {
    'learning_rate': 0.05,
    'max_depth': 6,
    'n_estimators': 200,
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    'reg_alpha': 1,  # Increased from 0.1
    'reg_lambda': 5,  # Increased from 1
    'random_state': SEED,
    'tree_method': 'exact'
}


CatBoost_Params = {
    'learning_rate': 0.05,
    'depth': 6,
    'iterations': 200,
    'random_seed': SEED,
    'verbose': 0,
    'l2_leaf_reg': 10  # Increase this value
}

# Create model instances
Light = LGBMRegressor(**Params, random_state=SEED, verbose=-1)
XGB_Model = XGBRegressor(**XGB_Params)
CatBoost_Model = CatBoostRegressor(**CatBoost_Params)

# Combine models using Voting Regressor
voting_model = VotingRegressor(estimators=[
    ('lightgbm', Light),
    ('xgboost', XGB_Model),
    ('catboost', CatBoost_Model)
])

In [None]:
Submission1 = TrainML(voting_model, test)

# Save submission
Submission1.to_csv('submission_1.csv', index=False)

In [None]:
Submission1

In [None]:
# Submission #2
# Removed  
time_series_cols = [
    'feature_0', 'feature_1', 'feature_2', 'feature_3', 'feature_4','feature_5', 'feature_6', 
    'feature_7', 'feature_8', 'feature_9','feature_10', 'feature_11',
    'feature_12', 'feature_13', 'feature_14',
    'feature_15', 'feature_16', 'feature_17', 'feature_18', 'feature_19',
    'feature_20', 'feature_21', 'feature_22', 'feature_24',
    'feature_25', 'feature_26', 'feature_27', 'feature_28', 'feature_29',
    'feature_30', 'feature_31', 'feature_32', 'feature_33', 'feature_34',
    'feature_36', 'feature_37', 'feature_41', 'feature_42', 'feature_39',
    'feature_43', 'feature_44',
    'feature_45', 'feature_46', 'feature_48', 'feature_49',
    'feature_50', 'feature_51', 'feature_52', 'feature_53', 
    'feature_55', 'feature_56', 'feature_57', 'feature_58', 'feature_59',
    'feature_60', 'feature_61', 'feature_62', 'feature_63', 'feature_64',
    'feature_65', 'feature_67', 'feature_68', 'feature_69',
    'feature_70', 'feature_71', 'feature_72', 'feature_73', 'feature_74',
    'feature_75', 'feature_76', 'feature_77', 'feature_79',
    'feature_81', 'feature_82', 'feature_83', 'feature_84',
    'feature_85', 'feature_86', 'feature_87', 'feature_89',
    'feature_91', 'feature_93', 'feature_94',
    'feature_95', 'feature_23_boxcox', 'feature_35_boxcox', 'feature_38_boxcox',
    'feature_40_boxcox', 'feature_47_boxcox', 'feature_54_boxcox',
    'feature_66_boxcox', 'feature_78_boxcox', 'feature_80_boxcox',
    'feature_88_boxcox', 'feature_90_boxcox'
]

featuresCols = ['Basic_Demos-Enroll_Season', 'Basic_Demos-Age', 'Basic_Demos-Sex',
                'CGAS-Season', 'CGAS-CGAS_Score', 'Physical-Season', 'Physical-BMI',
                'Physical-Height', 'Physical-Weight', 'Physical-Waist_Circumference',
                'Physical-Diastolic_BP', 'Physical-HeartRate', 'Physical-Systolic_BP',
                'Fitness_Endurance-Season', 'Fitness_Endurance-Max_Stage',
                'Fitness_Endurance-Time_Mins', 
                'FGC-Season', 'FGC-FGC_CU', 'FGC-FGC_CU_Zone', 'FGC-FGC_GSND',
                'FGC-FGC_GSND_Zone', 'FGC-FGC_GSD', 'FGC-FGC_GSD_Zone', 'FGC-FGC_PU',
                'FGC-FGC_PU_Zone', 'FGC-FGC_SRL', 'FGC-FGC_SRL_Zone', 'FGC-FGC_SRR',
                'FGC-FGC_SRR_Zone', 'FGC-FGC_TL', 'FGC-FGC_TL_Zone', 'BIA-Season',
                'BIA-BIA_Activity_Level_num', 'BIA-BIA_BMC_boxcox', 'BIA-BIA_BMI',
                'BIA-BIA_BMR_boxcox', 'BIA-BIA_DEE_boxcox', 'BIA-BIA_ECW_boxcox', 'BIA-BIA_FFM_boxcox',
                'BIA-BIA_FFMI_boxcox', 'BIA-BIA_FMI_boxcox', 'BIA-BIA_Fat', 'BIA-BIA_Frame_num',
                'BIA-BIA_ICW_boxcox', 'BIA-BIA_LDM_boxcox', 'BIA-BIA_LST_boxcox', 'BIA-BIA_SMM',
                'BIA-BIA_TBW_boxcox', 'PAQ_A-Season', 'PAQ_A-PAQ_A_Total', 'PAQ_C-Season',
                'PAQ_C-PAQ_C_Total', 'SDS-Season', 'SDS-SDS_Total_Raw',
                'SDS-SDS_Total_T', 'PreInt_EduHx-Season', 'fitness_score',
                'PreInt_EduHx-computerinternet_hoursday', 'sii',
                'Age_Internet_Hours', 'BIA_BMI_Internet_Hours_Age']

featuresCols += time_series_cols

train2 = train2[featuresCols]
train2 = train2.dropna(subset='sii')

featuresCols = ['Basic_Demos-Enroll_Season', 'Basic_Demos-Age', 'Basic_Demos-Sex',
                'CGAS-Season', 'CGAS-CGAS_Score', 'Physical-Season', 'Physical-BMI',
                'Physical-Height', 'Physical-Weight', 'Physical-Waist_Circumference',
                'Physical-Diastolic_BP', 'Physical-HeartRate', 'Physical-Systolic_BP',
                'Fitness_Endurance-Season', 'Fitness_Endurance-Max_Stage',
                'Fitness_Endurance-Time_Mins', 
                'FGC-Season', 'FGC-FGC_CU', 'FGC-FGC_CU_Zone', 'FGC-FGC_GSND',
                'FGC-FGC_GSND_Zone', 'FGC-FGC_GSD', 'FGC-FGC_GSD_Zone', 'FGC-FGC_PU',
                'FGC-FGC_PU_Zone', 'FGC-FGC_SRL', 'FGC-FGC_SRL_Zone', 'FGC-FGC_SRR',
                'FGC-FGC_SRR_Zone', 'FGC-FGC_TL', 'FGC-FGC_TL_Zone', 'BIA-Season',
                'BIA-BIA_Activity_Level_num', 'BIA-BIA_BMC_boxcox', 'BIA-BIA_BMI',
                'BIA-BIA_BMR_boxcox', 'BIA-BIA_DEE_boxcox', 'BIA-BIA_ECW_boxcox', 'BIA-BIA_FFM_boxcox',
                'BIA-BIA_FFMI_boxcox', 'BIA-BIA_FMI_boxcox', 'BIA-BIA_Fat', 'BIA-BIA_Frame_num',
                'BIA-BIA_ICW_boxcox', 'BIA-BIA_LDM_boxcox', 'BIA-BIA_LST_boxcox', 'BIA-BIA_SMM',
                'BIA-BIA_TBW_boxcox', 'PAQ_A-Season', 'PAQ_A-PAQ_A_Total', 'PAQ_C-Season',
                'PAQ_C-PAQ_C_Total', 'SDS-Season', 'SDS-SDS_Total_Raw',
                'SDS-SDS_Total_T', 'PreInt_EduHx-Season', 'fitness_score',
                'PreInt_EduHx-computerinternet_hoursday',
                'Age_Internet_Hours', 'BIA_BMI_Internet_Hours_Age']

featuresCols += time_series_cols
test2 = test2[featuresCols]

cat_c = ['Basic_Demos-Enroll_Season', 'CGAS-Season', 'Physical-Season', 
          'Fitness_Endurance-Season', 'FGC-Season', 'BIA-Season', 
          'PAQ_A-Season', 'PAQ_C-Season', 'SDS-Season', 'PreInt_EduHx-Season']

def update(df):
    global cat_c
    for c in cat_c: 
        df[c] = df[c].fillna('Missing')
        df[c] = df[c].astype('category')
    return df
        
train2 = update(train2)
test2 = update(test2)

def create_mapping(column, dataset):
    unique_values = dataset[column].unique()
    return {value: idx for idx, value in enumerate(unique_values)}

for col in cat_c:
    mapping = create_mapping(col, train2)
    mappingTe = create_mapping(col, test2)
    
    train2[col] = train2[col].replace(mapping).astype(int)
    test2[col] = test2[col].replace(mappingTe).astype(int)

def quadratic_weighted_kappa(y_true, y_pred):
    return cohen_kappa_score(y_true, y_pred, weights='quadratic')

def threshold_Rounder(oof_non_rounded, thresholds):
    return np.where(oof_non_rounded < thresholds[0], 0,
                    np.where(oof_non_rounded < thresholds[1], 1,
                             np.where(oof_non_rounded < thresholds[2], 2, 3)))

def evaluate_predictions(thresholds, y_true, oof_non_rounded):
    rounded_p = threshold_Rounder(oof_non_rounded, thresholds)
    return -quadratic_weighted_kappa(y_true, rounded_p)

def TrainML(model_class, test_data):
    X = train2.drop(['sii'], axis=1)
    y = train2['sii']

    SKF = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=SEED)
    
    train2_S = []
    test2_S = []
    
    oof_non_rounded = np.zeros(len(y), dtype=float) 
    oof_rounded = np.zeros(len(y), dtype=int) 
    test_preds = np.zeros((len(test_data), n_splits))

    for fold, (train_idx, test_idx) in enumerate(tqdm(SKF.split(X, y), desc="Training Folds", total=n_splits)):
        X_train, X_val = X.iloc[train_idx], X.iloc[test_idx]
        y_train, y_val = y.iloc[train_idx], y.iloc[test_idx]

        model = clone(model_class)
        model.fit(X_train, y_train)

        y_train_pred = model.predict(X_train)
        y_val_pred = model.predict(X_val)

        oof_non_rounded[test_idx] = y_val_pred
        y_val_pred_rounded = y_val_pred.round(0).astype(int)
        oof_rounded[test_idx] = y_val_pred_rounded

        train_kappa = quadratic_weighted_kappa(y_train, y_train_pred.round(0).astype(int))
        val_kappa = quadratic_weighted_kappa(y_val, y_val_pred_rounded)

        train2_S.append(train_kappa)
        test2_S.append(val_kappa)
        
        test_preds[:, fold] = model.predict(test_data)
        
        print(f"Fold {fold+1} - Train QWK: {train_kappa:.4f}, Validation QWK: {val_kappa:.4f}")
        clear_output(wait=True)

    print(f"Mean Train QWK --> {np.mean(train2_S):.4f}")
    print(f"Mean Validation QWK ---> {np.mean(test2_S):.4f}")

    KappaOPtimizer = minimize(evaluate_predictions,
                              x0=[0.5, 1.5, 2.5], args=(y, oof_non_rounded), 
                              method='Nelder-Mead')
    assert KappaOPtimizer.success, "Optimization did not converge."
    
    oof_tuned = threshold_Rounder(oof_non_rounded, KappaOPtimizer.x)
    tKappa = quadratic_weighted_kappa(y, oof_tuned)

    print(f"----> || Optimized QWK SCORE :: {Fore.CYAN}{Style.BRIGHT} {tKappa:.3f}{Style.RESET_ALL}")

    tpm = test_preds.mean(axis=1)
    tpTuned = threshold_Rounder(tpm, KappaOPtimizer.x)
    
    submission = pd.DataFrame({
        'id': sample['id'],
        'sii': tpTuned
    })

    return submission

# Model parameters for LightGBM
Params = {
    'n_estimators': 682,
    'learning_rate': 0.022201704131134002,
    'max_depth': 3,
    'num_leaves': 843,
    'min_data_in_leaf': 13,
    'feature_fraction': 0.893,
    'bagging_fraction': 0.8918812900108436,
    'bagging_freq': 10,
    'lambda_l1': 4.79779460021304e-07,
    'lambda_l2': 2.0055171376757653e-06,
    'min_child_samples': 39,
    'colsample_bytree': 0.9264391369678474
}


# XGBoost parameters
XGB_Params = {
    'learning_rate': 0.02829495436971426,
    'max_depth': 8,
    'n_estimators': 484,
    'subsample': 0.9834706801888403,
    'colsample_bytree': 0.7681852816292032,
    'reg_alpha': 0.010495697417466835,
    'reg_lambda': 0.022138771647168275,
    'random_state': SEED,
    'tree_method': 'exact',
    'min_child_weight': 9
}


CatBoost_Params = {
    'learning_rate': 0.09981560408272906,
    'depth': 6,
    'iterations': 43,
    'random_seed': SEED,
    'verbose': 0,
    'l2_leaf_reg': 3.4155853181132496,
    'bootstrap_type': 'MVS',
    'random_strength': 0.06930624569348895
}

# Create model instances
Light = LGBMRegressor(**Params, random_state=SEED, verbose=-1)
XGB_Model = XGBRegressor(**XGB_Params)
CatBoost_Model = CatBoostRegressor(**CatBoost_Params)

# Combine models using Voting Regressor
voting_model = VotingRegressor(estimators=[
    ('lightgbm', Light),
    ('xgboost', XGB_Model),
    ('catboost', CatBoost_Model)
])

# Train the ensemble model
Submission2 = TrainML(voting_model, test2)

# Save submission
Submission2.to_csv('submission_2.csv', index=False)
Submission2

In [None]:
# Submission 3
time_series_cols = [
    'feature_0', 'feature_1', 'feature_2', 'feature_3', 'feature_4','feature_5', 'feature_6', 
    'feature_7', 'feature_8', 'feature_9','feature_10', 'feature_11',
    'feature_12', 'feature_13', 'feature_14',
    'feature_15', 'feature_16', 'feature_17', 'feature_18', 'feature_19',
    'feature_20', 'feature_21', 'feature_22', 'feature_24',
    'feature_25', 'feature_26', 'feature_27', 'feature_28', 'feature_29',
    'feature_30', 'feature_31', 'feature_32', 'feature_33', 'feature_34',
    'feature_36', 'feature_37', 'feature_41', 'feature_42', 'feature_39',
    'feature_43', 'feature_44',
    'feature_45', 'feature_46', 'feature_48', 'feature_49',
    'feature_50', 'feature_51', 'feature_52', 'feature_53', 
    'feature_55', 'feature_56', 'feature_57', 'feature_58', 'feature_59',
    'feature_60', 'feature_61', 'feature_62', 'feature_63', 'feature_64',
    'feature_65', 'feature_67', 'feature_68', 'feature_69',
    'feature_70', 'feature_71', 'feature_72', 'feature_73', 'feature_74',
    'feature_75', 'feature_76', 'feature_77', 'feature_79',
    'feature_81', 'feature_82', 'feature_83', 'feature_84',
    'feature_85', 'feature_86', 'feature_87', 'feature_89',
    'feature_91', 'feature_93', 'feature_94',
    'feature_95', 'feature_23_boxcox', 'feature_35_boxcox', 'feature_38_boxcox',
    'feature_40_boxcox', 'feature_47_boxcox', 'feature_54_boxcox',
    'feature_66_boxcox', 'feature_78_boxcox', 'feature_80_boxcox',
    'feature_88_boxcox', 'feature_90_boxcox'
]

featuresCols = ['Basic_Demos-Enroll_Season', 'Basic_Demos-Age', 'Basic_Demos-Sex',
                'CGAS-Season', 'CGAS-CGAS_Score', 'Physical-Season', 'Physical-BMI',
                'Physical-Height', 'Physical-Weight', 'Physical-Waist_Circumference',
                'Physical-Diastolic_BP', 'Physical-HeartRate', 'Physical-Systolic_BP',
                'Fitness_Endurance-Season', 'Fitness_Endurance-Max_Stage',
                'Fitness_Endurance-Time_Mins', 
                'FGC-Season', 'FGC-FGC_CU', 'FGC-FGC_CU_Zone', 'FGC-FGC_GSND',
                'FGC-FGC_GSND_Zone', 'FGC-FGC_GSD', 'FGC-FGC_GSD_Zone', 'FGC-FGC_PU',
                'FGC-FGC_PU_Zone', 'FGC-FGC_SRL', 'FGC-FGC_SRL_Zone', 'FGC-FGC_SRR',
                'FGC-FGC_SRR_Zone', 'FGC-FGC_TL', 'FGC-FGC_TL_Zone', 'BIA-Season',
                'BIA-BIA_Activity_Level_num', 'BIA-BIA_BMC', 'BIA-BIA_BMI',
                'BIA-BIA_BMR', 'BIA-BIA_DEE', 'BIA-BIA_ECW', 'BIA-BIA_FFM',
                'BIA-BIA_FFMI', 'BIA-BIA_FMI', 'BIA-BIA_Fat', 'BIA-BIA_Frame_num',
                'BIA-BIA_ICW', 'BIA-BIA_LDM', 'BIA-BIA_LST', 'BIA-BIA_SMM',
                'BIA-BIA_TBW', 'PAQ_A-Season', 'PAQ_A-PAQ_A_Total', 'PAQ_C-Season',
                'PAQ_C-PAQ_C_Total', 'SDS-Season', 'SDS-SDS_Total_Raw',
                'SDS-SDS_Total_T', 'PreInt_EduHx-Season', 'fitness_score',
                'PreInt_EduHx-computerinternet_hoursday', 'sii',
                'Age_Internet_Hours', 'BIA_BMI_Internet_Hours_Age']

featuresCols += time_series_cols

train3 = train3[featuresCols]
train3 = train3.dropna(subset='sii')

featuresCols = ['Basic_Demos-Enroll_Season', 'Basic_Demos-Age', 'Basic_Demos-Sex',
                'CGAS-Season', 'CGAS-CGAS_Score', 'Physical-Season', 'Physical-BMI',
                'Physical-Height', 'Physical-Weight', 'Physical-Waist_Circumference',
                'Physical-Diastolic_BP', 'Physical-HeartRate', 'Physical-Systolic_BP',
                'Fitness_Endurance-Season', 'Fitness_Endurance-Max_Stage',
                'Fitness_Endurance-Time_Mins', 
                'FGC-Season', 'FGC-FGC_CU', 'FGC-FGC_CU_Zone', 'FGC-FGC_GSND',
                'FGC-FGC_GSND_Zone', 'FGC-FGC_GSD', 'FGC-FGC_GSD_Zone', 'FGC-FGC_PU',
                'FGC-FGC_PU_Zone', 'FGC-FGC_SRL', 'FGC-FGC_SRL_Zone', 'FGC-FGC_SRR',
                'FGC-FGC_SRR_Zone', 'FGC-FGC_TL', 'FGC-FGC_TL_Zone', 'BIA-Season',
                'BIA-BIA_Activity_Level_num', 'BIA-BIA_BMC', 'BIA-BIA_BMI',
                'BIA-BIA_BMR', 'BIA-BIA_DEE', 'BIA-BIA_ECW', 'BIA-BIA_FFM',
                'BIA-BIA_FFMI', 'BIA-BIA_FMI', 'BIA-BIA_Fat', 'BIA-BIA_Frame_num',
                'BIA-BIA_ICW', 'BIA-BIA_LDM', 'BIA-BIA_LST', 'BIA-BIA_SMM',
                'BIA-BIA_TBW', 'PAQ_A-Season', 'PAQ_A-PAQ_A_Total', 'PAQ_C-Season',
                'PAQ_C-PAQ_C_Total', 'SDS-Season', 'SDS-SDS_Total_Raw',
                'SDS-SDS_Total_T', 'PreInt_EduHx-Season', 'fitness_score',
                'PreInt_EduHx-computerinternet_hoursday', 
                'Age_Internet_Hours', 'BIA_BMI_Internet_Hours_Age']

featuresCols += time_series_cols
test3 = test3[featuresCols]

def quadratic_weighted_kappa(y_true, y_pred):
    return cohen_kappa_score(y_true, y_pred, weights='quadratic')

def threshold_Rounder(oof_non_rounded, thresholds):
    return np.where(oof_non_rounded < thresholds[0], 0,
                    np.where(oof_non_rounded < thresholds[1], 1,
                             np.where(oof_non_rounded < thresholds[2], 2, 3)))

def evaluate_predictions(thresholds, y_true, oof_non_rounded):
    rounded_p = threshold_Rounder(oof_non_rounded, thresholds)
    return -quadratic_weighted_kappa(y_true, rounded_p)

def TrainML(model_class, test_data):
    X = train3.drop(['sii'], axis=1)
    y = train3['sii']

    SKF = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=SEED)
    
    train_S = []
    test_S = []
    
    oof_non_rounded = np.zeros(len(y), dtype=float) 
    oof_rounded = np.zeros(len(y), dtype=int) 
    test_preds = np.zeros((len(test_data), n_splits))

    for fold, (train_idx, test_idx) in enumerate(tqdm(SKF.split(X, y), desc="Training Folds", total=n_splits)):
        X_train, X_val = X.iloc[train_idx], X.iloc[test_idx]
        y_train, y_val = y.iloc[train_idx], y.iloc[test_idx]

        model = clone(model_class)
        model.fit(X_train, y_train)

        y_train_pred = model.predict(X_train)
        y_val_pred = model.predict(X_val)

        oof_non_rounded[test_idx] = y_val_pred
        y_val_pred_rounded = y_val_pred.round(0).astype(int)
        oof_rounded[test_idx] = y_val_pred_rounded

        train_kappa = quadratic_weighted_kappa(y_train, y_train_pred.round(0).astype(int))
        val_kappa = quadratic_weighted_kappa(y_val, y_val_pred_rounded)

        train_S.append(train_kappa)
        test_S.append(val_kappa)
        
        test_preds[:, fold] = model.predict(test_data)
        
        print(f"Fold {fold+1} - Train QWK: {train_kappa:.4f}, Validation QWK: {val_kappa:.4f}")
        clear_output(wait=True)

    print(f"Mean Train QWK --> {np.mean(train_S):.4f}")
    print(f"Mean Validation QWK ---> {np.mean(test_S):.4f}")

    KappaOPtimizer = minimize(evaluate_predictions,
                              x0=[0.5, 1.5, 2.5], args=(y, oof_non_rounded), 
                              method='Nelder-Mead')
    assert KappaOPtimizer.success, "Optimization did not converge."
    
    oof_tuned = threshold_Rounder(oof_non_rounded, KappaOPtimizer.x)
    tKappa = quadratic_weighted_kappa(y, oof_tuned)

    print(f"----> || Optimized QWK SCORE :: {Fore.CYAN}{Style.BRIGHT} {tKappa:.3f}{Style.RESET_ALL}")

    tpm = test_preds.mean(axis=1)
    tp_rounded = threshold_Rounder(tpm, KappaOPtimizer.x)

    return tp_rounded

imputer = SimpleImputer(strategy='median')

ensemble = VotingRegressor(estimators=[
    ('lgb', Pipeline(steps=[('imputer', imputer), ('regressor', LGBMRegressor(random_state=SEED))])),
    ('xgb', Pipeline(steps=[('imputer', imputer), ('regressor', XGBRegressor(random_state=SEED))])),
    ('cat', Pipeline(steps=[('imputer', imputer), ('regressor', CatBoostRegressor(random_state=SEED, silent=True))])),
    ('rf', Pipeline(steps=[('imputer', imputer), ('regressor', RandomForestRegressor(random_state=SEED))])),
    ('gb', Pipeline(steps=[('imputer', imputer), ('regressor', GradientBoostingRegressor(random_state=SEED))]))
])

Submission3 = TrainML(ensemble, test3)

In [None]:
Submission3 = pd.DataFrame({
    'id': sample['id'],
    'sii': Submission3
})

Submission3

In [None]:
sub1 = Submission1
sub2 = Submission2
sub3 = Submission3

sub1 = sub1.sort_values(by='id').reset_index(drop=True)
sub2 = sub2.sort_values(by='id').reset_index(drop=True)
sub3 = sub3.sort_values(by='id').reset_index(drop=True)

combined = pd.DataFrame({
    'id': sub1['id'],
    'sii_1': sub1['sii'],
    'sii_2': sub2['sii'],
    'sii_3': sub3['sii']
})

def majority_vote(row):
    return row.mode()[0]

combined['final_sii'] = combined[['sii_1', 'sii_2', 'sii_3']].apply(majority_vote, axis=1)

final_submission = combined[['id', 'final_sii']].rename(columns={'final_sii': 'sii'})

final_submission.to_csv('submission.csv', index=False)

print("Majority voting completed and saved to 'Final_Submission.csv'")

In [None]:
final_submission