### change from previous:

1. use progress report 2 method
   
2. no autoencoder, no feature engineer. don't handle missing numerical values. models = lightgbm, xgboost & catboost which auto handles.

3. no autoencoder, no feature engineer. use imputer = SimpleImputer(strategy='median') in models.

## Progress Report 2 Method

In [1]:
import os
import warnings
from IPython.display import clear_output
from colorama import Fore, Style, init

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
from concurrent.futures import ThreadPoolExecutor

from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import KNNImputer, IterativeImputer
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import (RandomForestRegressor, VotingRegressor, GradientBoostingRegressor)
from sklearn.metrics import (classification_report, confusion_matrix, mean_squared_error,
                             accuracy_score, cohen_kappa_score, r2_score, make_scorer)
from sklearn.decomposition import PCA
from sklearn.tree import DecisionTreeRegressor 
from scipy.optimize import minimize
from sklearn.base import clone

from keras.models import Model
from keras.layers import Input, Dense

from xgboost import XGBRegressor
from catboost import CatBoostRegressor
from lightgbm import LGBMRegressor
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline

import torch
import torch.nn as nn
import torch.optim as optim

In [2]:
# Feature Engineering
def feature_engineering(df):
    # Calculate new features
    new_features = pd.DataFrame({
        'BMI_Age': df['Physical-BMI'] * df['Basic_Demos-Age'],
        'Internet_Hours_Age': df['PreInt_EduHx-computerinternet_hoursday'] * df['Basic_Demos-Age'],
        'BMI_Internet_Hours': df['Physical-BMI'] * df['PreInt_EduHx-computerinternet_hoursday'],
        'BFP_BMI': df['BIA-BIA_Fat'] / df['BIA-BIA_BMI'],
        'FFMI_BFP': df['BIA-BIA_FFMI'] / df['BIA-BIA_Fat'],
        'FMI_BFP': df['BIA-BIA_FMI'] / df['BIA-BIA_Fat'],
        'LST_TBW': df['BIA-BIA_LST'] / df['BIA-BIA_TBW'],
        'BFP_BMR': df['BIA-BIA_Fat'] * df['BIA-BIA_BMR'],
        'BFP_DEE': df['BIA-BIA_Fat'] * df['BIA-BIA_DEE'],
        'BMR_Weight': df['BIA-BIA_BMR'] / df['Physical-Weight'],
        'DEE_Weight': df['BIA-BIA_DEE'] / df['Physical-Weight'],
        'SMM_Height': df['BIA-BIA_SMM'] / df['Physical-Height'],
        'Muscle_to_Fat': df['BIA-BIA_SMM'] / df['BIA-BIA_FMI'],
        'Hydration_Status': df['BIA-BIA_TBW'] / df['Physical-Weight'],
        'ICW_TBW': df['BIA-BIA_ICW'] / df['BIA-BIA_TBW']
    })

    # Concatenate new features with the original DataFrame
    df = pd.concat([df, new_features], axis=1)
    
    return df

def load_and_process_data(directory):
    files = os.listdir(directory)
    all_stats = []

    with ThreadPoolExecutor() as executor:
        futures = [executor.submit(pd.read_parquet, os.path.join(directory, file, 'part-0.parquet')) for file in files]
        for future in tqdm(futures):
            data = future.result()
            if 'step' in data.columns:
                data.drop('step', axis=1, inplace=True)

            # Calculate summary statistics
            stats = data.describe().values.reshape(-1)
            all_stats.append(stats)

    # Create a DataFrame for summary statistics
    stat_columns = [f"stat_{i}" for i in range(len(all_stats[0]))]
    summary_df = pd.DataFrame(all_stats, columns=stat_columns)
    summary_df['id'] = [file.split('=')[1] for file in files]  # Extract 'id' from filenames

    return summary_df

class SimpleAutoEncoder(nn.Module):
    def __init__(self, input_dim, encoding_dim):
        super(SimpleAutoEncoder, self).__init__()
        self.encoder = nn.Sequential(
            nn.Linear(input_dim, encoding_dim * 2),
            nn.ReLU(),
            nn.Linear(encoding_dim * 2, encoding_dim),
            nn.ReLU()
        )
        self.decoder = nn.Sequential(
            nn.Linear(encoding_dim, encoding_dim * 2),
            nn.ReLU(),
            nn.Linear(encoding_dim * 2, input_dim),
            nn.Sigmoid()
        )

    def forward(self, x):
        return self.decoder(self.encoder(x))

def train_autoencoder(data, encoding_dim=10, epochs=20, batch_size=16):
    scaler = StandardScaler()
    scaled_data = scaler.fit_transform(data)
    tensor_data = torch.FloatTensor(scaled_data)

    autoencoder = SimpleAutoEncoder(input_dim=tensor_data.shape[1], encoding_dim=encoding_dim)
    criterion = nn.MSELoss()
    optimizer = optim.Adam(autoencoder.parameters())

    for epoch in range(epochs):
        for i in range(0, len(tensor_data), batch_size):
            batch = tensor_data[i:i + batch_size]
            optimizer.zero_grad()
            loss = criterion(autoencoder(batch), batch)
            loss.backward()
            optimizer.step()
        if (epoch + 1) % 5 == 0:
            print(f'Epoch {epoch + 1}/{epochs}, Loss: {loss.item():.4f}')

    with torch.no_grad():
        encoded_data = autoencoder.encoder(tensor_data).numpy()
    
    return pd.DataFrame(encoded_data, columns=[f'Enc_{i+1}' for i in range(encoded_data.shape[1])])
    
def impute_missing_values(data, season_columns, season_mapping):
    # Encode Seasons
    data[season_columns] = data[season_columns].map(lambda x: season_mapping.get(x, x))
    
    # Identify numeric columns
    numeric_cols = data.select_dtypes(include=['float64', 'float32', 'int64']).columns
    
    # Scale numeric features for KNN imputation
    scaler = StandardScaler()
    data_scaled = data.copy()
    data_scaled[numeric_cols] = scaler.fit_transform(data[numeric_cols])
    
    # Initialize the imputer and apply it only on numeric columns with missing values
    imputer = KNNImputer(n_neighbors=5)
    imputed_numeric_data = imputer.fit_transform(data_scaled[numeric_cols])
    imputed_scaled_df = pd.DataFrame(imputed_numeric_data, columns=numeric_cols)
    
    # Invert scaling to original scale for imputed numeric columns
    imputed_data = data.copy()
    imputed_data[numeric_cols] = scaler.inverse_transform(imputed_scaled_df)
    
    # Clip and convert 'sii' to integers
    if 'sii' in imputed_data.columns:
        imputed_data['sii'] = imputed_data['sii'].clip(lower=0, upper=3).round().astype(int)
    
    # Ensure other columns remain intact
    for col in imputed_data.columns:
        if col not in numeric_cols:
            imputed_data[col] = data[col]
    
    # Convert season columns to integers
    imputed_data[season_columns] = imputed_data[season_columns].clip(lower=1, upper=4).round().astype(int)
    
    return imputed_data

# Define QWK calculation function
def quadratic_weighted_kappa(y_true, y_pred):
    return cohen_kappa_score(y_true, y_pred, weights='quadratic')

# Function to apply threshold-based rounding to predictions
def threshold_rounder(predictions, thresholds):
    return np.where(predictions < thresholds[0], 0,
                    np.where(predictions < thresholds[1], 1,
                             np.where(predictions < thresholds[2], 2, 3)))

# Threshold optimization to maximize QWK
def optimize_qwk_thresholds(predictions, y_true):
    def evaluate_thresholds(thresholds):
        rounded_preds = threshold_rounder(predictions, thresholds)
        return -quadratic_weighted_kappa(y_true, rounded_preds)
    
    # Optimize using the Nelder-Mead method
    result = minimize(evaluate_thresholds, x0=[0.5, 1.5, 2.5], method='Nelder-Mead')
    return result.x if result.success else [0.5, 1.5, 2.5]

def evaluate_predictions(thresholds, y_true, oof_non_rounded):
    rounded_p = threshold_rounder(oof_non_rounded, thresholds)
    return -quadratic_weighted_kappa(y_true, rounded_p)

In [3]:
n_splits = 5
SEED = 42

In [4]:
# Model
def train_and_evaluate(train, test, model):
    X = train.drop(['sii'], axis=1)
    y = train['sii']

    train_S = []
    test_S = []
    
    # Arrays to store out-of-fold predictions and test predictions
    oof_non_rounded = np.zeros(len(y), dtype=float) 
    oof_rounded = np.zeros(len(y), dtype=int) 
    test_preds = np.zeros((len(test), n_splits))
    
    # Set up cross-validation
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=SEED)
    
    for fold, (train_idx, test_idx) in enumerate(tqdm(skf.split(X, y), desc="Training Folds", total=n_splits)):
        print(f"Training fold {fold + 1}/{n_splits}...")
        
        # Split the data into training and validation sets
        X_train, X_val = X.iloc[train_idx], X.iloc[test_idx]
        y_train, y_val = y.iloc[train_idx], y.iloc[test_idx]
        
        # Standardize the features using StandardScaler
        scaler = StandardScaler()
        X_train_scaled = scaler.fit_transform(X_train)
        X_val_scaled = scaler.transform(X_val)
        test_scaled = scaler.transform(test)
        
        # Fit the model on the scaled training data
        model = clone(model)
        model.fit(X_train_scaled, y_train)

        y_train_pred = model.predict(X_train_scaled)
        y_val_pred = model.predict(X_val_scaled)

        oof_non_rounded[test_idx] = y_val_pred
        y_val_pred_rounded = y_val_pred.round(0).astype(int)
        oof_rounded[test_idx] = y_val_pred_rounded

        train_kappa = quadratic_weighted_kappa(y_train, y_train_pred.round(0).astype(int))
        val_kappa = quadratic_weighted_kappa(y_val, y_val_pred_rounded)

        train_S.append(train_kappa)
        test_S.append(val_kappa)

        test_preds[:, fold] = model.predict(test)

        print(f"Fold {fold+1} - Train QWK: {train_kappa:.4f}, Validation QWK: {val_kappa:.4f}")
        clear_output(wait=True)

    print(f"Mean Train QWK: {np.mean(train_S):.4f}")
    print(f"Mean Validation QWK: {np.mean(test_S):.4f}")

    KappaOPtimizer = minimize(evaluate_predictions,
                              x0=[0.5, 1.5, 2.5], args=(y, oof_non_rounded), 
                              method='Nelder-Mead')
    assert KappaOPtimizer.success, "Optimization did not converge."

    oof_tuned = threshold_rounder(oof_non_rounded, KappaOPtimizer.x)
    tKappa = quadratic_weighted_kappa(y, oof_tuned)

    print(f"Optimized QWK SCORE: {Fore.CYAN}{Style.BRIGHT} {tKappa:.3f}{Style.RESET_ALL}")

    tpm = test_preds.mean(axis=1)
    tpTuned = threshold_rounder(tpm, KappaOPtimizer.x)

    # Prepare submission DataFrame
    submission = pd.DataFrame({
        'id': sample_submission['id'],  # Use 'id' from sample submission
        'sii': tpTuned
    })
    
    return submission

In [5]:
# Load Data

train_og = pd.read_csv('/kaggle/input/child-mind-institute-problematic-internet-use/train.csv')
test_og = pd.read_csv('/kaggle/input/child-mind-institute-problematic-internet-use/test.csv')
sample_submission = pd.read_csv('/kaggle/input/child-mind-institute-problematic-internet-use/sample_submission.csv')

# Load actigraphy time series data
train_ts = load_and_process_data("/kaggle/input/child-mind-institute-problematic-internet-use/series_train.parquet")
test_ts = load_and_process_data("/kaggle/input/child-mind-institute-problematic-internet-use/series_test.parquet")

df_train = train_ts.drop('id', axis=1)
df_test = test_ts.drop('id', axis=1)

# Autoencode Data
train_ts_encoded = train_autoencoder(df_train, encoding_dim=60, epochs=100, batch_size=32)
test_ts_encoded = train_autoencoder(df_test, encoding_dim=60, epochs=100, batch_size=32)

time_series_cols = train_ts_encoded.columns.tolist()
# Add 'id' back to the encoded DataFrame
train_ts_encoded["id"]=train_ts["id"]
test_ts_encoded['id']=test_ts["id"]

# Merge Data
train = pd.merge(train_og, train_ts_encoded, how="left", on='id')
test = pd.merge(test_og, test_ts_encoded, how="left", on='id')

# Impute Missing Data
# Define season mapping
season_mapping = {'Spring': 1, 'Summer': 2, 'Fall': 3, 'Winter': 4}
# For the train set
season_columns_train = [col for col in train.columns if 'Season' in col]
train_imputed = impute_missing_values(train, season_columns_train, season_mapping)
# For the test set
season_columns_test = [col for col in test.columns if 'Season' in col]
test_imputed = impute_missing_values(test, season_columns_test, season_mapping)

# Perform feature engineering
train_imputed = feature_engineering(train_imputed)
train_imputed.dropna(thresh=1, axis=0, inplace=True)
train_imputed.replace([np.inf, -np.inf], 0, inplace=True)
test_imputed = feature_engineering(test_imputed)

# Get the columns from both DataFrames
train_cols = set(train_og.columns)
test_cols = set(test_og.columns)

# Find common columns
common_cols = train_cols.intersection(test_cols)
featuresCols = [col for col in common_cols if col != 'id']
featuresCols += time_series_cols

test_imputed = test_imputed[featuresCols]
featuresCols.append('sii')
train_imputed = train_imputed[featuresCols]

100%|██████████| 996/996 [02:31<00:00,  6.55it/s]
100%|██████████| 2/2 [00:00<00:00,  8.67it/s]


Epoch 5/100, Loss: 1.4662
Epoch 10/100, Loss: 1.3984
Epoch 15/100, Loss: 1.3948
Epoch 20/100, Loss: 1.3920
Epoch 25/100, Loss: 1.3885
Epoch 30/100, Loss: 1.3863
Epoch 35/100, Loss: 1.3864
Epoch 40/100, Loss: 1.3870
Epoch 45/100, Loss: 1.3790
Epoch 50/100, Loss: 1.3753
Epoch 55/100, Loss: 1.3751
Epoch 60/100, Loss: 1.3688
Epoch 65/100, Loss: 1.3739
Epoch 70/100, Loss: 1.3681
Epoch 75/100, Loss: 1.3650
Epoch 80/100, Loss: 1.3630
Epoch 85/100, Loss: 1.3655
Epoch 90/100, Loss: 1.3634
Epoch 95/100, Loss: 1.3643
Epoch 100/100, Loss: 1.3621
Epoch 5/100, Loss: 1.0808
Epoch 10/100, Loss: 1.0187
Epoch 15/100, Loss: 0.8837
Epoch 20/100, Loss: 0.6916
Epoch 25/100, Loss: 0.5345
Epoch 30/100, Loss: 0.4522
Epoch 35/100, Loss: 0.4298
Epoch 40/100, Loss: 0.4274
Epoch 45/100, Loss: 0.4271
Epoch 50/100, Loss: 0.4271
Epoch 55/100, Loss: 0.4271
Epoch 60/100, Loss: 0.4271
Epoch 65/100, Loss: 0.4271
Epoch 70/100, Loss: 0.4271
Epoch 75/100, Loss: 0.4271
Epoch 80/100, Loss: 0.4271
Epoch 85/100, Loss: 0.4271
Ep

In [6]:
# Model hyperparameters
xgb_params = {
    'n_estimators': 200,
    'learning_rate': 0.05,
    'max_depth': 6,
    'min_child_weight': 1,
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    'gamma': 0,
    'reg_alpha': 1,
    'reg_lambda': 5,
    'random_state': 42
}
cat_params = {
    'iterations': 200,
    'learning_rate': 0.05,
    'depth': 6,
    'l2_leaf_reg': 10,
    'subsample': 0.8,
    'rsm': 0.8,
    'border_count': 32,
    'random_state': 42,
    'silent': True
}
rf_params = {
    'n_estimators': 200,
    'max_depth': None,
    'min_samples_split': 2,
    'min_samples_leaf': 1,
    'bootstrap': True,
    'random_state': 42
}
gb_params = {
    'n_estimators': 200,
    'learning_rate': 0.05,
    'max_depth': 3,
    'min_samples_split': 2,
    'min_samples_leaf': 1,
    'subsample': 1.0,
    'random_state': 42
}

# Initialize the models
rf_model = RandomForestRegressor(**rf_params)
xgb_model = XGBRegressor(**xgb_params)
cat_model = CatBoostRegressor(**cat_params)
gb_model = GradientBoostingRegressor(**gb_params)

# Initialize the voting regressor ensemble
voting_regressor = VotingRegressor(estimators=[
    ('rf', rf_model),
    ('xgboost', xgb_model),
    ('catboost', cat_model),
    ('gb', gb_model)
])

submission1 = train_and_evaluate(train_imputed, test_imputed, voting_regressor)

# Display the submission DataFrame
submission1

Training Folds: 100%|██████████| 5/5 [05:51<00:00, 70.35s/it]


Mean Train QWK: 0.8501
Mean Validation QWK: 0.5074
Optimized QWK SCORE: [36m[1m 0.567[0m


Unnamed: 0,id,sii
0,00008ff9,2
1,000fd460,2
2,00105258,2
3,00115b9f,2
4,0016bb22,2
5,001f3379,2
6,0038ba98,2
7,0068a485,2
8,0069fbed,2
9,0083e397,2


## No autoencoder. Don't handle missing numerical values. Models = lightgbm, xgboost & catboost

In [7]:
def train_and_evaluate(train, test, model):
    X = train.drop(['sii'], axis=1)
    y = train['sii']

    SKF = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=SEED)
    
    train_S = []
    test_S = []
    
    oof_non_rounded = np.zeros(len(y), dtype=float) 
    oof_rounded = np.zeros(len(y), dtype=int) 
    test_preds = np.zeros((len(test), n_splits))

    for fold, (train_idx, test_idx) in enumerate(tqdm(SKF.split(X, y), desc="Training Folds", total=n_splits)):
        X_train, X_val = X.iloc[train_idx], X.iloc[test_idx]
        y_train, y_val = y.iloc[train_idx], y.iloc[test_idx]

        model = clone(model)
        model.fit(X_train, y_train)

        y_train_pred = model.predict(X_train)
        y_val_pred = model.predict(X_val)

        oof_non_rounded[test_idx] = y_val_pred
        y_val_pred_rounded = y_val_pred.round(0).astype(int)
        oof_rounded[test_idx] = y_val_pred_rounded

        train_kappa = quadratic_weighted_kappa(y_train, y_train_pred.round(0).astype(int))
        val_kappa = quadratic_weighted_kappa(y_val, y_val_pred_rounded)

        train_S.append(train_kappa)
        test_S.append(val_kappa)
        
        test_preds[:, fold] = model.predict(test)

        print(f"Fold {fold+1} - Train QWK: {train_kappa:.4f}, Validation QWK: {val_kappa:.4f}")
        clear_output(wait=True)

    print(f"Mean Train QWK: {np.mean(train_S):.4f}")
    print(f"Mean Validation QWK: {np.mean(test_S):.4f}")

    KappaOPtimizer = minimize(evaluate_predictions,
                              x0=[0.5, 1.5, 2.5], args=(y, oof_non_rounded), 
                              method='Nelder-Mead')
    assert KappaOPtimizer.success, "Optimization did not converge."
    
    oof_tuned = threshold_rounder(oof_non_rounded, KappaOPtimizer.x)
    tKappa = quadratic_weighted_kappa(y, oof_tuned)

    print(f"Optimized QWK SCORE: {Fore.CYAN}{Style.BRIGHT} {tKappa:.3f}{Style.RESET_ALL}")

    tpm = test_preds.mean(axis=1)
    tpTuned = threshold_rounder(tpm, KappaOPtimizer.x)
    
    submission = pd.DataFrame({
        'id': sample_submission['id'],
        'sii': tpTuned
    })

    return submission
    
# Load Data

train_og = pd.read_csv('/kaggle/input/child-mind-institute-problematic-internet-use/train.csv')
test_og = pd.read_csv('/kaggle/input/child-mind-institute-problematic-internet-use/test.csv')
sample_submission = pd.read_csv('/kaggle/input/child-mind-institute-problematic-internet-use/sample_submission.csv')

# Load actigraphy time series data
train_ts = load_and_process_data("/kaggle/input/child-mind-institute-problematic-internet-use/series_train.parquet")
test_ts = load_and_process_data("/kaggle/input/child-mind-institute-problematic-internet-use/series_test.parquet")

time_series_cols = train_ts.columns.tolist()
time_series_cols.remove("id")

train = pd.merge(train_og, train_ts, how="left", on='id')
test = pd.merge(test_og, test_ts, how="left", on='id')

train = train.drop('id', axis=1)
test = test.drop('id', axis=1) 

# Get the columns from both DataFrames
train_cols = set(train_og.columns)
test_cols = set(test_og.columns)

# Find common columns
common_cols = train_cols.intersection(test_cols)
featuresCols = [col for col in common_cols if col != 'id']
featuresCols += time_series_cols

test = test[featuresCols]

featuresCols.append('sii')

train = train[featuresCols]
train = train.dropna(subset='sii')

cat_c = ['Basic_Demos-Enroll_Season', 'CGAS-Season', 'Physical-Season', 
          'Fitness_Endurance-Season', 'FGC-Season', 'BIA-Season', 
          'PAQ_A-Season', 'PAQ_C-Season', 'SDS-Season', 'PreInt_EduHx-Season']

def update(df):
    global cat_c
    for c in cat_c: 
        df[c] = df[c].fillna('Missing')
        df[c] = df[c].astype('category')
    return df
        
train = update(train)
test = update(test)

def create_mapping(column, dataset):
    unique_values = dataset[column].unique()
    return {value: idx for idx, value in enumerate(unique_values)}

for col in cat_c:
    mapping = create_mapping(col, train)
    mappingTe = create_mapping(col, test)
    
    train[col] = train[col].replace(mapping).astype(int)
    test[col] = test[col].replace(mappingTe).astype(int)

# Model hyperparameters (keeping your original hyperparameters)
# XGBoost parameters
XGB_Params = {
    'learning_rate': 0.05,
    'max_depth': 6,
    'n_estimators': 200,
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    'reg_alpha': 1,  # Increased from 0.1
    'reg_lambda': 5,  # Increased from 1
    'random_state': 42
}

# Model parameters for LightGBM
Params = {
    'learning_rate': 0.046,
    'max_depth': 12,
    'num_leaves': 478,
    'min_data_in_leaf': 13,
    'feature_fraction': 0.893,
    'bagging_fraction': 0.784,
    'bagging_freq': 4,
    'lambda_l1': 10,  # Increased from 6.59
    'lambda_l2': 0.01  # Increased from 2.68e-06
}

# Model parameters for CatBoost
CatBoost_Params = {
    'learning_rate': 0.05,
    'depth': 6,
    'iterations': 200,
    'random_seed': 42,
    'cat_features': cat_c,
    'verbose': 0,
    'l2_leaf_reg': 10  # Increase this value
}

# Initialize the models
Light = LGBMRegressor(**Params, random_state=42, verbose=-1, n_estimators=300)
XGB_Model = XGBRegressor(**XGB_Params)
CatBoost_Model = CatBoostRegressor(**CatBoost_Params)

# Initialize the voting regressor ensemble
voting_regressor = VotingRegressor(estimators=[
    ('lightgbm', Light),
    ('xgboost', XGB_Model),
    ('catboost', CatBoost_Model)
])

submission2 = train_and_evaluate(train, test, voting_regressor)

# Display the submission DataFrame
submission2

Training Folds: 100%|██████████| 5/5 [01:34<00:00, 18.87s/it]

Mean Train QWK: 0.7591
Mean Validation QWK: 0.3917





Optimized QWK SCORE: [36m[1m 0.463[0m


Unnamed: 0,id,sii
0,00008ff9,1
1,000fd460,0
2,00105258,0
3,00115b9f,0
4,0016bb22,0
5,001f3379,1
6,0038ba98,0
7,0068a485,0
8,0069fbed,1
9,0083e397,0


## No autoencoder. Use imputer = SimpleImputer(strategy='median'). Use more models.

In [8]:
# Load Data
train_og = pd.read_csv('/kaggle/input/child-mind-institute-problematic-internet-use/train.csv')
test_og = pd.read_csv('/kaggle/input/child-mind-institute-problematic-internet-use/test.csv')
sample_submission = pd.read_csv('/kaggle/input/child-mind-institute-problematic-internet-use/sample_submission.csv')

# Load actigraphy time series data
train_ts = load_and_process_data("/kaggle/input/child-mind-institute-problematic-internet-use/series_train.parquet")
test_ts = load_and_process_data("/kaggle/input/child-mind-institute-problematic-internet-use/series_test.parquet")

time_series_cols = train_ts.columns.tolist()
time_series_cols.remove("id")

train = pd.merge(train_og, train_ts, how="left", on='id')
test = pd.merge(test_og, test_ts, how="left", on='id')

train = train.drop('id', axis=1)
test = test.drop('id', axis=1) 

# Get the columns from both DataFrames
train_cols = set(train_og.columns)
test_cols = set(test_og.columns)

# Find common columns
common_cols = train_cols.intersection(test_cols)
featuresCols = [col for col in common_cols if col != 'id']
featuresCols += time_series_cols

test = test[featuresCols]

featuresCols.append('sii')

train = train[featuresCols]
train = train.dropna(subset='sii')

cat_c = ['Basic_Demos-Enroll_Season', 'CGAS-Season', 'Physical-Season', 
          'Fitness_Endurance-Season', 'FGC-Season', 'BIA-Season', 
          'PAQ_A-Season', 'PAQ_C-Season', 'SDS-Season', 'PreInt_EduHx-Season']
        
train = update(train)
test = update(test)

for col in cat_c:
    mapping = create_mapping(col, train)
    mappingTe = create_mapping(col, test)
    
    train[col] = train[col].replace(mapping).astype(int)
    test[col] = test[col].replace(mappingTe).astype(int)
    
imputer = SimpleImputer(strategy='median')

ensemble = VotingRegressor(estimators=[
    ('lgb', Pipeline(steps=[('imputer', imputer), ('regressor', LGBMRegressor(random_state=SEED))])),
    ('xgb', Pipeline(steps=[('imputer', imputer), ('regressor', XGBRegressor(random_state=SEED))])),
    ('cat', Pipeline(steps=[('imputer', imputer), ('regressor', CatBoostRegressor(random_state=SEED, silent=True))])),
    ('rf', Pipeline(steps=[('imputer', imputer), ('regressor', RandomForestRegressor(random_state=SEED))])),
    ('gb', Pipeline(steps=[('imputer', imputer), ('regressor', GradientBoostingRegressor(random_state=SEED))]))
])

submission3 = train_and_evaluate(train, test, ensemble)

# Display the submission DataFrame
submission3

Training Folds: 100%|██████████| 5/5 [02:38<00:00, 31.78s/it]

Mean Train QWK: 0.9161
Mean Validation QWK: 0.3866





Optimized QWK SCORE: [36m[1m 0.452[0m


Unnamed: 0,id,sii
0,00008ff9,2
1,000fd460,0
2,00105258,0
3,00115b9f,0
4,0016bb22,0
5,001f3379,1
6,0038ba98,0
7,0068a485,0
8,0069fbed,2
9,0083e397,0


### TPOT to find best parameters. Stacked Regressor.

In [9]:
# Load Data
train_og = pd.read_csv('/kaggle/input/child-mind-institute-problematic-internet-use/train.csv')
test_og = pd.read_csv('/kaggle/input/child-mind-institute-problematic-internet-use/test.csv')
sample_submission = pd.read_csv('/kaggle/input/child-mind-institute-problematic-internet-use/sample_submission.csv')

# Load actigraphy time series data
train_ts = load_and_process_data("/kaggle/input/child-mind-institute-problematic-internet-use/series_train.parquet")
test_ts = load_and_process_data("/kaggle/input/child-mind-institute-problematic-internet-use/series_test.parquet")

df_train = train_ts.drop('id', axis=1)
df_test = test_ts.drop('id', axis=1)

# Autoencode Data
train_ts_encoded = train_autoencoder(df_train, encoding_dim=60, epochs=100, batch_size=32)
test_ts_encoded = train_autoencoder(df_test, encoding_dim=60, epochs=100, batch_size=32)

time_series_cols = train_ts_encoded.columns.tolist()

# Add 'id' back to the encoded DataFrame
train_ts_encoded["id"]=train_ts["id"]
test_ts_encoded['id']=test_ts["id"]

# Merge Data
train = pd.merge(train_og, train_ts_encoded, how="left", on='id')
test = pd.merge(test_og, test_ts_encoded, how="left", on='id')

# Impute Missing Data
# Define season mapping
season_mapping = {'Spring': 1, 'Summer': 2, 'Fall': 3, 'Winter': 4}

# For the train set
season_columns_train = [col for col in train.columns if 'Season' in col]
train_imputed = impute_missing_values(train, season_columns_train, season_mapping)

# For the test set
season_columns_test = [col for col in test.columns if 'Season' in col]
test_imputed = impute_missing_values(test, season_columns_test, season_mapping)

# Perform feature engineering
train_imputed = feature_engineering(train_imputed)
train_imputed.dropna(thresh=1, axis=0, inplace=True)
train_imputed.replace([np.inf, -np.inf], 0, inplace=True)
test_imputed = feature_engineering(test_imputed)

# Get the columns from both DataFrames
train_cols = set(train_og.columns)
test_cols = set(test_og.columns)

# Find common columns
common_cols = train_cols.intersection(test_cols)
featuresCols = [col for col in common_cols if col != 'id']
featuresCols += time_series_cols

test_imputed = test_imputed[featuresCols]
featuresCols.append('sii')
train_imputed = train_imputed[featuresCols]

def train_and_evaluate(train, test, models, use_stacking=False, final_estimator_class=None):
    X = train.drop(['sii'], axis=1)
    y = train['sii']

    train_S = []
    test_S = []
    
    oof_non_rounded = np.zeros(len(y), dtype=float) 
    oof_rounded = np.zeros(len(y), dtype=int) 
    test_preds = np.zeros((len(test), n_splits))

    # Set up cross-validation
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=SEED)
    for fold, (train_idx, test_idx) in enumerate(tqdm(skf.split(X, y), desc="Training Folds", total=n_splits)):
        print(f"Training fold {fold + 1}/{n_splits}...")

        # Split the data into training and validation sets
        X_train, X_val = X.iloc[train_idx], X.iloc[test_idx]
        y_train, y_val = y.iloc[train_idx], y.iloc[test_idx]

        # Standardize the features using StandardScaler
        scaler = StandardScaler()
        X_train_scaled = scaler.fit_transform(X_train)
        X_val_scaled = scaler.transform(X_val)
        test_scaled = scaler.transform(test)

        if use_stacking:
            # Initialize the stacked regressor with the specified final estimator
            stacked_regressor = StackingRegressor(
                estimators=[(model_name, model_class()) for model_class, model_name in models],
                final_estimator=final_estimator_class(),
                n_jobs=-1
            )
            model = stacked_regressor

        else:
            # Initialize the voting regressor with the specified models
            voting_regressor = VotingRegressor(
                estimators=[(model_name, model_class()) for model_class, model_name in models]
            )
            model = voting_regressor

        # Fit the model on the scaled training data
        model = clone(model)
        model.fit(X_train_scaled, y_train)

        # Generate predictions for the validation
        y_train_pred = model.predict(X_train_scaled)
        y_val_pred = model.predict(X_val_scaled)

        oof_non_rounded[test_idx] = y_val_pred
        y_val_pred_rounded = y_val_pred.round(0).astype(int)
        oof_rounded[test_idx] = y_val_pred_rounded

        train_kappa = quadratic_weighted_kappa(y_train, y_train_pred.round(0).astype(int))
        val_kappa = quadratic_weighted_kappa(y_val, y_val_pred_rounded)

        train_S.append(train_kappa)
        test_S.append(val_kappa)
        
        test_preds[:, fold] = model.predict(test)

        print(f"Fold {fold+1} - Train QWK: {train_kappa:.4f}, Validation QWK: {val_kappa:.4f}")
        clear_output(wait=True)

    print(f"Mean Train QWK ---> {np.mean(train_S):.4f}")
    print(f"Mean Validation QWK ---> {np.mean(test_S):.4f}")

    KappaOPtimizer = minimize(evaluate_predictions,
                              x0=[0.5, 1.5, 2.5], args=(y, oof_non_rounded), 
                              method='Nelder-Mead')
    assert KappaOPtimizer.success, "Optimization did not converge."
    
    oof_tuned = threshold_rounder(oof_non_rounded, KappaOPtimizer.x)
    tKappa = quadratic_weighted_kappa(y, oof_tuned)

    print(f"Optimized QWK SCORE :: {Fore.CYAN}{Style.BRIGHT} {tKappa:.3f}{Style.RESET_ALL}")  
    tpm = test_preds.mean(axis=1)
    tpTuned = threshold_rounder(tpm, KappaOPtimizer.x)

    # Prepare submission DataFrame
    submission = pd.DataFrame({
        'id': sample_submission['id'],  # Use 'id' from sample submission
        'sii': tpTuned
    })

    return submission

models = [
    (lambda: RandomForestRegressor(), 'Random Forest'),
    (lambda: XGBRegressor(verbosity=0), 'XGBoost'),
    (lambda: CatBoostRegressor(verbose=0), 'CatBoost'),
    (lambda: GradientBoostingRegressor(), 'Gradient Boosting')
]

from sklearn.ensemble import StackingRegressor
from sklearn.ensemble import GradientBoostingRegressor

# Stacking Regressor

submission4 = train_and_evaluate(
    train_imputed, 
    test_imputed, 
    models, 
    use_stacking=True,
    final_estimator_class=GradientBoostingRegressor
)

submission4

Training Folds: 100%|██████████| 5/5 [22:46<00:00, 273.37s/it]


Mean Train QWK ---> 0.9671
Mean Validation QWK ---> 0.5382
Optimized QWK SCORE :: [36m[1m 0.579[0m


Unnamed: 0,id,sii
0,00008ff9,2
1,000fd460,2
2,00105258,2
3,00115b9f,2
4,0016bb22,2
5,001f3379,2
6,0038ba98,2
7,0068a485,2
8,0069fbed,2
9,0083e397,2


In [10]:
sub1 = submission1
sub2 = submission2
sub3 = submission3
sub4 = submission4

sub1 = sub1.sort_values(by='id').reset_index(drop=True)
sub2 = sub2.sort_values(by='id').reset_index(drop=True)
sub3 = sub3.sort_values(by='id').reset_index(drop=True)
sub4 = sub4.sort_values(by='id').reset_index(drop=True)

combined = pd.DataFrame({
    'id': sub1['id'],
    'sii_1': sub1['sii'],
    'sii_2': sub2['sii'],
    'sii_3': sub3['sii'],
    'sii_4': sub4['sii']
})

def majority_vote(row):
    return row.mode()[0]

combined['final_sii'] = combined[['sii_1', 'sii_2', 'sii_3', 'sii_4']].apply(majority_vote, axis=1)

final_submission = combined[['id', 'final_sii']].rename(columns={'final_sii': 'sii'})

final_submission.to_csv('submission.csv', index=False)

print("Majority voting completed and saved to 'Submission.csv'")
final_submission

Majority voting completed and saved to 'Submission.csv'


Unnamed: 0,id,sii
0,00008ff9,2
1,000fd460,0
2,00105258,0
3,00115b9f,0
4,0016bb22,0
5,001f3379,1
6,0038ba98,0
7,0068a485,0
8,0069fbed,2
9,0083e397,0
