In [53]:
import os
import warnings

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
from concurrent.futures import ThreadPoolExecutor

from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import KNNImputer, IterativeImputer
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import (RandomForestClassifier, RandomForestRegressor,
                              VotingClassifier, VotingRegressor, GradientBoostingClassifier,
                              GradientBoostingRegressor, AdaBoostClassifier)
from sklearn.metrics import (classification_report, confusion_matrix, mean_squared_error,
                             accuracy_score, cohen_kappa_score, r2_score, make_scorer)
from sklearn.decomposition import PCA
from sklearn.tree import DecisionTreeRegressor 
from scipy.optimize import minimize

from keras.models import Model
from keras.layers import Input, Dense

from xgboost import XGBRegressor
from catboost import CatBoostRegressor

import torch
import torch.nn as nn
import torch.optim as optim

# Progress Report #1

# Importing Data

In [None]:
# Load Data

train_df = pd.read_csv('/kaggle/input/child-mind-institute-problematic-internet-use/train.csv')
test_df = pd.read_csv('/kaggle/input/child-mind-institute-problematic-internet-use/test.csv')
data_dict = pd.read_csv('/kaggle/input/child-mind-institute-problematic-internet-use/data_dictionary.csv')

# Data Preparation

In [None]:
# Get Statistical details

train_df.describe().transpose()

In [None]:
train_df.info()

In [None]:
test_df.info()

In [None]:
data_dict.head()

## EDA

In [None]:
# Check the distribution of the target variable 'sii'
class_distribution = train_df['sii'].value_counts().sort_index()
print("Class Distribution:\n", class_distribution)

# Set the style for the plot
sns.set(style="whitegrid")

# Create a bar plot of class distribution
plt.figure(figsize=(10, 6))
sns.barplot(x=class_distribution.index, y=class_distribution.values, palette='viridis', alpha=0.8)

# Set the title and labels
plt.title('Class Distribution of Target Variable (sii)', fontsize=16)
plt.xlabel('Classes', fontsize=14)
plt.ylabel('Number of Instances', fontsize=14)
plt.xticks(rotation=45)
plt.grid(axis='y', linestyle='--', alpha=0.7)  # Optional: add gridlines for better readability

# Show the plot
plt.show()

In [None]:
def calculate_stats(data, columns):
    if isinstance(columns, str):
        columns = [columns]

    stats = []
    for col in columns:
        if data[col].dtype in ['object', 'category']:
            counts = data[col].value_counts(dropna=False, sort=False)
            percents = data[col].value_counts(normalize=True, dropna=False, sort=False) * 100
            formatted = counts.astype(str) + ' (' + percents.round(2).astype(str) + '%)'
            stats_col = pd.DataFrame({'count (%)': formatted})
            stats.append(stats_col)

        else:
            stats_col = data[col].describe().to_frame().transpose()
            stats_col['missing'] = data[col].isnull().sum()
            stats_col.index.name = col
            stats.append(stats_col)

    return pd.concat(stats, axis=0)

In [None]:
# Temporary variable to hold the new column
temp_data = train_df.copy()
temp_data['Age Group'] = pd.cut(
    temp_data['Basic_Demos-Age'],
    bins=[4, 12, 18, 22],
    labels=['Children (5-12)', 'Adolescents (13-18)', 'Adults (19-22)']
)

calculate_stats(temp_data, 'Age Group')

In [None]:
sex_counts_renamed = train_df['Basic_Demos-Sex'].replace({0: 'Male', 1: 'Female'}).value_counts()
sex_counts_renamed

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Create the boxplot
fig, ax = plt.subplots(figsize=(5, 5))

# Create the boxplot for SII by Age without hue
sns.boxplot(y=train_df['Basic_Demos-Age'], x=train_df['sii'], ax=ax, palette="Set3")

# Set the title and labels
ax.set_title('SII by Age')  # Set the title on the ax object
ax.set_ylabel('Age')  # Set the y-axis label
ax.set_xlabel('SII')  # Set the x-axis label

# Show the plot
plt.show()

In [None]:
# Create the boxplot

fig, ax = plt.subplots(figsize=(5, 5)) 
sns.boxplot(y=train_df['Physical-BMI'], x=train_df['sii'], ax=ax, palette="Set3")  # Pass ax directly

# Set the title and labels
ax.set_title('SII by Physical BMI')  
ax.set_ylabel('BMI')  # Set the y-axis label
ax.set_xlabel('SII')  # Set the x-axis label

# Show the plot
plt.show()

---

# Data Preprocessing

In [None]:
common_columns = train_df.columns.intersection(test_df.columns)

In [None]:
# Create a copy of the original train_df for reference
train_original = train_df.copy()

# Filter train_df to keep only the common columns plus the 'sii' column
train_df = train_df[common_columns.union(['sii'])]

In [None]:
train_df

## Check for Duplicate Data

In [None]:
train_df.duplicated().sum()

There are no duplicates in this data.

## Missing Values

In [None]:
train_df.isnull()

In [None]:
train_df.isnull().sum()

In [None]:
# Calculate missing values by columns

def check_missing_values(row):

    """ functions that check and verifies if there are missing values in dataframe """

    counter = 0
    for element in row:
        if element == True:
            counter+=1

    return ("The amount of missing records is: ", counter)

train_df.isnull().apply(lambda x: check_missing_values(x))

In [None]:
# Calculate missing values in every record

train_df.isnull().apply(lambda x: check_missing_values(x), axis=1)

## Encoding Seasons as Numbers

In [None]:
# Check for unique values

print(train_df['Basic_Demos-Enroll_Season'].unique())

In [None]:
# Display columns containing 'Season'

season_columns = [col for col in train_df.columns if 'Season' in col]
print("Columns containing 'Season':", season_columns)

In [None]:
season_mapping = {'Spring': 1, 'Summer': 2, 'Fall': 3, 'Winter': 4}

for column in season_columns:
    train_df[column] = train_df[column].map(season_mapping)

In [None]:
# target
train_df["sii"].value_counts()

## Predict Missing Values

In [None]:
# Impute missing values

imputer = KNNImputer(n_neighbors=5, weights='distance', metric='nan_euclidean')
numeric_cols = train_df.select_dtypes(include=['float64', 'int64']).columns
imputed_data = imputer.fit_transform(train_df[numeric_cols])
train_imputed = pd.DataFrame(imputed_data, columns=numeric_cols)

# Convert the 'sii' column back to integers and ensure it stays between 0 and 3

if 'sii' in numeric_cols:
    train_imputed['sii'] = train_imputed['sii'].clip(lower=0, upper=3).round().astype(int)

# Convert season_columns to integers

for column in season_columns:
    if column in train_imputed.columns:
        train_imputed[column] = train_imputed[column].round().astype(int)

# Retain other columns from the original DataFrame

for col in train_df.columns:
    if col not in numeric_cols:
        train_imputed[col] = train_df[col]

In [None]:
train_imputed.isnull().apply(lambda x: check_missing_values(x))

---

## Feature Engineering


### Add variables

The dataset contains features related to physical characteristics (eg, BMI, Height, Weight), behavioral aspects (eg, internet usage), and fitness data (eg, endurance time).

In [54]:
# Feature Engineering
import pandas as pd

def feature_engineering(df):
    # Calculate new features
    new_features = pd.DataFrame({
        'BMI_Age': df['Physical-BMI'] * df['Basic_Demos-Age'],
        'Internet_Hours_Age': df['PreInt_EduHx-computerinternet_hoursday'] * df['Basic_Demos-Age'],
        'BMI_Internet_Hours': df['Physical-BMI'] * df['PreInt_EduHx-computerinternet_hoursday'],
        'BFP_BMI': df['BIA-BIA_Fat'] / df['BIA-BIA_BMI'],
        'FFMI_BFP': df['BIA-BIA_FFMI'] / df['BIA-BIA_Fat'],
        'FMI_BFP': df['BIA-BIA_FMI'] / df['BIA-BIA_Fat'],
        'LST_TBW': df['BIA-BIA_LST'] / df['BIA-BIA_TBW'],
        'BFP_BMR': df['BIA-BIA_Fat'] * df['BIA-BIA_BMR'],
        'BFP_DEE': df['BIA-BIA_Fat'] * df['BIA-BIA_DEE'],
        'BMR_Weight': df['BIA-BIA_BMR'] / df['Physical-Weight'],
        'DEE_Weight': df['BIA-BIA_DEE'] / df['Physical-Weight'],
        'SMM_Height': df['BIA-BIA_SMM'] / df['Physical-Height'],
        'Muscle_to_Fat': df['BIA-BIA_SMM'] / df['BIA-BIA_FMI'],
        'Hydration_Status': df['BIA-BIA_TBW'] / df['Physical-Weight'],
        'ICW_TBW': df['BIA-BIA_ICW'] / df['BIA-BIA_TBW']
    })

    # Concatenate new features with the original DataFrame
    df = pd.concat([df, new_features], axis=1)
    
    return df

In [None]:
# Apply feature engineering and clean data

train_imputed = feature_engineering(train_imputed)
train_imputed.dropna(thresh=1, axis=0, inplace=True)
train_imputed.replace([np.inf, -np.inf], 0, inplace=True)

---

### PCA

In [None]:
# PCA Implementation

# Split the dataset into features and target variable
X = train_imputed.drop('id', axis=1)  # Drop 'id' column from features
y = train_imputed['sii']  # Target variable

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Apply PCA
pca = PCA(n_components=0.95)  # Preserve 95% of variance
X_train_pca = pca.fit_transform(X_train_scaled)
X_test_pca = pca.transform(X_test_scaled)

# Check the shape of the PCA output
print("Original shape:", X_train.shape)
print("Transformed shape after PCA:", X_train_pca.shape)

### Ada Model

In [None]:
# Initialize the AdaBoost model
ada_model = AdaBoostClassifier(n_estimators=100, random_state=42)

# Fit the model on PCA-transformed training data
ada_model.fit(X_train_pca, y_train)

# Make predictions on the test set
y_pred_ada = ada_model.predict(X_test_pca)

# Calculate accuracy
accuracy_ada = accuracy_score(y_test, y_pred_ada)
print(f'AdaBoost Model Accuracy: {accuracy_ada:.4f}')

### Gradient Boost

In [None]:
# Initialize the Gradient Boosting model
gb_model = GradientBoostingClassifier(n_estimators=100, random_state=42)

# Fit the model on PCA-transformed training data
gb_model.fit(X_train_pca, y_train)

# Make predictions on the test set
y_pred_gb = gb_model.predict(X_test_pca)

# Calculate accuracy
accuracy_gb = accuracy_score(y_test, y_pred_gb)
print(f'Gradient Boosting Model Accuracy: {accuracy_gb:.4f}')

### Random Forest

In [None]:
# Initialize the Random Forest model
rf_model = RandomForestClassifier(n_estimators=100, random_state=42, class_weight='balanced')

# Fit the model on PCA-transformed training data
rf_model.fit(X_train_pca, y_train)

# Make predictions on the test set
y_pred_rf = rf_model.predict(X_test_pca)

# Calculate accuracy
accuracy_rf = accuracy_score(y_test, y_pred_rf)
print(f'Random Forest Model Accuracy: {accuracy_rf:.4f}')

In [None]:
# Initialize the voting classifier with the three models
voting_model = VotingClassifier(
    estimators=[
        ('rf', rf_model),
        ('ada', ada_model),
        ('gb', gb_model)
    ],
    voting='soft'  # Use soft voting to consider predicted probabilities
)

# Fit the voting model on PCA-transformed training data
voting_model.fit(X_train_pca, y_train)

# Make predictions on the test set
y_pred_voting = voting_model.predict(X_test_pca)

# Calculate accuracy
accuracy_voting = accuracy_score(y_test, y_pred_voting)
print(f'Voting Classifier Model Accuracy: {accuracy_voting:.4f}')

---

# Progress Report #2

In [55]:
# Load Data

train_og = pd.read_csv('/kaggle/input/child-mind-institute-problematic-internet-use/train.csv')
test_og = pd.read_csv('/kaggle/input/child-mind-institute-problematic-internet-use/test.csv')

### Add in Actigraphy Time Series Data

In [56]:
def load_and_process_data(directory):
    files = os.listdir(directory)
    all_stats = []

    with ThreadPoolExecutor() as executor:
        futures = [executor.submit(pd.read_parquet, os.path.join(directory, file, 'part-0.parquet')) for file in files]
        for future in tqdm(futures):
            data = future.result()
            if 'step' in data.columns:
                data.drop('step', axis=1, inplace=True)

            # Calculate summary statistics
            stats = data.describe().values.reshape(-1)
            all_stats.append(stats)

    # Create a DataFrame for summary statistics
    stat_columns = [f"stat_{i}" for i in range(len(all_stats[0]))]
    summary_df = pd.DataFrame(all_stats, columns=stat_columns)
    summary_df['id'] = [file.split('=')[1] for file in files]  # Extract 'id' from filenames

    return summary_df

In [57]:
# Load actigraphy time series data
train_ts = load_and_process_data("/kaggle/input/child-mind-institute-problematic-internet-use/series_train.parquet")
test_ts = load_and_process_data("/kaggle/input/child-mind-institute-problematic-internet-use/series_test.parquet")

df_train = train_ts.drop('id', axis=1)
df_test = test_ts.drop('id', axis=1)

100%|██████████| 996/996 [02:28<00:00,  6.71it/s]
100%|██████████| 2/2 [00:00<00:00,  6.55it/s]


In [58]:
class SimpleAutoEncoder(nn.Module):
    def __init__(self, input_dim, encoding_dim):
        super(SimpleAutoEncoder, self).__init__()
        self.encoder = nn.Sequential(
            nn.Linear(input_dim, encoding_dim * 2),
            nn.ReLU(),
            nn.Linear(encoding_dim * 2, encoding_dim),
            nn.ReLU()
        )
        self.decoder = nn.Sequential(
            nn.Linear(encoding_dim, encoding_dim * 2),
            nn.ReLU(),
            nn.Linear(encoding_dim * 2, input_dim),
            nn.Sigmoid()
        )

    def forward(self, x):
        return self.decoder(self.encoder(x))

def train_autoencoder(data, encoding_dim=10, epochs=20, batch_size=16):
    scaler = StandardScaler()
    scaled_data = scaler.fit_transform(data)
    tensor_data = torch.FloatTensor(scaled_data)

    autoencoder = SimpleAutoEncoder(input_dim=tensor_data.shape[1], encoding_dim=encoding_dim)
    criterion = nn.MSELoss()
    optimizer = optim.Adam(autoencoder.parameters())

    for epoch in range(epochs):
        for i in range(0, len(tensor_data), batch_size):
            batch = tensor_data[i:i + batch_size]
            optimizer.zero_grad()
            loss = criterion(autoencoder(batch), batch)
            loss.backward()
            optimizer.step()
        if (epoch + 1) % 5 == 0:
            print(f'Epoch {epoch + 1}/{epochs}, Loss: {loss.item():.4f}')

    with torch.no_grad():
        encoded_data = autoencoder.encoder(tensor_data).numpy()
    
    return pd.DataFrame(encoded_data, columns=[f'Enc_{i+1}' for i in range(encoded_data.shape[1])])

In [59]:
# Autoencode Data
train_ts_encoded = train_autoencoder(df_train, encoding_dim=60, epochs=100, batch_size=32)
test_ts_encoded = train_autoencoder(df_test, encoding_dim=60, epochs=100, batch_size=32)

Epoch 5/100, Loss: 1.4830
Epoch 10/100, Loss: 1.3856
Epoch 15/100, Loss: 1.3797
Epoch 20/100, Loss: 1.3719
Epoch 25/100, Loss: 1.3707
Epoch 30/100, Loss: 1.3620
Epoch 35/100, Loss: 1.3590
Epoch 40/100, Loss: 1.3573
Epoch 45/100, Loss: 1.3582
Epoch 50/100, Loss: 1.3575
Epoch 55/100, Loss: 1.3595
Epoch 60/100, Loss: 1.3518
Epoch 65/100, Loss: 1.3535
Epoch 70/100, Loss: 1.3515
Epoch 75/100, Loss: 1.3519
Epoch 80/100, Loss: 1.3508
Epoch 85/100, Loss: 1.3529
Epoch 90/100, Loss: 1.3520
Epoch 95/100, Loss: 1.3517
Epoch 100/100, Loss: 1.3499
Epoch 5/100, Loss: 1.0821
Epoch 10/100, Loss: 1.0216
Epoch 15/100, Loss: 0.8897
Epoch 20/100, Loss: 0.6921
Epoch 25/100, Loss: 0.5248
Epoch 30/100, Loss: 0.4460
Epoch 35/100, Loss: 0.4290
Epoch 40/100, Loss: 0.4273
Epoch 45/100, Loss: 0.4271
Epoch 50/100, Loss: 0.4271
Epoch 55/100, Loss: 0.4271
Epoch 60/100, Loss: 0.4271
Epoch 65/100, Loss: 0.4271
Epoch 70/100, Loss: 0.4271
Epoch 75/100, Loss: 0.4271
Epoch 80/100, Loss: 0.4271
Epoch 85/100, Loss: 0.4271
Ep

In [60]:
time_series_cols = train_ts_encoded.columns.tolist()
# Add 'id' back to the encoded DataFrame
train_ts_encoded["id"]=train_ts["id"]
test_ts_encoded['id']=test_ts["id"]

train = pd.merge(train_og, train_ts_encoded, how="left", on='id')
test = pd.merge(test_og, test_ts_encoded, how="left", on='id')

In [61]:
# Calculate % of Missing sii

# Count the total number of rows
total_rows = len(train)

# Count the missing values in the 'sii' column
missing_sii = train['sii'].isna().sum()

# Calculate the percentage of missing values
missing_percentage = (missing_sii / total_rows) * 100

print(f"Percentage of missing values in 'sii': {missing_percentage:.2f}%")

Percentage of missing values in 'sii': 30.91%


Since more than 30% of the data is missing, imputation might be a better choice than removing the rows. 

### Impute Missing Data
#### Use KNN Imputer 

Given the large dataset and the convergence warning upon MICE, switching from IterativeImputer to a simpler imputer, like KNNImputer, might be beneficial for handling such large datasets. 
KNNImputer can be faster and less likely to run into convergence issues, especially with higher-dimensional data.

In [62]:
def impute_missing_values(data, season_columns, season_mapping):
    # Encode Seasons
    data[season_columns] = data[season_columns].map(lambda x: season_mapping.get(x, x))
    
    # Identify numeric columns
    numeric_cols = data.select_dtypes(include=['float64', 'float32', 'int64']).columns
    
    # Scale numeric features for KNN imputation
    scaler = StandardScaler()
    data_scaled = data.copy()
    data_scaled[numeric_cols] = scaler.fit_transform(data[numeric_cols])
    
    # Initialize the imputer and apply it only on numeric columns with missing values
    imputer = KNNImputer(n_neighbors=5)
    imputed_numeric_data = imputer.fit_transform(data_scaled[numeric_cols])
    imputed_scaled_df = pd.DataFrame(imputed_numeric_data, columns=numeric_cols)
    
    # Invert scaling to original scale for imputed numeric columns
    imputed_data = data.copy()
    imputed_data[numeric_cols] = scaler.inverse_transform(imputed_scaled_df)
    
    # Clip and convert 'sii' to integers
    if 'sii' in imputed_data.columns:
        imputed_data['sii'] = imputed_data['sii'].clip(lower=0, upper=3).round().astype(int)
    
    # Ensure other columns remain intact
    for col in imputed_data.columns:
        if col not in numeric_cols:
            imputed_data[col] = data[col]
    
    # Convert season columns to integers
    imputed_data[season_columns] = imputed_data[season_columns].astype(int)
    
    return imputed_data

In [63]:
# Define season mapping
season_mapping = {'Spring': 1, 'Summer': 2, 'Fall': 3, 'Winter': 4}

# For the train set
season_columns_train = [col for col in train.columns if 'Season' in col]
train_imputed = impute_missing_values(train, season_columns_train, season_mapping)

# For the test set
season_columns_test = [col for col in test.columns if 'Season' in col]
test_imputed = impute_missing_values(test, season_columns_test, season_mapping)

In [64]:
# Perform feature engineering
train_imputed = feature_engineering(train_imputed)
train_imputed.dropna(thresh=1, axis=0, inplace=True)
train_imputed.replace([np.inf, -np.inf], 0, inplace=True)

test_imputed = feature_engineering(test_imputed)

In [65]:
# Get the columns from both DataFrames
train_cols = set(train_og.columns)
test_cols = set(test_og.columns)

# Find common columns
common_cols = train_cols.intersection(test_cols)
featuresCols = [col for col in common_cols if col != 'id']
featuresCols += time_series_cols

test_imputed = test_imputed[featuresCols]
featuresCols.append('sii')
train_imputed = train_imputed[featuresCols]

In [66]:
# Split the dataset into features and target variable
X = train_imputed.drop('sii', axis=1)
y = train_imputed['sii']  # Target variable

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [67]:
# Standardize the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

test_scaled = scaler.transform(test_imputed)

In [68]:
# Apply PCA
pca = PCA(n_components=0.95)  # Preserve 95% of variance
X_train_pca = pca.fit_transform(X_train_scaled)
X_test_pca = pca.transform(X_test_scaled)
test_pca = pca.transform(test_scaled)

# Models

The target (sii) has an ordinal nature, with values representing categories: 0 for None, 1 for Mild, 2 for Moderate, and 3 for Severe. 
Therefore, regression is appropriate for this analysis to capture the inherent order in the severity levels.

### Voting Regressor

### Hyperparameters

In [69]:
# XGBoost Hyperparameters
xgb_params = {
    'n_estimators': 200,
    'learning_rate': 0.05,
    'max_depth': 6,
    'min_child_weight': 1,
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    'gamma': 0,
    'reg_alpha': 1,
    'reg_lambda': 5,
    'random_state': 42
}

# CatBoost Hyperparameters
cat_params = {
    'iterations': 200,
    'learning_rate': 0.05,
    'depth': 6,
    'l2_leaf_reg': 10,
    'subsample': 0.8,
    'rsm': 0.8,
    'border_count': 32,
    'random_state': 42,
    'silent': True
}

# Random Forest Hyperparameters
rf_params = {
    'n_estimators': 200,
    'max_depth': None,
    'min_samples_split': 2,
    'min_samples_leaf': 1,
    'bootstrap': True,
    'random_state': 42
}

# Gradient Boosting Hyperparameters
gb_params = {
    'n_estimators': 200,
    'learning_rate': 0.05,
    'max_depth': 3,
    'min_samples_split': 2,
    'min_samples_leaf': 1,
    'subsample': 1.0,
    'random_state': 42
}

### QWK Metric

In [None]:
# Define QWK calculation function
def quadratic_weighted_kappa(y_true, y_pred, num_classes=4):
    y_pred = y_pred.round().astype(int).clip(0, num_classes - 1)
    return cohen_kappa_score(y_true, y_pred, weights='quadratic')

# Define a function to round predictions based on optimized thresholds
def threshold_rounder(predictions, thresholds):
    return np.where(predictions < thresholds[0], 0,
                    np.where(predictions < thresholds[1], 1,
                             np.where(predictions < thresholds[2], 2, 3)))

# Optimization function for QWK thresholds
def optimize_qwk_thresholds(predictions, y_true):
    def evaluate_thresholds(thresholds):
        rounded_preds = threshold_rounder(predictions, thresholds)
        return -quadratic_weighted_kappa(y_true, rounded_preds)
    
    # Optimize thresholds
    result = minimize(evaluate_thresholds, x0=[0.5, 1.5, 2.5], method='Nelder-Mead')
    return result.x if result.success else [0.5, 1.5, 2.5]

In [None]:
# Initialize models with specified hyperparameters
rf_model = RandomForestRegressor(**rf_params)
xgb_model = XGBRegressor(**xgb_params)
cat_model = CatBoostRegressor(**cat_params)
gb_model = GradientBoostingRegressor(**gb_params)

In [None]:
# Initialize the voting regressor with optimized thresholds
voting_regressor = VotingRegressor(estimators=[
    ('rf', rf_model),
    ('xgboost', xgb_model),
    ('catboost', cat_model),
    ('gb', gb_model)
])

# Fit and predict with voting regressor
voting_regressor.fit(X_train_scaled, y_train)
y_pred = voting_regressor.predict(X_test_scaled)

# Optimize thresholds to maximize QWK
optimal_thresholds = optimize_qwk_thresholds(y_pred, y_test)
y_pred_rounded = threshold_rounder(y_pred, optimal_thresholds)

# Calculate QWK with optimized thresholds
qwk_score = quadratic_weighted_kappa(y_test, y_pred_rounded)
print(f'Optimized Voting Regressor QWK: {qwk_score:.4f}')

In [None]:
# Round predictions to nearest integer as QWK requires discrete categories
y_pred_rounded = np.round(y_pred).astype(int)

# Calculate the Quadratic Weighted Kappa score
qwk_score = cohen_kappa_score(y_test, y_pred_rounded, weights='quadratic')
print("Quadratic Weighted Kappa score:", qwk_score)

In [None]:
# Predict on test data and prepare for submission
submission1 = voting_regressor.predict(test_scaled)

# Convert predictions to integers by rounding
submission1 = submission1.round().astype(int)

# Create the submission DataFrame with 'id' from reloaded test_df and 'sii' as integer type
submission = pd.DataFrame({
    'id': test_og['id'],  # Use the original 'id' column from the reloaded test data
    'sii': submission1  # 'sii' is now integer as required
})

# Save the submission file
submission.to_csv('/kaggle/working/submission.csv', index=False)
print("Submission file created: submission.csv")
submission

In [None]:
## trial

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import cohen_kappa_score
from scipy.optimize import minimize
from tqdm import tqdm  # For progress bars
from colorama import Fore, Style  # For colored output (if needed)

# Define constants
N_SPLITS = 5
SEED = 42

# Define QWK calculation function
def quadratic_weighted_kappa(y_true, y_pred, num_classes=4):
    y_pred = y_pred.round().astype(int).clip(0, num_classes - 1)
    return cohen_kappa_score(y_true, y_pred, weights='quadratic')

# Define a function to round predictions based on optimized thresholds
def threshold_rounder(predictions, thresholds):
    return np.where(predictions < thresholds[0], 0,
                    np.where(predictions < thresholds[1], 1,
                             np.where(predictions < thresholds[2], 2, 3)))

# Optimization function for QWK thresholds
def optimize_thresholds(y_true, preds):
    return minimize(lambda th: -quadratic_weighted_kappa(y_true, threshold_rounder(preds, th)), 
                    x0=[0.5, 1.5, 2.5], method='Nelder-Mead').x

def train_model(model, train, test):
    X = train.drop(['sii'], axis=1)
    y = train['sii']
    test_preds = np.zeros((len(test), N_SPLITS))
    oof_preds = np.zeros(len(y))

    skf = StratifiedKFold(n_splits=N_SPLITS, shuffle=True, random_state=SEED)
    for fold, (train_idx, val_idx) in enumerate(tqdm(skf.split(X, y), desc="Training Folds", total=N_SPLITS)):
        X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
        y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]
        
        model.fit(X_train, y_train)
        oof_preds[val_idx] = model.predict(X_val)
        test_preds[:, fold] = model.predict(test)

    # Optimize thresholds for out-of-fold predictions
    optimized_thresholds = optimize_thresholds(y, oof_preds)
    oof_preds_rounded = threshold_rounder(oof_preds, optimized_thresholds)

    print(f"Mean Train QWK --> {quadratic_weighted_kappa(y, oof_preds_rounded):.4f}")
    
    final_test_preds = threshold_rounder(test_preds.mean(axis=1), optimized_thresholds)
    
    # Print optimized QWK score
    optimized_qwk_score = quadratic_weighted_kappa(y, oof_preds_rounded)
    print(f"----> || Optimized QWK SCORE :: {Fore.CYAN}{Style.BRIGHT}{optimized_qwk_score:.3f}{Style.RESET_ALL}")

    return pd.DataFrame({'id': test['id'], 'sii': final_test_preds})

# Example of usage
# submission = train_model(your_model, train_data, test_data)