In [None]:
import numpy as np
import pandas as pd
import os
import re
from sklearn.base import clone
from sklearn.metrics import cohen_kappa_score
from sklearn.model_selection import StratifiedKFold
from scipy.optimize import minimize
from concurrent.futures import ThreadPoolExecutor
from tqdm import tqdm

from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
from keras.models import Model
from keras.layers import Input, Dense
from keras.optimizers import Adam

from colorama import Fore, Style
from IPython.display import clear_output
import warnings
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor
from catboost import CatBoostRegressor
from sklearn.ensemble import VotingRegressor, RandomForestRegressor, GradientBoostingRegressor
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
warnings.filterwarnings('ignore')
pd.options.display.max_columns = None
import torch
from torch import nn

SEED = 42
n_splits = 5

In [None]:
def process_file(filename, dirname):
    df = pd.read_parquet(os.path.join(dirname, filename, 'part-0.parquet'))
    df.drop('step', axis=1, inplace=True)
    return df.describe().values.reshape(-1), filename.split('=')[1]

def load_time_series(dirname) -> pd.DataFrame:
    ids = os.listdir(dirname)
    
    with ThreadPoolExecutor() as executor:
        results = list(tqdm(executor.map(lambda fname: process_file(fname, dirname), ids), total=len(ids)))
    
    stats, indexes = zip(*results)
    
    df = pd.DataFrame(stats, columns=[f"stat_{i}" for i in range(len(stats[0]))])
    df['id'] = indexes
    return df

In [None]:
train = pd.read_csv('/kaggle/input/child-mind-institute-problematic-internet-use/train.csv')
test = pd.read_csv('/kaggle/input/child-mind-institute-problematic-internet-use/test.csv')
sample = pd.read_csv('/kaggle/input/child-mind-institute-problematic-internet-use/sample_submission.csv')

train_ts = load_time_series("/kaggle/input/child-mind-institute-problematic-internet-use/series_train.parquet")
test_ts = load_time_series("/kaggle/input/child-mind-institute-problematic-internet-use/series_test.parquet")

df_train = train_ts.drop('id', axis=1)
df_test = test_ts.drop('id', axis=1)

In [None]:
temp_df = train[['id','sii']]

time_series_df_with_target = pd.merge(train_ts,temp_df,how="left",on='id')

temp_df = test[['id']]

time_series_df_without_target = pd.merge(test_ts,temp_df,how="left",on='id')

In [None]:
from torch.utils.data import Dataset,DataLoader
from sklearn.preprocessing import StandardScaler

class CustomDataset(Dataset):
    
    def __init__(self, dataframe):
        # Apply StandardScaler to the input features
        self.scaler = StandardScaler()
        if 'sii' in dataframe.columns:
            self.train = True
            features = dataframe.drop(['id', 'sii'], axis=1)  # Drop ID and target column
            

            self.targets = dataframe['sii'].values  # Keep target values (sii)
        else:
            self.train = False
            features = dataframe.drop(['id'],axis=1)
            
        self.scaled_data = self.scaler.fit_transform(features)  # Scale features
            

    def __len__(self):
        return len(self.scaled_data)
    
    def __getitem__(self, idx):
        # Return the scaled input features and target value
        if self.train:
            return torch.tensor(self.scaled_data[idx], dtype=torch.float32), torch.tensor(self.targets[idx], dtype=torch.long)  # Ensure targets are long for classification
        else:
            return torch.tensor(self.scaled_data[idx], dtype=torch.float32)
        
train_dataset = CustomDataset(time_series_df_with_target)
train_dataloader = DataLoader(train_dataset,batch_size=32,shuffle=True)

test_dataset = CustomDataset(time_series_df_without_target)
test_dataloader = DataLoader(test_dataset,batch_size=1,shuffle=True)

In [None]:
class LSTMEncoder(nn.Module):
    
    def __init__(self, input_size, hidden_size, latent_dim, num_classes, num_layers=1):
        super(LSTMEncoder, self).__init__()
        # LSTM layer
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True)
        # Fully connected layers for latent dimension and classification
        self.fc_latent = nn.Linear(hidden_size, latent_dim)  # Latent space
        self.fc_class = nn.Linear(latent_dim, num_classes)   # Classifier layer for output
        
    def forward(self, x):
        _, (hn, _) = self.lstm(x)  # Get hidden state from LSTM (hn)
        latent = self.fc_latent(hn[-1])  # Compress the last hidden state to latent space
        out = self.fc_class(latent)  # Classify using the latent space
        return latent, out

In [None]:
# Example parameters
input_size = time_series_df_with_target.shape[1] - 2  # Number of features (excluding the ID column)
hidden_size = 64  # Number of hidden units in LSTM
latent_dim = 30  # Number of dimensions to encode into
num_classes = len(time_series_df_with_target['sii'].unique())  # Number of target classes
num_layers = 2  # LSTM layers


model = LSTMEncoder(input_size, hidden_size, latent_dim, num_classes)

# Define a loss function and optimizer
criterion = nn.CrossEntropyLoss()  # For multi-class classification
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

In [None]:
# Training loop
epochs = 40  # Number of epochs to train

for epoch in range(epochs):
    total_loss = 0
    for batch_data, batch_targets in train_dataloader:
        optimizer.zero_grad()
        # Forward pass: latent representation and predictions
        latent, predictions = model(batch_data.unsqueeze(1))
        # Compute the loss
        loss = criterion(predictions, batch_targets)
        total_loss+=loss.item()
        loss.backward()  # Backpropagate the error
        optimizer.step()  # Update the weights

    print(f"Epoch [{epoch+1}], Loss: {total_loss/32}")

In [None]:
import pandas as pd

final_list = []

# After training, you can use the encoder to extract both the latent dimensions and class predictions
with torch.no_grad():
    for batch_data, batch_targets in train_dataloader:
        batch_data = batch_data.unsqueeze(1)  # Add sequence length dim if needed
        latent, predictions = model(batch_data)
        final_list.append(latent)

# Step 1: Concatenate all the latent representations into a single tensor
# Assuming each 'latent' is of shape (batch_size, latent_dim)
all_latents = torch.cat(final_list, dim=0)  # Concatenate along the first dimension (batch dimension)

# Step 2: Convert the concatenated tensor to a NumPy array
latent_array = all_latents.numpy()  # Convert to NumPy array

# Step 3: Create a DataFrame from the NumPy array
num_latent_dims = latent_array.shape[1]  # Get the number of latent dimensions
train_latent = pd.DataFrame(latent_array, columns=[f'enc_{i + 1}' for i in range(num_latent_dims)])

In [None]:
train_latent['id'] = time_series_df_with_target['id']

In [None]:
train = pd.merge(train, train_latent, how="left", on='id')

In [None]:
from sklearn.impute import SimpleImputer, KNNImputer

imputer = KNNImputer(n_neighbors=5)
numeric_cols = train.select_dtypes(include=['float64', 'int64']).columns
imputed_data = imputer.fit_transform(train[numeric_cols])
train_imputed = pd.DataFrame(imputed_data, columns=numeric_cols)
train_imputed['sii'] = train_imputed['sii'].round().astype(int)
for col in train.columns:
    if col not in numeric_cols:
        train_imputed[col] = train[col]
        
train = train_imputed

In [None]:
train

In [None]:
final_list = []

# After training, you can use the encoder to extract both the latent dimensions and class predictions
with torch.no_grad():
    for batch_data in test_dataloader:
        batch_data = batch_data.unsqueeze(1)  # Add sequence length dim if needed
        latent, predictions = model(batch_data)
        final_list.append(latent)

# Step 1: Concatenate all the latent representations into a single tensor
# Assuming each 'latent' is of shape (batch_size, latent_dim)
all_latents = torch.cat(final_list, dim=0)  # Concatenate along the first dimension (batch dimension)

# Step 2: Convert the concatenated tensor to a NumPy array
latent_array = all_latents.numpy()  # Convert to NumPy array

# Step 3: Create a DataFrame from the NumPy array
num_latent_dims = latent_array.shape[1]  # Get the number of latent dimensions
test_latent = pd.DataFrame(latent_array, columns=[f'enc_{i + 1}' for i in range(num_latent_dims)])

In [None]:
test_latent['id'] = time_series_df_without_target['id']
test = pd.merge(test, test_latent, how="left", on='id')

In [None]:
featuresCols = ['Basic_Demos-Enroll_Season', 'Basic_Demos-Age', 'Basic_Demos-Sex',
                'CGAS-Season', 'CGAS-CGAS_Score', 'Physical-Season', 'Physical-BMI',
                'Physical-Height', 'Physical-Weight', 'Physical-Waist_Circumference',
                'Physical-Diastolic_BP', 'Physical-HeartRate', 'Physical-Systolic_BP',
                'Fitness_Endurance-Season', 'Fitness_Endurance-Max_Stage',
                'Fitness_Endurance-Time_Mins', 'Fitness_Endurance-Time_Sec',
                'FGC-Season', 'FGC-FGC_CU', 'FGC-FGC_CU_Zone', 'FGC-FGC_GSND',
                'FGC-FGC_GSND_Zone', 'FGC-FGC_GSD', 'FGC-FGC_GSD_Zone', 'FGC-FGC_PU',
                'FGC-FGC_PU_Zone', 'FGC-FGC_SRL', 'FGC-FGC_SRL_Zone', 'FGC-FGC_SRR',
                'FGC-FGC_SRR_Zone', 'FGC-FGC_TL', 'FGC-FGC_TL_Zone', 'BIA-Season',
                'BIA-BIA_Activity_Level_num', 'BIA-BIA_BMC', 'BIA-BIA_BMI',
                'BIA-BIA_BMR', 'BIA-BIA_DEE', 'BIA-BIA_ECW', 'BIA-BIA_FFM',
                'BIA-BIA_FFMI', 'BIA-BIA_FMI', 'BIA-BIA_Fat', 'BIA-BIA_Frame_num',
                'BIA-BIA_ICW', 'BIA-BIA_LDM', 'BIA-BIA_LST', 'BIA-BIA_SMM',
                'BIA-BIA_TBW', 'PAQ_A-Season', 'PAQ_A-PAQ_A_Total', 'PAQ_C-Season',
                'PAQ_C-PAQ_C_Total', 'SDS-Season', 'SDS-SDS_Total_Raw',
                'SDS-SDS_Total_T', 'PreInt_EduHx-Season',
                'PreInt_EduHx-computerinternet_hoursday', 'sii']

featuresCols += ['enc_1', 'enc_2', 'enc_3', 'enc_4', 'enc_5', 'enc_6', 'enc_7', 'enc_8',
       'enc_9', 'enc_10', 'enc_11', 'enc_12', 'enc_13', 'enc_14', 'enc_15',
       'enc_16', 'enc_17', 'enc_18', 'enc_19', 'enc_20','enc_21','enc_22','enc_23','enc_24','enc_25','enc_26','enc_27','enc_28','enc_29','enc_30']

train = train[featuresCols]
train = train.dropna(subset='sii')
test  = test.drop('id',axis=1)

cat_c = ['Basic_Demos-Enroll_Season', 'CGAS-Season', 'Physical-Season', 
          'Fitness_Endurance-Season', 'FGC-Season', 'BIA-Season', 
          'PAQ_A-Season', 'PAQ_C-Season', 'SDS-Season', 'PreInt_EduHx-Season']

In [None]:
def update(df):
    global cat_c
    for c in cat_c: 
        df[c] = df[c].fillna('Missing')
        df[c] = df[c].astype('category')
    return df
        
train = update(train)
test = update(test)

def create_mapping(column, dataset):
    unique_values = dataset[column].unique()
    return {value: idx for idx, value in enumerate(unique_values)}

for col in cat_c:
    mapping = create_mapping(col, train)
    mappingTe = create_mapping(col, test)
    
    train[col] = train[col].replace(mapping).astype(int)
    test[col] = test[col].replace(mappingTe).astype(int)

def quadratic_weighted_kappa(y_true, y_pred):
    return cohen_kappa_score(y_true, y_pred, weights='quadratic')

def threshold_Rounder(oof_non_rounded, thresholds):
    return np.where(oof_non_rounded < thresholds[0], 0,
                    np.where(oof_non_rounded < thresholds[1], 1,
                             np.where(oof_non_rounded < thresholds[2], 2, 3)))

def evaluate_predictions(thresholds, y_true, oof_non_rounded):
    rounded_p = threshold_Rounder(oof_non_rounded, thresholds)
    return -quadratic_weighted_kappa(y_true, rounded_p)

In [None]:
def TrainML(model_class, test_data):
    X = train.drop(['sii'], axis=1)
    y = train['sii']

    SKF = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=SEED)
    
    train_S = []
    test_S = []
    
    oof_non_rounded = np.zeros(len(y), dtype=float) 
    oof_rounded = np.zeros(len(y), dtype=int) 
    test_preds = np.zeros((len(test_data), n_splits))

    for fold, (train_idx, test_idx) in enumerate(tqdm(SKF.split(X, y), desc="Training Folds", total=n_splits)):
        X_train, X_val = X.iloc[train_idx], X.iloc[test_idx]
        y_train, y_val = y.iloc[train_idx], y.iloc[test_idx]

        model = clone(model_class)
        model.fit(X_train, y_train)

        y_train_pred = model.predict(X_train)
        y_val_pred = model.predict(X_val)

        oof_non_rounded[test_idx] = y_val_pred
        y_val_pred_rounded = y_val_pred.round(0).astype(int)
        oof_rounded[test_idx] = y_val_pred_rounded

        train_kappa = quadratic_weighted_kappa(y_train, y_train_pred.round(0).astype(int))
        val_kappa = quadratic_weighted_kappa(y_val, y_val_pred_rounded)

        train_S.append(train_kappa)
        test_S.append(val_kappa)
        
        test_preds[:, fold] = model.predict(test_data)
        
        print(f"Fold {fold+1} - Train QWK: {train_kappa:.4f}, Validation QWK: {val_kappa:.4f}")
        clear_output(wait=True)

    print(f"Mean Train QWK --> {np.mean(train_S):.4f}")
    print(f"Mean Validation QWK ---> {np.mean(test_S):.4f}")

    KappaOPtimizer = minimize(evaluate_predictions,
                              x0=[0.5, 1.5, 2.5], args=(y, oof_non_rounded), 
                              method='Nelder-Mead')
    assert KappaOPtimizer.success, "Optimization did not converge."
    
    oof_tuned = threshold_Rounder(oof_non_rounded, KappaOPtimizer.x)
    tKappa = quadratic_weighted_kappa(y, oof_tuned)

    print(f"----> || Optimized QWK SCORE :: {Fore.CYAN}{Style.BRIGHT} {tKappa:.3f}{Style.RESET_ALL}")

    tpm = test_preds.mean(axis=1)
    tpTuned = threshold_Rounder(tpm, KappaOPtimizer.x)
    
    submission = pd.DataFrame({
        'id': sample['id'],
        'sii': tpTuned
    })

    return submission

In [None]:
Params = {
    'learning_rate': 0.046,
    'max_depth': 12,
    'num_leaves': 478,
    'min_data_in_leaf': 13,
    'feature_fraction': 0.893,
    'bagging_fraction': 0.784,
    'bagging_freq': 4,
    'lambda_l1': 10,  # Increased from 6.59
    'lambda_l2': 0.01  # Increased from 2.68e-06
}


XGB_Params = {
    'learning_rate': 0.05,
    'max_depth': 6,
    'n_estimators': 200,
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    'reg_alpha': 1,  # Increased from 0.1
    'reg_lambda': 5,  # Increased from 1
    'random_state': SEED,
    'tree_method': 'exact'
}


CatBoost_Params = {
    'learning_rate': 0.05,
    'depth': 6,
    'iterations': 200,
    'random_seed': SEED,
    'verbose': 0,
    'l2_leaf_reg': 10  # Increase this value
}

# Create model instances
Light = LGBMRegressor(**Params, random_state=SEED, verbose=-1, n_estimators=300)
XGB_Model = XGBRegressor(**XGB_Params)
CatBoost_Model = CatBoostRegressor(**CatBoost_Params)

# Combine models using Voting Regressor
voting_model = VotingRegressor(estimators=[
    ('lightgbm', Light),
    ('xgboost', XGB_Model),
    ('catboost', CatBoost_Model)
])

In [None]:
Submission1 = TrainML(voting_model, test)

# Save submission
Submission1.to_csv('submission.csv', index=False)