In [None]:
import numpy as np
import pandas as pd
import os
import re
from tqdm import tqdm
import polars as pl
import polars.selectors as cs
import matplotlib.pyplot as plt
from matplotlib.ticker import MaxNLocator, FormatStrFormatter, PercentFormatter
import seaborn as sns
import plotly.subplots as sp
import plotly.express as px
from concurrent.futures import ThreadPoolExecutor
from colorama import Fore, Style
from IPython.display import clear_output
from IPython.display import display
import warnings
warnings.filterwarnings('ignore')
pd.options.display.max_columns = None

In [None]:
from sklearn.base import clone, BaseEstimator, RegressorMixin
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.ensemble import StackingRegressor
from sklearn.linear_model import Ridge
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score, roc_curve, auc
from sklearn.metrics import cohen_kappa_score
from sklearn.model_selection import StratifiedKFold
from scipy.optimize import minimize
from sklearn.ensemble import VotingRegressor, RandomForestRegressor, GradientBoostingRegressor, HistGradientBoostingRegressor, ExtraTreesRegressor
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.pipeline import Pipeline
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor
from catboost import CatBoostRegressor
from sklearn.model_selection import GridSearchCV

In [None]:
from pytorch_tabnet.tab_model import TabNetRegressor
from pytorch_tabnet.callbacks import Callback

from keras.models import Model
from keras.layers import Input, Dense
from keras.optimizers import Adam

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import pytorch_tabnet

In [None]:
train = pd.read_csv('/kaggle/input/child-mind-institute-problematic-internet-use/train.csv')
test = pd.read_csv('/kaggle/input/child-mind-institute-problematic-internet-use/test.csv')

In [None]:
# Find rows where both columns have values
conflict_rows = train[(train['PAQ_A-PAQ_A_Total'].notna()) & (train['PAQ_C-PAQ_C_Total'].notna())]

# Print the number of conflicting rows
print(f"Number of conflict rows: {len(conflict_rows)}")

# 判斷是否存在衝突行

if not conflict_rows.empty:

    train = train.drop(conflict_rows.index)

In [None]:
train

In [None]:
train['PAQ_A-PAQ_A_Total'] = train['PAQ_A-PAQ_A_Total'].fillna(train['PAQ_C-PAQ_C_Total'])

train['PAQ_A-Season'] = train['PAQ_A-Season'].fillna(train['PAQ_C-Season'])

test['PAQ_A-PAQ_A_Total'] = test['PAQ_A-PAQ_A_Total'].fillna(test['PAQ_C-PAQ_C_Total'])

test['PAQ_A-Season'] = test['PAQ_A-Season'].fillna(test['PAQ_C-Season'])



# 刪除 column2

train = train.drop(columns=['PAQ_C-PAQ_C_Total', 'PAQ_C-Season'])

test = test.drop(columns=['PAQ_C-PAQ_C_Total', 'PAQ_C-Season'])



train = train.rename(columns={'PAQ_A-Season': 'PAQ-Season'})

train = train.rename(columns={'PAQ_A-PAQ_A_Total': 'PAQ-PAQ_Total'})

test = test.rename(columns={'PAQ_A-Season': 'PAQ-Season'})

test = test.rename(columns={'PAQ_A-PAQ_A_Total': 'PAQ-PAQ_Total'})

In [None]:
train

In [None]:
def process_file(filename, dirname):

    df = pd.read_parquet(os.path.join(dirname, filename, 'part-0.parquet'))

    df.drop('step', axis=1, inplace=True)

    return df.describe().values.reshape(-1), filename.split('=')[1]

def load_time_series(dirname) -> pd.DataFrame:

    ids = os.listdir(dirname)

    

    with ThreadPoolExecutor() as executor:

        results = list(tqdm(executor.map(lambda fname: process_file(fname, dirname), ids), total=len(ids)))

    

    stats, indexes = zip(*results)

    

    df = pd.DataFrame(stats, columns=[f"Stat_{i}" for i in range(len(stats[0]))])

    df['id'] = indexes

    

    return df

In [None]:
train

In [None]:
sample = pd.read_csv('/kaggle/input/child-mind-institute-problematic-internet-use/sample_submission.csv')

train_ts = load_time_series("/kaggle/input/child-mind-institute-problematic-internet-use/series_train.parquet")
test_ts = load_time_series("/kaggle/input/child-mind-institute-problematic-internet-use/series_test.parquet")

In [None]:
time_series_cols = train_ts.columns.tolist()
time_series_cols.remove("id")

In [None]:
featuresCols = ['Basic_Demos-Enroll_Season', 'Basic_Demos-Age', 'Basic_Demos-Sex',
                'CGAS-Season', 'CGAS-CGAS_Score', 'Physical-Season', 'Physical-BMI',
                'Physical-Height', 'Physical-Weight', 'Physical-Waist_Circumference',
                'Physical-Diastolic_BP', 'Physical-HeartRate', 'Physical-Systolic_BP',
                'Fitness_Endurance-Season', 'Fitness_Endurance-Max_Stage',
                'Fitness_Endurance-Time_Mins', 'Fitness_Endurance-Time_Sec',
                'FGC-Season', 'FGC-FGC_CU', 'FGC-FGC_CU_Zone', 'FGC-FGC_GSND',
                'FGC-FGC_GSND_Zone', 'FGC-FGC_GSD', 'FGC-FGC_GSD_Zone', 'FGC-FGC_PU',
                'FGC-FGC_PU_Zone', 'FGC-FGC_SRL', 'FGC-FGC_SRL_Zone', 'FGC-FGC_SRR',
                'FGC-FGC_SRR_Zone', 'FGC-FGC_TL', 'FGC-FGC_TL_Zone', 'BIA-Season',
                'BIA-BIA_Activity_Level_num', 'BIA-BIA_BMC', 'BIA-BIA_BMI',
                'BIA-BIA_BMR', 'BIA-BIA_DEE', 'BIA-BIA_ECW', 'BIA-BIA_FFM',
                'BIA-BIA_FFMI', 'BIA-BIA_FMI', 'BIA-BIA_Fat', 'BIA-BIA_Frame_num',
                'BIA-BIA_ICW', 'BIA-BIA_LDM', 'BIA-BIA_LST', 'BIA-BIA_SMM',
                'BIA-BIA_TBW', 'PAQ-Season',
                'PAQ-PAQ_Total',
                'SDS-Season', 'SDS-SDS_Total_Raw',
                'SDS-SDS_Total_T', 'PreInt_EduHx-Season',
                'PreInt_EduHx-computerinternet_hoursday', 'PCIAT-PCIAT_Total'] 
                # change 'sii' to 'PCIAT-PCIAT_Total'

SEASON_COL = ['Basic_Demos-Enroll_Season', 'CGAS-Season', 'Physical-Season', 
          'Fitness_Endurance-Season', 'FGC-Season', 'BIA-Season', 
          'PAQ-Season', 'SDS-Season', 'PreInt_EduHx-Season']

In [None]:
train

In [None]:
test

In [None]:
train = pd.merge(train, train_ts, how="left", on='id')
test = pd.merge(test, test_ts, how="left", on='id')

train = train.drop('id', axis=1)
test = test.drop('id', axis=1)

featuresCols += time_series_cols

train = train[featuresCols]
# removes rows from the train DataFrame where the PCIAT-PCIAT_Total column has missing values (NaN). 
train = train.dropna(subset='PCIAT-PCIAT_Total') # change 'sii' to 'PCIAT-PCIAT_Total'

In [None]:
train

In [None]:
test.shape

In [None]:
train.shape

In [None]:
def update(df):
    global SEASON_COL
    for c in SEASON_COL: 
        df[c] = df[c].fillna('Missing')
        df[c] = df[c].astype('category')
    return df

train = update(train)
test = update(test)

In [None]:
train

In [None]:
season_mapping = {'Spring': 0, 'Summer': 1, 'Fall': 2, 'Winter': 3, 'Missing': 4}

for col in SEASON_COL:

    train[col] = train[col].map(season_mapping)

    test[col] = test[col].map(season_mapping)


In [None]:
train

## Fine-Tune LightGBM

In [None]:
# import optuna
# from lightgbm import LGBMRegressor
# from sklearn.base import clone
# from sklearn.model_selection import StratifiedKFold

# X = train.drop(['PCIAT-PCIAT_Total'], axis=1) # change 'sii' to 'PCIAT-PCIAT_Total'
# y = train['PCIAT-PCIAT_Total'] # change 'sii' to 'PCIAT-PCIAT_Total'

# # Optuna Objective Function
# def optuna_objective(trial):
#     # Define the hyperparameter search space
#     params = {
#         'n_estimators': trial.suggest_int('n_estimators', 100, 1000),
#         'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3),
#         'max_depth': trial.suggest_int('max_depth', 3, 15),
#         'num_leaves': trial.suggest_int('num_leaves', 20, 100),
#         'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
#         'subsample': trial.suggest_float('subsample', 0.5, 1.0),
#         'reg_alpha': trial.suggest_float('reg_alpha', 0.0, 10.0),
#         'reg_lambda': trial.suggest_float('reg_lambda', 0.0, 10.0),
#         'min_child_weight': trial.suggest_float('min_child_weight', 0.01, 10.0)
#     }

#     # Model Initialization
#     model = LGBMRegressor(random_state=42, force_col_wise=True, **params)
#     oof_preds = np.zeros(len(y))  # Out-of-fold predictions

#     # Stratified K-Fold Cross-Validation
#     SKF = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
#     for train_idx, val_idx in SKF.split(X, y):
#         X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
#         y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

#         model.fit(X_train, y_train)
#         oof_preds[val_idx] = model.predict(X_val)

#     # Convert predictions to sii labels
#     oof_rounded = np.vectorize(pciat_to_sii)(oof_preds)
#     true_rounded = np.vectorize(pciat_to_sii)(y)

#     # Return QWK (negative for Optuna to maximize)
#     return quadratic_weighted_kappa(true_rounded, oof_rounded)

# # Run Optuna Optimization
# study = optuna.create_study(direction='maximize')
# study.optimize(optuna_objective, n_trials=50)  # Number of trials for optimization

# # Print Best Parameters and Score
# print(f"Best Parameters: {study.best_params}")
# print(f"Best QWK Score: {study.best_value}")

# # Use Best Parameters in Final Model
# best_params = study.best_params
# optimized_lgb_model = LGBMRegressor(random_state=42, force_col_wise=True, **best_params)

## Ignore Warning

In [None]:
import warnings
warnings.filterwarnings('ignore')

## Fine-Tune XGBoost

In [None]:
# # Import required libraries
# import optuna
# from xgboost import XGBRegressor
# from sklearn.model_selection import StratifiedKFold

# # Prepare your data
# X = train.drop(['PCIAT-PCIAT_Total'], axis=1)  # Features
# y = train['PCIAT-PCIAT_Total']  # Target

# # Optuna Objective Function for XGBoost
# def optuna_objective(trial):
#     # Define the hyperparameter search space
#     params = {
#         'n_estimators': trial.suggest_int('n_estimators', 100, 1000),
#         'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3),
#         'max_depth': trial.suggest_int('max_depth', 3, 15),
#         'subsample': trial.suggest_float('subsample', 0.5, 1.0),
#         'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
#         'reg_alpha': trial.suggest_float('reg_alpha', 0.0, 10.0),
#         'reg_lambda': trial.suggest_float('reg_lambda', 0.0, 10.0),
#         'min_child_weight': trial.suggest_float('min_child_weight', 0.01, 10.0),
#         'gamma': trial.suggest_float('gamma', 0.0, 5.0)  # Additional XGBoost parameter
#     }

#     # Initialize the XGBoost model
#     model = XGBRegressor(
#         random_state=42, 
#         enable_categorical=True,  # Enable categorical support
#         **params)
#     oof_preds = np.zeros(len(y))  # Out-of-fold predictions

#     # Stratified K-Fold Cross-Validation
#     SKF = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
#     for train_idx, val_idx in SKF.split(X, y):
#         X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
#         y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

#         model.fit(X_train, y_train, eval_set=[(X_val, y_val)], eval_metric='rmse', verbose=False, early_stopping_rounds=50)
#         oof_preds[val_idx] = model.predict(X_val)

#     # Convert predictions to sii labels
#     oof_rounded = np.vectorize(pciat_to_sii)(oof_preds)
#     true_rounded = np.vectorize(pciat_to_sii)(y)

#     # Return QWK (negative for Optuna to maximize)
#     return quadratic_weighted_kappa(true_rounded, oof_rounded)

# # Run Optuna Optimization
# study = optuna.create_study(direction='maximize')
# study.optimize(optuna_objective, n_trials=50)  # Number of trials for optimization

# # Print Best Parameters and Score
# print(f"Best Parameters for XGBoost: {study.best_params}")
# print(f"Best QWK Score for XGBoost: {study.best_value}")

# # Use Best Parameters in Final Model
# best_params = study.best_params
# optimized_xgb_model = XGBRegressor(random_state=42, **best_params)

## Fine-Tune CatBoost

In [None]:
# # Import required libraries
# import optuna
# from catboost import CatBoostRegressor
# from sklearn.model_selection import StratifiedKFold

# # Prepare your data
# X = train.drop(['PCIAT-PCIAT_Total'], axis=1)  # Features
# y = train['PCIAT-PCIAT_Total']  # Target

# # Optuna Objective Function for CatBoost
# def optuna_objective(trial):
#     # Define the hyperparameter search space
#     params = {
#         'iterations': trial.suggest_int('iterations', 100, 1000),
#         'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3),
#         'depth': trial.suggest_int('depth', 3, 12),
#         'l2_leaf_reg': trial.suggest_float('l2_leaf_reg', 1.0, 10.0),
#         'bagging_temperature': trial.suggest_float('bagging_temperature', 0.0, 1.0),
#         'random_strength': trial.suggest_float('random_strength', 0.0, 10.0),
#         'border_count': trial.suggest_int('border_count', 32, 255),  # Number of splits for categorical features
#         'grow_policy': trial.suggest_categorical('grow_policy', ['SymmetricTree', 'Depthwise', 'Lossguide']),
#     }

#     # Initialize the CatBoost model
#     model = CatBoostRegressor(
#         random_state=42,
#         silent=True,  # Suppress verbose output
#         cat_features=X.select_dtypes(include=['category']).columns.tolist(),  # Identify categorical features
#         **params
#     )
#     oof_preds = np.zeros(len(y))  # Out-of-fold predictions

#     # Stratified K-Fold Cross-Validation
#     SKF = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
#     for train_idx, val_idx in SKF.split(X, y):
#         X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
#         y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

#         model.fit(X_train, y_train, eval_set=[(X_val, y_val)], early_stopping_rounds=50)
#         oof_preds[val_idx] = model.predict(X_val)

#     # Convert predictions to sii labels
#     oof_rounded = np.vectorize(pciat_to_sii)(oof_preds)
#     true_rounded = np.vectorize(pciat_to_sii)(y)

#     # Return QWK (negative for Optuna to maximize)
#     return quadratic_weighted_kappa(true_rounded, oof_rounded)

# # Run Optuna Optimization
# study = optuna.create_study(direction='maximize')
# study.optimize(optuna_objective, n_trials=50)  # Number of trials for optimization

# # Print Best Parameters and Score
# print(f"Best Parameters for CatBoost: {study.best_params}")
# print(f"Best QWK Score for CatBoost: {study.best_value}")

# # Use Best Parameters in Final Model
# best_params = study.best_params
# optimized_cat_model = CatBoostRegressor(random_state=42, silent=True, **best_params)

## Train ML

In [None]:
n_splits = 5

def pciat_to_sii(pciat_score):
    """
    Transform PCIAT-PCIAT_Total predictions into sii labels.
    """
    if pciat_score <= 30:
        return 0
    elif pciat_score <= 49:
        return 1
    elif pciat_score <= 79:
        return 2
    else:
        return 3

def quadratic_weighted_kappa(y_true, y_pred):
    return cohen_kappa_score(y_true, y_pred, weights='quadratic')

def threshold_Rounder(oof_non_rounded, thresholds):
    """
    Apply thresholds to PCIAT-PCIAT_Total predictions and map to sii labels.
    """
    rounded = np.where(oof_non_rounded < thresholds[0], 0,
                       np.where(oof_non_rounded < thresholds[1], 1,
                                np.where(oof_non_rounded < thresholds[2], 2, 3)))
    return rounded

def evaluate_predictions(thresholds, y_true, oof_non_rounded):
    """
    Evaluate QWK by mapping PCIAT-PCIAT_Total predictions to sii labels.
    """
    rounded_p = threshold_Rounder(oof_non_rounded, thresholds)
    return -quadratic_weighted_kappa(y_true, rounded_p)

def TrainML(model_class, test_data):

    X = train.drop(['PCIAT-PCIAT_Total'], axis=1) # change 'sii' to 'PCIAT-PCIAT_Total'
    y = train['PCIAT-PCIAT_Total'] # change 'sii' to 'PCIAT-PCIAT_Total'

    SKF = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)

    train_S = []
    test_S = []

    oof_non_rounded = np.zeros(len(y), dtype=float)
    oof_rounded = np.zeros(len(y), dtype=int)
    test_preds = np.zeros((len(test_data), n_splits))

    print("Starting Cross-Validation...")
    for fold, (train_idx, test_idx) in enumerate(tqdm(SKF.split(X, y), desc="Training Folds", total=n_splits)):
        X_train, X_val = X.iloc[train_idx], X.iloc[test_idx]
        y_train, y_val = y.iloc[train_idx], y.iloc[test_idx]

        model = clone(model_class)
        model.fit(X_train, y_train)

        y_train_pred = model.predict(X_train)
        y_val_pred = model.predict(X_val)

        oof_non_rounded[test_idx] = y_val_pred
        y_val_pred_rounded = np.vectorize(pciat_to_sii)(y_val_pred)  # Transform PCIAT-PCIAT_Total to sii
        oof_rounded[test_idx] = y_val_pred_rounded

        train_kappa = quadratic_weighted_kappa(
            np.vectorize(pciat_to_sii)(y_train),  # Map y_train to sii
            np.vectorize(pciat_to_sii)(y_train_pred.round(0))
        )
        val_kappa = quadratic_weighted_kappa(
            np.vectorize(pciat_to_sii)(y_val),
            y_val_pred_rounded
        )

        train_S.append(train_kappa)
        test_S.append(val_kappa)

        test_preds[:, fold] = model.predict(test_data)

        print(f"Fold {fold+1} - Train QWK: {train_kappa:.4f}, Validation QWK: {val_kappa:.4f}")
        clear_output(wait=True)
            


    # Log fold-specific and overall QWK statistics
    print("\nFold-wise QWK Scores:")
    for fold_idx, (train_qwk, val_qwk) in enumerate(zip(train_S, test_S), start=1):
        print(f"Fold {fold_idx}: Train QWK = {train_qwk:.4f}, Validation QWK = {val_qwk:.4f}")
    
    print(f"\nMean Train QWK --> {np.mean(train_S):.4f} ± {np.std(train_S):.4f}")
    print(f"Mean Validation QWK ---> {np.mean(test_S):.4f} ± {np.std(test_S):.4f}")


    # Map PCIAT-PCIAT_Total to sii for threshold optimization
    sii_labels = np.vectorize(pciat_to_sii)(train['PCIAT-PCIAT_Total'])
    
    # Optimize thresholds using multiple methods
    methods = ['Nelder-Mead', 'Powell', 'TNC']
    best_result = None
    best_thresholds = None
    best_score = float('-inf')
    
    for method in methods:
        result = minimize(evaluate_predictions,
                          x0=[30, 49, 79],  # Initial threshold guesses
                          args=(sii_labels, oof_non_rounded),
                          method=method)
        if result.success:
            score = -result.fun
            if score > best_score:
                best_score = score
                best_result = result
                best_thresholds = result.x

    print(f"\nBest thresholds: {best_thresholds}")
    print(f"Best QWK score after optimization: {best_score:.4f}")

    oof_tuned = threshold_Rounder(oof_non_rounded, best_thresholds)
    tKappa = quadratic_weighted_kappa(sii_labels, oof_tuned)

    print(f"----> || Optimized QWK SCORE :: {Fore.CYAN}{Style.BRIGHT} {tKappa:.3f}{Style.RESET_ALL}")

    tpm = test_preds.mean(axis=1)
    tp_rounded = threshold_Rounder(tpm, best_thresholds)


    return tp_rounded


In [None]:
imputer = SimpleImputer(strategy='median')

# Optimized LightGBM model
optimized_lgb_model = LGBMRegressor(
    n_estimators=107,
    learning_rate=0.1370718043598246,
    max_depth=4,
    num_leaves=68,
    colsample_bytree=0.7238253848440545,
    subsample=0.5543321896833934,
    reg_alpha=7.854917349399392,
    reg_lambda=8.769761580128085,
    min_child_weight=1.105089752114477,
    random_state=42,
    force_col_wise=True,
    verbose=-1
)

# Optimized XGBoost model
# optimized_xgb_model = XGBRegressor(
#     n_estimators=541,
#     learning_rate=0.03143204336162494,
#     max_depth=6,
#     subsample=0.8414548026219018,
#     colsample_bytree=0.8774468113199907,
#     reg_alpha=0.9106427018084773,
#     reg_lambda=5.725763474248076,
#     min_child_weight=3.6121329951143517,
#     gamma=4.644504781492626,
#     random_state=42
# )

ensemble = StackingRegressor(estimators=[
    ('lgb', Pipeline(steps=[('imputer', imputer), ('regressor', optimized_lgb_model)])),
    ('xgb', Pipeline(steps=[('imputer', imputer), ('regressor', XGBRegressor(random_state=42))])),
    ('cat', Pipeline(steps=[('imputer', imputer), ('regressor', CatBoostRegressor(random_state=42, silent=True))])),
    ('rf', Pipeline(steps=[('imputer', imputer), ('regressor', RandomForestRegressor(random_state=42))])),
    ('gb', Pipeline(steps=[('imputer', imputer), ('regressor', GradientBoostingRegressor(random_state=42))]))
])

Submission = TrainML(ensemble, test)
Submission = pd.DataFrame({
    'id': sample['id'],
    'sii': Submission
})

Submission

## Submission

In [None]:
Submission.to_csv('submission.csv', index=False)