In [6]:
#import basic libraries
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt, matplotlib.gridspec as gridspe
import seaborn as sns
import polars as pl

import lightgbm as lgb
import xgboost as xgb
import catboost as cb
from sklearn.model_selection import StratifiedKFold
from sklearn.ensemble import VotingClassifier
from sklearn.base import clone
from sklearn.metrics import cohen_kappa_score
from sklearn.model_selection import train_test_split
from tqdm import tqdm
from scipy.optimize import minimize

# I. Data Exploratory Analysis (EDA) & Data Prepossessing 

## 1. Data Loading

In [7]:
dic = pd.read_csv("/kaggle/input/child-mind-institute-problematic-internet-use/data_dictionary.csv")
train_df = pd.read_csv("/kaggle/input/child-mind-institute-problematic-internet-use/train.csv")
test_df = pd.read_csv("/kaggle/input/child-mind-institute-problematic-internet-use/test.csv")

##  2. Data Preview

### 2.1. Preview Data_dictionary
The HBN dataset has total 11 instruments. Each one are presented by some fields calculated below. The total number of fields collected is 80 (excluding id).

In [None]:
# Preview data_dictionary
pd.set_option('display.max_columns', None)  
pd.set_option('display.max_colwidth', None) 
dic.head()

In [None]:
# Summarise data_dictionary
grouped = dic.groupby('Instrument', sort=False)['Field'].apply(lambda x: ', '.join(x)).reset_index()
grouped['NumFields'] = dic.groupby('Instrument', sort=False).size().values
grouped = grouped.style.set_properties(**{'text-align': 'left'}, subset=['Instrument', 'Field', 'NumFields']) \
    .set_table_styles([{'selector': 'th', 'props': [('text-align', 'left')]}])

grouped

### 2.2. Preview Train & Test data

In [None]:
# training data
train_df.info()

2736 non-null in sii => 
sii missing is about 30% (accepted to remove), so only use part with sii valid to train the supervised model for accurately.

In [None]:
usable_train_df = train_df.dropna(subset='sii')
usable_train_df.info()

In [None]:
missing_percentage = (usable_train_df.isnull().sum() / len(usable_train_df)) * 100 # calculate % missing data

missing_percentage.sort_values(ascending=False).plot(kind='bar', color='#A3C8FF', figsize=(24, 6))
plt.title('Percentage of Missing Data per Column in Training data')
plt.ylabel('Percentage (%)')
plt.xlabel('Columns');

In [None]:
#Testing data
test_df.info()
# 58 fields measured (excluding id) 

In [None]:
missing_percentage_test = (test_df.isnull().sum() / len(test_df)) * 100 # calculate % missing data

missing_percentage_test.sort_values(ascending=False).plot(kind='bar', color='orange', figsize=(24, 6))
plt.title('Percentage of Missing Data per Column in Testing data')
plt.ylabel('Percentage (%)')
plt.xlabel('Testing data');

In [None]:
# find the difference between 2 files
columns_train = set(usable_train_df.columns)
columns_test = set(test_df.columns)
difference = sorted(list(columns_train - columns_test)) #Find the difference columns in 2 data files
print(difference) # 23

In [None]:
# Actigraphy (time series) data
#example
actigraphy = pl.read_parquet('/kaggle/input/child-mind-institute-problematic-internet-use/series_train.parquet/id=0417c91e/part-0.parquet')
actigraphy

### 2.3. Features Classification

In [None]:
# Identify the type of fields (features to train)
def print_column_info(column_type, column_name):
    cols = dic[dic['Type'] == column_type]['Field']
    print(f"{column_name}: {len(cols)}")
    print(cols.values)
    print("\n")

print_column_info('float', 'Continuous')
print_column_info('int', 'Discrete')
print_column_info('str', 'Categorical')
print_column_info('categorical int', 'Categorical Int')


In [None]:
usable_train_df = train_df.dropna(subset='sii')
usable_train_df.info()
test_df.info()

### 2.4. Distribution of features

In [None]:
# Distribution of numerical features
usable_train_df.describe()

In [None]:
usable_train_df.select_dtypes(include=["object"]).describe()

In [None]:
# sii distribution → sii is imbalanced, half of it is 0
sii_counts = usable_train_df['sii'].value_counts()
colors = plt.cm.Blues(np.linspace(0.3, 1, len(sii_counts)))

#plt.figure(figsize=(12, 6))
sii_counts.plot(kind='bar', color=colors)
plt.title('"sii distribution"')
plt.xlabel('Values')
plt.ylabel('Frequency')
plt.xticks(rotation=0)
legend_labels = ['None', 'Mild', 'Moderate', 'Severe']
colors_for_legend = colors
plt.legend(handles=[plt.Rectangle((0, 0), 1, 1, color=color) for color in colors_for_legend],
           labels=legend_labels, title="Categories", loc='upper left', bbox_to_anchor=(1, 1));


### 3.1. Encode Season columns

In [None]:
from sklearn.preprocessing import LabelEncoder

encoder = LabelEncoder()

columns_to_encode = [
    'Basic_Demos-Enroll_Season', 'CGAS-Season', 'Physical-Season',
    'Fitness_Endurance-Season', 'FGC-Season', 'BIA-Season', 'PAQ_A-Season',
    'PAQ_C-Season', 'PCIAT-Season', 'SDS-Season', 'PreInt_EduHx-Season'
]

for col in columns_to_encode:
    usable_train_df.loc[:, col] = encoder.fit_transform(usable_train_df[col])

### 3.2. Encode Actigraphy data


## EDA

In [None]:
sns.boxplot(x='sii', y='PCIAT-PCIAT_Total', data=usable_train_df)
plt.title('Correlation between PCIAT-PCIAT_Total và sii')
plt.xlabel('sii')
plt.ylabel('PCIAT-PCIAT_Total')
y_ticks = np.arange(usable_train_df['PCIAT-PCIAT_Total'].min(), usable_train_df['PCIAT-PCIAT_Total'].max(), 10)
plt.yticks(y_ticks);

The target ```sii``` is available exactly for those participants for whom we have results of the Parent-Child Internet Addiction Test (PCIAT), and it is a function of the PCIAT total score.
* 0-30 Normal (0)
* 31-49 Mild (1)
* 50-79 Moderate (2)
* 80-100 Severe (3)

Proof: https://digitalwellnesslab.org/wp-content/uploads/Scoring-Overview.pdf

In [None]:
# Check correlation between numerical features and PCIAT Total Scores

corr_matrix = usable_train_df[[
    'Basic_Demos-Age', 'Physical-BMI', 'Physical-Height', 'Physical-Weight',
 'FGC-FGC_GSND' ,'FGC-FGC_GSD', 'FGC-FGC_SRL', 'FGC-FGC_SRR', 'BIA-BIA_BMC',
 'BIA-BIA_BMI', 'BIA-BIA_BMR', 'BIA-BIA_DEE' ,'BIA-BIA_ECW', 'BIA-BIA_FFM',
 'BIA-BIA_FFMI' ,'BIA-BIA_FMI', 'BIA-BIA_Fat', 'BIA-BIA_ICW' ,'BIA-BIA_LDM',
 'BIA-BIA_LST', 'BIA-BIA_SMM', 'BIA-BIA_TBW', 'PAQ_A-PAQ_A_Total',
 'PAQ_C-PAQ_C_Total', 'CGAS-CGAS_Score' ,'Physical-Waist_Circumference', 'Physical-Diastolic_BP',
 'Physical-HeartRate', 'Physical-Systolic_BP' ,'Fitness_Endurance-Max_Stage',
 'Fitness_Endurance-Time_Mins', 'Fitness_Endurance-Time_Sec' ,'FGC-FGC_CU',
 'FGC-FGC_PU', 'FGC-FGC_TL' ,'PCIAT-PCIAT_Total', 'SDS-SDS_Total_Raw',
 'SDS-SDS_Total_T', 'PreInt_EduHx-computerinternet_hoursday',
    
    'Basic_Demos-Sex', 'FGC-FGC_CU_Zone' ,'FGC-FGC_GSND_Zone',
 'FGC-FGC_GSD_Zone', 'FGC-FGC_PU_Zone' ,'FGC-FGC_SRL_Zone',
 'FGC-FGC_SRR_Zone', 'FGC-FGC_TL_Zone', 'BIA-BIA_Activity_Level_num',
 'BIA-BIA_Frame_num', 
    
    'Basic_Demos-Enroll_Season', 'CGAS-Season', 'Physical-Season',
    'Fitness_Endurance-Season', 'FGC-Season', 'BIA-Season', 'PAQ_A-Season',
    'PAQ_C-Season', 'PCIAT-Season', 'SDS-Season', 'PreInt_EduHx-Season'
]].corr()

# Lấy các cột có tương quan > 0.1 hoặc < -0.1 với PCIAT-PCIAT_Total
sii_corr = corr_matrix['PCIAT-PCIAT_Total'].drop('PCIAT-PCIAT_Total')
filtered_corr = sii_corr[(sii_corr > 0.1) | (sii_corr < -0.1)]

print(filtered_corr)

plt.figure(figsize=(8, 6))
filtered_corr.sort_values().plot(kind='barh', color='#55B197')
plt.title('Features with Correlation > 0.1 or < -0.1 with PCIAT-PCIAT_Total')
plt.xlabel('Correlation coefficient')
plt.ylabel('Features')
plt.tight_layout()
plt.show()

# II. Feature Engineering

### 2.1 Feature selection

In [None]:
# choose features
featuresCols = ['Basic_Demos-Age', 'Physical-BMI', 'Physical-Height', 'Physical-Weight', 
         'FGC-FGC_GSND', 'FGC-FGC_GSD', 'BIA-BIA_BMI', 'BIA-BIA_FFMI', 'Physical-Waist_Circumference', 
         'Physical-Systolic_BP', 'FGC-FGC_CU', 'FGC-FGC_PU', 'FGC-FGC_TL', 
         'SDS-SDS_Total_Raw', 'SDS-SDS_Total_T', 'PreInt_EduHx-computerinternet_hoursday', 
         'FGC-FGC_SRL_Zone', 'FGC-FGC_SRR_Zone', 'BIA-BIA_Frame_num', 'Fitness_Endurance-Season', 'PAQ_A-Season']
#filter features has >50% missing data
columns_with_missing = missing_percentage[missing_percentage > 50].index
missCols = usable_train_df[columns_with_missing]
# print(missCols)

selectedFeatures = list(set(featuresCols) - set(missCols))
Selected_df = usable_train_df[selectedFeatures]
Selected_df.info() #final training df

### 2.2. Clean Data

In [None]:
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

imputer = IterativeImputer(max_iter=20, random_state=0)

Selected_df = pd.DataFrame(imputer.fit_transform(Selected_df), columns=Selected_df.columns)
#Selected_df

In [None]:
Selected_df.info()

In [None]:
Selected_df.hist(figsize=(15, 12), bins=20, color='#A3C8FF');

# III. Model training

In [None]:
# Calculate Kappa
def quadratic_weighted_kappa(y_true, y_pred):
    return cohen_kappa_score(y_true, y_pred, weights='quadratic')

# Categorize based on 4 threshold (0-3)
def threshold_Rounder(oof_non_rounded, thresholds):
    return np.where(oof_non_rounded < thresholds[0], 0,
                    np.where(oof_non_rounded < thresholds[1], 1,
                             np.where(oof_non_rounded < thresholds[2], 2, 3)))

def evaluate_predictions(thresholds, y_true, oof_non_rounded):
    rounded_p = threshold_Rounder(oof_non_rounded, thresholds)
    return -quadratic_weighted_kappa(y_true, rounded_p)

In [None]:
# define input
X = Selected_df;
y = usable_train_df['sii']
test = test_df[selectedFeatures]

In [None]:
test.info()
X.info()

### Model

In [None]:
Params_LGB = {
    'learning_rate': 0.046,
    'max_depth': 12,
    'num_leaves': 478,
    'min_data_in_leaf': 13,
    'feature_fraction': 0.893,
    'bagging_fraction': 0.784,
    'bagging_freq': 4,
    'lambda_l1': 10,
    'lambda_l2': 0.01,
    'device': 'cpu'  # Thay 'gpu' bằng 'cpu'
}

XGB_Params = {
    'learning_rate': 0.05,
    'max_depth': 6,
    'n_estimators': 200,
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    'reg_alpha': 1,
    'reg_lambda': 5,
    'random_state': 0,
    'tree_method': 'hist',  # Thay 'gpu_hist' bằng 'hist' để dùng CPU
}

CatBoost_Params = {
    'learning_rate': 0.05,
    'depth': 6,
    'iterations': 200,
    'random_seed': 0,
    'verbose': 0,
    'l2_leaf_reg': 10,
    'task_type': 'CPU'  # Thay 'GPU' bằng 'CPU'
}


In [None]:
lgb_model = lgb.LGBMClassifier(**Params_LGB)
xgb_model = xgb.XGBClassifier(**XGB_Params)
catboost_model = cb.CatBoostClassifier(**CatBoost_Params)

# Voting Classifier
voting_clf = VotingClassifier(
    estimators=[
        ('lgb', lgb_model),
        ('xgb', xgb_model),
        ('catboost', catboost_model)
    ]
)

In [1]:
lgb_model = lgb.LGBMClassifier(**Params_LGB)
xgb_model = xgb.XGBClassifier(**XGB_Params)
catboost_model = cb.CatBoostClassifier(**CatBoost_Params)

# Voting Classifier
voting_clf = VotingClassifier(
    estimators=[
        ('lgb', lgb_model),
        ('xgb', xgb_model),
        ('catboost', catboost_model)
    ],
    voting='soft'  # "soft" choose with highest prob
)

def ModelTraining(X,y, test_data):
    SKF = StratifiedKFold(n_splits=10, shuffle=True, random_state=0)
   
    # store QWK
    train_S = []
    test_S = []
    
    # create array storing prediction to calculate QWK
    oof_non_rounded = np.zeros(len(y), dtype=float)  # used for training set
    oof_rounded = np.zeros(len(y), dtype=int)
    test_preds = np.zeros((len(test_data), 10))  # used for testing set

    # training loop
    for fold, (train_idx, test_idx) in enumerate(tqdm(SKF.split(X, y), desc="Training Folds", total=10)):
        X_train, X_val = X.iloc[train_idx], X.iloc[test_idx]
        y_train, y_val = y.iloc[train_idx], y.iloc[test_idx]

        
        voting_clf.fit(X_train, y_train)

        # predict on traing & testing set
        y_train_pred = voting_clf.predict(X_train)
        y_val_pred = voting_clf.predict(X_val)
        
        # update prediction
        oof_non_rounded[test_idx] = y_val_pred
        y_val_pred_rounded = y_val_pred.round(0).astype(int)
        oof_rounded[test_idx] = y_val_pred_rounded

        # Calculate QWK for training and validation
        train_kappa = quadratic_weighted_kappa(y_train, y_train_pred.round(0).astype(int))
        val_kappa = quadratic_weighted_kappa(y_val, y_val_pred_rounded)
        
        # Store QWK scores for each fold
        train_S.append(train_kappa)
        test_S.append(val_kappa)

        # Store the predictions for the test set
        test_preds[:, fold] = voting_clf.predict(test_data)

    # After all folds
    print(f"Mean Train QWK --> {np.mean(train_S):.4f}")
    print(f"Mean Validation QWK --> {np.mean(test_S):.4f}")

    # Optimize threshold
    KappaOPtimizer = minimize(evaluate_predictions,
                              x0=[0.5, 1.5, 2.5], args=(y, oof_non_rounded), 
                              method='Nelder-Mead')
    
    
    # Tune the OOF predictions using optimized thresholds
    oof_tuned = threshold_Rounder(oof_non_rounded, KappaOPtimizer.x)
    tKappa = quadratic_weighted_kappa(y, oof_tuned)

    print(f"----> || Optimized QWK SCORE :: {tKappa:.3f}")

    # Average test predictions across all folds
    tpm = test_preds.mean(axis=1)
    tpTuned = threshold_Rounder(tpm, KappaOPtimizer.x)

    # Create the submission
    submission = pd.DataFrame({
        'id': test_df['id'],
        'sii': tpTuned
    })

    return submission


NameError: name 'lgb' is not defined

In [None]:
submission = ModelTraining(X, y, test)

In [None]:
submission.to_csv("submission.csv", index=False)