In [1]:
!pip -q install /kaggle/input/pytorchtabnet/pytorch_tabnet-4.1.0-py3-none-any.whl

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from concurrent.futures import ThreadPoolExecutor
from tqdm import tqdm
from pytorch_tabnet.tab_model import TabNetClassifier, TabNetRegressor
from xgboost import XGBRegressor
from pytorch_tabnet.callbacks import Callback
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.preprocessing import StandardScaler

In [3]:
train_df = pd.read_csv('/kaggle/input/child-mind-institute-problematic-internet-use/train.csv')
test_df = pd.read_csv('/kaggle/input/child-mind-institute-problematic-internet-use/test.csv')

In [4]:
conflict_rows = train_df[(train_df['PAQ_A-PAQ_A_Total'].notna()) & (train_df['PAQ_C-PAQ_C_Total'].notna())]

# 判斷是否存在衝突行
if not conflict_rows.empty:
    train_df = train_df.drop(conflict_rows.index)

In [5]:
# 將合併結果存回 column1
train_df['PAQ_A-PAQ_A_Total'] = train_df['PAQ_A-PAQ_A_Total'].fillna(train_df['PAQ_C-PAQ_C_Total'])
train_df['PAQ_A-Season'] = train_df['PAQ_A-Season'].fillna(train_df['PAQ_C-Season'])
test_df['PAQ_A-PAQ_A_Total'] = test_df['PAQ_A-PAQ_A_Total'].fillna(test_df['PAQ_C-PAQ_C_Total'])
test_df['PAQ_A-Season'] = test_df['PAQ_A-Season'].fillna(test_df['PAQ_C-Season'])

# 刪除 column2
train_df = train_df.drop(columns=['PAQ_C-PAQ_C_Total', 'PAQ_C-Season'])
test_df = test_df.drop(columns=['PAQ_C-PAQ_C_Total', 'PAQ_C-Season'])

train_df = train_df.rename(columns={'PAQ_A-Season': 'PAQ-Season'})
train_df = train_df.rename(columns={'PAQ_A-PAQ_A_Total': 'PAQ-PAQ_Total'})
test_df = test_df.rename(columns={'PAQ_A-Season': 'PAQ-Season'})
test_df = test_df.rename(columns={'PAQ_A-PAQ_A_Total': 'PAQ-PAQ_Total'})

In [6]:
df = train_df.dropna(axis=1, thresh=len(train_df) - 3000)

In [7]:
def process_file(filename, dirname):
    df = pd.read_parquet(os.path.join(dirname, filename, 'part-0.parquet'))
    df.drop('step', axis=1, inplace=True)
    return df.describe().values.reshape(-1), filename.split('=')[1]

In [8]:
def load_time_series(dirname) -> pd.DataFrame:
    ids = os.listdir(dirname)
    
    with ThreadPoolExecutor() as executor:
        results = list(tqdm(executor.map(lambda fname: process_file(fname, dirname), ids), total=len(ids)))
    
    stats, indexes = zip(*results)
    
    df = pd.DataFrame(stats, columns=[f"Stat_{i}" for i in range(len(stats[0]))])
    df['id'] = indexes
    
    return df

class AutoEncoder(nn.Module):
    def __init__(self, input_dim, encoding_dim):
        super(AutoEncoder, self).__init__()
        self.encoder = nn.Sequential(
            nn.Linear(input_dim, encoding_dim*3),
            nn.ReLU(),
            nn.Linear(encoding_dim*3, encoding_dim*2),
            nn.ReLU(),
            nn.Linear(encoding_dim*2, encoding_dim),
            nn.ReLU()
        )
        self.decoder = nn.Sequential(
            nn.Linear(encoding_dim, input_dim*2),
            nn.ReLU(),
            nn.Linear(input_dim*2, input_dim*3),
            nn.ReLU(),
            nn.Linear(input_dim*3, input_dim),
            nn.Sigmoid()
        )
        
    def forward(self, x):
        encoded = self.encoder(x)
        decoded = self.decoder(encoded)
        return decoded

def perform_autoencoder(df, encoding_dim=50, epochs=50, batch_size=32):
    scaler = StandardScaler()
    df_scaled = scaler.fit_transform(df)
    
    data_tensor = torch.FloatTensor(df_scaled)
    
    input_dim = data_tensor.shape[1]
    autoencoder = AutoEncoder(input_dim, encoding_dim)
    
    criterion = nn.MSELoss()
    optimizer = optim.Adam(autoencoder.parameters())
    
    for epoch in range(epochs):
        for i in range(0, len(data_tensor), batch_size):
            batch = data_tensor[i : i + batch_size]
            optimizer.zero_grad()
            reconstructed = autoencoder(batch)
            loss = criterion(reconstructed, batch)
            loss.backward()
            optimizer.step()
            
        if (epoch + 1) % 10 == 0:
            print(f'Epoch [{epoch + 1}/{epochs}], Loss: {loss.item():.4f}]')
                 
    with torch.no_grad():
        encoded_data = autoencoder.encoder(data_tensor).numpy()
        
    df_encoded = pd.DataFrame(encoded_data, columns=[f'Enc_{i + 1}' for i in range(encoded_data.shape[1])])
    
    return df_encoded

def feature_engineering(df):
    season_cols = [col for col in df.columns if 'Season' in col]
    df = df.drop(season_cols, axis=1) 
    df['BMI_Age'] = df['Physical-BMI'] * df['Basic_Demos-Age']
    df['Internet_Hours_Age'] = df['PreInt_EduHx-computerinternet_hoursday'] * df['Basic_Demos-Age']
    df['BMI_Internet_Hours'] = df['Physical-BMI'] * df['PreInt_EduHx-computerinternet_hoursday']
    df['BFP_BMI'] = df['BIA-BIA_Fat'] / df['BIA-BIA_BMI']
    df['FFMI_BFP'] = df['BIA-BIA_FFMI'] / df['BIA-BIA_Fat']
    df['FMI_BFP'] = df['BIA-BIA_FMI'] / df['BIA-BIA_Fat']
    df['LST_TBW'] = df['BIA-BIA_LST'] / df['BIA-BIA_TBW']
    df['BFP_BMR'] = df['BIA-BIA_Fat'] * df['BIA-BIA_BMR']
    df['BFP_DEE'] = df['BIA-BIA_Fat'] * df['BIA-BIA_DEE']
    # df['BMR_Weight'] = df['BIA-BIA_BMR'] / df['Physical-Weight']
    # df['DEE_Weight'] = df['BIA-BIA_DEE'] / df['Physical-Weight']
    df['SMM_Height'] = df['BIA-BIA_SMM'] / df['Physical-Height']
    df['Muscle_to_Fat'] = df['BIA-BIA_SMM'] / df['BIA-BIA_FMI']
    # df['Hydration_Status'] = df['BIA-BIA_TBW'] / df['Physical-Weight']
    df['ICW_TBW'] = df['BIA-BIA_ICW'] / df['BIA-BIA_TBW']
    
    return df

In [9]:
# 把SII是空的column刪除
train_df = train_df.dropna(subset=['sii'])

In [10]:
# PCIAT 有些欄位是空的，會影響最後SII結果，把若填滿PCIAT有可能改變SII的column刪除
PCIAT_cols = [f'PCIAT-PCIAT_{i+1:02d}' for i in range(20)]
def IncorrectRows(row):
    if pd.isna(row['PCIAT-PCIAT_Total']):
        return np.nan
    max_possible = row['PCIAT-PCIAT_Total'] + row[PCIAT_cols].isna().sum() * 5
    if row['PCIAT-PCIAT_Total'] <= 30 and max_possible <= 30:
        return 0
    elif 31 <= row['PCIAT-PCIAT_Total'] <= 49 and max_possible <= 49:
        return 1
    elif 50 <= row['PCIAT-PCIAT_Total'] <= 79 and max_possible <= 79:
        return 2
    elif row['PCIAT-PCIAT_Total'] >= 80 and max_possible >= 80:
        return 3
    return np.nan

train_df['recal_sii'] = train_df.apply(IncorrectRows, axis=1)

In [11]:
mismatch_rows = train_df[
    (train_df['recal_sii'] != train_df['sii']) & train_df['sii'].notna()
]
mismatch_indexes = mismatch_rows.index
train_df = train_df.drop(mismatch_indexes)
train_df = train_df.drop(['recal_sii'], axis=1)

In [12]:
# 把有關Season的column做mapping 
SEASON_COLS = [
    "Basic_Demos-Enroll_Season", 
    "CGAS-Season", 
    "Physical-Season", 
    "Fitness_Endurance-Season", 
    "FGC-Season", 
    "BIA-Season", 
    "PAQ-Season",
    "SDS-Season",
    "PreInt_EduHx-Season", 
    ]
def update(df):
    for c in SEASON_COLS: 
        df[c] = df[c].fillna('Missing')
        df[c] = df[c].astype('category')
    return df
train_df = update(train_df)
test_df = update(test_df)
season_mapping = {'Spring': 0, 'Summer': 1, 'Fall': 2, 'Winter': 3, 'Missing': 4}
for col in SEASON_COLS:
    train_df[col] = train_df[col].map(season_mapping)
    test_df[col] = test_df[col].map(season_mapping)
train_df['PCIAT-Season'] = train_df['PCIAT-Season'].map(season_mapping)

In [13]:
# 做Imputer
train_id = train_df['id']
test_id = test_df['id']
train_features = train_df.drop(columns=['id'])
test_features = test_df.drop(columns=['id'])

imputer = SimpleImputer(strategy='median')
train_features_imputed = pd.DataFrame(imputer.fit_transform(train_features), columns=train_features.columns, index=train_features.index)
test_features_imputed = pd.DataFrame(imputer.fit_transform(test_features), columns=test_features.columns, index=test_features.index)

train_df = pd.concat([train_id, train_features_imputed], axis=1)
test_df = pd.concat([test_id, test_features_imputed], axis=1)

In [14]:
train_cor = train_df.drop('id', axis=1)
test_cor = test_df.drop('id', axis=1)

In [15]:
# # 尋找和PCIAT_Total相關性低的column並刪除 
# corr_matrix = train_cor[['PCIAT-PCIAT_Total', 'Basic_Demos-Age', 'Basic_Demos-Sex', 'Physical-BMI', 
#                         'Physical-Height', 'Physical-Weight', 'Physical-Waist_Circumference',
#                         'Physical-Diastolic_BP', 'Physical-Systolic_BP', 'Physical-HeartRate',
#                         'PreInt_EduHx-computerinternet_hoursday', 'SDS-SDS_Total_T', 'PAQ-PAQ_Total',
#                         'Fitness_Endurance-Max_Stage', 'Fitness_Endurance-Time_Mins', 
#                         'Fitness_Endurance-Time_Sec', 'FGC-FGC_CU', 'FGC-FGC_GSND', 'FGC-FGC_GSD', 
#                         'FGC-FGC_PU', 'FGC-FGC_SRL', 'FGC-FGC_SRR', 'FGC-FGC_TL', 'BIA-BIA_Activity_Level_num', 
#                         'BIA-BIA_BMC', 'BIA-BIA_BMI', 'BIA-BIA_BMR', 'BIA-BIA_DEE', 'BIA-BIA_ECW', 'BIA-BIA_FFM',
#                         'BIA-BIA_FFMI', 'BIA-BIA_FMI', 'BIA-BIA_Fat', 'BIA-BIA_Frame_num', 'BIA-BIA_ICW', 
#                         'BIA-BIA_LDM', 'BIA-BIA_LST', 'BIA-BIA_SMM', 'BIA-BIA_TBW']].corr()
# sii_corr = corr_matrix['PCIAT-PCIAT_Total'].drop('PCIAT-PCIAT_Total')
# filtered_corr = sii_corr[(sii_corr > 0.1) | (sii_corr < -0.1)]
# other_corr = sii_corr[(sii_corr <= 0.1) & (sii_corr >= -0.1)]
# other_corr_columns = other_corr.index.tolist()
# print(other_corr)

In [16]:
# plt.figure(figsize=(8, 6))
# filtered_corr.sort_values().plot(kind='barh', color='coral')
# plt.title('Features with Correlation > 0.1 or < -0.1 with PCIAT-PCIAT_Total')
# plt.xlabel('Correlation coefficient')
# plt.ylabel('Features')
# plt.show()

In [17]:
# 把parquet data加進去 
train_ts = load_time_series("/kaggle/input/child-mind-institute-problematic-internet-use/series_train.parquet")
test_ts = load_time_series("/kaggle/input/child-mind-institute-problematic-internet-use/series_test.parquet")

train_ts_noID = train_ts.drop('id', axis=1)
test_ts_noID = test_ts.drop('id', axis=1)

train_ts_encoded = perform_autoencoder(train_ts_noID, encoding_dim=60, epochs=100, batch_size=32)
test_ts_encoded = perform_autoencoder(test_ts_noID, encoding_dim=60, epochs=100, batch_size=32)

time_series_cols = train_ts_encoded.columns.tolist()
train_ts_encoded["id"]=train_ts["id"]
test_ts_encoded['id']=test_ts["id"]

100%|██████████| 996/996 [01:21<00:00, 12.24it/s]
100%|██████████| 2/2 [00:00<00:00, 10.34it/s]


Epoch [10/100], Loss: 1.6444]
Epoch [20/100], Loss: 1.5530]
Epoch [30/100], Loss: 1.5247]
Epoch [40/100], Loss: 1.5126]
Epoch [50/100], Loss: 1.5120]
Epoch [60/100], Loss: 1.5064]
Epoch [70/100], Loss: 1.5068]
Epoch [80/100], Loss: 1.5042]
Epoch [90/100], Loss: 1.4803]
Epoch [100/100], Loss: 1.4700]
Epoch [10/100], Loss: 0.9721]
Epoch [20/100], Loss: 0.4846]
Epoch [30/100], Loss: 0.4271]
Epoch [40/100], Loss: 0.4271]
Epoch [50/100], Loss: 0.4271]
Epoch [60/100], Loss: 0.4271]
Epoch [70/100], Loss: 0.4271]
Epoch [80/100], Loss: 0.4271]
Epoch [90/100], Loss: 0.4271]
Epoch [100/100], Loss: 0.4271]


In [18]:
TARGET_COLS = [
    "PCIAT-Season",
    "PCIAT-PCIAT_01",
    "PCIAT-PCIAT_02",
    "PCIAT-PCIAT_03",
    "PCIAT-PCIAT_04",
    "PCIAT-PCIAT_05",
    "PCIAT-PCIAT_06",
    "PCIAT-PCIAT_07",
    "PCIAT-PCIAT_08",
    "PCIAT-PCIAT_09",
    "PCIAT-PCIAT_10",
    "PCIAT-PCIAT_11",
    "PCIAT-PCIAT_12",
    "PCIAT-PCIAT_13",
    "PCIAT-PCIAT_14",
    "PCIAT-PCIAT_15",
    "PCIAT-PCIAT_16",    
    "PCIAT-PCIAT_17",
    "PCIAT-PCIAT_18",
    "PCIAT-PCIAT_19",
    "PCIAT-PCIAT_20",
    "PCIAT-PCIAT_Total"
]
train_df = train_df.drop(TARGET_COLS,axis=1)

In [19]:
train_df = pd.merge(train_df, train_ts_encoded, how="left", on='id')
test_df = pd.merge(test_df, test_ts_encoded, how="left", on='id')

In [20]:
imputer = KNNImputer(n_neighbors=5)
numeric_cols = train_df.select_dtypes(include=['float64', 'int64','float32', 'int32']).columns
imputed_data = imputer.fit_transform(train_df[numeric_cols])
train_imputed = pd.DataFrame(imputed_data, columns=numeric_cols)
train_imputed['sii'] = train_imputed['sii'].round().astype(int)
for col in train_df.columns:
    if col not in numeric_cols:
        train_imputed[col] = train_df[col]
        
train_df = train_imputed

train_df = feature_engineering(train_df)
train_df = train_df.dropna(thresh=10, axis=0)
test_df = feature_engineering(test_df)

In [21]:
# train_df = train_df.drop(columns=other_corr_columns)
# test_df = test_df.drop(columns=other_corr_columns)

In [22]:
featuresCols = ['Basic_Demos-Age', 'Basic_Demos-Sex',
                'CGAS-CGAS_Score', 'Physical-BMI',
                'Physical-Height', 'Physical-Weight', 'Physical-Waist_Circumference',
                'Physical-Diastolic_BP', 'Physical-HeartRate', 'Physical-Systolic_BP',
                'Fitness_Endurance-Max_Stage',
                'Fitness_Endurance-Time_Mins', 'Fitness_Endurance-Time_Sec',
                'FGC-FGC_CU', 'FGC-FGC_CU_Zone', 'FGC-FGC_GSND',
                'FGC-FGC_GSND_Zone', 'FGC-FGC_GSD', 'FGC-FGC_GSD_Zone', 'FGC-FGC_PU',
                'FGC-FGC_PU_Zone', 'FGC-FGC_SRL', 'FGC-FGC_SRL_Zone', 'FGC-FGC_SRR',
                'FGC-FGC_SRR_Zone', 'FGC-FGC_TL', 'FGC-FGC_TL_Zone',
                'BIA-BIA_Activity_Level_num', 'BIA-BIA_BMC', 'BIA-BIA_BMI',
                'BIA-BIA_BMR', 'BIA-BIA_DEE', 'BIA-BIA_ECW', 'BIA-BIA_FFM',
                'BIA-BIA_FFMI', 'BIA-BIA_FMI', 'BIA-BIA_Fat', 'BIA-BIA_Frame_num',
                'BIA-BIA_ICW', 'BIA-BIA_LDM', 'BIA-BIA_LST', 'BIA-BIA_SMM',
                'BIA-BIA_TBW', 'PAQ-PAQ_Total',
                'SDS-SDS_Total_Raw','SDS-SDS_Total_T',
                'PreInt_EduHx-computerinternet_hoursday', 'sii', 'BMI_Age','Internet_Hours_Age','BMI_Internet_Hours',
                'BFP_BMI', 'FFMI_BFP', 'FMI_BFP', 'LST_TBW', 'BFP_BMR', 'BFP_DEE','SMM_Height', 'Muscle_to_Fat', 'ICW_TBW']

featuresCols += time_series_cols

train_df = train_df[featuresCols]
train_df = train_df.dropna(subset='sii')

featuresCols = ['Basic_Demos-Age', 'Basic_Demos-Sex',
                'CGAS-CGAS_Score', 'Physical-BMI',
                'Physical-Height', 'Physical-Weight', 'Physical-Waist_Circumference',
                'Physical-Diastolic_BP', 'Physical-HeartRate', 'Physical-Systolic_BP',
                'Fitness_Endurance-Max_Stage',
                'Fitness_Endurance-Time_Mins', 'Fitness_Endurance-Time_Sec',
                'FGC-FGC_CU', 'FGC-FGC_CU_Zone', 'FGC-FGC_GSND',
                'FGC-FGC_GSND_Zone', 'FGC-FGC_GSD', 'FGC-FGC_GSD_Zone', 'FGC-FGC_PU',
                'FGC-FGC_PU_Zone', 'FGC-FGC_SRL', 'FGC-FGC_SRL_Zone', 'FGC-FGC_SRR',
                'FGC-FGC_SRR_Zone', 'FGC-FGC_TL', 'FGC-FGC_TL_Zone',
                'BIA-BIA_Activity_Level_num', 'BIA-BIA_BMC', 'BIA-BIA_BMI',
                'BIA-BIA_BMR', 'BIA-BIA_DEE', 'BIA-BIA_ECW', 'BIA-BIA_FFM',
                'BIA-BIA_FFMI', 'BIA-BIA_FMI', 'BIA-BIA_Fat', 'BIA-BIA_Frame_num',
                'BIA-BIA_ICW', 'BIA-BIA_LDM', 'BIA-BIA_LST', 'BIA-BIA_SMM',
                'BIA-BIA_TBW', 'PAQ-PAQ_Total',
                'SDS-SDS_Total_Raw',
                'SDS-SDS_Total_T',
                'PreInt_EduHx-computerinternet_hoursday', 'BMI_Age','Internet_Hours_Age','BMI_Internet_Hours',
                'BFP_BMI', 'FFMI_BFP', 'FMI_BFP', 'LST_TBW', 'BFP_BMR', 'BFP_DEE',
                'SMM_Height', 'Muscle_to_Fat', 'ICW_TBW']

featuresCols += time_series_cols
test_df = test_df[featuresCols]

In [23]:
# train_df = train_df.drop('id', axis=1)
# test_df = test_df.drop('id', axis=1)

In [24]:
print(f'Train Shape : {train_df.shape} || Test Shape : {test_df.shape}')

Train Shape : (2718, 120) || Test Shape : (20, 119)


In [25]:
train_df = train_df.fillna(0)
test_df = test_df.fillna(0)

In [26]:
from imblearn.over_sampling import SMOTE
X_train = train_df.drop(columns=['sii'])  # 假設 'sii' 是目標欄位
y_train = train_df['sii']

# 使用 SMOTE 進行過採樣
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

In [27]:
X_train_resampled

Unnamed: 0,Basic_Demos-Age,Basic_Demos-Sex,CGAS-CGAS_Score,Physical-BMI,Physical-Height,Physical-Weight,Physical-Waist_Circumference,Physical-Diastolic_BP,Physical-HeartRate,Physical-Systolic_BP,...,Enc_51,Enc_52,Enc_53,Enc_54,Enc_55,Enc_56,Enc_57,Enc_58,Enc_59,Enc_60
0,5.000000,0.000000,51.000000,16.877316,46.000000,50.800000,26.000000,68.000000,81.000000,114.000000,...,1.261670,3.364906,1.822499,3.199418,1.867914,0.601543,1.124913,1.397312,2.965155,1.810345
1,9.000000,0.000000,65.000000,14.035590,48.000000,46.000000,22.000000,75.000000,70.000000,122.000000,...,0.877661,2.240514,2.114751,1.985586,2.209620,0.819748,2.199565,1.322297,1.787001,1.673745
2,10.000000,1.000000,71.000000,16.648696,56.500000,75.600000,26.000000,65.000000,94.000000,117.000000,...,1.659091,0.704123,1.858778,1.130553,1.789839,1.242414,6.432002,1.905194,0.867131,2.600431
3,9.000000,0.000000,71.000000,18.292347,56.000000,81.600000,26.000000,60.000000,97.000000,117.000000,...,1.258142,2.517515,0.000000,2.423617,2.779365,0.000000,0.000000,0.000000,2.142169,0.294263
4,13.000000,1.000000,50.000000,22.279952,59.500000,112.200000,26.000000,60.000000,73.000000,102.000000,...,2.321296,0.205333,3.107339,0.000000,3.114447,1.274335,3.261590,2.929590,1.146064,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6339,12.599601,0.000000,71.005984,28.680067,65.599601,175.479681,26.000000,74.897407,86.599601,134.201197,...,0.525903,0.324441,3.958662,0.538026,1.577378,1.804936,2.348030,3.283607,2.516716,2.757017
6340,13.646459,0.646459,57.929175,22.659126,63.136403,131.784379,26.000000,97.929175,87.363743,154.192073,...,0.722748,0.678223,3.279521,0.606808,2.857975,2.727269,1.942552,1.797713,1.541758,3.451328
6341,14.000000,0.974194,65.077417,18.905577,60.601618,98.790989,26.000000,104.045194,81.000000,144.200028,...,0.100740,0.387067,3.715885,0.029209,3.591292,3.906484,1.904498,1.364211,1.504025,4.319537
6342,17.000000,0.152939,53.058783,22.445850,66.464713,141.155331,26.000000,79.694122,79.529392,128.847061,...,1.780591,0.936604,2.920789,0.641967,2.601398,1.152416,3.379448,2.633135,0.877951,1.667096


In [28]:
y_train_resampled

0       2
1       0
2       0
3       1
4       1
       ..
6339    3
6340    3
6341    3
6342    3
6343    3
Name: sii, Length: 6344, dtype: int64

In [29]:
X_train = X_train_resampled.values
y_train = y_train_resampled.values.reshape(-1, 1)
X_test = test_df.values

# model = MLPClassifier(hidden_layer_sizes=(128, 64, 32), max_iter=500, random_state=42)
# model.fit(X_train, y_train)

# model = RandomForestClassifier(random_state=0)
# model.fit(X_train, y_train)

X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

model = TabNetRegressor(
    n_d=64,              # Width of the decision prediction layer
    n_a=64,              # Width of the attention embedding for each step
    n_steps=5,           # Number of steps in the architecture
    gamma=1.5,           # Coefficient for feature selection regularization
    n_independent=2,     # Number of independent GLU layer in each GLU block
    n_shared=2,          # Number of shared GLU layer in each GLU block
    lambda_sparse=1e-4,  # Sparsity regularization
    optimizer_fn=torch.optim.Adam,
    optimizer_params=dict(lr=2e-2, weight_decay=1e-5),
    mask_type='entmax',
    scheduler_params=dict(mode="min", patience=10, min_lr=1e-5, factor=0.5),
    scheduler_fn=torch.optim.lr_scheduler.ReduceLROnPlateau,
    verbose=1,
    device_name='cuda' if torch.cuda.is_available() else 'cpu'
)


# 訓練 TabNet 模型
model.fit(
    X_train, y_train,
    eval_set=[(X_val, y_val)],    # 指定驗證集
    eval_name=['val'],            # 命名驗證集
    eval_metric=['mae'],          # 設定評估指標，例如 MAE
    max_epochs=500,      
    patience=50,          
    batch_size=1024,
    virtual_batch_size=128,
    num_workers=0,
    drop_last=False,
)

# model = XGBRegressor(
#     n_estimators=200,          # 設置樹的數量
#     learning_rate=0.05,         # 設置學習率
#     max_depth=6,                # 最大樹深
#     subsample=0.8,              # 隨機採樣比例
#     colsample_bytree=0.8,       # 每棵樹的列采樣率
#     reg_alpha=1,  # Increased from 0.1
#     reg_lambda=5,  # Increased from 1
#     random_state=42
# )

# # 訓練 XGBRegressor，並設置早停
# model.fit(
#     X_train, y_train,
#     eval_set=[(X_val, y_val)],             # 指定驗證集
#     early_stopping_rounds=50,              # 如果在 50 個 rounds 中，驗證集上的結果不再改善則停止訓練
#     verbose=True                           # 顯示訓練過程
# )


test_df['sii'] = model.predict(X_test)



epoch 0  | loss: 6.48088 | val_mae: 11.4224 |  0:00:01s
epoch 1  | loss: 3.0014  | val_mae: 5.97052 |  0:00:01s
epoch 2  | loss: 2.18413 | val_mae: 4.40949 |  0:00:02s
epoch 3  | loss: 1.87663 | val_mae: 3.81592 |  0:00:02s
epoch 4  | loss: 1.22423 | val_mae: 3.40053 |  0:00:02s
epoch 5  | loss: 1.08828 | val_mae: 2.61743 |  0:00:03s
epoch 6  | loss: 0.97629 | val_mae: 3.78685 |  0:00:03s
epoch 7  | loss: 0.89271 | val_mae: 5.3196  |  0:00:04s
epoch 8  | loss: 0.85532 | val_mae: 5.82879 |  0:00:04s
epoch 9  | loss: 0.74888 | val_mae: 2.86027 |  0:00:04s
epoch 10 | loss: 0.6972  | val_mae: 1.41034 |  0:00:05s
epoch 11 | loss: 0.69719 | val_mae: 1.28311 |  0:00:05s
epoch 12 | loss: 0.69969 | val_mae: 1.14508 |  0:00:06s
epoch 13 | loss: 0.59078 | val_mae: 1.08213 |  0:00:06s
epoch 14 | loss: 0.60524 | val_mae: 1.15833 |  0:00:07s
epoch 15 | loss: 0.61787 | val_mae: 0.88686 |  0:00:07s
epoch 16 | loss: 0.62558 | val_mae: 0.93153 |  0:00:07s
epoch 17 | loss: 0.57334 | val_mae: 0.93411 |  0



In [30]:
submit_df = pd.concat([test_id, test_df['sii']], axis=1)
submit_df['sii'] = submit_df['sii'].astype(int)

In [31]:
submit_df

Unnamed: 0,id,sii
0,00008ff9,0
1,000fd460,0
2,00105258,0
3,00115b9f,1
4,0016bb22,0
5,001f3379,1
6,0038ba98,0
7,0068a485,0
8,0069fbed,0
9,0083e397,0


In [32]:
submit_df.to_csv('submission.csv', index=False)