'''
Author:
        
        PARK, JunHo, junho@ccnets.org

        
        KIM, JoengYoong, jeongyoong@ccnets.org
        
    COPYRIGHT (c) 2024. CCNets. All Rights reserved.
'''

# Heart Disease Dataset

Data Source: https://www.kaggle.com/datasets/mahdifaour/heart-disease-dataset/data

<a id="1"></a>
> <h1 style = 'font-family: Times New Roman'><b> <b style = 'color: #42c2f5'>1.</b> Import Necessary Libraries </b></h1>

In [1]:
import os
import sys
import warnings
warnings.filterwarnings("ignore")

path_append = "../"
sys.path.append(path_append)  # Go up one directory from where you are.

In [2]:
import pandas  as pd
df = pd.read_csv(path_append + '../data/Heart Disease Dataset/Heart_Disease.csv')
df.head()

Unnamed: 0,sex,age,education,smokingStatus,cigsPerDay,BPMeds,prevalentStroke,prevalentHyp,diabetes,totChol,sysBP,diaBP,BMI,heartRate,glucose,CHDRisk
0,male,39,4,no,0,0,0,0,no,195,106.0,70.0,26.97,80,77,no
1,female,46,2,no,0,0,0,0,no,250,121.0,81.0,28.73,95,76,no
2,male,48,1,yes,20,0,0,0,no,245,127.5,80.0,25.34,75,70,no
3,female,61,3,yes,30,0,0,1,no,225,150.0,95.0,28.58,65,103,yes
4,female,46,3,yes,23,0,0,0,no,285,130.0,84.0,23.1,85,85,no


<a id="2"></a>
> <h1 style = 'font-family: Times New Roman'><b> <b style = 'color: #4290f5'>2.</b> Modeling: Preprocess </b></h1>

In [3]:
from tools.preprocessing.data_frame import auto_preprocess_dataframe
target_columns = ['CHDRisk']
encode_columns = ['education']
df, description = auto_preprocess_dataframe(df, target_columns, encode_columns=encode_columns)

Number of missing values in each column:
sex              11
smokingStatus    13
dtype: int64
Number of rows dropped due to missing values: 22

Column 'diabetes' has 2 unique values.
Column 'education' has 4 unique values.
Column 'sex' has 2 unique values.
Column 'smokingStatus' has 2 unique values.
Column 'CHDRisk' has 2 unique values.


Unnamed: 0,Min,Max,Mean,Std,Null Count,Scaled,Encoded
age,-5.808315,4.592238,-1.0,2.339339,0,Minmax,
cigsPerDay,0.0,3.5,0.4546687,0.596618,0,Robust,
BPMeds,-0.03012,0.96988,4.864066e-19,0.170942,0,,
prevalentStroke,-0.00575,0.99425,1.702423e-18,0.075623,0,,
prevalentHyp,-0.310789,0.689211,1.1673760000000001e-17,0.46288,0,,
totChol,-2.122807,6.421053,0.04844257,0.773303,0,Robust,
sysBP,-1.679245,6.301887,0.1652907,0.831567,0,Robust,
diaBP,-2.324786,4.136752,0.0616171,0.815829,0,Robust,
BMI,-1.998987,6.361702,0.07715471,0.820912,0,Robust,
heartRate,-2.214286,4.857143,0.05329761,0.854885,0,Robust,


In [4]:
import torch
from sklearn.model_selection import train_test_split
from tools.preprocessing.template_dataset import TemplateDataset

train_df, test_df = train_test_split(df, test_size=0.2, shuffle=True, random_state=42)
# predict the next value in the sequence
train_df_x = train_df.iloc[:, :-1] # all columns except the last one
train_df_y = train_df.iloc[:, -1:] # only the last column

test_df_x = test_df.iloc[:, :-1] # all columns except the last one
test_df_y = test_df.iloc[:, -1:] # only the last column

print('train df shape: ', train_df.shape)
print('test df shape: ', test_df.shape)
trainset = TemplateDataset(train_df_x, train_df_y)
testset = TemplateDataset(test_df_x, test_df_y)

train df shape:  (2921, 19)
test df shape:  (731, 19)


In [5]:
from tools.setting.data_config import DataConfig
from tools.setting.ml_params import MLParameters
from trainer_hub import TrainerHub

num_features = description['num_features']
num_classes = description['num_classes']
data_config = DataConfig(dataset_name = 'heart-disease', task_type='binary_classification', obs_shape=[num_features], label_size=num_classes)

#  Set training configuration from the AlgorithmConfig class, returning them as a Namespace object.
ml_params = MLParameters(ccnet_network = 'tabnet', encoder_network = 'tabnet')
ml_params.training.num_epoch = 1000
ml_params.model.ccnet_config.num_layers = 4
ml_params.model.encoder_config.num_layers = 4   
ml_params.model.encoder_config.d_model = 512   

# Set the device to GPU if available, else CPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") 

# Initialize the TrainerHub class with the training configuration, data configuration, device, and use_print and use_wandb flags
trainer_hub = TrainerHub(ml_params, data_config, device, use_print=False, use_wandb=True) 

Trainer Name: causal_trainer


[1mModelParameters Parameters:[0m


Unnamed: 0,ccnet_config,ccnet_network,encoder_config,encoder_network
0,See details below,tabnet,See details below,tabnet


[3m
Detailed ccnet_config Configuration:[0m


Unnamed: 0,ccnet_config_model_name,ccnet_config_num_layers,ccnet_config_d_model,ccnet_config_dropout,ccnet_config_obs_shape,ccnet_config_condition_dim,ccnet_config_z_dim
0,tabnet,4,256,0.05,[256],2,8


[3m
Detailed encoder_config Configuration:[0m


Unnamed: 0,encoder_config_model_name,encoder_config_num_layers,encoder_config_d_model,encoder_config_dropout,encoder_config_obs_shape,encoder_config_condition_dim,encoder_config_z_dim
0,tabnet,4,512,0.05,[18],256,256


[1mTrainingParameters Parameters:[0m


Unnamed: 0,batch_size,max_iters,max_seq_len,min_seq_len,num_epoch
0,64,100000,,,1000


[1mOptimizationParameters Parameters:[0m


Unnamed: 0,clip_grad_range,decay_rate_100k,learning_rate,max_grad_norm,scheduler_type
0,,0.05,0.0002,1.0,exponential


[1mAlgorithmParameters Parameters:[0m


Unnamed: 0,enable_diffusion,error_function,reset_pretrained
0,False,mse,False


[1mDataConfig Parameters:[0m


Unnamed: 0,dataset_name,task_type,obs_shape,label_size,explain_size,explain_layer,state_size,show_image_indices
0,heart-disease,binary_classification,[18],2,8,tanh,256,








In [6]:
trainer_hub.train(trainset, testset)

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mjunhopark[0m. Use [1m`wandb login --relogin`[0m to force relogin


[34m[1mwandb[0m: Adding directory to artifact (.\..\saved\heart-disease\causal-learning)... Done. 0.0s


Epochs:   0%|          | 0/1000 [00:00<?, ?it/s]

Iterations:   0%|          | 0/45 [00:00<?, ?it/s]

In [None]:
trainer_hub.test(testset)