# DA- VAE, Diffusion (Wisconsin breast cancer dataset)

* Using Keras/Tensorflow

In [None]:
import os
import time
# import shap ## for XAI
import numpy as np
import pandas as pd
import seaborn as sns
import tensorflow as tf
# import pingouin as pg
import matplotlib.pyplot as plt
import sklearn.metrics as metrics

In [None]:
from numpy import dot
from numpy.linalg import norm

from keras.models import Sequential
from keras.layers import Dense , Dropout , Lambda, Flatten
from keras.layers import Dense , Activation, Dropout, BatchNormalization
from keras.optimizers import Adam ,RMSprop
from keras.callbacks import Callback

from scipy.special import rel_entr
from scipy.stats import gaussian_kde
from scipy.spatial.distance import jensenshannon

from sklearn.model_selection import train_test_split, StratifiedKFold, KFold, train_test_split, ParameterGrid
from sklearn import decomposition, metrics
from sklearn.cluster import KMeans
from sklearn.metrics import cohen_kappa_score,f1_score, confusion_matrix, roc_auc_score
from sklearn.inspection import permutation_importance
from sklearn.preprocessing import Normalizer, MinMaxScaler
from sklearn.manifold import TSNE

from tensorflow.keras.utils import to_categorical
from tensorflow import keras
from tensorflow.keras.callbacks import LearningRateScheduler, EarlyStopping, ReduceLROnPlateau
from tensorflow.keras.layers import Input, Dense, Lambda, Conv1D, Flatten, Reshape, UpSampling1D
from tensorflow.keras.models import Model
from tensorflow.keras import backend as K
from tensorflow.keras.losses import mse, MeanSquaredError

***

---

## Original data check

In [None]:
### In this research, the dataset is breast cancer dataset (tabular format)

data_ori = pd.read_csv('E:/RESEARCH/Datasets/bio_data/breast_cancer_wisconsin/data.csv')

In [None]:
### data shape, variables check
print("The shape of the public breast cancer wisconsin dataset is:",data_ori.shape)
# print(public.columns)
data_ori.head()

In [None]:
# data_ori.columns

In [None]:
# 결측값이 하나라도 있는 행의 개수 확인
num_missing_rows = data_ori.isna().any(axis=1).sum()

print(f"The number of rows that contains at least one missing value: {num_missing_rows}")

In [None]:
# y includes our labels and x includes our features
y = data_ori.diagnosis                          # M or B 
list = ['Unnamed: 32','id','diagnosis']
x = data_ori.drop(list, axis = 1 )

In [None]:
y.value_counts()

In [None]:
x.columns

In [None]:
B, M = y.value_counts()
print('Number of Benign: ', B)
print('Number of Malignant : ', M)

In [None]:
x.describe()

In [None]:
x.shape

--------------

----------

## 1D VAE model

In [None]:
data_ori["diagnosis"].value_counts()

> Checking Breast cancer dataset, it requires balance for lacking data group \
> underneath, Binary class is labeled 0(Benign), 1(Malignant) \
> Classification: ratio shoudl be 1 * (Benign), 1.68 * (Malignant)

In [None]:
data = data_ori

In [None]:
data_B = data[data['diagnosis']=="B"]
data_M = data[data['diagnosis']=="M"]

In [None]:
class Args:
    # arugments
    epochs=100
    bs=32
    lr=0.0001
    momentum=0.9
    num_classes= 2
    latent_dim = 16
    inter_dim1 = 32
    inter_dim2 = 16
    seed=710674

args = Args()

# np.random.seed(args.seed)
# random.seed(args.seed)
# torch.manual_seed(args.seed)

In [None]:
data_vae = data_ori.copy()
# data_vae = data_B.copy()
# data_vae = data_M.copy()

In [None]:
# y includes our labels and x includes our features
y = data_vae.diagnosis  # M or B 
list = ['Unnamed: 32','id','diagnosis']
x = data_vae.drop(list, axis = 1 )

In [None]:
data_x = x.copy()

In [None]:
scaler = MinMaxScaler() #set the scaler (between 0 and 1)

data_x[:] = scaler.fit_transform(data_x[:])
data_x = data_x.round(decimals=5)

In [None]:
# data_x = (data_x - np.mean(data_x, axis=0)) / np.std(data_x, axis=0)  # 데이터 정규화

In [None]:
# data_x.head()

In [None]:
# # 데이터 확인
# print(np.isnan(data_x).any())  # False여야 합니다.
# print(np.isinf(data_x).any())  # False여야 합니다.

In [None]:
# data_x = data_x.fillna(data_x.mean())

In [None]:
### breast cancer wisconsin ####### BINARY CLASSIFICATION ##########

label = y
label = label.replace({'B':0})
label = label.replace({'M': 1})

data_y = to_categorical(label, 2) ## into the format of one-hot encoding

In [None]:
print("The size of x dataset is:", data_x.shape)
print("The size of y dataset is:", data_y.shape)

### encoder networks

In [None]:
input_dim = data_x.shape[1]
latent_dim = 2

inputs = Input(shape=(input_dim,))
h = Dense(32, activation='relu')(inputs)
z_mean = Dense(latent_dim)(h)
z_log_var = Dense(latent_dim)(h)

In [None]:
# ## deeper model
# input_dim = data_x.shape[1]
# latent_dim = 2  

# inputs = Input(shape=(input_dim,))
# h = Dense(128, activation='relu')(inputs)
# h = Dense(64, activation='relu')(h)
# h = Dense(32, activation='relu')(h)
# z_mean = Dense(latent_dim)(h)
# z_log_var = Dense(latent_dim)(h)

In [None]:
## latent space sampling
def sampling(args):
    z_mean, z_log_var = args
    batch = K.shape(z_mean)[0]
    dim = K.int_shape(z_mean)[1]
    epsilon = K.random_normal(shape=(batch, dim))
    return z_mean + K.exp(0.5 * z_log_var) * epsilon

z = Lambda(sampling, output_shape=(latent_dim,))([z_mean, z_log_var])

### decoder networks

In [None]:
decoder_h = Dense(32, activation='relu')
decoder_mean = Dense(input_dim, activation='sigmoid')

h_decoded = decoder_h(z)
x_decoded_mean = decoder_mean(h_decoded)

In [None]:
# ## deeper model
# decoder_h1 = Dense(32, activation='relu')
# decoder_h2 = Dense(64, activation='relu')
# decoder_h3 = Dense(128, activation='relu')
# decoder_mean = Dense(input_dim, activation='sigmoid')

# h_decoded = decoder_h1(z)
# h_decoded = decoder_h2(h_decoded)
# h_decoded = decoder_h3(h_decoded)
# x_decoded_mean = decoder_mean(h_decoded)

### model define

In [None]:
vae = Model(inputs, x_decoded_mean)
vae.summary()

### loss function

In [None]:
reconstruction_loss = MeanSquaredError()(inputs, x_decoded_mean)

kl_loss = -0.5 * K.sum(1 + z_log_var - K.square(z_mean) - K.exp(z_log_var + K.epsilon()), axis=-1)
vae_loss = K.mean(reconstruction_loss + kl_loss)
vae.add_loss(vae_loss)

In [None]:
vae.compile(optimizer=Adam(learning_rate=0.001))

In [None]:
vae.summary()

### model training

In [None]:
vae.fit(data_x, epochs=args.epochs, batch_size=args.bs, validation_split=0.2, verbose=2)

### extracting encoder, decoder

In [None]:
## encoder model extraction
encoder = Model(inputs, z_mean)

In [None]:
## decoder model extraction
decoder_input = Input(shape=(latent_dim,))
_h_decoded = decoder_h(decoder_input)
_x_decoded_mean = decoder_mean(_h_decoded)
decoder = Model(decoder_input, _x_decoded_mean)

# #### deeper model
# decoder_input = Input(shape=(latent_dim,))
# _h_decoded = decoder_h1(decoder_input)
# _h_decoded = decoder_h2(_h_decoded)
# _h_decoded = decoder_h3(_h_decoded)
# _x_decoded_mean = decoder_mean(_h_decoded)
# decoder = Model(decoder_input, _x_decoded_mean)

In [None]:
# # 입력 데이터를 잠재 공간으로 인코딩
# encoded_data = encoder.predict(data_x)

# # 잠재 공간의 데이터를 디코딩하여 원래 공간으로 복원
# decoded_data = decoder.predict(encoded_data)

### Latent space visualization

In [None]:
## encode the input dataset into latent space
encoded_data = encoder.predict(data_x)

## latent space transform with t-SNE function
tsne = TSNE(n_components=2, random_state=710674)
encoded_data_tsne = tsne.fit_transform(encoded_data)

## visualize
plt.figure(figsize=(8, 6))
plt.scatter(encoded_data_tsne[:, 0], encoded_data_tsne[:, 1], c=label, cmap='viridis')
plt.colorbar()
plt.xlabel("t-SNE component 1")
plt.ylabel("t-SNE component 2")
plt.title("t-SNE visualization of the latent space")
plt.show()

### Synthetic data generation

In [None]:
### sampling randomly from latent space
# n_samples = 212  # number of synthetic dataset to generate (generate size for gen_B)
n_samples = 357  # number of synthetic dataset to generate (generate size for gen_M)

z_samples = np.random.normal(size=(n_samples, latent_dim))

### 디코더를 통해 synthetic data 생성
# synthetic_data_B = decoder.predict(z_samples)
synthetic_data_M = decoder.predict(z_samples)

# print("Generated Synthetic Data:")
# print(synthetic_data)

In [None]:
# synthetic_data_B.shape
synthetic_data_M.shape

In [None]:
# gen_B = synthetic_data_B.copy()
# gen_B = pd.DataFrame(gen_B, columns=data_x.columns)

#####################
gen_M = synthetic_data_M.copy()
gen_M = pd.DataFrame(gen_M, columns=data_x.columns)

In [None]:
gen_M

In [None]:
min(gen_M.shape[0], data_x.shape[0])

In [None]:
data_x_check = data_x.sample(n=(min(gen_M.shape[0], data_x.shape[0])))
gen_check = gen_M.sample(n=(min(gen_M.shape[0], data_x.shape[0])))
js_divergence = jensenshannon(gen_check['smoothness_mean'], data_x_check['smoothness_mean'])
print(f'Jensen-Shannon Divergence: {js_divergence}')

### Overall VAE data synthesizing model

In [None]:
# data_syn = data_B.copy()
# data_syn = data_M.copy()

----------

-----------

## Latent Diffusion Model

### encoder, decoder networks

In [None]:
data = data_ori.copy()

In [None]:
data_B = data[data['diagnosis']=="B"]
data_M = data[data['diagnosis']=="M"]

In [None]:
# data_diffusion = data_ori.copy()
# data_diffusion = data_B.copy()
data_diffusion = data_M.copy()

In [None]:
data_diffusion.shape

In [None]:
# y includes our labels and x includes our features
list = ['Unnamed: 32','id','diagnosis']
x = data_diffusion.drop(list, axis = 1 )
y = data_diffusion.diagnosis  # M or B 

In [None]:
data_x = x.copy()

In [None]:
scaler = MinMaxScaler() #set the scaler (between 0 and 1)
data_x[:] = scaler.fit_transform(data_x[:])
data_x = data_x.round(decimals=5)

In [None]:
### breast cancer wisconsin ####### BINARY CLASSIFICATION ##########
label = y
label = label.replace({'B':0})
label = label.replace({'M': 1})

data_y = to_categorical(label, 2) ## into the format of one-hot encoding

In [None]:
print("The size of x dataset is:", data_x.shape)
print("The size of y dataset is:", data_y.shape)

In [None]:
# Sampling function for the latent space
def sampling(repara):
    z_mean, z_log_var = repara
    batch = tf.shape(z_mean)[0]
    dim = tf.shape(z_mean)[1]
    epsilon = K.random_normal(shape=(batch, dim))
    return z_mean + tf.exp(0.5 * z_log_var) * epsilon

# Define encoder
def build_encoder(input_shape, latent_dim):
    inputs = Input(shape=input_shape)
    x = Conv1D(32, 3, activation='relu', padding='same')(inputs)
    x = Flatten()(x)
    x = Dense(128, activation='relu')(x)
    z_mean = Dense(latent_dim)(x)
    z_log_var = Dense(latent_dim)(x)
    return Model(inputs, [z_mean, z_log_var], name='encoder')

# Define decoder
def build_decoder(output_shape, latent_dim):
    latent_inputs = Input(shape=(latent_dim,))
    x = Dense(128, activation='relu')(latent_inputs)
    x = Dense(np.prod(output_shape), activation='relu')(x)
    x = Reshape(output_shape)(x)
    outputs = Conv1D(1, 3, activation='sigmoid', padding='same')(x)
    return Model(latent_inputs, outputs, name='decoder')

In [None]:
input_shape = (data_x.shape[1], 1)  # Example input shape
latent_dim = 16

In [None]:
input_shape

In [None]:
## Define the VAE
encoder = build_encoder(input_shape, latent_dim)
decoder = build_decoder(input_shape, latent_dim)

inputs = Input(shape=input_shape)
z_mean, z_log_var = encoder(inputs)
z = Lambda(sampling, output_shape=(latent_dim,))([z_mean, z_log_var])
outputs = decoder(z)

vae = Model(inputs, outputs, name='vae')

# Define the VAE loss
reconstruction_loss = MeanSquaredError()(inputs, outputs)
kl_loss = -0.5 * tf.reduce_mean(
    z_log_var - tf.square(z_mean) - tf.exp(z_log_var) + 1)
vae_loss = reconstruction_loss + kl_loss
vae.add_loss(vae_loss)
vae.compile(optimizer='adam')

# vae.summary()

In [None]:
# encoder.summary()

In [None]:
# decoder.summary()

### model training

In [None]:
# Train the VAE
vae.fit(data_x, epochs=args.epochs, validation_split=0.2, batch_size=args.bs, verbose=2)

### extracting encoder, decoder

In [None]:
# # Extract the encoder and decoder models
# encoder_model = encoder
# decoder_model = decoder

In [None]:
# # Example of using the encoder to get the latent representation
# example_data = np.random.rand(1, 356, 1)  # Example single data point
# z_mean, z_log_var = encoder_model.predict(example_data)
# z_sampled = sampling([z_mean, z_log_var])

# print("Latent representation (z_mean):", z_mean)
# print("Latent representation (z_log_var):", z_log_var)
# print("Sampled latent vector:", z_sampled)

# # Example of using the decoder to generate data from the latent space
# synthetic_data = decoder_model.predict(z_sampled)

# print("Generated synthetic data:", synthetic_data)

### Synthetic data generation

In [None]:
## for synthetic data generation from latent diffusion's latent space
def latent_diffusion_data_generation(decoder, latent_dim, num_samples):
    # sample random vectors from a standard normal distribution
    random_vectors = np.random.normal(size=(num_samples, latent_dim))
    # generating synthetic dataset with decoder
    synthetic_data = decoder.predict(random_vectors)
    return synthetic_data

In [None]:
### sampling randomly from latent space
b_samples = 212  # number of synthetic dataset to generate (generate size for gen_B)
m_samples = 357  # number of synthetic dataset to generate (generate size for gen_M)

### generating synthetic dataset for each class
# synthetic_data_B = latent_diffusion_data_generation(decoder, latent_dim, b_samples)
synthetic_data_M = latent_diffusion_data_generation(decoder, latent_dim, m_samples)

In [None]:
## Reshape synthetic data for Sequential model input
# Ensure this matches the input shape of the Sequential model
# synthetic_data_B = synthetic_data_B.reshape((synthetic_data_B.shape[0]), (synthetic_data_B.shape[1]))
synthetic_data_M = synthetic_data_M.reshape((synthetic_data_M.shape[0]), (synthetic_data_M.shape[1]))

In [None]:
synthetic_data_B

In [None]:
# gen_B = synthetic_data_B.copy()
# gen_B = pd.DataFrame(gen_B, columns=data_x.columns)

#####################
gen_M = synthetic_data_M.copy()
gen_M = pd.DataFrame(gen_M, columns=data_x.columns)

---------

-----------

## DNN classification/prediction

### using original dataset only

In [None]:
x = data_x
y = data_y

In [None]:
x_trainset, x_test, y_trainset, y_test = train_test_split(ori_x, y_ori, test_size = 0.4, random_state = 710674)

In [None]:
x_train, x_vali, y_train, y_vali = train_test_split(x_trainset, y_trainset, test_size = 0.2, random_state = 710674)

### using VAE generated dataset only

In [None]:
# x = decoded_df
# y = y

### using original + generated dataset

> we generate synthesized dataset using VAE from above. \
> adopted original datset: data_mdd, data_bpi, data_bpii \
> synthesized into: gen_mdd, gen_bpi, gen_bpii \
> gen_control is not generated since "control group" has biggest number.

* Set target information into newly generated dataset

In [None]:
gen_B['diagnosis'] = "B"
gen_M['diagnosis'] = "M"

* Preparing original dataset

In [None]:
data_B = data_B.drop(['Unnamed: 32','id'], axis=1)
data_M = data_M.drop(['Unnamed: 32','id'], axis=1)

* Concat the original datasets into one original dataframe

In [None]:
ori_df_list = [data_B, data_M]
ori_df_concat = pd.concat(ori_df_list, ignore_index=True)

In [None]:
ori_df_concat.shape

In [None]:
ori_df_concat['diagnosis'].value_counts()

* Concat the generated datasets into one gen dataframe

In [None]:
gen_df_list = [gen_B, gen_M]
gen_df_concat = pd.concat(gen_df_list, ignore_index=True)

In [None]:
gen_df_concat['diagnosis'].value_counts()

* Preparing x and y data vectors

In [None]:
### Using MDD,BP dataset
ori_x = ori_df_concat.drop(['diagnosis'], axis=1)
ori_y = ori_df_concat.loc[:,["diagnosis"]]

gen_x = gen_df_concat.drop(['diagnosis'], axis=1)
gen_y = gen_df_concat.loc[:,["diagnosis"]]

In [None]:
ori_x = ori_x.fillna(ori_x.mean())

In [None]:
label = ori_y
label = label.replace({'B':0})
label = label.replace({'M': 1})

y_ori = to_categorical(label, 2) ## into the format of one-hot encoding

In [None]:
label_ = gen_y
label_ = label_.replace({'B':0})
label_ = label_.replace({'M': 1})

y_gen = to_categorical(label_, 2) ## into the format of one-hot encoding

* Separating the test dataset only from original dataframe

In [None]:
x_trainset, x_test, y_trainset, y_test = train_test_split(ori_x, y_ori, test_size = 0.4, random_state = 710674)

* Then concat the generated dataset with training dataset

In [None]:
x_train_concat = pd.concat([x_trainset, gen_x], ignore_index=True)
y_train_concat = np.concatenate([y_trainset, y_gen])

* Then separating the validation dataset from concat dataframe

In [None]:
x_train, x_vali, y_train, y_vali = train_test_split(x_train_concat, y_train_concat, test_size = 0.2, random_state = 710674)

In [None]:
# ori_x[ori_x.isna().any(axis=1)].shape

In [None]:
# # fig, ax = plt.subplots()
# plt.figure(figsize = (10, 5))
# plt.xlabel('Generated MDD BP feature variables',fontsize=10)
# plt.ylabel('Feature value',fontsize=10)
# plt.boxplot(ori_x)
# plt.show()

***

### model training + test

In [None]:
class Args:
    # arugments
    epochs=150
    bs=32
    lr=0.001
    momentum=0.9
    encoding_dim = 16
    num_classes= 2
    seed=710674

args = Args()

In [None]:
inputs = np.concatenate((x_train, x_vali), axis = 0)
targets = np.concatenate((y_train, y_vali), axis = 0)

In [None]:
fold_num = 1
split_num = 5
opt = keras.optimizers.SGD(learning_rate = args.lr, decay = 1e-6, momentum = args.momentum)
kfold = KFold(n_splits = split_num, shuffle = True)

In [None]:
############## FOR FOUR-GROUP CLASSIFICATION ###############
# class_weight = {0:1, 1:1.68}
class_weight = {0:1, 1:1}

In [None]:
acc_per_fold = []
loss_per_fold = []

In [None]:
x_train.shape

In [None]:
for train, test in kfold.split(inputs, targets):
    model = Sequential()
    model.add(Dense(32, input_dim = x_train.shape[1], activation = 'relu'))
    model.add(Dense(64, activation = 'relu'))
    model.add(BatchNormalization())
    model.add(Dropout(0.5)) #drop out
    model.add(Dense(64, activation = 'relu'))
    model.add(Dense(32, activation = 'relu'))
    model.add(Dense(args.num_classes, activation = 'softmax'))
    
    ## model compile
    model.compile(loss = 'categorical_crossentropy', optimizer = opt, metrics = ['accuracy'])
    
    print('----------------------------------------')
    print(f'Training or fold {fold_num} ... ')
    
    ## fit data to model
    history = model.fit(inputs[train], targets[train], batch_size = args.bs, epochs = args.epochs, verbose = 0, class_weight = class_weight)
    
    ## generate generalization metrics
    scores = model.evaluate(inputs[test], targets[test])
    print(f'Score for fold {fold_num}: {model.metrics_names[0]} of {scores[0]}; {model.metrics_names[1]} of {scores[1]*100}%')
    print("%s: %.2f%%" %(model.metrics_names[1], scores[1]*100))
    acc_per_fold.append(scores[1] * 100)
    loss_per_fold.append(scores[0])
    
    ## increasing fold number
    fold_num = fold_num + 1
    
    
    
## Summarizing the results
print('------------------------------------------------------------------------')
print('Score per fold')
for i in range(0, len(acc_per_fold)):
    print('------------------------------------------------------------------------')
    print(f'>> Fold {i+1} - Loss: {loss_per_fold[i]} - Accuracy: {acc_per_fold[i]}%')
print('------------------------------------------------------------------------')
print('Average scores for all folds:')
print(f'>>> Accuracy: {np.mean(acc_per_fold)} (+- {np.std(acc_per_fold)})')
print(f'>>> Loss: {np.mean(loss_per_fold)}')
print('------------------------------------------------------------------------')

In [None]:
y_predict = model.predict(x_test)
y_predict = np.argmax(y_predict, axis = 1)
y_test = np.argmax(y_test, axis = 1)

result = confusion_matrix(y_test, y_predict, normalize = 'pred')
print(result)

In [None]:
figure = plt.figure(figsize=(6, 4))
sns.heatmap(result, annot=True,cmap=plt.cm.Blues)
plt.tight_layout()
plt.ylabel('True label')
plt.xlabel('Predicted label')
plt.show()

In [None]:
accuracy = metrics.accuracy_score(y_test, y_predict)
precision = metrics.precision_score(y_test, y_predict, average = 'macro')
recall = metrics.recall_score(y_test, y_predict, average = 'micro')
f1 = metrics.f1_score(y_test, y_predict, average = 'weighted')

print("=============================================")
print("The overall accuracy is:", round(accuracy, 4))
print("The precision score is:", round(precision, 4))
print("The recall score is:", round(recall, 4))
print("The f1 score is:", round(f1, 4))
print("=============================================")

***

***

## Using VAE/AE as a dimensionality reduction

In [None]:
x = data_x
y = y

In [None]:
class Args:
    # arugments
    epochs=200
    enc_epochs = 50
    bs=32
    enc_bs = 16
    lr=0.001
    momentum=0.9
    encoding_dim = 16
    num_classes= 2
    verbose='store_true'
    seed=710674

args = Args()

### Autoencoder for dim reduction

In [None]:
# input dataset layer
input_layer = Input(shape=(x.shape[1],))

# encoder layers
encoder = Dense(64, activation='relu')(input_layer)
encoder = Dense(32, activation='relu')(encoder)
encoder_out = Dense(args.encoding_dim, activation='relu')(encoder)

# decoder layers
decoder = Dense(32, activation='relu')(encoder_out)
decoder = Dense(64, activation='relu')(decoder)
decoder_out = Dense(x.shape[1], activation='sigmoid')(decoder)

In [None]:
# AE model
autoencoder = Model(inputs=input_layer, outputs=decoder_out)

# Encoder model (convert input dataset into latent space)
encoder_model = Model(inputs=input_layer, outputs=encoder_out)

# Decoder model (recover latent space/vector into original dataset format)
encoded_input = Input(shape=(args.encoding_dim,))
decoder_layer = autoencoder.layers[-3](encoded_input)
decoder_layer = autoencoder.layers[-2](decoder_layer)
decoder_out = autoencoder.layers[-1](decoder_layer)
decoder_model = Model(inputs=encoded_input, outputs=decoder_layer)

In [None]:
# Model compile
autoencoder.compile(optimizer='adam', loss='mse')

# Checking each model
autoencoder.summary()
# encoder_model.summary()
# decoder_model.summary()

In [None]:
## Model training
autoencoder.fit(x, x, epochs = args.enc_epochs, batch_size = args.enc_bs, shuffle=True, validation_split=0.2, verbose=2)

In [None]:
encoded_data = encoder_model.predict(x)
decoded_data = decoder_model.predict(encoded_data)

In [None]:
encoded_data.shape

In [None]:
decoded_data.shape

In [None]:
y.shape

In [None]:
decoded_data

In [None]:
class Args:
    # arugments
    epochs=200
    bs=32
    lr=0.001
    momentum=0.9
    num_classes= 2
    verbose='store_true'
    seed=710674

args = Args()

# np.random.seed(args.seed)
# random.seed(args.seed)
# torch.manual_seed(args.seed)

In [None]:
x_trainset, x_test, y_trainset, y_test = train_test_split(decoded_data, y, test_size = 0.1, random_state = 710674)
x_train, x_vali, y_train, y_vali = train_test_split(x_trainset, y_trainset, test_size = 0.2, random_state = 710674)

In [None]:
inputs = np.concatenate((x_train, x_vali), axis = 0)
targets = np.concatenate((y_train, y_vali), axis = 0)

In [None]:
fold_num = 1
split_num = 5
opt = keras.optimizers.SGD(learning_rate = args.lr, decay = 1e-6, momentum = args.momentum)
kfold = KFold(n_splits = split_num, shuffle = True)
# kfold = StratifiedKFold(n_splits = split_num, shuffle = True)

In [None]:
############## FOR FOUR-GROUP CLASSIFICATION ###############
class_weight = {0:1, 1: 1.68}

In [None]:
acc_per_fold = []
loss_per_fold = []

In [None]:
for train, test in kfold.split(inputs, targets):
    model = Sequential()
    model.add(Dense(128, input_dim = x_train.shape[1], activation = 'relu'))
    model.add(Dense(64, activation = 'relu'))
    model.add(Dropout(0.5)) #drop out
    model.add(Dense(32, activation = 'relu'))
    model.add(Dense(args.num_classes, activation = 'softmax'))
    
    ## model compile
    model.compile(loss = 'categorical_crossentropy', optimizer = opt, metrics = ['accuracy'])
    
    print('----------------------------------------')
    print(f'Training or fold {fold_num} ... ')
    
    ## fit data to model
    history = model.fit(inputs[train], targets[train], batch_size = args.bs, epochs = args.epochs, verbose = 0, class_weight = class_weight)
    
    ## generate generalization metrics
    scores = model.evaluate(inputs[test], targets[test])
    print(f'Score for fold {fold_num}: {model.metrics_names[0]} of {scores[0]}; {model.metrics_names[1]} of {scores[1]*100}%')
    print("%s: %.2f%%" %(model.metrics_names[1], scores[1]*100))
    acc_per_fold.append(scores[1] * 100)
    loss_per_fold.append(scores[0])
    
    ## increasing fold number
    fold_num = fold_num + 1
    
    
    
## Summarizing the results
print('------------------------------------------------------------------------')
print('Score per fold')
for i in range(0, len(acc_per_fold)):
    print('------------------------------------------------------------------------')
    print(f'>> Fold {i+1} - Loss: {loss_per_fold[i]} - Accuracy: {acc_per_fold[i]}%')
print('------------------------------------------------------------------------')
print('Average scores for all folds:')
print(f'>>> Accuracy: {np.mean(acc_per_fold)} (+- {np.std(acc_per_fold)})')
print(f'>>> Loss: {np.mean(loss_per_fold)}')
print('------------------------------------------------------------------------')

In [None]:
y_predict = model.predict(x_test)
y_predict = np.argmax(y_predict, axis = 1)
y_test = np.argmax(y_test, axis = 1)

result = confusion_matrix(y_test, y_predict, normalize = 'pred')
print(result)

In [None]:
figure = plt.figure(figsize=(6, 4))
sns.heatmap(result, annot=True,cmap=plt.cm.Blues)
plt.tight_layout()
plt.ylabel('True label')
plt.xlabel('Predicted label')
plt.show()

In [None]:
accuracy = metrics.accuracy_score(y_test, y_predict)
precision = metrics.precision_score(y_test, y_predict, average = 'macro')
recall = metrics.recall_score(y_test, y_predict, average = 'micro')
f1 = metrics.f1_score(y_test, y_predict, average = 'weighted')
auc = roc_auc_score(y_test, model.predict(x_test, verbose=0), multi_class='ovr')

print("=============================================")
print("The overall accuracy is:", round(accuracy, 4))
print("The precision score is:", round(precision, 4))
print("The recall score is:", round(recall, 4))
print("The f1 score is:", round(f1, 4))
print("The AUC score is:", round(auc, 4))
print("=============================================")