# Unsup for health records

* Idealization the methods for health records handling

In [None]:
import os
import numpy as np
import pandas as pd
import tensorflow as tf

In [None]:
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, Layer
from tensorflow.keras.utils import to_categorical

from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler, LabelEncoder
from sklearn.mixture import GaussianMixture
from sklearn.metrics import adjusted_rand_score, silhouette_score

## Dataset check

In [None]:
data_ori = pd.read_csv('C:/Users/user/Downloads/GNN_test.csv')
print("The shape of the original dataset is:", data_ori.shape)

In [None]:
data_ori.columns

In [None]:
# 빈 문자열이나 공백을 NaN으로 변환
data_ori.replace(r'^\s*$', pd.NA, regex=True, inplace=True)

In [None]:
print(data_ori.isnull().sum())

In [None]:
data_ori = data_ori.dropna()

In [None]:
data_ori.shape

In [None]:
data_ori.head()

In [None]:
data_ori['Group'].value_counts()

In [None]:
data_unsup = data_ori.copy()
data_unsup = data_unsup.drop(['E_No', 'HAMD_total_V1'], axis=1)
data_unsup['Group'] = (data_unsup['Group'] -1)
# data_unsup = data_unsup.drop(['E_No', 'Group'], axis=1)

In [None]:
data_unsup.head()

In [None]:
data_y = data_unsup.loc[:, ['Group']]
data_x = data_unsup.drop(['Group'], axis=1)

In [None]:
# x = data_x.copy()

# scaler = MinMaxScaler() #set the scaler (between 0 and 1)
# # scaler = RobustScaler()
# x[:] = scaler.fit_transform(x[:])
# x = x.round(decimals=6)

In [None]:
# y = data_y.copy()
# # y = to_categorical(y, 3)

In [None]:
# 그룹 변수를 분리
group_column = 'Group'
features = data_unsup.drop(columns=[group_column])  # Feature 데이터만 남김
true_labels = LabelEncoder().fit_transform(data_unsup[group_column])  # Group 변수를 숫자로 인코딩

## Unsupervised approaches

### K-means

In [None]:
os.environ["OMP_NUM_THREADS"] = '1'
from sklearn.cluster import KMeans

In [None]:
# K-means 클러스터링 적용
kmeans = KMeans(n_clusters=3, random_state=710674)
kmeans_labels = kmeans.fit_predict(x)

In [None]:
# 클러스터링 성능 평가
ari_kmeans = adjusted_rand_score(true_labels, kmeans_labels)
silhouette_kmeans = silhouette_score(data_unsup, kmeans_labels)

print(f"K-means ARI: {ari_kmeans}")
print(f"K-means Silhouette Score: {silhouette_kmeans}")

### Gaussian Mixture (GMM)

In [None]:
# Gaussian Mixture Model 적용
gmm = GaussianMixture(n_components=3, random_state=710674)
gmm_labels = gmm.fit_predict(data_unsup)

# 클러스터링 성능 평가
ari_gmm = adjusted_rand_score(true_labels, gmm_labels)
silhouette_gmm = silhouette_score(data_unsup, gmm_labels)

print(f"GMM ARI: {ari_gmm}")
print(f"GMM Silhouette Score: {silhouette_gmm}")

### Autoencoder

In [None]:
# Autoencoder 모델 생성
input_dim = data.shape[1]
encoding_dim = 3  # 저차원 공간으로 축소할 차원 수

input_layer = Input(shape=(input_dim,))
encoded = Dense(encoding_dim, activation='relu')(input_layer)
decoded = Dense(input_dim, activation='sigmoid')(encoded)

autoencoder = Model(input_layer, decoded)
autoencoder.compile(optimizer='adam', loss='mse')

# Autoencoder 학습
autoencoder.fit(data, data, epochs=50, batch_size=10, shuffle=True, verbose=2)

# 축소된 차원으로 데이터 변환
encoder = Model(input_layer, encoded)
encoded_data = encoder.predict(data)

# K-means를 사용하여 Autoencoder의 출력으로 클러스터링
kmeans_encoded = KMeans(n_clusters=3, random_state=42)
kmeans_encoded_labels = kmeans_encoded.fit_predict(encoded_data)

# 클러스터링 성능 평가
ari_autoencoder = adjusted_rand_score(true_labels, kmeans_encoded_labels)
silhouette_autoencoder = silhouette_score(encoded_data, kmeans_encoded_labels)

print(f"Autoencoder K-means ARI: {ari_autoencoder}")
print(f"Autoencoder K-means Silhouette Score: {silhouette_autoencoder}")