# Data Augmentation Box

Project for Data Augmentation System

## Data Augmentation Order

STEP 1 - Domain Data Preparation
1. Domain data labeling check
2. Dimensionality Reduction
3. Regression analysis


STEP 2 - Data Augmentation
1. Domain data check
02. Public Data Supplement
03. Data filtering (1st)
04. Dimensionality Reduction
05. Label Spreading (semi-supervised learning based)
06. Data Filtering (2nd)
07. Regression analytsis
08. Data Filtering (3rd)
09. Data Augmentation
10. Model Generation

- - -

In [None]:
import os
import time
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import sklearn.metrics as metrics

In [None]:
from sklearn.model_selection import train_test_split, ParameterGrid
from sklearn.preprocessing import MinMaxScaler
from sklearn import decomposition
from sklearn.manifold import TSNE
from sklearn.cluster import KMeans
from sklearn.linear_model import LogisticRegression
from sklearn.semi_supervised import LabelPropagation, LabelSpreading
from sklearn.metrics import roc_auc_score, accuracy_score
from sklearn.feature_extraction.text import CountVectorizer
from tensorflow.keras.utils import to_categorical
from tensorflow import keras

scaler = MinMaxScaler() #set the scaler

In [None]:
from scipy import stats
from scipy.stats import ttest_ind
from sklearn.model_selection import train_test_split
from keras.models import Sequential
from keras.layers import Dense , Activation, Dropout, BatchNormalization
from keras.optimizers import Adam ,RMSprop
from keras import  backend as K
from keras.optimizers import SGD
# from tensorflow.keras import utils as np_utils
# from tensorflow.keras.metrics import binary_focal_crossentropy
from sklearn import decomposition, metrics
from sklearn.linear_model import LogisticRegression
from tensorflow.keras.utils import to_categorical
from tensorflow import keras
# from torch.utils.data import TensorDataset, DataLoader, Dataset
from sklearn.preprocessing import MinMaxScaler, RobustScaler, Normalizer

# from tqdm import tqdm
# from torch.autograd import Variable
# from torch.utils.data import Dataset, DataLoader
from sklearn.metrics import cohen_kappa_score,f1_score, confusion_matrix
from sklearn.model_selection import KFold, train_test_split
from keras.callbacks import Callback
# from pytorch_tabnet.tab_model import TabNetClassifier

## 01. Domain Data Check

* SMC dataset - depression research on 100 subjects

In [None]:
### HRV numerical dataset
domain_ori = pd.read_csv('E:/RESEARCH/Datasets/HRV/HRV_REV_all.csv', sep=',')

In [None]:
### data shape, variables check
print("The shape of the domain dataset is:",domain_ori.shape)
# print(domain.columns)
domain_ori.head()

* HAMD 점수에 따라서 새롭게 IndexH 라고 라벨링용 변수 만들어주자

In [None]:
### checking lables for the data
domain_ori.loc[domain_ori['HAMD']<=7, 'IndexH'] = "normal" ##healthy control
domain_ori.loc[(domain_ori['HAMD']>7) & (domain_ori['HAMD']<=16), 'IndexH'] = "mild" ##mild depression
domain_ori.loc[domain_ori['HAMD']>16, 'IndexH'] = "severe" ##mod-severe depression
domain_y = domain_ori.loc[:,'IndexH']
# domain_y = domain.loc[:,'disorder']

In [None]:
domain_ori['IndexH'].value_counts()

* 그리고 안쓸 변수들은 제거해주자. (HRV 관련 변수만 쓸 것임)

In [None]:
### deleting unnecessary data columns
domain = domain_ori.drop(['sub','age','gender','VISIT','disorder','HAMD', 'HAMA','PDSS','ASI','APPQ','PSWQ','SPI','PSS','BIS','SSI'], axis=1)

In [None]:
### check the domain data columns again
print(domain.columns)
print(domain.shape)

- - -

* Domain data variable selection for the right task
> baseline, stress, rest phase로 나눠진 데이터를 각각 쪼개주는 것.

In [None]:
domain_b1 = domain.loc[:, ['b1RMSSD', 'b1HR', 'b1PNN50', 'b1VLF', 'b1LF', 'b1HF', 'b1LF/HF']]
domain_b2 = domain.loc[:, ['b1RMSSD', 'b1HR', 'b1PNN50', 'b1VLF', 'b1LF', 'b1HF', 'b1LF/HF']]
domain_b3 = domain.loc[:, ['b1RMSSD', 'b1HR', 'b1PNN50', 'b1VLF', 'b1LF', 'b1HF', 'b1LF/HF']]
domain_b1_index = domain.loc[:, ['b1RMSSD', 'b1HR', 'b1PNN50', 'b1VLF', 'b1LF', 'b1HF', 'b1LF/HF', 'IndexH']]
domain_b2_index = domain.loc[:, ['b1RMSSD', 'b1HR', 'b1PNN50', 'b1VLF', 'b1LF', 'b1HF', 'b1LF/HF', 'IndexH']]
domain_b3_index = domain.loc[:, ['b1RMSSD', 'b1HR', 'b1PNN50', 'b1VLF', 'b1LF', 'b1HF', 'b1LF/HF', 'IndexH']]

In [None]:
domain_b1.columns = ['RMSSD', 'HR', 'PNN50', 'VLF', 'LF', 'HF', 'LF_HF']
domain_b2.columns = ['RMSSD', 'HR', 'PNN50', 'VLF', 'LF', 'HF', 'LF_HF']
domain_b3.columns = ['RMSSD', 'HR', 'PNN50', 'VLF', 'LF', 'HF', 'LF_HF']

* domain_s 는 stress phase에 있는 애들

In [None]:
domain_s = domain.loc[:, ['sRMSSD','sHR', 'sPNN50', 'sVLF', 'sLF', 'sHF', 'sLF/HF']]
domain_s_index = domain.loc[:, ['sRMSSD','sHR', 'sPNN50', 'sVLF', 'sLF', 'sHF', 'sLF/HF', 'IndexH']]

In [None]:
domain_s.columns = ['RMSSD', 'HR', 'PNN50', 'VLF', 'LF', 'HF', 'LF_HF']

In [None]:
### Standardization
domain_b1[:] = scaler.fit_transform(domain_b1[:])
domain_b2[:] = scaler.fit_transform(domain_b2[:])
domain_b3[:] = scaler.fit_transform(domain_b3[:])
domain_s[:] = scaler.fit_transform(domain_s[:])

In [None]:
# domain_b1['Index'] = domain_b1_index['IndexH']
# domain_b2['Index'] = domain_b2_index['IndexH']
# domain_b3['Index'] = domain_b3_index['IndexH']
# domain_s['Index'] = domain_s_index['IndexH']

Later you can select the dataset that you want to analyze. 

ex) if you want to augment the stress phase dataset, choose domain_s

- - -

--------

## 02. Public Data Supplement

* Using public dataset (SWELL-HRV) for augmentation

In [None]:
### In our medical case, we adopt HRV dataset from SWEEL HRV research
### Using public data must be very careful, and researchers should only use them for training data supplement.

public = pd.read_csv('E:/RESEARCH/Datasets/HRV/HRV_Public/SWELL_hrv/data/final/train.csv', sep=',')

In [None]:
### data shape, variables check
print("The shape of the public SWELL dataset is:",public.shape)
# print(public.columns)
public.head()

- - -

* preprocess our data to fit into domain data
> 실제 사용하는 domain(삼성병원)데이터는 3phase를 가지지만 public에서는 baseline이랑 stress를 나눠본다

In [None]:
### set the variables same as domain dataset
public_b = public[public['condition'] == 'no stress']
public_s1 = public[public['condition'] == 'interruption']
public_s2 = public[public['condition'] == 'time pressure']

* checking the number of baseline and stress phase dataset

In [None]:
### check the number of each phase dataset
print(public_b.shape)
print(public_s1.shape)
print(public_s2.shape)

In [None]:
### now select the common(repeated) variables from the domain data and save
public = public.loc[:,['RMSSD', 'HR', 'PNN50', 'VLF', 'LF', 'HF', 'LF_HF']]
public_b = public_b.loc[:,['RMSSD', 'HR', 'PNN50', 'VLF', 'LF', 'HF', 'LF_HF']]
public_s1 = public_s1.loc[:,['RMSSD', 'HR', 'PNN50', 'VLF', 'LF', 'HF', 'LF_HF']]
public_s2 = public_s2.loc[:,['RMSSD', 'HR', 'PNN50', 'VLF', 'LF', 'HF', 'LF_HF']]

* 마찬가지로 scaler 적용해서 standardization 적용

In [None]:
### standardization on supplemented dataset
public_b[:] = scaler.fit_transform(public_b[:])
public_s1[:] = scaler.fit_transform(public_s1[:])
public_s2[:] = scaler.fit_transform(public_s2[:])

In [None]:
### round up the variable values for fifth decimal points
public_b = public_b.round(decimals=5)
public_s1 = public_s1.round(decimals=5)
public_s2 = public_s2.round(decimals=5)

In [None]:
public_b_index = public_b.loc[:,['RMSSD', 'HR', 'PNN50', 'VLF', 'LF', 'HF', 'LF_HF']]
public_s1_index = public_s1.loc[:,['RMSSD', 'HR', 'PNN50', 'VLF', 'LF', 'HF', 'LF_HF']]
public_s2_index = public_s2.loc[:,['RMSSD', 'HR', 'PNN50', 'VLF', 'LF', 'HF', 'LF_HF']]

In [None]:
## put index as 0, for future augmentation prediction value
public_b_index['IndexH'] = 0
public_s1_index['IndexH'] = 0
public_s2_index['IndexH'] = 0

- - -

## 03. Data Filtering

### 3-1) Data Mergence

In [None]:
### First select the data phase (maybe not necessary for some dataset)
### Then, check the number of data in each domain and public dataset
### Here we are going to use baseline phase

print("Shape of the domain dataset for the training is", domain_s.shape)
print("Shape of the public dataset for the training is", public_s1.shape)

In [None]:
### select the proper amount of dataset for each
domain_resized = domain_s.sample(frac=1) ##sampling 뽑을거도 없이 전체 다 쓰면 되고.
public_resized = public_s1.sample(n=920)
print(domain_resized.shape)
print(public_resized.shape)

In [None]:
# public_resized.head()

In [None]:
# domain_resized.head()

* training이라는 이름으로 stress phase에서의 두 데이터를 합치자

In [None]:
training = pd.concat((domain_resized, public_resized))

In [None]:
### check the finalized first augmented dataset size/shape
print("Shape of the firstly augmented dataset for the training is", training.shape)

In [None]:
training.head()

## 04. Dimensionality Reduction

* 현재 domain이랑 public에서 사용되는 공용 변수는 7개.('RMSSD', 'HR', 'PNN50', 'VLF', 'LF', 'HF', 'LF_HF')
* 군집화하기 위해서 차원축소를 해도 각 데이터의 설명력이 떨어지지 않는 지 확인해보자.

* 먼저 Domain dataset

In [None]:
### To put the labels on domain dataset and use them for labeling, index must be included
### 3 component dimensionality reduction on merged dataset
dom_pca_3 = decomposition.PCA(n_components=3)
dom_pca_3.fit(domain_s.iloc[:])
dom_pca_3_result = dom_pca_3.fit_transform(domain_s)
dom_pca_3_df = pd.DataFrame(dom_pca_3_result, columns = ['PCA0', 'PCA1', 'PCA2'])
dom_3 = dom_pca_3.explained_variance_ratio_.sum()*100 #explained ratio

### check the representativeness of the reduced dimension by PCA
print('Explained variation per principal component: {}'.format(dom_pca_3.explained_variance_ratio_))
print('Cumulative variance explained by 3 principal components: {:.2%}'.format(np.sum(dom_pca_3.explained_variance_ratio_)))

In [None]:
# dom_pca_3_df

In [None]:
# domain_s
# domain_s_index

In [None]:
## get the target info from domain_s dataset
dom_pca_3_df['target'] = domain_s_index['IndexH'] ## HAMD를 사용해서 새로 만든 IndexH 라벨에 따른 시각화 비교.
# dom_pca_3_df['target'] = domain_ori['disorder'] ## 만약에 HAMD기반해서 새로 만든 라벨 말고 오리지널 disorder를 쓴다면..?

In [None]:
domain_s_index

In [None]:
# Plot initialisation
fig = plt.figure(figsize=(8,5))
ax = fig.add_subplot(111, projection='3d')
plt.title('PCA 3 result from Domain Dataset', fontsize=11)

## get the target info from domain_s dataset
dom_pca_3_df['target'] = domain_s_index['IndexH']
# dom_pca_3_df['target'] = domain_ori['disorder']

## seperate by target values
dom_pca_0 = dom_pca_3_df[dom_pca_3_df['target']=='normal']
dom_pca_1 = dom_pca_3_df[dom_pca_3_df['target']=='mild']
dom_pca_2 = dom_pca_3_df[dom_pca_3_df['target']=='severe']

ax.scatter(dom_pca_0['PCA0'], dom_pca_0['PCA1'], dom_pca_0['PCA2'], color = 'orange', alpha = 0.7)
ax.scatter(dom_pca_1['PCA0'], dom_pca_1['PCA1'], dom_pca_1['PCA2'], color = 'red', alpha = 0.7)
ax.scatter(dom_pca_2['PCA0'], dom_pca_2['PCA1'], dom_pca_2['PCA2'], color = 'green', alpha = 0.7)

# plt.savefig('pca_result.png')

* What if using 2dimensionalities

In [None]:
### To put the labels on domain dataset and use them for labeling, index must be included
### 3 component dimensionality reduction on merged dataset
dom_pca_2 = decomposition.PCA(n_components=2)
dom_pca_2.fit(domain_s)
dom_pca_2_result = dom_pca_2.fit_transform(domain_s)
dom_pca_2_df = pd.DataFrame(dom_pca_2_result, columns = ['PCA0', 'PCA1'])
dom_2 = dom_pca_2.explained_variance_ratio_.sum()*100 #explained ratio

### check the representativeness of the reduced dimension by PCA
print('Explained variation per principal component: {}'.format(dom_pca_2.explained_variance_ratio_))
print('Cumulative variance explained by 2 principal components: {:.2%}'.format(np.sum(dom_pca_2.explained_variance_ratio_)))

In [None]:
# class target 정보 불러오기 
dom_pca_2_df['target'] = domain_s_index['IndexH']
# dom_pca_2_df['target'] = domain_ori['disorder']

## seperate by target values
dom_pca_0 = dom_pca_2_df[dom_pca_3_df['target']=='normal']
dom_pca_1 = dom_pca_2_df[dom_pca_3_df['target']=='mild']
dom_pca_2 = dom_pca_2_df[dom_pca_3_df['target']=='severe']

# target 별 시각화
plt.scatter(dom_pca_0['PCA0'], dom_pca_0['PCA1'], color = 'orange', alpha = 0.7)
plt.scatter(dom_pca_1['PCA0'], dom_pca_1['PCA1'], color = 'red', alpha = 0.7)
plt.scatter(dom_pca_2['PCA0'], dom_pca_2['PCA1'], color = 'green', alpha = 0.7)

plt.xlabel('component 0')
plt.ylabel('component 1')
plt.legend()
plt.show()

In [None]:
### To put the labels on domain dataset and use them for labeling, index must be included
### 3 component dimensionality reduction on merged dataset
dom_pca_3 = decomposition.PCA(n_components=3)
dom_pca_3_result = dom_pca_3.fit_transform(domain_resized)
dom_3 = dom_pca_3.explained_variance_ratio_.sum()*100 #explained ratio

### check the representativeness of the reduced dimension by PCA
print('Explained variation per principal component: {}'.format(dom_pca_3.explained_variance_ratio_))
print('Cumulative variance explained by 3 principal components: {:.2%}'.format(np.sum(dom_pca_3.explained_variance_ratio_)))

In [None]:
# dom_pca_3_result
dom_pca_3_result.shape ##reduced dimension

In [None]:
# domain_resized.sort_index()

In [None]:
RDATA_reduced = pd.DataFrame(dom_pca_3_result)

* 그림으로 그려서 확인

In [None]:
dom_result3 = pd.DataFrame(dom_pca_3.transform(domain_resized), columns = ['PCA%i' % i for i in range(3)], index = domain_resized.index)
# dom_result3

In [None]:
# Plot initialisation
fig = plt.figure(figsize=(8,5))
ax = fig.add_subplot(111, projection='3d')
plt.title('PCA 3 result from Domain Dataset', fontsize=11, fontweight='bold')
ax.scatter(dom_result3['PCA0'], dom_result3['PCA1'], dom_result3['PCA2'])
# plt.savefig('pca_result.png')

* Silhouette score를 이용한 분석으로 몇개의 cluster로 나누는 것이 합리적인지 보자

In [None]:
# candidate values for our number of cluster
parameters = [2, 3, 4, 5, 6]

# instantiating ParameterGrid, pass number of clusters as input
parameter_grid = ParameterGrid({'n_clusters': parameters})
best_score = -1
kmeans_model = KMeans()     # instantiating KMeans model
silhouette_scores = []

# evaluation based on silhouette_score
for p in parameter_grid:
    kmeans_model.set_params(**p)  # set current hyper parameter
    kmeans_model.fit(domain_resized)     # fit model on dataset, this will find clusters based on parameter p
    ss = metrics.silhouette_score(domain_resized, kmeans_model.labels_)   # calculate silhouette_score
    silhouette_scores += [ss]       # store all the scores
    print('Parameter:', p, 'Score', ss)
    # check p which has the best score
    if ss > best_score:
        best_score = ss
        best_grid = p
        
# plotting silhouette score
plt.bar(range(len(silhouette_scores)), list(silhouette_scores), align='center', color='#849ef7', width=0.5)
plt.xticks(range(len(silhouette_scores)), list(parameters))
plt.title('Domain Dataset silhouette score')
plt.xlabel('Number of Clusters')
plt.show()

* 다음으로 Public dataset

In [None]:
# public_resized

In [None]:
### 3 component dimensionality reduction on merged dataset
pub_pca_3 = decomposition.PCA(n_components=3)
pub_pca_3_result = pub_pca_3.fit_transform(public_resized)
pub_3 = pub_pca_3.explained_variance_ratio_.sum()*100

### check the representativeness of the reduced dimension by PCA
print('Explained variation per principal component: {}'.format(pub_pca_3.explained_variance_ratio_))
print('Cumulative variance explained by 2 principal components: {:.2%}'.format(np.sum(pub_pca_3.explained_variance_ratio_)))

* 마찬가지로 그림으로 그려서 확인

In [None]:
pub_result3 = pd.DataFrame(pub_pca_3.transform(public_resized), columns = ['PCA%i' % i for i in range(3)], index = public_resized.index)

In [None]:
# Plot initialisation
fig = plt.figure(figsize=(8,5))
ax = fig.add_subplot(111, projection='3d')
plt.title('PCA 3 result from Public Dataset', fontsize=11, fontweight='bold')
ax.scatter(pub_result3['PCA0'], pub_result3['PCA1'], pub_result3['PCA2'])
# plt.savefig('pca_result.png')

* Silhouette score to check optimal cluster count

In [None]:
# candidate values for our number of cluster
parameters = [2, 3, 4, 5, 6]

# instantiating ParameterGrid, pass number of clusters as input
parameter_grid = ParameterGrid({'n_clusters': parameters})
best_score = -1
kmeans_model = KMeans()     # instantiating KMeans model
silhouette_scores = []

# evaluation based on silhouette_score
for p in parameter_grid:
    kmeans_model.set_params(**p)  # set current hyper parameter
    kmeans_model.fit(public_resized)     # fit model on dataset, this will find clusters based on parameter p
    ss = metrics.silhouette_score(public_resized, kmeans_model.labels_)   # calculate silhouette_score
    silhouette_scores += [ss]       # store all the scores
    print('Parameter:', p, 'Score', ss)
    # check p which has the best score
    if ss > best_score:
        best_score = ss
        best_grid = p
        
# plotting silhouette score
plt.bar(range(len(silhouette_scores)), list(silhouette_scores), align='center', color='#849ef7', width=0.5)
plt.xticks(range(len(silhouette_scores)), list(parameters))
plt.title('Public Dataset silhouette score')
plt.xlabel('Number of Clusters')
plt.show()

* 마지막으로 합쳐진 데이터셋에대한 그림

In [None]:
training

In [None]:
### 3 component dimensionality reduction on merged dataset
concat_pca_3 = decomposition.PCA(n_components=3)
concat_pca_3_result = concat_pca_3.fit_transform(training)
concat_3 = concat_pca_3.explained_variance_ratio_.sum()*100

### check the representativeness of the reduced dimension by PCA
print('Explained variation per principal component: {}'.format(concat_pca_3.explained_variance_ratio_))
print('Cumulative variance explained by 3 principal components: {:.2%}'.format(np.sum(concat_pca_3.explained_variance_ratio_)))

In [None]:
concat_pca_3_result = pd.DataFrame(concat_pca_3.transform(training), columns = ['PCA%i' % i for i in range(3)], index = training.index)

In [None]:
# Plot initialisation
fig = plt.figure(figsize=(8,5))
ax = fig.add_subplot(111, projection='3d')
plt.title('PCA 3 result from Concatenated Dataset', fontsize=11, fontweight='bold')
ax.scatter(concat_pca_3_result['PCA0'], concat_pca_3_result['PCA1'], concat_pca_3_result['PCA2'])
# plt.savefig('pca_result.png')

## 05. Data Clustering

## 06. Unlabeled data labeling

* 여기서 RDATA는 Real dataset이고 PDATA는 augmentation을 위한 public dataset

In [None]:
RDATA = domain_s
PDATA = public_s1.sample(n=1000)
label = domain_y

-----

* Practice

In [None]:
R1, R2, R3 = np.split(RDATA, [int(.1*len(RDATA)), int(.5*len(RDATA))])
Y1, Y2, Y3 = np.split(label, [int(.1*len(label)), int(.5*len(label))])

In [None]:
R3.shape

In [None]:
Y12 = np.concatenate((Y1, Y2.apply(lambda x: -1)))
R12 = np.concatenate((R1,R2))

In [None]:
R12.shape

In [None]:
index = ['Algorithm', 'ROC AUC']
results = pd.DataFrame(columns=index)

In [None]:
logreg = LogisticRegression(random_state=710674, class_weight='balanced')
logreg.fit(R1, Y1)
results = results.append(
    pd.Series(['Multiple Logistic Regression', roc_auc_score(Y3, logreg.predict_proba(R3), multi_class='ovr')], 
                                   index=index), ignore_index=True)
results

In [None]:
def label_prop_test(kernel, params_list, X_train, X_test, y_train, y_test):
    plt.figure(figsize=(20,10))
    n, g = 0, 0
    roc_scores = []
    if kernel == 'rbf':
        for g in params_list:
            lp = LabelPropagation(kernel=kernel, n_neighbors=n, gamma=g, max_iter=100000, tol=0.0001)
            lp.fit(X_train, y_train)
            roc_scores.append(roc_auc_score(y_test, lp.predict_proba(X_test), multi_class='ovr'))
    if kernel == 'knn':
        for n in params_list:
            lp = LabelPropagation(kernel=kernel, n_neighbors=n, gamma=g, max_iter=100000, tol=0.0001)
            lp.fit(X_train, y_train)
            roc_scores.append(roc_auc_score(y_test, lp.predict_proba(X_test), multi_class='ovr'))
    plt.figure(figsize=(16,8));
    plt.plot(params_list, roc_scores)
    plt.title('Label Propagation ROC AUC with ' + kernel + ' kernel')
    plt.show()
    print('Best metrics value is at {}'.format(params_list[np.argmax(roc_scores)]))

In [None]:
gammas = [9e-6, 1e-5, 2e-5, 3e-5, 4e-5, 5e-5, 6e-5, 7e-5, 8e-5, 9e-5]
label_prop_test('rbf', gammas, R12, R3, Y12, Y3)

In [None]:
ns = np.arange(50,60)
label_prop_test('knn', ns, R12, R3, Y12, Y3)

In [None]:
lp_rbf = LabelPropagation(kernel='rbf', gamma=9e-6, max_iter=100000, tol=0.0001)
lp_rbf.fit(R12, Y12)
results = results.append(pd.Series(['Label Propagation RBF', 
                                    roc_auc_score(Y3, lp_rbf.predict_proba(R3), multi_class='ovr')], index=index), ignore_index=True)

lp_knn = LabelPropagation(kernel='knn', n_neighbors=52, max_iter=100000, tol=0.0001)
lp_knn.fit(R12, Y12)
results = results.append(pd.Series(['Label Propagation KNN', 
                                    roc_auc_score(Y3, lp_knn.predict_proba(R3), multi_class='ovr')], index=index), ignore_index=True)

In [None]:
results

In [None]:
def labels_spread_test(kernel, hyperparam, alphas, X_train, X_test, y_train, y_test):
    plt.figure(figsize=(20,10))
    n, g = 0, 0
    roc_scores = []
    if kernel == 'rbf':
        g = hyperparam
    if kernel == 'knn':
        n = hyperparam
    for alpha in alphas:
        ls = LabelSpreading(kernel=kernel, n_neighbors=n, gamma=g, alpha=alpha, max_iter=1000, tol=0.001)
        ls.fit(X_train, y_train)
        roc_scores.append(roc_auc_score(y_test, ls.predict_proba(X_test), multi_class = 'ovr'))
    plt.figure(figsize=(16,8));
    plt.plot(alphas, roc_scores);
    plt.title('Label Spreading ROC AUC with ' + kernel + ' kernel')
    plt.show();
    print('Best metrics value is at {}'.format(alphas[np.argmax(roc_scores)]))

In [None]:
alphas = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]  
labels_spread_test('rbf', 1e-5, alphas, R12, R3, Y12, Y3)

In [None]:
alphas = [0.01, 0.02, 0.03, 0.04, 0.05, 0.06, 0.07, 0.08, 0.09]  
labels_spread_test('knn', 51, alphas, R12, R3, Y12, Y3)

In [None]:
ls_rbf = LabelSpreading(kernel='rbf', gamma=9e-6, alpha=0.1, max_iter=1000, tol=0.001)
ls_rbf.fit(R12, Y12)
results = results.append(pd.Series(['Label Spreading RBF', 
                                    roc_auc_score(Y3, ls_rbf.predict_proba(R3), multi_class='ovr')], index=index), ignore_index=True)
ls_knn = LabelSpreading(kernel='knn', n_neighbors=53, alpha=0.09, max_iter=1000, tol=0.001)
ls_knn.fit(R12, Y12)
results = results.append(pd.Series(['Label Spreading KNN', 
                                    roc_auc_score(Y3, ls_knn.predict_proba(R3), multi_class='ovr')], index=index), ignore_index=True)

In [None]:
results

----------

In [None]:
RDATA
# PDATA

* 일단 PDATA는 unlabeled data 상태이기에 -1로 라벨값 만들어주고.

In [None]:
PDATA['y'] = -1

In [None]:
PDATA.info()

* Regression 돌리기 위해서 test, train 나눠보자

In [None]:
# Labeled datapoints and following labels.
train_x, test_x, train_y, test_y = train_test_split(RDATA, label, test_size = 0.2, random_state = 710674)

In [None]:
print("The shape of training dataset x is:", train_x.shape)
print("The shape of test dataset x is:", test_x.shape)

In [None]:
# Unlabeled datapoints and following labels.
train_x2 = PDATA.loc[:,['RMSSD', 'HR', 'PNN50', 'VLF', 'LF', 'HF', 'LF_HF']]
train_y2 = PDATA['y']

In [None]:
print("The shape of public training dataset x is:", train_x2.shape)
print("The shape of public test dataset x is:", train_y2.shape)

In [None]:
# Concatenate
x = np.concatenate((train_x, train_x2))
y = np.concatenate((train_y, train_y2))

In [None]:
print("The shape of Total training dataset x is:", x.shape)
print("The shape of Total test dataset x is:", y.shape)

### Multiple Logistic Regression

* Logistic regression 돌려서 변수간 연관성 및 함수를 확인한다

In [None]:
index = ['Analysis Method', 'ROC AUC']
results = pd.DataFrame(columns = index) ## result 라고 데이터프레임 하나 만들어놓고.

In [None]:
logreg = LogisticRegression(solver = 'newton-cg', random_state = 710674, C = 1.0, max_iter = 20000)
logreg.fit(train_x, train_y)
results = results.append(
    pd.Series(['Multiple Logistic Regression', roc_auc_score(test_y, logreg.predict_proba(test_x), multi_class='ovr')],
              index=index), ignore_index=True)

results

In [None]:
# logreg.predict_proba(test_x)

In [None]:
y_pred = logreg.predict(test_x)
acc_score = accuracy_score(test_y, y_pred)

In [None]:
acc_score

* 각 라벨별 변수에 대한 계수(coefficient)를 확인

In [None]:
logreg.coef_

### Label propagation process

* Label propagation (generating probablistic transition matrix for unlabeled datapoints)

In [None]:
RDATA = domain_s
PDATA = public_s1.sample(n=1000)
label = domain_y

In [None]:
PDATA

## 07. Performance comparison (using DNN)

In [None]:
class Args:
    # arugments
    epochs=80
    bs=64
    lr=0.0001
    momentum=0.9
    num_classes=3
    verbose='store_true'
    seed=710674

args = Args()    

# np.random.seed(args.seed)
# random.seed(args.seed)
# torch.manual_seed(args.seed)

* domain_b1, domain_b2, domain_b3, domain_s

In [None]:
# domain_s

In [None]:
# domain_s_index

In [None]:
data_y = domain_s_index.loc[:,['IndexH']]

In [None]:
label = data_y
label = label.replace({'normal': 0})
label = label.replace({'mild': 1})
label = label.replace({'severe': 2})

In [None]:
x = domain_s
y = to_categorical((label), 3) ## into the format of one-hot encoding

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state = 710674)

In [None]:
print("The size of training dataset is:", x_train.shape[0])
print("The size of test dataset is:", x_test.shape[0])

In [None]:
inputs = np.concatenate((x_train, x_test), axis = 0)
targets = np.concatenate((y_train, y_test), axis = 0)

In [None]:
# x_test

* Applying 5-fold cross validation

In [None]:
fold_num = 1
split_num = 5
opt = keras.optimizers.SGD(learning_rate = args.lr, decay = 1e-5, momentum = args.momentum)
kfold = KFold(n_splits = split_num, shuffle = True)

In [None]:
label.value_counts()

In [None]:
class_weight = {1:1.8, 2: 2.5 , 0:1.2}
# class_weight = {1:1.0, 2:1.5, 0:1.0}

In [None]:
acc_per_fold = []
loss_per_fold = []

In [None]:
for train, test in kfold.split(inputs, targets):
    model = Sequential()
    model.add(Dense(32, input_dim = x_train.shape[1], activation = 'relu'))
    model.add(Dense(64, activation = 'relu'))
    # model.add(Dense(1024, activation = 'relu'))
#     model.add(Dropout(0.5)) #drop out
    model.add(BatchNormalization())
    model.add(Activation('relu'))
    # model.add(Dense(1024, activation = 'relu')) 
    model.add(Dense(64, activation = 'relu'))
    model.add(BatchNormalization())
    model.add(Activation('relu'))
    # model.add(Dense(256, activation = 'relu'))
    # model.add(Dense(128, activation = 'relu')) # added
    model.add(Dense(32, activation = 'relu'))
    model.add(Dense(args.num_classes, activation = 'softmax'))
    
    ## model compile
    model.compile(loss = 'categorical_crossentropy', optimizer = opt, metrics = ['accuracy'])
    
    print('----------------------------------------')
    print(f'Training or fold {fold_num} ... ')
    
    ## fit data to model
    history = model.fit(inputs[train], targets[train], batch_size = args.bs, epochs = args.epochs, verbose = 0, class_weight = class_weight)
    
    ## generate generalization metrics
    scores = model.evaluate(inputs[test], targets[test])
    print(f'Score for fold {fold_num}: {model.metrics_names[0]} of {scores[0]}; {model.metrics_names[1]} of {scores[1]*100}%')
    print("%s: %.2f%%" %(model.metrics_names[1], scores[1]*100))
    acc_per_fold.append(scores[1] * 100)
    loss_per_fold.append(scores[0])
    
    ## increasing fold number
    fold_num = fold_num + 1
    
    
    
## Summarizing the results
print('------------------------------------------------------------------------')
print('Score per fold')
for i in range(0, len(acc_per_fold)):
    print('------------------------------------------------------------------------')
    print(f'>> Fold {i+1} - Loss: {loss_per_fold[i]} - Accuracy: {acc_per_fold[i]}%')
print('------------------------------------------------------------------------')
print('Average scores for all folds:')
print(f'>>> Accuracy: {np.mean(acc_per_fold)} (+- {np.std(acc_per_fold)})')
print(f'>>> Loss: {np.mean(loss_per_fold)}')
print('------------------------------------------------------------------------')

In [None]:
# for train, test in kfold.split(inputs, targets):
#     model = Sequential()
#     model.add(Dense(32, input_dim = x_train.shape[1], activation = 'relu'))
#     model.add(Dense(64, activation = 'relu'))
#     model.add(Dense(64, activation = 'relu'))
#     model.add(Dense(64, activation = 'relu'))
#     model.add(Dense(32, activation = 'relu'))
#     model.add(Dense(args.num_classes, activation = 'softmax'))
    
#     ## model compile
#     model.compile(loss = 'categorical_crossentropy', optimizer = opt, metrics = ['accuracy'])
    
#     print('----------------------------------------')
#     print(f'Training or fold {fold_num} ... ')
    
#     ## fit data to model
#     history = model.fit(inputs[train], targets[train], batch_size = args.bs, epochs = args.epochs, verbose = 0, class_weight = class_weight)
    
#     ## generate generalization metrics
#     scores = model.evaluate(inputs[test], targets[test])
#     print(f'Score for fold {fold_num}: {model.metrics_names[0]} of {scores[0]}; {model.metrics_names[1]} of {scores[1]*100}%')
#     print("%s: %.2f%%" %(model.metrics_names[1], scores[1]*100))
#     acc_per_fold.append(scores[1] * 100)
#     loss_per_fold.append(scores[0])
    
#     ## increasing fold number
#     fold_num = fold_num + 1
    
    
    
# ## Summarizing the results
# print('------------------------------------------------------------------------')
# print('Score per fold')
# for i in range(0, len(acc_per_fold)):
#     print('------------------------------------------------------------------------')
#     print(f'>> Fold {i+1} - Loss: {loss_per_fold[i]} - Accuracy: {acc_per_fold[i]}%')
# print('------------------------------------------------------------------------')
# print('Average scores for all folds:')
# print(f'>>> Accuracy: {np.mean(acc_per_fold)} (+- {np.std(acc_per_fold)})')
# print(f'>>> Loss: {np.mean(loss_per_fold)}')
# print('------------------------------------------------------------------------')

In [None]:
y_predict = model.predict(x_test)
y_predict = np.argmax(y_predict, axis = 1)
y_test = np.argmax(y_test, axis = 1)

result = confusion_matrix(y_test, y_predict, normalize = 'pred')
print(result)

In [None]:
print(y_predict.shape)
print(y_test.shape)

In [None]:
# np.argmax(y_test, axis=1)

In [None]:
figure = plt.figure(figsize=(6, 4))
sns.heatmap(result, annot=True,cmap=plt.cm.Blues)
plt.tight_layout()
plt.ylabel('True label')
plt.xlabel('Predicted label')
plt.show()

In [None]:
accuracy = metrics.accuracy_score(y_test, y_predict)
precision = metrics.precision_score(y_test, y_predict, average = 'macro')
recall = metrics.recall_score(y_test, y_predict, average = 'micro')
f1 = metrics.f1_score(y_test, y_predict, average = 'weighted')

print("=============================================")
print("The overall accuracy is:", round(accuracy, 4))
print("The precision score is:", round(precision, 4))
print("The recall score is:", round(recall, 4))
print("The f1 score is:", round(f1, 4))
print("=============================================")