<a href="https://colab.research.google.com/github/alicewoo0925/ECG-Apnoea-Detection/blob/main/4thmodel/cross_validation_4th.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Set up

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import plotly
import plotly.express as px
from sklearn.preprocessing import StandardScaler,RobustScaler,MinMaxScaler
from imblearn.over_sampling import SMOTE
from sklearn.metrics import classification_report

# Import dataset

In [None]:
feature_cols = ["RRseg_mean","log(RRseg_std)","log(SDSD)","RMSSD","RRseg_triI"]
result_cols = ["A","N"]

X_train_df = pd.read_csv("X_train.csv", names=feature_cols)
T_train_df = pd.read_csv("T_train.csv", names=result_cols)

df = pd.concat([X_train_df,T_train_df], axis=1)
del df['N']

In [None]:
df

Unnamed: 0,RRseg_mean,log(RRseg_std),log(SDSD),RMSSD,RRseg_triI,A
0,1.03070,-3.2674,-3.4358,0.031926,5.5556,0
1,0.93625,-1.5161,-1.4311,0.237140,3.8462,0
2,0.99066,-2.1771,-2.3040,0.099071,5.5556,0
3,1.03140,-2.7058,-2.8820,0.055551,4.5455,0
4,0.95794,-1.7164,-1.8649,0.153660,7.1429,0
...,...,...,...,...,...,...
22934,0.80387,-2.8809,-2.3893,0.091075,5.0000,0
22935,0.74925,-2.8511,-2.5267,0.079412,6.2500,0
22936,0.78632,-2.9993,-2.5127,0.080507,3.1250,0
22937,0.77346,-3.0244,-2.4681,0.084192,2.5000,0


check if there is NaN

In [None]:
# check if there is NaN
df[df.isna().any(axis=1)]

Unnamed: 0,RRseg_mean,log(RRseg_std),log(SDSD),RMSSD,RRseg_triI,A


check if data is balanced

In [None]:
# plot histogram of the count of each target
fig = px.histogram(df,x='A', title = 'Total Count of Each Target')
fig.show()

# Split and preprocess

In [None]:
def scale_dataset(dataframe, oversample=False):
  X = dataframe[dataframe.columns[:-1]].values
  y = dataframe[dataframe.columns[-1]].values

  #scaler = StandardScaler()
  #scaler = RobustScaler()
  scaler = MinMaxScaler()
  X = scaler.fit_transform(X)

  if oversample:
    # ros = RandomOverSampler()
    smote = SMOTE(k_neighbors=8, random_state = 42)
    X, y = smote.fit_resample(X, y)
    #X, y = ros.fit_resample(X, y)

  data = np.hstack((X, np.reshape(y, (-1, 1))))

  return data, X, y

In [None]:
train, valid, test = np.split(df.sample(frac=1), [int(0.8*len(df)), int(0.9*len(df))]) #Hold-out

In [None]:
train, X_train, y_train = scale_dataset(train, oversample = True)
valid, X_valid, y_valid = scale_dataset(valid, oversample=False)
test, X_test, y_test = scale_dataset(test)

In [None]:
from sklearn.model_selection import train_test_split

X = df[df.columns[:-1]].values
y = df[df.columns[-1]].values
scaler = MinMaxScaler()
X = scaler.fit_transform(X)
smote = SMOTE(random_state=42)
X, y = smote.fit_resample(X, y)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=4)

In [None]:
# plot histogram of the count of each target
fig = px.histogram(y_train, title = 'Total Count of Each Target')
fig.show()

# Train and Test

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
rf_model = RandomForestClassifier()
rf_model = rf_model.fit(X_train, y_train)

In [None]:
y_pred = rf_model.predict(X_test)
#print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.82      0.52      0.63      1671
           1       0.33      0.68      0.45       595

    accuracy                           0.56      2266
   macro avg       0.58      0.60      0.54      2266
weighted avg       0.69      0.56      0.59      2266



In [None]:
y_pred = rf_model.predict(X_test)

In [None]:
def print_result (y_test, y_pred):

  from sklearn.metrics import confusion_matrix, accuracy_score, cohen_kappa_score, roc_auc_score, f1_score
  tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()

  sensitivity = tp/(tp+fn)
  specificity = tn/(tn+fp)
  accuracy = accuracy_score(y_test, y_pred)
  PPV = tp/(tp+fp)
  NPV = tn/(tn+fn)
  kappa = cohen_kappa_score(y_test, y_pred)
  AUROC = roc_auc_score(y_test, y_pred)
  f1 = f1_score(y_test, y_pred)

  print("Sensitivity : %5.3f, Specificity: %5.3f, Accuracy: %5.3f, PPV: %5.3f, NPV: %5.3f, f1: %5.3f, Cohen's Kappa: %5.3f, AUROC: %5.3f"
        % (sensitivity, specificity, accuracy, PPV, NPV, f1, kappa, AUROC))

  return sensitivity, specificity, accuracy, PPV, NPV, f1, kappa, AUROC

In [None]:
sensitivity, specificity, accuracy, PPV, NPV, f1, kappa, AUROC = print_result(y_test, y_pred)

Sensitivity : 0.837, Specificity: 0.780, Accuracy: 0.808, PPV: 0.790, NPV: 0.829, f1: 0.813, Cohen's Kappa: 0.617, AUROC: 0.809


pls run this again using 4th data