<a href="https://colab.research.google.com/github/alicewoo0925/ECG-Apnoea-Detection/blob/main/3rdmodel/combined_features.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Set up

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler,RobustScaler,MinMaxScaler
from imblearn.over_sampling import SMOTE
from sklearn.metrics import classification_report

# Import dataset

In [None]:
feature_cols = ["RRseg_mean","log(RRseg_std)","log(SDSD)","RMSSD","RRseg_triI","VLF","LF","HF","TP"]
result_cols = ["A","N"]

X_train_df = pd.read_csv("X_train.csv", names=feature_cols)
T_train_df = pd.read_csv("T_train.csv", names=result_cols)

df = pd.concat([X_train_df,T_train_df], axis=1)
del df['N']

In [None]:
df

Unnamed: 0,RRseg_mean,log(RRseg_std),log(SDSD),RMSSD,RRseg_triI,VLF,LF,HF,TP,A
0,1.03070,-3.2674,-3.4358,0.031926,5.5556,1.3772,4.2213,4.4522,10.051,0
1,0.93625,-1.5161,-1.4311,0.237140,3.8462,35.7060,33.3910,99.9740,169.070,0
2,0.99066,-2.1771,-2.3040,0.099071,5.5556,36.4880,29.2790,77.1060,142.870,0
3,1.03140,-2.7058,-2.8820,0.055551,4.5455,28.5570,25.6450,60.5140,114.720,0
4,0.95794,-1.7164,-1.8649,0.153660,7.1429,39.0160,39.8810,58.2810,137.180,0
...,...,...,...,...,...,...,...,...,...,...
22934,0.80387,-2.8809,-2.3893,0.091075,5.0000,2.5314,2.5364,6.5306,11.598,0
22935,0.74925,-2.8511,-2.5267,0.079412,6.2500,2.5464,2.5411,6.5436,11.631,0
22936,0.78632,-2.9993,-2.5127,0.080507,3.1250,2.5485,2.5410,6.5574,11.647,0
22937,0.77346,-3.0244,-2.4681,0.084192,2.5000,2.5164,2.5402,6.5713,11.628,0


check if there is NaN

In [None]:
# check if there is NaN
df[df.isna().any(axis=1)]

Unnamed: 0,RRseg_mean,log(RRseg_std),log(SDSD),RMSSD,RRseg_triI,VLF,LF,HF,TP,A


check if data is balanced

In [None]:
import plotly
import plotly.express as px

In [None]:
# plot histogram of the count of each target
fig = px.histogram(df,x='A', title = 'Total Count of Each Target')
fig.show()

# Split into train and test & rescale

In [None]:
def scale_dataset(dataframe, oversample=False):
  X = dataframe[dataframe.columns[:-1]].values
  y = dataframe[dataframe.columns[-1]].values

  #scaler = StandardScaler()
  #scaler = RobustScaler()
  scaler = MinMaxScaler()
  X = scaler.fit_transform(X)

  if oversample:
    # ros = RandomOverSampler()
    smote = SMOTE(k_neighbors=3, random_state = 42)
    X, y = smote.fit_resample(X, y)
    #X, y = ros.fit_resample(X, y)

  data = np.hstack((X, np.reshape(y, (-1, 1))))

  return data, X, y

In [None]:
#train, valid, test = np.split(df.sample(frac=1), [int(0.8*len(df)), int(0.9*len(df))]) #Hold-out

In [None]:
#train, X_train, y_train = scale_dataset(train)
#valid, X_valid, y_valid = scale_dataset(valid, oversample=False)
#test, X_test, y_test = scale_dataset(test)

In [None]:
len(y_train)

27232

In [None]:
len(y_valid)

2294

In [None]:
len(y_test)

2294

check if data is balanced

In [None]:
# plot histogram of the count of each target
fig = px.histogram(y_train, title = 'Total Count of Each Target')
fig.show()

# Train and predict - 1

*   27232 train
*   2294 valid
*   2294 test



## kNN

In [None]:
from sklearn.neighbors import KNeighborsClassifier

In [None]:
knn_model = KNeighborsClassifier(n_neighbors=5)
knn_model.fit(X_train, y_train)

KNeighborsClassifier()

In [None]:
y_pred = knn_model.predict(X_test)

In [None]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.87      0.67      0.76      1709
           1       0.42      0.69      0.52       585

    accuracy                           0.68      2294
   macro avg       0.64      0.68      0.64      2294
weighted avg       0.75      0.68      0.70      2294



## Naive Bayes

In [None]:
from sklearn.naive_bayes import GaussianNB

In [None]:
nb_model = GaussianNB()
nb_model = nb_model.fit(X_train, y_train)

In [None]:
y_pred = nb_model.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.82      0.44      0.57      1709
           1       0.30      0.72      0.43       585

    accuracy                           0.51      2294
   macro avg       0.56      0.58      0.50      2294
weighted avg       0.69      0.51      0.53      2294



## Log Regression

In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
lg_model = LogisticRegression()
lg_model = lg_model.fit(X_train, y_train)

In [None]:
y_pred = lg_model.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.86      0.58      0.69      1709
           1       0.37      0.72      0.49       585

    accuracy                           0.62      2294
   macro avg       0.62      0.65      0.59      2294
weighted avg       0.74      0.62      0.64      2294



## Linear Regression

In [None]:
from sklearn.linear_model import LinearRegression

In [None]:
ln_model = LinearRegression()
ln_model.fit(X_train, y_train)

LinearRegression()

In [None]:
y_pred = lg_model.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.86      0.58      0.69      1709
           1       0.37      0.72      0.49       585

    accuracy                           0.62      2294
   macro avg       0.62      0.65      0.59      2294
weighted avg       0.74      0.62      0.64      2294



## SVM

In [None]:
from sklearn.svm import SVC

In [None]:
svm_model = SVC()
svm_model = svm_model.fit(X_train, y_train)

In [None]:
y_pred = svm_model.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.87      0.65      0.74      1709
           1       0.41      0.70      0.52       585

    accuracy                           0.67      2294
   macro avg       0.64      0.68      0.63      2294
weighted avg       0.75      0.67      0.69      2294



## Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
rf_model = RandomForestClassifier()
rf_model = rf_model.fit(X_train, y_train)

In [None]:
# smote random_state = 42
y_pred = rf_model.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.80      0.93      0.86      1698
           1       0.63      0.33      0.43       596

    accuracy                           0.78      2294
   macro avg       0.72      0.63      0.65      2294
weighted avg       0.76      0.78      0.75      2294



In [None]:
# smote k_neighbors = 3
y_pred = rf_model.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.76      0.93      0.84      1678
           1       0.52      0.21      0.30       616

    accuracy                           0.74      2294
   macro avg       0.64      0.57      0.57      2294
weighted avg       0.70      0.74      0.69      2294



In [None]:
# smote k_neighbors = 3, random_state = 42
y_pred = rf_model.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.80      0.92      0.86      1699
           1       0.61      0.36      0.46       595

    accuracy                           0.77      2294
   macro avg       0.71      0.64      0.66      2294
weighted avg       0.75      0.77      0.75      2294



## Neural Net

In [None]:
import tensorflow as tf

In [None]:
def train_model(X_train, y_train, num_nodes, dropout_prob, lr, batch_size, epochs):
  nn_model = tf.keras.Sequential([
      tf.keras.layers.Dense(num_nodes, activation='relu', input_shape=(10,)),
      tf.keras.layers.Dropout(dropout_prob),
      tf.keras.layers.Dense(num_nodes, activation='relu'),
      tf.keras.layers.Dropout(dropout_prob),
      tf.keras.layers.Dense(1, activation='sigmoid')
  ])

  nn_model.compile(optimizer=tf.keras.optimizers.Adam(lr), loss='binary_crossentropy',
                  metrics=['accuracy','Precision','Recall'])
  history = nn_model.fit(
    X_train, y_train, epochs=epochs, batch_size=batch_size, validation_split=0.2, verbose=0
  )

  return nn_model, history

In [None]:
nn_model = tf.keras.Sequential([
    tf.keras.layers.Dense(16, activation='relu', input_shape=(9,)),
    tf.keras.layers.Dense(16, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid'),

])

nn_model.compile(optimizer=tf.keras.optimizers.Adam(0.001), loss='binary_crossentropy',
                 metrics=['accuracy','Precision','Recall'])

In [None]:
history = nn_model.fit(
    X_train, y_train,
    epochs=100,
    batch_size=32,
    validation_split=0.2
)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

# Feature importance

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
# set a feature selector and select features by fitting the model
# 9 features, 10 max depth
X = df[df.columns[:-1]].values
y = df[df.columns[-1]].values

scaler = MinMaxScaler()
X = scaler.fit_transform(X)

selector = RandomForestClassifier(n_estimators=9,max_depth=10,random_state=0)
selector.fit(X,y)

RandomForestClassifier(max_depth=10, n_estimators=9, random_state=0)

In [None]:
# get importance
feature_selection = selector.feature_importances_

In [None]:
feature_cols = ["RRseg_mean","log(RRseg_std)","log(SDSD)","RMSSD","RRseg_triI","VLF","LF","HF","TP"]
importance_series = pd.Series(feature_selection, index=feature_cols)
importance_series.sort_values(ascending=False, inplace=True) # descending order

In [None]:
# plot feature importances
fig = px.bar(importance_series, title= 'Feature importance using Random Forest')
fig.show()

# Train and predict - 2
only use top 5 important features:
* HF
* VLF
* log(RRseg_std)
* LF
* TP

In [None]:
df_top5 = df.loc[:,importance_series.index[0:5]]
df_top5

Unnamed: 0,HF,VLF,log(RRseg_std),LF,TP
0,4.4522,1.3772,-3.2674,4.2213,10.051
1,99.9740,35.7060,-1.5161,33.3910,169.070
2,77.1060,36.4880,-2.1771,29.2790,142.870
3,60.5140,28.5570,-2.7058,25.6450,114.720
4,58.2810,39.0160,-1.7164,39.8810,137.180
...,...,...,...,...,...
22934,6.5306,2.5314,-2.8809,2.5364,11.598
22935,6.5436,2.5464,-2.8511,2.5411,11.631
22936,6.5574,2.5485,-2.9993,2.5410,11.647
22937,6.5713,2.5164,-3.0244,2.5402,11.628


In [None]:
df_top5['A'] = df['A']
df_top5

Unnamed: 0,HF,VLF,log(RRseg_std),LF,TP,A
0,4.4522,1.3772,-3.2674,4.2213,10.051,0
1,99.9740,35.7060,-1.5161,33.3910,169.070,0
2,77.1060,36.4880,-2.1771,29.2790,142.870,0
3,60.5140,28.5570,-2.7058,25.6450,114.720,0
4,58.2810,39.0160,-1.7164,39.8810,137.180,0
...,...,...,...,...,...,...
22934,6.5306,2.5314,-2.8809,2.5364,11.598,0
22935,6.5436,2.5464,-2.8511,2.5411,11.631,0
22936,6.5574,2.5485,-2.9993,2.5410,11.647,0
22937,6.5713,2.5164,-3.0244,2.5402,11.628,0


In [None]:
from sklearn.model_selection import train_test_split

X = df[df.columns[:-1]].values
y = df[df.columns[-1]].values
#scaler = MinMaxScaler()
#X = scaler.fit_transform(X)
#smote = SMOTE(random_state=42)
#X, y = smote.fit_resample(X, y)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=4)

In [None]:
#train, valid, test = np.split(df_top5.sample(frac=1), [int(0.8*len(df_top5)), int(0.9*len(df_top5))]) #Hold-out

In [None]:
#train, X_train, y_train = scale_dataset(train, oversample=True)
#valid, X_valid, y_valid = scale_dataset(valid, oversample=False)
#test, X_test, y_test = scale_dataset(test, oversample=False)

In [None]:
rf_model2 = RandomForestClassifier()
rf_model2 = rf_model2.fit(X_train, y_train)

In [None]:
# MinMax scaler
y_pred = rf_model2.predict(X_test)


In [None]:
def print_result (y_test, y_pred):

  from sklearn.metrics import confusion_matrix, accuracy_score, cohen_kappa_score, roc_auc_score, f1_score
  tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()

  sensitivity = tp/(tp+fn)
  specificity = tn/(tn+fp)
  accuracy = accuracy_score(y_test, y_pred)
  PPV = tp/(tp+fp)
  NPV = tn/(tn+fn)
  kappa = cohen_kappa_score(y_test, y_pred)
  AUROC = roc_auc_score(y_test, y_pred)
  f1 = f1_score(y_test, y_pred)

  print("Sensitivity : %5.3f, Specificity: %5.3f, Accuracy: %5.3f, PPV: %5.3f, NPV: %5.3f, f1: %5.3f, Cohen's Kappa: %5.3f, AUROC: %5.3f"
        % (sensitivity, specificity, accuracy, PPV, NPV, f1, kappa, AUROC))

  return sensitivity, specificity, accuracy, PPV, NPV, f1, kappa, AUROC


In [None]:
sensitivity, specificity, accuracy, PPV, NPV, f1, kappa, AUROC = print_result(y_test, y_pred)

Sensitivity : 0.630, Specificity: 0.951, Accuracy: 0.868, PPV: 0.819, NPV: 0.881, f1: 0.712, Cohen's Kappa: 0.629, AUROC: 0.791


 precision    recall  f1-score   support

           0       0.76      0.90      0.82      1712
           1       0.33      0.15      0.20       582

    accuracy                           0.71      2294
   macro avg       0.54      0.52      0.51      2294
weighted avg       0.65      0.71      0.66      2294


# Train and predict - 3
top 3 features

In [None]:
df_top3 = df.loc[:,importance_series.index[0:3]]
df_top3['A'] = df['A']
df_top3

Unnamed: 0,HF,VLF,log(RRseg_std),A
0,4.4522,1.3772,-3.2674,0
1,99.9740,35.7060,-1.5161,0
2,77.1060,36.4880,-2.1771,0
3,60.5140,28.5570,-2.7058,0
4,58.2810,39.0160,-1.7164,0
...,...,...,...,...
22934,6.5306,2.5314,-2.8809,0
22935,6.5436,2.5464,-2.8511,0
22936,6.5574,2.5485,-2.9993,0
22937,6.5713,2.5164,-3.0244,0


In [None]:
train, valid, test = np.split(df_top3.sample(frac=1), [int(0.8*len(df_top3)), int(0.9*len(df_top3))]) #Hold-out

In [None]:
train, X_train, y_train = scale_dataset(train, oversample=True)
valid, X_valid, y_valid = scale_dataset(valid, oversample=False)
test, X_test, y_test = scale_dataset(test, oversample=False)

In [None]:
rf_model3 = RandomForestClassifier()
rf_model3 = rf_model.fit(X_train, y_train)

In [None]:
y_pred = rf_model3.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.76      0.79      0.77      1704
           1       0.31      0.27      0.29       590

    accuracy                           0.66      2294
   macro avg       0.53      0.53      0.53      2294
weighted avg       0.64      0.66      0.65      2294

