<a href="https://colab.research.google.com/github/alicewoo0925/ECG-Apnoea-Detection/blob/main/5thmodel/meta_classifier_training_set_only.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Set up

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler,RobustScaler,MinMaxScaler
from imblearn.over_sampling import SMOTE
from sklearn.metrics import classification_report
import plotly
import plotly.express as px

# Import dataset

In [None]:
feature_cols = ["RRseg_mean","log(RRseg_std)","log(SDSD)","RMSSD","RRseg_triI"]
result_cols = ["A","N"]

X_train_df = pd.read_csv("X_train.csv", names=feature_cols)
T_train_df = pd.read_csv("T_train.csv", names=result_cols)

df = pd.concat([X_train_df,T_train_df], axis=1)
del df['N']

In [None]:
df

Unnamed: 0,RRseg_mean,log(RRseg_std),log(SDSD),RMSSD,RRseg_triI,A
0,1.01390,-2.2262,-2.2082,0.108880,5.0000,0
1,0.94845,-1.3447,-1.2345,0.288230,6.2500,0
2,0.98738,-1.5966,-1.4929,0.223310,7.1429,0
3,1.02000,-3.0473,-3.3568,0.034641,8.3333,0
4,0.97656,-1.3232,-1.2073,0.296360,8.3333,0
...,...,...,...,...,...,...
22648,0.80480,-3.8387,-4.5533,0.010462,2.1739,0
22649,0.74838,-3.3904,-4.5298,0.010733,3.8462,0
22650,0.78724,-4.2226,-4.6701,0.009310,2.0833,0
22651,0.77346,-4.6699,-4.7238,0.008827,1.4706,0


check if there is NaN

In [None]:
# check if there is NaN
df[df.isna().any(axis=1)]

Unnamed: 0,RRseg_mean,log(RRseg_std),log(SDSD),RMSSD,RRseg_triI,A


check if data is balanced

In [None]:
# plot histogram of the count of each target
fig = px.histogram(df,x='A', title = 'Total Count of Each Target')
fig.show()

# Split into train and test & rescale

In [None]:
def scale_dataset(dataframe, oversample=False):
  X = dataframe[dataframe.columns[:-1]].values
  y = dataframe[dataframe.columns[-1]].values

  #scaler = StandardScaler()
  #scaler = RobustScaler()
  scaler = MinMaxScaler()
  X = scaler.fit_transform(X)

  if oversample:
    # ros = RandomOverSampler()
    smote = SMOTE(k_neighbors=8,random_state = 42)
    X, y = smote.fit_resample(X, y)
    #X, y = ros.fit_resample(X, y)

  data = np.hstack((X, np.reshape(y, (-1, 1))))

  return data, X, y

In [None]:
train, valid, test = np.split(df.sample(frac=1), [int(0.8*len(df)), int(0.9*len(df))]) #Hold-out

In [None]:
train, X_train, y_train = scale_dataset(train, oversample=True)
valid, X_valid, y_valid = scale_dataset(valid, oversample=False)
test, X_test, y_test = scale_dataset(test)

In [None]:
len(y_train)

26832

In [None]:
len(y_valid)

2265

In [None]:
len(y_test)

2266

check if data is balanced

In [None]:
# plot histogram of the count of each target
fig = px.histogram(y_train, title = 'Total Count of Each Target')
fig.show()

# All time domain features


In [None]:
train, valid, test = np.split(df.sample(frac=1), [int(0.8*len(df)), int(0.9*len(df))]) #Hold-o

In [None]:
train, X_train, y_train = scale_dataset(train, oversample=True)
valid, X_valid, y_valid = scale_dataset(valid, oversample=False)
test, X_test, y_test = scale_dataset(test, oversample=False)

In [None]:
# use this instead:

In [None]:
from sklearn.model_selection import train_test_split

X = df[df.columns[:-1]].values
y = df[df.columns[-1]].values
scaler = MinMaxScaler()
X = scaler.fit_transform(X)
smote = SMOTE(random_state=42)
X, y = smote.fit_resample(X, y)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=4)

In [None]:
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import StackingClassifier
from sklearn.ensemble import RandomForestClassifier

In [None]:
estimators = [
    ('rf', RandomForestClassifier()),
     ('svr', LinearSVC())
]

classifier = StackingClassifier(
    estimators=estimators, final_estimator=LogisticRegression()
)

In [None]:
classifier = classifier.fit(X_train, y_train)

In [None]:
y_pred = classifier.predict(X_test)

In [None]:
def print_result (y_test, y_pred):

  from sklearn.metrics import confusion_matrix, accuracy_score, cohen_kappa_score, roc_auc_score, f1_score
  tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()

  sensitivity = tp/(tp+fn)
  specificity = tn/(tn+fp)
  accuracy = accuracy_score(y_test, y_pred)
  PPV = tp/(tp+fp)
  NPV = tn/(tn+fn)
  kappa = cohen_kappa_score(y_test, y_pred)
  AUROC = roc_auc_score(y_test, y_pred)
  f1 = f1_score(y_test, y_pred)

  print("Sensitivity : %5.3f, Specificity: %5.3f, Accuracy: %5.3f, PPV: %5.3f, NPV: %5.3f, f1: %5.3f, Cohen's Kappa: %5.3f, AUROC: %5.3f"
        % (sensitivity, specificity, accuracy, PPV, NPV, f1, kappa, AUROC))

  return sensitivity, specificity, accuracy, PPV, NPV, f1, kappa, AUROC

In [None]:
sensitivity, specificity, accuracy, PPV, NPV, f1, kappa, AUROC = print_result(y_test, y_pred)

Sensitivity : 0.821, Specificity: 0.796, Accuracy: 0.808, PPV: 0.798, NPV: 0.818, f1: 0.809, Cohen's Kappa: 0.616, AUROC: 0.808


In [None]:
# all time domain features
y_pred = classifier.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.82      0.58      0.68      1702
           1       0.32      0.61      0.42       564

    accuracy                           0.59      2266
   macro avg       0.57      0.59      0.55      2266
weighted avg       0.69      0.59      0.61      2266



# Feature importance - not used in the submission

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
# set a feature selector and select features by fitting the model
# 9 features, 10 max depth
X = df[df.columns[:-1]].values
y = df[df.columns[-1]].values

scaler = MinMaxScaler()
X = scaler.fit_transform(X)

selector = RandomForestClassifier(n_estimators=9,max_depth=10,random_state=0)
selector.fit(X,y)

RandomForestClassifier(max_depth=10, n_estimators=9, random_state=0)

In [None]:
# get importance
feature_selection = selector.feature_importances_

In [None]:
feature_cols = ["RRseg_mean","log(RRseg_std)","log(SDSD)","RMSSD","RRseg_triI","VLF","LF","HF","TP"]
importance_series = pd.Series(feature_selection, index=feature_cols)
importance_series.sort_values(ascending=False, inplace=True) # descending order

In [None]:
# plot feature importances
fig = px.bar(importance_series, title= 'Feature importance using Random Forest')
fig.show()

# All feature set

In [None]:
df_top5 = df.loc[:,importance_series.index[0:5]]
df_top5

Unnamed: 0,HF,VLF,log(RRseg_std),LF,TP
0,4.4522,1.3772,-3.2674,4.2213,10.051
1,99.9740,35.7060,-1.5161,33.3910,169.070
2,77.1060,36.4880,-2.1771,29.2790,142.870
3,60.5140,28.5570,-2.7058,25.6450,114.720
4,58.2810,39.0160,-1.7164,39.8810,137.180
...,...,...,...,...,...
22934,6.5306,2.5314,-2.8809,2.5364,11.598
22935,6.5436,2.5464,-2.8511,2.5411,11.631
22936,6.5574,2.5485,-2.9993,2.5410,11.647
22937,6.5713,2.5164,-3.0244,2.5402,11.628


In [None]:
df_top5['A'] = df['A']
df_top5

Unnamed: 0,HF,VLF,log(RRseg_std),LF,TP,A
0,4.4522,1.3772,-3.2674,4.2213,10.051,0
1,99.9740,35.7060,-1.5161,33.3910,169.070,0
2,77.1060,36.4880,-2.1771,29.2790,142.870,0
3,60.5140,28.5570,-2.7058,25.6450,114.720,0
4,58.2810,39.0160,-1.7164,39.8810,137.180,0
...,...,...,...,...,...,...
22934,6.5306,2.5314,-2.8809,2.5364,11.598,0
22935,6.5436,2.5464,-2.8511,2.5411,11.631,0
22936,6.5574,2.5485,-2.9993,2.5410,11.647,0
22937,6.5713,2.5164,-3.0244,2.5402,11.628,0


In [None]:
train, valid, test = np.split(df_top5.sample(frac=1), [int(0.8*len(df_top5)), int(0.9*len(df_top5))]) #Hold-out

In [None]:
train, X_train, y_train = scale_dataset(train, oversample=True)
valid, X_valid, y_valid = scale_dataset(valid, oversample=False)
test, X_test, y_test = scale_dataset(test, oversample=False)

In [None]:
estimators = [
    ('rf', RandomForestClassifier()),
     ('svr', LinearSVC())
]

classifier2 = StackingClassifier(
    estimators=estimators, final_estimator=LogisticRegression()
)

In [None]:
classifier2 = classifier.fit(X_train, y_train)

In [None]:
y_pred = classifier2.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.79      0.91      0.84      1694
           1       0.55      0.29      0.38       600

    accuracy                           0.75      2294
   macro avg       0.67      0.60      0.61      2294
weighted avg       0.72      0.75      0.72      2294



In [None]:
# smote k_neighbor = 8
y_pred = classifier2.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.79      0.85      0.82      1687
           1       0.46      0.36      0.40       607

    accuracy                           0.72      2294
   macro avg       0.62      0.60      0.61      2294
weighted avg       0.70      0.72      0.71      2294



# Hyperparameter tuning - not used in the submission
no tuning will be used bcus there is no big difference in terms of performance

In [None]:
from sklearn.model_selection import RandomizedSearchCV

In [None]:
rf_max_depth = [int(x) for x in np.linspace(5, 55 ,11)]
rf_max_depth.append(None)

rf_min_samples_leaf = [1,2,4]
rf_min_samples_split = [int(x) for x in np.linspace(2,10,9)]

rf_n_estimators = [int(x) for x in np.linspace(start = 100, stop = 1000, num = 10)]

rf_grid = {'n_estimators' : rf_n_estimators,
           'max_depth' : rf_max_depth,
           'min_samples_split' : rf_min_samples_split,
           'min_samples_leaf' : rf_min_samples_leaf
}

In [None]:
rf_base = RandomForestClassifier()

rf_random = RandomizedSearchCV(estimator = rf_base, param_distributions=rf_grid,n_iter=10,cv=3,verbose=2,random_state=42,n_jobs=-1)

rf_random.fit(X_train,y_train)

rf_random.best_params_

Fitting 3 folds for each of 10 candidates, totalling 30 fits



A worker stopped while some jobs were given to the executor. This can be caused by a too short worker timeout or by a memory leak.



{'n_estimators': 1000,
 'min_samples_split': 2,
 'min_samples_leaf': 1,
 'max_depth': 45}

## time domain only & random forest only

In [None]:
train, valid, test = np.split(df_time.sample(frac=1), [int(0.8*len(df_time)), int(0.9*len(df_time))]) #Hold-out

train, X_train, y_train = scale_dataset(train, oversample=True)
valid, X_valid, y_valid = scale_dataset(valid, oversample=False)
test, X_test, y_test = scale_dataset(test, oversample=False)

In [None]:
rf_model = RandomForestClassifier(n_estimators=1000,min_samples_split=2,min_samples_leaf=1,max_depth=45)
rf_model = rf_model.fit(X_train, y_train)

In [None]:
y_pred = rf_model.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.82      0.71      0.76      1685
           1       0.41      0.56      0.47       609

    accuracy                           0.67      2294
   macro avg       0.61      0.63      0.61      2294
weighted avg       0.71      0.67      0.68      2294



In [None]:
rf_model = RandomForestClassifier()
rf_model = rf_model.fit(X_train, y_train)
y_pred = rf_model.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.81      0.67      0.73      1668
           1       0.39      0.58      0.47       626

    accuracy                           0.64      2294
   macro avg       0.60      0.62      0.60      2294
weighted avg       0.70      0.64      0.66      2294



## all features & random forest only

In [None]:
train, valid, test = np.split(df.sample(frac=1), [int(0.8*len(df)), int(0.9*len(df))]) #Hold-out

train, X_train, y_train = scale_dataset(train, oversample=True)
valid, X_valid, y_valid = scale_dataset(valid, oversample=False)
test, X_test, y_test = scale_dataset(test, oversample=False)

In [None]:
rf_model = RandomForestClassifier(n_estimators=1000,min_samples_split=2,min_samples_leaf=1,max_depth=45)
rf_model = rf_model.fit(X_train, y_train)

In [None]:
y_pred = rf_model.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.77      0.89      0.83      1669
           1       0.52      0.31      0.38       625

    accuracy                           0.73      2294
   macro avg       0.65      0.60      0.61      2294
weighted avg       0.71      0.73      0.71      2294



## top 5 & random forest only

In [None]:
train, valid, test = np.split(df_top5.sample(frac=1), [int(0.8*len(df_top5)), int(0.9*len(df_top5))]) #Hold-out

train, X_train, y_train = scale_dataset(train, oversample=True)
valid, X_valid, y_valid = scale_dataset(valid, oversample=False)
test, X_test, y_test = scale_dataset(test, oversample=False)

In [None]:
rf_model = RandomForestClassifier(n_estimators=1000,min_samples_split=2,min_samples_leaf=1,max_depth=45)
rf_model = rf_model.fit(X_train, y_train)

In [None]:
y_pred = rf_model.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.74      0.89      0.81      1678
           1       0.30      0.13      0.18       616

    accuracy                           0.69      2294
   macro avg       0.52      0.51      0.49      2294
weighted avg       0.62      0.69      0.64      2294

