# `AML — Task 2:` Heart rhythm classification from raw ECG signals
---

In [1]:
import numpy as np
import pandas as pd
import biosppy.signals.ecg as ecg
import biosppy.signals.tools as tools
from biosppy.plotting import plot_ecg
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm
tqdm.pandas()

In [2]:
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import f1_score
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.ensemble import RandomForestClassifier, VotingClassifier

---
## Dataset import and export to `csv`

In [3]:
def load_from_csv(extension="", drop_id = True):
    X_train = pd.read_csv('data/X_train' + extension + '.csv')
    y_train = pd.read_csv('data/y_train' + extension + '.csv')
    X_test  = pd.read_csv('data/X_test' + extension + '.csv')
    
    if drop_id:
        X_train = X_train.drop(columns=['id'])
        y_train = y_train.drop(columns=['id'])
        X_test  = X_test.drop(columns=['id'])
     
    return X_train, y_train, X_test

In [4]:
def export_to_csv(X_train, y_train, X_test, extension="_cleaned"):
    X_train.to_csv('data/X_train' + extension + '.csv', index=False)
    y_train.to_csv('data/y_train' + extension + '.csv', index=False)
    X_test.to_csv('data/X_test' + extension + '.csv', index=False)

## Submission export to `csv`

In [5]:
def create_submission(sub_id, pred, basepath='submissions/task2-sub'):
    result = pred.copy().rename(columns={0: 'y'})
    result['id'] = range(0, len(result))
    result = result[['id', 'y']]
    result.to_csv(basepath + str(sub_id) + '.csv', index=False)

---
## Feature extraction

In [6]:
def extract_features(time_series: pd.Series, sampling_rate=300.0) -> np.array:
    # Drop nan values in the time series
    no_nans = time_series.dropna()
    
    # Extract main features
    ts, filtered, rpeaks, _, templates, _, heart_rate = ecg.ecg(no_nans, sampling_rate, show=False)
    # If one of these assert fails, let's code the use of another segmenter using the filtered ecg
    assert len(rpeaks) > 1, 'ECG cannot have a single R peak'
    assert len(templates) > 1, 'ECG cannot have a single heartbeat'
    
    # Extract R peak amplitudes
    rpeaks_amplitudes = [filtered[rpeak] for rpeak in rpeaks]
    # Extract mean R peak amplitude
    rpeaks_mean = np.mean(rpeaks_amplitudes)
    # Extract std of R peak amplitudes
    rpeaks_std = np.std(rpeaks_amplitudes)
    
    #TODO: maybe change durations to seconds?
    # Extract RR durations
    rr_durations = [r2 - r1 for r1, r2 in zip(rpeaks, rpeaks[1:])]
    # Extract mean RR duration
    rr_durations_mean = np.mean(rr_durations)
    # Extract std of RR durations
    rr_durations_std = np.std(rr_durations)
    
    #TODO: check correlation between rrdurations and heart rate
    #RESULT: corr of mean ~0.7, corr of std ~0.6
    if len(heart_rate) == 0:
        #Temp fix
        heart_rate = rr_durations
    # Extract mean heart rate
    heart_rate_mean = np.mean(heart_rate)
    # Extract std of mean heart
    heart_rate_std = np.std(heart_rate)
    
    #TODO: Extract SNR ratio (http://www.cinc.org/archives/2011/pdf/0609.pdf)
    # For now, we use this
    snr = np.quantile(np.std(templates, axis=0), 0.35)
    
    window_size = 50
    # Extract S peaks
    speaks = [rpeak + np.argmin(filtered[rpeak:rpeak+window_size]) for rpeak in rpeaks]
    # Extract S peak amplitudes
    speaks_amplitudes = [filtered[speak] for speak in speaks]
    # Extract mean S peak amplitude
    speaks_mean = np.mean(speaks_amplitudes)
    # Extract std of S peak amplitudes
    speaks_std = np.std(speaks_amplitudes)
    
    # Extract Q peaks
    qpeaks = [rpeak - window_size + np.argmin(filtered[rpeak-window_size:rpeak]) for rpeak in rpeaks]
    # Extract Q peak amplitudes
    qpeaks_amplitudes = [filtered[qpeak] for qpeak in qpeaks]
    # Extract mean Q peak amplitude
    qpeaks_mean = np.mean(qpeaks_amplitudes)
    # Extract std of Q peak amplitudes
    qpeaks_std = np.std(qpeaks_amplitudes)
    
    # Extract QRS durations
    qrs_durations = [speak - qpeak for qpeak, speak in zip(qpeaks, speaks)]
    # Extract mean QRS duration
    qrs_durations_mean = np.mean(qrs_durations)
    # Extract std of QRS durations
    qrs_durations_std = np.std(qrs_durations)
    
    # Use this to go from index differences to seconds
    index_to_time = ts[-1] / len(filtered)
    # Extract pNN28 (https://www.ncbi.nlm.nih.gov/pmc/articles/PMC1767394/)
    pNN28 = (np.array(rr_durations) * index_to_time > 0.028).sum() / len(rr_durations)
    
    # Return extracted features
    return pd.Series([rpeaks_mean, 
                      rpeaks_std,
                      rr_durations_mean, 
                      rr_durations_std, 
                      heart_rate_mean, 
                      heart_rate_std, 
                      snr,
                      speaks_mean,
                      speaks_std,
                      qpeaks_mean,
                      qpeaks_std,
                      qrs_durations_mean,
                      qrs_durations_std,
                      pNN28,])

## Data standardization

In [7]:
def standardize_data(X_train, X_test):
    # Do the scaling, saving the scaler to use it for X_test too. No need imputation, just ignore Nan values
    scaler = StandardScaler().fit(X_train)
    X_train_scaled = scaler.transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    return (X_train_scaled, X_test_scaled)

---
## Model defintion

In [8]:
def best_svc(X_train, y_train):
    svc = SVC()
    gs_svc_params = {
        "kernel": ["rbf", "poly", "sigmoid"],
        "C": np.logspace(0, 1, 2),
        "class_weight": ["balanced", None]
    }
    gs_svc = GridSearchCV(svc, gs_svc_params, cv=5, verbose=3, scoring='f1_micro', error_score='raise')
    gs_svc.fit(X_train, y_train)
    
    print(f"The best validation score obtained is {gs_svc.best_score_:.5f} with\n\t{gs_svc.best_params_}")
    
    return gs_svc

In [9]:
def best_random_forest(X_train, y_train):
    random_forest = RandomForestClassifier()
    gs_forest_params = {
     "n_estimators": np.arange(100, 400, 100),
     "max_depth": [None], #np.arange(2, 8, 1),
     "min_samples_split": [2], #np.arange(2, 8, 1),
     "min_samples_leaf": [1], #np.arange(1, 9, 2),
     "class_weight": ["balanced", None],
     "random_state": [0], 
    }
    
    gs_forest = GridSearchCV(random_forest, gs_forest_params, cv=5, verbose=3, scoring='f1_micro', error_score='raise')
    gs_forest.fit(X_train, y_train)

    print(f"The best validation score obtained is {gs_forest.best_score_:.5f} with\n\t{gs_forest.best_params_}")
    
    return gs_forest

In [20]:
def best_ensemble(models, X_train, y_train):
    ensemble = VotingClassifier(estimators=[(str(i), model) for i, model in enumerate(models)])
    gs_ensemble_params = {
     "voting": ["hard", "soft"]
    }
    
    gs_ensemble = GridSearchCV(ensemble, gs_ensemble_params, cv=5,
                               verbose=3, scoring='f1_micro', error_score='raise')
    gs_ensemble.fit(X_train, y_train)

    print(f"The best validation score obtained is {gs_ensemble.best_score_:.5f} with\n\t{gs_ensemble.best_params_}")
    
    return gs_ensemble

---
## Main Pipeline

In [12]:
X_train_raw, y_train_raw, X_test_raw = load_from_csv()

In [None]:
# Maybe slower, but progress visible so good for debug
# X_train = pd.DataFrame()
# for i, s in X_train_raw.iterrows():
#     print(f"Extracting row #{i+1:4} of {X_train_raw.shape[0]}", end='\r')
#     X_train = X_train.append(extract_features(s), ignore_index=True)
# print("Done!")

In [13]:
X_train = X_train_raw.progress_apply(extract_features, axis=1)

In [14]:
X_test = X_test_raw.progress_apply(extract_features, axis=1)

In [15]:
print(f"X_train has {X_train.isna().sum().sum()} null values.")
print(f"X_test has {X_test.isna().sum().sum()} null values.")

X_train has 0 null values.
X_test has 0 null values.


In [16]:
X_train, X_test = standardize_data(X_train, X_test)

In [17]:
gs_svc = best_svc(X_train, np.array(y_train_raw).ravel())

Fitting 5 folds for each of 12 candidates, totalling 60 fits
[CV 1/5] END C=1.0, class_weight=balanced, kernel=rbf;, score=0.706 total time=   1.2s
[CV 2/5] END C=1.0, class_weight=balanced, kernel=rbf;, score=0.658 total time=   1.1s
[CV 3/5] END C=1.0, class_weight=balanced, kernel=rbf;, score=0.693 total time=   1.1s
[CV 4/5] END C=1.0, class_weight=balanced, kernel=rbf;, score=0.683 total time=   1.1s
[CV 5/5] END C=1.0, class_weight=balanced, kernel=rbf;, score=0.691 total time=   1.1s
[CV 1/5] END C=1.0, class_weight=balanced, kernel=poly;, score=0.682 total time=   1.0s
[CV 2/5] END C=1.0, class_weight=balanced, kernel=poly;, score=0.684 total time=   1.1s
[CV 3/5] END C=1.0, class_weight=balanced, kernel=poly;, score=0.721 total time=   1.1s
[CV 4/5] END C=1.0, class_weight=balanced, kernel=poly;, score=0.710 total time=   1.0s
[CV 5/5] END C=1.0, class_weight=balanced, kernel=poly;, score=0.697 total time=   1.1s
[CV 1/5] END C=1.0, class_weight=balanced, kernel=sigmoid;, scor

In [18]:
gs_random_forest = best_random_forest(X_train, np.array(y_train_raw).ravel())

Fitting 5 folds for each of 6 candidates, totalling 30 fits
[CV 1/5] END class_weight=balanced, max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=100, random_state=0;, score=0.747 total time=   1.1s
[CV 2/5] END class_weight=balanced, max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=100, random_state=0;, score=0.741 total time=   1.2s
[CV 3/5] END class_weight=balanced, max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=100, random_state=0;, score=0.770 total time=   1.1s
[CV 4/5] END class_weight=balanced, max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=100, random_state=0;, score=0.754 total time=   1.1s
[CV 5/5] END class_weight=balanced, max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=100, random_state=0;, score=0.752 total time=   1.0s
[CV 1/5] END class_weight=balanced, max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=200, random_state=0;, score=0.749 total

In [22]:
ensemble = best_ensemble([SVC(probability=True, **gs_svc.best_params_), RandomForestClassifier(**gs_random_forest.best_params_)], 
                         X_train, np.array(y_train_raw).ravel())

Fitting 5 folds for each of 2 candidates, totalling 10 fits
[CV 1/5] END .......................voting=hard;, score=0.761 total time=   6.3s
[CV 2/5] END .......................voting=hard;, score=0.731 total time=   6.6s
[CV 3/5] END .......................voting=hard;, score=0.753 total time=   6.6s
[CV 4/5] END .......................voting=hard;, score=0.745 total time=   6.5s
[CV 5/5] END .......................voting=hard;, score=0.748 total time=   5.9s
[CV 1/5] END .......................voting=soft;, score=0.773 total time=   6.2s
[CV 2/5] END .......................voting=soft;, score=0.757 total time=   6.5s
[CV 3/5] END .......................voting=soft;, score=0.774 total time=   6.3s
[CV 4/5] END .......................voting=soft;, score=0.766 total time=   5.9s
[CV 5/5] END .......................voting=soft;, score=0.762 total time=   5.8s
The best validation score obtained is 0.76666 with
	{'voting': 'soft'}


In [23]:
model = ensemble
sub_id = 8
prediction = pd.DataFrame(model.predict(X_test))

In [24]:
create_submission(sub_id, prediction)

**Solutions must be submitted on the [project website](https://aml.ise.inf.ethz.ch/task2/).**