# `AML — Task 2:` Heart rhythm classification from raw ECG signals
---

In [None]:
# For TQDM :
#! python3.6 -m pip install ipywidgets
#! python3.6 -m pip install --upgrade jupyter
#! jupyter nbextension enable --py widgetsnbextension

In [1]:
import numpy as np
import pandas as pd
import biosppy.signals.ecg as ecg
import biosppy.signals.tools as tools
from biosppy.plotting import plot_ecg
import matplotlib.pyplot as plt
from IPython.display import clear_output
from tqdm.notebook import tqdm
tqdm.pandas()

In [2]:
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.feature_selection import SelectFpr, SelectFdr, SelectFwe, f_classif, chi2, mutual_info_classif
from sklearn.metrics import f1_score
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.linear_model import RidgeClassifier, LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier,\
                             AdaBoostClassifier, VotingClassifier, ExtraTreesClassifier

---
## Dataset import and export to `csv`

In [3]:
def load_from_csv(extension="", drop_id = True):
    X_train = pd.read_csv('data/X_train' + extension + '.csv')
    y_train = pd.read_csv('data/y_train' + extension + '.csv')
    X_test  = pd.read_csv('data/X_test' + extension + '.csv')
    
    if drop_id:
        X_train = X_train.drop(columns=['id'])
        y_train = y_train.drop(columns=['id'])
        X_test  = X_test.drop(columns=['id'])
     
    return X_train, y_train, X_test

In [4]:
def export_to_csv(X_train, y_train, X_test, extension="_cleaned"):
    X_train.to_csv('data/X_train' + extension + '.csv', index=False)
    y_train.to_csv('data/y_train' + extension + '.csv', index=False)
    X_test.to_csv('data/X_test' + extension + '.csv', index=False)

## Submission export to `csv`

In [5]:
def create_submission(sub_id, pred, basepath='submissions/task2-sub'):
    result = pred.copy().rename(columns={0: 'y'})
    result['id'] = range(0, len(result))
    result = result[['id', 'y']]
    result.to_csv(basepath + str(sub_id) + '.csv', index=False)

---
## Data processing

### Helpers

In [6]:
def extract_stats_from(features_series: list) -> pd.Series:
    # pd.describe().drop(count) returns 7 interesting features (min, max, std...)
    fs = [pd.Series(s).describe().drop(["count"]) for s in features_series]
    return pd.concat(fs, ignore_index=True)

In [7]:
def extract_qpeaks_features(filtered: np.array, rpeaks: np.array, window_size=50):
    qpeaks = [rpeak - window_size + np.argmin(filtered[rpeak-window_size:rpeak]) for rpeak in rpeaks]
    qpeaks_amplitudes = [filtered[qpeak] for qpeak in qpeaks]
    return qpeaks, qpeaks_amplitudes

In [8]:
def extract_speaks_features(filtered: np.array, rpeaks: np.array, window_size=50):
    speaks = [rpeak + np.argmin(filtered[rpeak:rpeak+window_size]) for rpeak in rpeaks]
    speaks_amplitudes = [filtered[speak] for speak in speaks]
    return speaks, speaks_amplitudes

### Feature extraction

In [9]:
def extract_features(time_series: pd.Series, sampling_rate=300) -> pd.Series:
    # Drop nan values in the time series
    no_nans = time_series.dropna()
    
    # Extract main features from ECG
    ts, filtered, rpeaks, _, templates, _, heart_rate = ecg.ecg(no_nans, sampling_rate, show=False)
    assert len(rpeaks) > 1, 'ECG cannot have a single R peak'
    assert len(templates) > 1, 'ECG cannot have a single heartbeat'
    
    # Extract Q,R,S peak features
    rpeaks_amplitude = [filtered[rpeak] for rpeak in rpeaks]
    qpeaks, qpeaks_amplitude = extract_qpeaks_features(filtered, rpeaks)
    speaks, speaks_amplitude = extract_speaks_features(filtered, rpeaks)
    
    # Extract RR, QRS durations features
    rr_durations = [r2 - r1 for r1, r2 in zip(rpeaks, rpeaks[1:])]
    qrs_durations = [speak - qpeak for qpeak, speak in zip(qpeaks, speaks)]
    
    # Extract QRS direction features
    qrs_direction = [q + r + s for q, r, s in zip(qpeaks, rpeaks, speaks)]
    
    # Extract Q/R ratio features
    qr_ratio = [q / r for q, r in zip(qpeaks, rpeaks)]
    
    #TODO: Extract SNR ratio (http://www.cinc.org/archives/2011/pdf/0609.pdf)
    snr = np.quantile(np.std(templates, axis=0), 0.35)
    
    # Use this to go from index differences to seconds
    index_to_time = ts[-1] / len(filtered)
    # Extract pNN28 (https://www.ncbi.nlm.nih.gov/pmc/articles/PMC1767394/)
    pNN28 = (np.array(rr_durations) * index_to_time > 0.028).sum() / len(rr_durations)
    
    ecg_features = extract_stats_from([
        rpeaks_amplitude, qpeaks_amplitude, speaks_amplitude,
        rr_durations, qrs_durations, qrs_direction, qr_ratio
    ])
    
    return ecg_features.append(pd.Series([snr, pNN28]), ignore_index=True)

---
## Data standardization

In [10]:
def standardize_data(X_train, X_test):
    scaler = StandardScaler().fit(X_train)
    X_train_scaled = scaler.transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    return pd.DataFrame(X_train_scaled), pd.DataFrame(X_test_scaled)

---
## Feature Selection

In [11]:
def remove_constant_features(X_train, X_test, verbose=True):
    non_constant_features_mask = X_train.apply(pd.Series.nunique) != 1
    X_train_selected_features = X_train.loc[:, non_constant_features_mask]
    X_test_selected_features = X_test.loc[:, non_constant_features_mask]
    
    if verbose:
        print(f"{X_train.shape[1]-X_train_selected_features.shape[1]} features removed because of constant values ({100*(X_train.shape[1]-X_train_selected_features.shape[1])/X_train.shape[1]:.2f}%).")
    
    return X_train_selected_features, X_test_selected_features

In [12]:
def remove_too_correlated_features(X_train, X_test, threshold=0.98, verbose=True):
    X_train_corr_ = X_train.corr()

    X_train_too_correlated = (X_train_corr_.mask(
        np.tril(np.ones([len(X_train_corr_)]*2, dtype=bool))).abs() > threshold).any()
    
    X_train_selected_features = X_train.loc[:, (~X_train_too_correlated)]
    X_test_selected_features = X_test.loc[:, (~X_train_too_correlated)]
    
    if verbose:
        print(f"{X_train.shape[1]-X_train_selected_features.shape[1]} features removed because of correlation with another feature > {threshold} ({100*(X_train.shape[1]-X_train_selected_features.shape[1])/X_train.shape[1]:.2f}%).")

    return X_train_selected_features, X_test_selected_features

In [38]:
def remove_useless_features(X_train, y_train, X_test, selector_type, stat, verbose=1):
    # See https://stats.stackexchange.com/questions/328358/fpr-fdr-and-fwe-for-feature-selection
    # and https://scikit-learn.org/stable/modules/feature_selection.html#univariate-feature-selection
    
    assert selector_type in ["fpr", "fdr", "fwe"], "Unrecognised selector type"
    assert stat in [f_classif, chi2, mutual_info_classif], "Unrecognised stat"
    selector = None
    if selector_type == "fpr":
        selector = SelectFpr(stat)
    elif selector_type == "fdr":
        selector = SelectFdr(stat)
    elif selector_type == "fwe":
        selector = SelectFwe(stat)
    
    selector.fit(X_train, y_train)
    X_train_selected_features = pd.DataFrame(selector.transform(X_train))
    X_test_selected_features = pd.DataFrame(selector.transform(X_test))
    
    if verbose:
        print(f"{X_train.shape[1]-X_train_selected_features.shape[1]} features removed because of complicated p-value stuff we don't understand ({100*(X_train.shape[1]-X_train_selected_features.shape[1])/X_train.shape[1]:.2f}%).")
        
    return X_train_selected_features, X_test_selected_features

---
## Model definition

In [14]:
def grid_search(model, params, X_train, y_train, clear=False):
    gs = GridSearchCV(model, params, cv=5, verbose=3, scoring='f1_micro', error_score='raise')
    gs.fit(X_train, y_train)
    
    if clear:
        clear_output(wait=True)
    print(f"{type(gs.best_estimator_).__name__} best validation score is {gs.best_score_:.5f} +- {gs.cv_results_['std_test_score'][gs.best_index_]:.5f},\nobtained with {gs.best_params_}")
    
    return gs

---
## Main Pipeline

### Load dataset

In [15]:
X_train_raw, y_train_raw, X_test_raw = load_from_csv()

### Extract features

In [16]:
X_train = X_train_raw.progress_apply(extract_features, axis=1)

  0%|          | 0/5117 [00:00<?, ?it/s]

In [17]:
X_test = X_test_raw.progress_apply(extract_features, axis=1)

  0%|          | 0/3411 [00:00<?, ?it/s]

In [18]:
print(f"X_train has {X_train.isna().sum().sum()} null values.")
print(f"X_test has {X_test.isna().sum().sum()} null values.")

X_train has 0 null values.
X_test has 0 null values.


In [19]:
y_train = np.array(y_train_raw).ravel()

### Standardize data

In [20]:
X_train, X_test = standardize_data(X_train, X_test)

### Select features

In [21]:
X_train_1, X_test_1 = remove_constant_features(X_train, X_test)

0 features removed because of constant values (0.00%).


In [23]:
X_train_2, X_test_2 = remove_too_correlated_features(X_train_1, X_test_1, threshold=0.98)

8 features removed because of correlation with another feature > 0.98 (15.69%).


In [41]:
X_train_3, X_test_3 = remove_useless_features(X_train_2, y_train, X_test_2, selector_type="fpr", stat=f_classif)

2 features removed because of complicated p-value stuff we don't understand (4.65%).


### Train models

In [54]:
gs_svc = grid_search(SVC(),
                     {
                         "kernel": ["rbf", "poly"], #["rbf", "poly", "sigmoid"],
                         "C": [1, 10, 50, 100],
                         "class_weight": ["balanced", None],
                         "random_state": [0],
                     },
                     X_train_3,
                     y_train,
                     clear=True)

SVC best validation score is 0.76256 +- 0.00560,
obtained with {'C': 10, 'class_weight': None, 'kernel': 'rbf', 'random_state': 0}


In [55]:
gs_random_forest = grid_search(RandomForestClassifier(),
                               {
                                   "n_estimators": np.arange(100, 300, 200),
                                   "max_depth": [None], #np.arange(2, 8, 1),
                                   "min_samples_split": [2, 4], #np.arange(2, 8, 1),
                                   "min_samples_leaf": [1, 4], #np.arange(1, 9, 2),
                                   "class_weight": ["balanced", None],
                                   "random_state": [0], 
                               },
                               X_train_3,
                               y_train,
                               clear=True)

RandomForestClassifier best validation score is 0.78132 +- 0.01226,
obtained with {'class_weight': 'balanced', 'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 4, 'n_estimators': 100, 'random_state': 0}


In [56]:
gs_gbc = grid_search(GradientBoostingClassifier(),
                     {
                         "loss": ["deviance"],
                         "learning_rate": [0.1, 1],
                         "n_estimators": [50, 200],
                         "subsample": [0.7, 1],
                         "criterion": ["squared_error"], #["friedman_mse", "mse"],
                         "min_samples_split": [4], #[2, 4],
                         "min_samples_leaf": [3], #[1, 3],
                         "n_iter_no_change": [None],
                         "tol": [1e-4],
                     },
                     X_train_3, 
                     y_train,
                     clear=True)

GradientBoostingClassifier best validation score is 0.78953 +- 0.00390,
obtained with {'criterion': 'squared_error', 'learning_rate': 0.1, 'loss': 'deviance', 'min_samples_leaf': 3, 'min_samples_split': 4, 'n_estimators': 200, 'n_iter_no_change': None, 'subsample': 0.7, 'tol': 0.0001}


In [None]:
# No predict_proba
gs_knn = grid_search(KNeighborsClassifier(),
                     {
                         "n_neighbors": np.arange(2, 10, 1),
                         "weights": ["uniform", "distance"],
                     },
                     X_train, 
                     y_train,
                     clear=True)

In [None]:
# Takes 1min to train a single instance, but scores of ~0.7...
gs_gp = grid_search(GaussianProcessClassifier(),
                    {
                        "kernel": [None],
                        "multi_class": ["one_vs_rest", "one_vs_one"],
                        "random_state": [0],
                    },
                    X_train, 
                    y_train,
                    clear=True)

In [57]:
gs_ab = grid_search(AdaBoostClassifier(),
                    {
                        "base_estimator": [None],
                        "n_estimators": [50, 300],
                        "learning_rate": [0.1, 1.0],
                        "random_state": [0],
                    },
                    X_train_3, 
                    y_train,
                    clear=True)

AdaBoostClassifier best validation score is 0.72210 +- 0.00570,
obtained with {'base_estimator': None, 'learning_rate': 0.1, 'n_estimators': 300, 'random_state': 0}


In [58]:
gs_etc = grid_search(ExtraTreesClassifier(),
                    {
                        "n_estimators": [50, 100, 500],
                        "criterion": ["gini", "entropy"],
                        "bootstrap": [True, False],
                        "class_weight": ["balanced", "balanced_subsample"],
                        "random_state": [0],
                    },
                    X_train_3, 
                    y_train,
                    clear=True)

ExtraTreesClassifier best validation score is 0.77565 +- 0.00596,
obtained with {'bootstrap': False, 'class_weight': 'balanced', 'criterion': 'gini', 'n_estimators': 500, 'random_state': 0}


In [None]:
# No predict_proba
gs_ridge = grid_search(RidgeClassifier(),
                       {
                           "alpha": [0.1, 1, 5],
                           "class_weight": ["balanced", None],
                           "random_state": [0],
                       },
                       X_train,
                       y_train,
                       clear=True)

In [59]:
gs_logreg = grid_search(LogisticRegression(),
                        {
                            "penalty": ["l2"],
                            "C": [0.1, 1, 10],
                            "class_weight": ["balanced", None],
                            "random_state": [0],
                            "max_iter": [1000]
                        },
                        X_train_3,
                        y_train,
                        clear=True)

LogisticRegression best validation score is 0.69064 +- 0.00282,
obtained with {'C': 10, 'class_weight': None, 'max_iter': 1000, 'penalty': 'l2', 'random_state': 0}


In [60]:
gs_mlp = grid_search(MLPClassifier(),
                     {
                         "hidden_layer_sizes": [(100,), (40, 40, 30, 10), (40, 40), (100, 30, 10)],
                         "alpha": [0.0001],
                         "solver": ["adam"],
                         "early_stopping": [True],
                         "random_state": [0],
                     },
                     X_train_3,
                     y_train,
                     clear=True)

MLPClassifier best validation score is 0.74458 +- 0.00432,
obtained with {'alpha': 0.0001, 'early_stopping': True, 'hidden_layer_sizes': (100, 30, 10), 'random_state': 0, 'solver': 'adam'}


In [61]:
# Only kept models with a predict proba function for soft voting + > 0.75 val score
gs_ensemble = grid_search(VotingClassifier([('svc', SVC(probability=True, **gs_svc.best_params_)),
                                            ('rf', RandomForestClassifier(**gs_random_forest.best_params_)),
                                            ('gbc', GradientBoostingClassifier(**gs_gbc.best_params_)),
                                            ('etc', ExtraTreesClassifier(**gs_etc.best_params_))]),
                          {
                              "voting": ["soft"], #["hard", "soft"],
                          },
                          X_train_3, 
                          y_train,
                          clear=True)

VotingClassifier best validation score is 0.79324 +- 0.00911,
obtained with {'voting': 'soft'}


---
## Generate new submission

In [63]:
model = gs_ensemble
sub_id = 14
prediction = pd.DataFrame(model.predict(X_test_3))

In [64]:
create_submission(sub_id, prediction)

**Solutions must be submitted on the [project website](https://aml.ise.inf.ethz.ch/task2/).**