# `AML — Task 2:` Heart rhythm classification from raw ECG signals
---

In [1]:
import numpy as np
import pandas as pd
import biosppy.signals.ecg as ecg
import biosppy.signals.tools as tools
import matplotlib.pyplot as plt

In [2]:
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import f1_score
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer, KNNImputer

---
## Dataset import and export to `csv`

In [4]:
def load_from_csv(extension="", drop_id = True):
    X_train = pd.read_csv('data/X_train' + extension + '.csv')
    y_train = pd.read_csv('data/y_train' + extension + '.csv')
    X_test  = pd.read_csv('data/X_test' + extension + '.csv')
    
    if drop_id:
        X_train = X_train.drop(columns=['id'])
        y_train = y_train.drop(columns=['id'])
        X_test  = X_test.drop(columns=['id'])
     
    return X_train, y_train, X_test

In [5]:
def export_to_csv(X_train, y_train, X_test, extension="_cleaned"):
    X_train.to_csv('data/X_train' + extension + '.csv', index=False)
    y_train.to_csv('data/y_train' + extension + '.csv', index=False)
    X_test.to_csv('data/X_test' + extension + '.csv', index=False)

## Submission export to `csv`

In [6]:
def create_submission(sub_id, pred, basepath='submissions/task2-sub'):
    result = pred.copy().rename(columns={0: 'y'})
    result['id'] = range(0, len(result))
    result = result[['id', 'y']]
    result.to_csv(basepath + str(sub_id) + '.csv', index=False)

---
## Feature extraction

In [30]:
def extract_features(time_series: pd.Series, sampling_rate=300.0) -> np.array:
    # Drop nan values in the time series
    no_nans = time_series.dropna()
    
    # Extract main features
    _, filtered, rpeaks, _, templates, _, heart_rate = ecg.ecg(no_nans, sampling_rate, show=False)
    # If this assert fails, let's code the use of another segmenter using the filtered ecg
    assert len(templates) > 1, 'ECG cannot have a single R peak'
    
    # Extract R peak amplitudes
    rpeaks_amplitudes = [filtered[rpeak] for rpeak in rpeaks]
    # Extract mean R peak amplitude
    rpeaks_mean = np.mean(rpeaks_amplitudes)
    # Extract std of R peak amplitudes
    rpeaks_std = np.std(rpeaks_amplitudes)
    
    #TODO: maybe change durations to seconds?
    # Extract RR durations
    rrdurations = [r2 - r1 for r1, r2 in zip(rpeaks, rpeaks[1:])]
    # Extract mean RR duration
    rrdurations_mean = np.mean(rrdurations)
    # Extract std of RR durations
    rrdurations_std = np.std(rrdurations)
    
    # Extract mean heart rate
    heart_rate_mean = np.mean(heart_rate)
    # Extract std of mean heart
    heart_rate_std = np.std(heart_rate)
    
    #TODO: Extract SNR ratio (http://www.cinc.org/archives/2011/pdf/0609.pdf)
    # For now, we use this
    snr = np.quantile(np.std(templates, axis=0), 0.35)
    
    # Extract S peaks
    
    # Extract Q peaks
    
    # Extract QRS durations
    
    # Return extracted features
    return np.array([rpeaks_mean, 
                     rpeaks_std, 
                     rrdurations_mean, 
                     rrdurations_std, 
                     heart_rate_mean, 
                     heart_rate_std, 
                     snr])

## Data standardization

In [32]:
def standardize_data(X_train, X_test):
    # Do the scaling, saving the scaler to use it for X_test too. No need imputation, just ignore Nan values
    scaler = StandardScaler().fit(X_train)
    X_train_scaled = scaler.transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    return (X_train_scaled, X_test_scaled)

---
## Model defintion

In [33]:
def best_svc(X_train, y_train):
    svc = SVC()
    gs_svc_params = {
        'kernel': ['rbf', 'poly', 'sigmoid'],
        'C': np.logspace(0, 1, 2),
        'class_weight': ['balanced', None]
    }
    gs_svc = GridSearchCV(svc, gs_svc_params, cv=5, verbose=3, scoring=f1_score, error_score='raise')
    gs_svc.fit(X_train, y_train)
    
    print(f"The best validation score obtained is {gs_svc.best_score_:.5f} with\n\t{gs_svc_params}")
    
    return gs_svc

## Main Pipeline

In [8]:
X_train_raw, y_train_raw, X_test_raw = load_from_csv()

In [None]:
X_train = X_train_raw.apply(extract_features, axis=1)

In [None]:
X_test = X_test_raw.apply(extract_features, axis=1)

In [38]:
svc = best_svc(X_train, y_train)

Fitting 5 folds for each of 12 candidates, totalling 60 fits
[CV 1/5] END C=1.0, class_weight=balanced, kernel=rbf;, score=0.642 total time=   1.3s
[CV 2/5] END C=1.0, class_weight=balanced, kernel=rbf;, score=0.638 total time=   1.3s
[CV 3/5] END C=1.0, class_weight=balanced, kernel=rbf;, score=0.653 total time=   1.2s
[CV 4/5] END C=1.0, class_weight=balanced, kernel=rbf;, score=0.674 total time=   1.2s
[CV 5/5] END C=1.0, class_weight=balanced, kernel=rbf;, score=0.636 total time=   1.2s
[CV 1/5] END C=1.0, class_weight=None, kernel=rbf;, score=0.703 total time=   0.9s
[CV 2/5] END C=1.0, class_weight=None, kernel=rbf;, score=0.698 total time=   0.9s
[CV 3/5] END C=1.0, class_weight=None, kernel=rbf;, score=0.680 total time=   0.9s
[CV 4/5] END C=1.0, class_weight=None, kernel=rbf;, score=0.718 total time=   0.9s
[CV 5/5] END C=1.0, class_weight=None, kernel=rbf;, score=0.691 total time=   0.9s
[CV 1/5] END C=3.9810717055349722, class_weight=balanced, kernel=rbf;, score=0.671 total 

In [39]:
model = svc
sub_id = 3
prediction = pd.DataFrame(model.predict(X_test))

In [40]:
create_submission(sub_id, prediction)

**Solutions must be submitted on the [project website](https://aml.ise.inf.ethz.ch/task2/).**