In [1]:
import numpy as np
import pandas as pd 
import os
from sktime.datasets import load_from_arff_to_dataframe
from pyts.utils import windowed_view
from scipy.signal import find_peaks
from scipy.stats import skew, kurtosis
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from xgboost import XGBClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import VarianceThreshold
from sklearn.metrics import accuracy_score, classification_report, precision_recall_fscore_support

# Feature extraction

In [None]:
def feature_extraction(data, apply_fft=True):

    def aux_extract(data):
        features = []
        features.append(data.mean(1))
        features.append(data.std(1))
        features.append(np.abs(data - data.mean(1)[:, np.newaxis, :]).mean(1))
        features.append(data.min(1))
        features.append(data.max(1))
        features.append(data.max(1) - data.min(1))

        features.append(np.median(data, axis=1))
        features.append(np.median(np.abs(data - np.median(data,1)[:, np.newaxis, :]),1))
        features.append(np.subtract(*np.percentile(data,[75, 25],1)))
        features.append(np.count_nonzero(data < 0, 1))
        features.append(np.count_nonzero(data >= 0, 1))
        features.append(np.count_nonzero(data > data.mean(1)[:, np.newaxis, :], 1))

        features.append(np.apply_along_axis(lambda x: len(find_peaks(x)[0]), 1, data))
        features.append(skew(data,axis=1))
        features.append(kurtosis(data, axis=1))
        features.append(np.mean(data**2,axis=1))
        features.append(np.mean(np.sqrt(np.sum(data**2,axis=-1)),axis=1)[:, np.newaxis])
        features.append(np.sum(np.mean(np.abs(data),axis=1),axis=1)[:, np.newaxis])

        features = np.concatenate(features,axis=1)

        return features
    
    seq_len = data.shape[1]
    fft_data = np.abs(np.fft.fft(data,axis=1))[:, 1:(seq_len//2+1), :]

    result = aux_extract(data)
    if apply_fft:
        result = np.concatenate([result, aux_extract(fft_data)],axis=1)

    return result

In [None]:
def feature_extraction_window(data, window_size=10, window_step=3, apply_fft=True):

    data = np.stack(
        [
            windowed_view(data[:,:,idx], window_size, window_step)
            for idx in range(data.shape[-1])
        ],
        axis=-1
    )

    def aux_extract(data):
        n_samples = data.shape[0]

        features = []

        features.append(data.mean(2).reshape(n_samples,-1))
        features.append(data.std(2).reshape(n_samples,-1))
        features.append(np.abs(data - data.mean(2)[:, :, np.newaxis, :]).mean(2).reshape(n_samples,-1))
        features.append(data.min(2).reshape(n_samples,-1))
        features.append(data.max(2).reshape(n_samples,-1))
        features.append(data.max(2).reshape(n_samples,-1) - data.min(2).reshape(n_samples,-1))

        features.append(np.median(data, axis=2).reshape(n_samples,-1))
        features.append(np.median(np.abs(data - np.median(data,2)[:, :, np.newaxis, :]),2).reshape(n_samples,-1))
        features.append(np.subtract(*np.percentile(data,[75, 25],2)).reshape(n_samples,-1))
        features.append(np.count_nonzero(data < 0, 2).reshape(n_samples,-1))
        features.append(np.count_nonzero(data >= 0, 2).reshape(n_samples,-1))
        features.append(np.count_nonzero(data > data.mean(2)[:, :, np.newaxis, :], 2).reshape(n_samples,-1))

        features.append(np.apply_along_axis(lambda x: len(find_peaks(x)[0]), 2, data).reshape(n_samples,-1))
        # features.append(skew(data,axis=2).reshape(n_samples,-1))
        # features.append(kurtosis(data, axis=2).reshape(n_samples,-1))
        features.append(np.mean(data**2,axis=2).reshape(n_samples,-1))
        features.append(np.mean(np.sqrt(np.sum(data**2,axis=-1)),axis=2)[:, :, np.newaxis].reshape(n_samples,-1))
        features.append(np.sum(np.mean(np.abs(data),axis=2),axis=2)[:, :, np.newaxis].reshape(n_samples,-1))

        features = np.concatenate(features,axis=1)

        return features
    
    seq_len = data.shape[2]
    fft_data = np.abs(np.fft.fft(data, axis=2))[:, :, 1:(seq_len//2+1), :]
    result = aux_extract(data)
    if apply_fft:
        result = np.concatenate([result, aux_extract(fft_data)],axis=1)
    return result

# Read data + convert numpy

In [163]:
DATASET = ["RacketSports", "MITBIH", "PTBDB"]
DATA_PATH = "data"
LABEL_COL = 187

In [3]:
def dataframe2numpy(X):
    N = len(X)
    S = len(X.iloc[0][0])
    H = len(X.columns)
    return np.stack(X.values.reshape(-1)).reshape(N,S,H)

In [162]:
if DATASET == "RacketSports":

    X_train, y_train = load_from_arff_to_dataframe(
        os.path.join(DATA_PATH, "RacketSports/RacketSports_TRAIN.arff")
    )

    X_test, y_test = load_from_arff_to_dataframe(
        os.path.join(DATA_PATH, "RacketSports/RacketSports_TEST.arff")
    )

    rs_train = dataframe2numpy(X_train)
    rs_test = dataframe2numpy(X_test)

    label2id = {el:i for i, el in enumerate(list(np.unique(y_train)))}

    target_train = pd.Series(y_train).apply(lambda x:label2id[x]).values
    target_test = pd.Series(y_test).apply(lambda x:label2id[x]).values

    train_features = feature_extraction_window(rs_train)
    test_features = feature_extraction_window(rs_test)

elif DATASET == "MITBIH":

    mit_bih_train = pd.read_csv(os.path.join("data","ECG","mitbih_train.csv"),header=None)
    target_train = mit_bih_train[LABEL_COL].copy().values
    mit_bih_train.drop(LABEL_COL,axis=1,inplace=True)

    mit_bih_test = pd.read_csv(os.path.join("data","ECG","mitbih_test.csv"),header=None)
    target_test = mit_bih_test[LABEL_COL].copy().values
    mit_bih_test.drop(LABEL_COL,axis=1,inplace=True)

    mitbih_train = mit_bih_train.values
    mitbih_test = mit_bih_test.values

    train_features = feature_extraction_window(mitbih_train[:,:,np.newaxis],20,8)
    test_features = feature_extraction_window(mitbih_test[:,:,np.newaxis],20,8)

elif DATASET == "PTBDB":
    
    abnormal = pd.read_csv(os.path.join("data","ECG","ptbdb_abnormal.csv"),header=None)
    normal = pd.read_csv(os.path.join("data","ECG","ptbdb_normal.csv"),header=None)

    train_abn, test_abn = train_test_split(abnormal, test_size=0.2, random_state=42)
    train_nor, test_nor = train_test_split(normal, test_size=0.2, random_state=42)

    ptbdb_train = pd.concat([train_abn, train_nor]).sample(frac=1, random_state=42).reset_index(drop=True)
    ptbdb_test = pd.concat([test_abn, test_nor]).sample(frac=1, random_state=42).reset_index(drop=True)

    target_train = ptbdb_train[LABEL_COL].copy()
    ptbdb_train.drop(LABEL_COL,axis=1,inplace=True)

    target_test = ptbdb_test[LABEL_COL].copy()
    ptbdb_test.drop(LABEL_COL,axis=1,inplace=True)

    ptbdb_train = ptbdb_train.values
    ptbdb_test = ptbdb_test.values

    train_features = feature_extraction_window(ptbdb_train[:,:,np.newaxis],20,8)
    test_features = feature_extraction_window(ptbdb_test[:,:,np.newaxis],20,8)
    


# Feature selection & standardization

In [69]:
selecter = VarianceThreshold(2)
scaler = StandardScaler()

In [70]:
selecter = selecter.fit(train_features)
selected_train = selecter.transform(train_features)
selected_test = selecter.transform(test_features)

In [71]:
scaler = scaler.fit(selected_train)
scaled_train = scaler.transform(selected_train)
scaled_test = scaler.transform(selected_test)

In [72]:
scaled_train.shape

(151, 878)

# Hyperparameter search

In [73]:
MODEL_NAME = "xgb" # ["svm", "random_forest", "xgb"]
MODEL = None
MODEL_PARAMS = None

In [74]:
if MODEL_NAME == "svm":
    MODEL = SVC()
    MODEL_PARAMS = {
        "kernel":["linear","poly","rbf"],
        "C":[1.0,5.0,10.0,20.0]
    }

elif MODEL_NAME == "random_forest":
    MODEL = RandomForestClassifier()
    MODEL_PARAMS = {
        "n_estimators":range(40,301,20),
        "max_depth":[3, 5, 8, 12],
        "max_samples":[0.4, 0.7, 1.0]
    }

elif MODEL_NAME == "xgb":
    MODEL = XGBClassifier()
    MODEL_PARAMS = {
        "n_estimators":range(40,101,20),
        "max_depth":[3, 5, 8],
        "learning_rate":[1e-3, 1e-2, 1e-1]
    }

else:
    print("Untested model")

In [75]:
clf = GridSearchCV(MODEL, MODEL_PARAMS,n_jobs=-1)

In [76]:
clf = clf.fit(scaled_train, target_train)

In [77]:
clf.best_params_

{'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 100}

# Evaluation

In [None]:
results_df = []

for i, params in enumerate(pd.DataFrame(clf.cv_results_)["params"]):
    if MODEL_NAME == "svm":
        curr_model = SVC(**params)
    elif MODEL_NAME == "random_forest":
        curr_model = RandomForestClassifier(**params)
    elif MODEL_NAME == "xgb":
        curr_model = XGBClassifier(**params)
    
    curr_model = curr_model.fit(scaled_train, target_train)

    y_true = target_test
    y_pred = curr_model.predict(scaled_test)


    row = [params]
    
    accuracy = accuracy_score(y_true, y_pred)
    row.append(accuracy)
    res = list(map(np.mean, precision_recall_fscore_support(y_true, y_pred)))
    res2 = list(map(np.std, precision_recall_fscore_support(y_true, y_pred)))
    
    mean_std = zip(res[:-1], res2[:-1])
    for mean_std_tuple in mean_std:
        row.extend(list(mean_std_tuple))

    results_df.append(row)
    print(i)
    if i == 5:
        break


In [98]:
columns = [
    "Hyperparameters",
    "Accuracy",
    "Mean precision",
    "Std precision",
    "Mean recall",
    "Std recall",
    "Mean f1",
    "Std f1"
]

pd.DataFrame(results_df,columns=columns).round(3)

Unnamed: 0,Hyperparameters,Mean precision,Std precision,Mean recall,Std recall,Mean f1,Std f1
0,"{'learning_rate': 0.001, 'max_depth': 3, 'n_es...",0.503,0.108,0.491,0.061,0.495,0.08
1,"{'learning_rate': 0.001, 'max_depth': 3, 'n_es...",0.503,0.108,0.491,0.061,0.495,0.08
2,"{'learning_rate': 0.001, 'max_depth': 3, 'n_es...",0.515,0.114,0.506,0.08,0.51,0.095
3,"{'learning_rate': 0.001, 'max_depth': 3, 'n_es...",0.512,0.105,0.506,0.08,0.509,0.091
4,"{'learning_rate': 0.001, 'max_depth': 5, 'n_es...",0.543,0.107,0.522,0.101,0.526,0.093
5,"{'learning_rate': 0.001, 'max_depth': 5, 'n_es...",0.532,0.094,0.507,0.077,0.513,0.067


In [160]:
res = classification_report(target_test, clf.predict(scaled_test))
print(res)

              precision    recall  f1-score   support

         0.0       0.95      0.87      0.91       810
         1.0       0.95      0.98      0.97      2102

    accuracy                           0.95      2912
   macro avg       0.95      0.92      0.94      2912
weighted avg       0.95      0.95      0.95      2912

