In [26]:
import pandas as pd
import numpy as np

import os
from pathlib import Path

import plotly
import plotly.graph_objs as go

from tsfresh import extract_features, select_features
from tsfresh.feature_extraction.settings import MinimalFCParameters, from_columns
from tsfresh.utilities.dataframe_functions import impute

from sklearn.model_selection import train_test_split
from sklearn.metrics import balanced_accuracy_score

from catboost import CatBoostClassifier

In [2]:
# define relative path
path = Path.cwd()

In [3]:
# read the data
path_to_data = os.path.join(path, 'data', 'physionet2017.csv')
df = pd.read_csv(path_to_data, index_col=[-2])
df.head()


Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,1991,1992,1993,1994,1995,1996,1997,1998,1999,label
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
A00/A00001,0.035032,0.037155,0.044586,0.063694,0.076433,0.085987,0.089172,0.083864,0.072187,0.061571,...,0.02017,0.008493,0.0,-0.007431,-0.012739,-0.015924,-0.019108,-0.023355,-0.022293,0
A00/A00002,-0.035288,-0.032573,-0.030945,-0.029859,-0.031488,-0.034202,-0.037459,-0.040717,-0.043974,-0.047231,...,-0.002714,-0.001629,-0.001086,-0.000543,-0.000543,0.0,0.0,0.000543,0.001086,0
A00/A00003,-0.303922,-0.261438,-0.222222,-0.19281,-0.176471,-0.163399,-0.147059,-0.130719,-0.117647,-0.107843,...,-0.339869,-0.346405,-0.339869,-0.323529,-0.297386,-0.264706,-0.20915,-0.117647,-0.065359,0
A00/A00004,0.109467,0.117604,0.128698,0.142012,0.153107,0.161982,0.170118,0.176036,0.181213,0.184911,...,0.846154,0.780325,0.640533,0.467456,0.298077,0.16568,0.085799,0.012574,0.013314,1
A00/A00005,-0.019856,-0.017148,-0.01444,-0.011733,-0.009928,-0.008123,-0.006318,-0.004513,-0.00361,-0.001805,...,-0.347473,-0.306859,-0.26083,-0.214801,-0.168773,-0.124549,-0.083935,-0.051444,0.0,1


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 8528 entries, A00/A00001 to A08/A08528
Columns: 2001 entries, 0 to label
dtypes: float64(2000), int64(1)
memory usage: 130.3+ MB


In [5]:
# labels variance
df['label'].value_counts()

0    5076
2    2415
1     758
3     279
Name: label, dtype: int64

In [6]:
# labels info
info_dict = {0: 'Normal', 1: 'AF', 2: 'Other', 3: 'Noise'}

In [None]:
# # data visualization
# def visual(row):
#     name = row.name.replace('/', '_')
#     row = row.values
#     fig = go.Figure()
#     fig.add_trace(go.Scatter(y=row[:-1]))
#     fig.update_layout(title={
#                             'text': info_dict[row[-1]],
#                             'font_size': 24,
#                             'y':0.9,
#                             'x':0.5})
#     fig.write_html(os.path.join('figures', f'{name}.html'))

# df.apply(visual, axis=1)

In [8]:
# labels visualization
for label in df['label'].unique():
    fig = go.Figure()
    fig.add_trace(go.Scatter(y=df[df['label'] == label].iloc[1, :-1]))
    fig.update_layout(title=info_dict[label])
    fig.show()

In [9]:
# check NaNs
df.isna().sum().sum()

0

In [10]:
# select features and target
features_df = df.drop(columns={'label'})
label = df['label']
features_df.head()

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,1990,1991,1992,1993,1994,1995,1996,1997,1998,1999
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
A00/A00001,0.035032,0.037155,0.044586,0.063694,0.076433,0.085987,0.089172,0.083864,0.072187,0.061571,...,0.038217,0.02017,0.008493,0.0,-0.007431,-0.012739,-0.015924,-0.019108,-0.023355,-0.022293
A00/A00002,-0.035288,-0.032573,-0.030945,-0.029859,-0.031488,-0.034202,-0.037459,-0.040717,-0.043974,-0.047231,...,-0.0038,-0.002714,-0.001629,-0.001086,-0.000543,-0.000543,0.0,0.0,0.000543,0.001086
A00/A00003,-0.303922,-0.261438,-0.222222,-0.19281,-0.176471,-0.163399,-0.147059,-0.130719,-0.117647,-0.107843,...,-0.330065,-0.339869,-0.346405,-0.339869,-0.323529,-0.297386,-0.264706,-0.20915,-0.117647,-0.065359
A00/A00004,0.109467,0.117604,0.128698,0.142012,0.153107,0.161982,0.170118,0.176036,0.181213,0.184911,...,0.844675,0.846154,0.780325,0.640533,0.467456,0.298077,0.16568,0.085799,0.012574,0.013314
A00/A00005,-0.019856,-0.017148,-0.01444,-0.011733,-0.009928,-0.008123,-0.006318,-0.004513,-0.00361,-0.001805,...,-0.381769,-0.347473,-0.306859,-0.26083,-0.214801,-0.168773,-0.124549,-0.083935,-0.051444,0.0


In [11]:
# split to the valid data
X, X_val, y, y_val = train_test_split(features_df, label, test_size=0.1, shuffle=True)

In [14]:
X_val.to_csv(os.path.join('validation', 'data.csv'))
y_val.to_csv(os.path.join('validation', 'target.csv'))

## Applying tsfresh library to analyze data

In [12]:
# vectorized data
data_long = pd.DataFrame({0: X.values.flatten(),
                          1: X.index.repeat(features_df.shape[1])})
data_long.head()

Unnamed: 0,0,1
0,-0.041031,A05/A05368
1,-0.041031,A05/A05368
2,-0.041985,A05/A05368
3,-0.044847,A05/A05368
4,-0.04771,A05/A05368


In [15]:
# calculate features
extracted_features = extract_features(data_long, column_id=1, impute_function=impute, default_fc_parameters=MinimalFCParameters()).sort_index()
extracted_features.head()

Feature Extraction: 100%|██████████| 40/40 [00:03<00:00, 11.07it/s]


Unnamed: 0,0__sum_values,0__median,0__mean,0__length,0__standard_deviation,0__variance,0__root_mean_square,0__maximum,0__absolute_maximum,0__minimum
A00/A00001,54.674098,-0.001062,0.027337,2000.0,0.161908,0.026214,0.164199,1.0,1.0,-0.16879
A00/A00002,17.956569,-0.005429,0.008978,2000.0,0.097265,0.00946,0.097678,1.0,1.0,-0.282302
A00/A00003,-37.196078,0.086601,-0.018598,2000.0,0.433558,0.187972,0.433957,1.0,2.578431,-2.578431
A00/A00004,55.476331,-0.018121,0.027738,2000.0,0.156104,0.024369,0.15855,1.0,1.0,-0.10355
A00/A00005,42.030686,-0.018051,0.021015,2000.0,0.168624,0.028434,0.169929,1.0,1.0,-0.443141


In [16]:
# split to the test data
y = y.sort_index()
X_train, X_test, y_train, y_test = train_test_split(extracted_features, y, test_size=0.3, shuffle=True)

In [17]:
# select importance features
X_train = select_features(X_train, y_train)
X_train.head()

Unnamed: 0,0__minimum,0__median,0__root_mean_square,0__absolute_maximum,0__standard_deviation,0__variance,0__sum_values,0__mean
A01/A01026,-0.311745,-0.020307,0.158488,1.0,0.15757,0.024828,34.061471,0.017031
A03/A03211,-0.608345,-0.04576,0.226977,1.0,0.226897,0.051482,12.079408,0.00604
A00/A00972,-0.38574,-0.032907,0.176849,1.0,0.17638,0.03111,25.744973,0.012872
A01/A01876,-0.757962,-0.133758,0.356975,1.0,0.356555,0.127131,-34.633758,-0.017317
A07/A07691,-1.036446,-0.04328,0.242947,1.036446,0.24236,0.058738,-33.758542,-0.016879


In [27]:
from_columns(X_test)

{'0': {'minimum': None,
  'median': None,
  'root_mean_square': None,
  'absolute_maximum': None,
  'standard_deviation': None,
  'variance': None,
  'sum_values': None,
  'mean': None}}

In [23]:
X_train.columns

Index(['0__minimum', '0__median', '0__root_mean_square', '0__absolute_maximum',
       '0__standard_deviation', '0__variance', '0__sum_values', '0__mean'],
      dtype='object')

In [18]:
# select features from test data
X_test = X_test[X_train.columns]
X_test

Unnamed: 0,0__minimum,0__median,0__root_mean_square,0__absolute_maximum,0__standard_deviation,0__variance,0__sum_values,0__mean
A01/A01808,-0.662712,-0.017373,0.224498,1.000000,0.223109,0.049777,49.878814,0.024939
A06/A06944,-1.007764,-0.034161,0.284732,1.007764,0.280524,0.078694,-97.532609,-0.048766
A07/A07845,-2.084249,0.007326,0.357960,2.084249,0.356789,0.127298,-57.882784,-0.028941
A01/A01227,-0.326121,-0.041731,0.151107,1.000000,0.150902,0.022771,15.748068,0.007874
A07/A07792,-0.258838,-0.010101,0.137078,1.000000,0.136395,0.018604,27.335859,0.013668
...,...,...,...,...,...,...,...,...
A03/A03246,-0.719101,-0.031835,0.198472,1.000000,0.198274,0.039313,17.737828,0.008869
A05/A05690,-7.435897,0.136752,1.090808,7.435897,1.079313,1.164916,-315.880342,-0.157940
A00/A00436,-0.298182,-0.007273,0.134967,1.000000,0.134343,0.018048,25.916364,0.012958
A06/A06493,-0.264875,-0.018234,0.173913,1.000000,0.172790,0.029856,39.474088,0.019737


### CatBoost

In [19]:
# init, train and predict model
model = CatBoostClassifier(iterations=100, verbose=False)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
balanced_accuracy_score(y_test, y_pred)

0.30696457254222004

In [22]:
model.save_model('CB_0.3.onnx')