In [1]:
# import the necessary modules
import pandas as pd
import numpy as np
import os
#waveform-database:A library of tools for reading, writing, and processing WFDB signals and annotations.
import wfdb 

%matplotlib inline
import matplotlib as mlp
import matplotlib.pyplot as plt

mlp.rc("xtick",labelsize=12)
mlp.rc("ytick",labelsize=12)
mlp.rc("axes",labelsize=14)

MAIN_PATH = "/home/moon/gitWorkspace/MachineLearning/ECG-Arrythmia"
DATA_PATH = "/home/moon/gitWorkspace/MachineLearning/ECG-Arrythmia/mit-database"
FIG_PATH="/home/moon/gitWorkspace/MachineLearning/ECG-Arrythmia"
FIGURE_PATH = os.path.join(FIG_PATH,"saved_figures")
os.makedirs("saved_figures",exist_ok=True)
os.chdir(DATA_PATH)

#a function to save plotted figures
def save_fig(fig_id, tight_layout=True, fig_extension="png", resolution=300):
    path = os.path.join(FIGURE_PATH, fig_id + "." + fig_extension)
    print("Saving figure", fig_id)
    if tight_layout:
        plt.tight_layout()
    plt.savefig(path, format=fig_extension, dpi=resolution)


In [2]:
sample = wfdb.io.rdsamp("100")


In [3]:
sample_MLII = sample[0][:,0]
sample_V5 = sample[0][:,1]


 ## Importing and Preparing Data

In [4]:
# RECORDS file contains all the record names of the corresponding patients
records = np.loadtxt(os.path.join(DATA_PATH,"RECORDS"), dtype=str)
print(records)
print(len(records))

['100' '101' '102' '103' '104' '105' '106' '107' '108' '109' '111' '112'
 '113' '114' '115' '116' '117' '118' '119' '121' '122' '123' '124' '200'
 '201' '202' '203' '205' '207' '208' '209' '210' '212' '213' '214' '215'
 '217' '219' '220' '221' '222' '223' '228' '230' '231' '232' '233' '234']
48


In [5]:
def read_data():
    dataset = []
    for record in records:
        temp = wfdb.rdrecord(record)
        dataset.append(temp.p_signal)
    return dataset

In [6]:
dataset = read_data()


## Valid and Invalid beats 

In [7]:
invalid_beat = [
    "[", "!", "]", "x", "(", ")", "p", "t", 
    "u", "`", "'", "^", "|", "~", "+", "s", 
    "T", "*", "D", "=",'"', "@", "B", "a", "J", "S",
    "r", "F", "e", "j", "n", "f", "Q", "?"
]

label_beats=[
    "A", "L", "/", "V", "R", "E", "N"
]

## Beat Segmentation

In [8]:
def classify(symbol):
    if symbol in label_beats:
        return 1
    else:
        return 0
    
def segment(signal_MLII, beat_loc):
    window=180
    count=1
    x=beat_loc-window
    y=beat_loc+window
    samp=signal_MLII[x:y]
    return samp

all_signals=[]
all_labels=[]
for record in records:
    temp=wfdb.rdrecord(record)
    annot_temp=wfdb.rdann(record, 'atr')
    ann_sample=annot_temp.sample
    ann_symbol=annot_temp.symbol
    fs=temp.fs
    signal=temp.p_signal
    signal_MLII=signal[:,0]
    for i, i_sample in enumerate(ann_sample):
        label_dec=classify(ann_symbol[i])
        segmentation=segment(signal_MLII, i_sample)
        if label_dec==1 and len(segmentation)==360:
            all_signals.append(segmentation)
            all_labels.append(ann_symbol[i])


In [9]:
# vstack Stacks arrays in sequence vertically (row wise).
all_signals_fin=np.vstack(all_signals)

In [10]:
print(all_signals_fin.shape)

(107141, 360)


## Visualizing Segmentated Data

In [11]:
labels_fin=pd.Series(all_labels)

## Check distribution of types of data

In [12]:
labels_fin.value_counts()

N    75011
L     8071
R     7255
V     7129
/     7023
A     2546
E      106
dtype: int64

## Resampling

In [13]:
len(all_signals_fin)
labels_array=np.array(all_labels)

In [14]:
df_0=all_signals_fin[labels_fin=='/']
df_1=all_signals_fin[labels_fin=='A']
df_2=all_signals_fin[labels_fin=='E']
df_3=all_signals_fin[labels_fin=='L']
df_4=all_signals_fin[labels_fin=='N']
df_5=all_signals_fin[labels_fin=='R']
df_6=all_signals_fin[labels_fin=='V']
y_df0=labels_array[labels_fin=='/']
print(len(df_0))
print(y_df0)
len(y_df0)

7023
['/' '/' '/' ... '/' '/' '/']


7023

In [15]:
from sklearn.utils import resample

In [16]:
df0_sampled=resample(df_0, replace=True, n_samples=20000, random_state=42)
df1_sampled=resample(df_1, replace=True, n_samples=20000, random_state=42)
df2_sampled=resample(df_2, replace=True, n_samples=20000, random_state=42)
df3_sampled=resample(df_3, replace=True, n_samples=20000, random_state=42)
df4_sampled=resample(df_4, replace=True, n_samples=20000, random_state=42)
df5_sampled=resample(df_5, replace=True, n_samples=20000, random_state=42)
df6_sampled=resample(df_6, replace=True, n_samples=20000, random_state=42)


In [17]:
y_0=['/']*len(df0_sampled)
y_1=['A']*len(df1_sampled)
y_2=['E']*len(df2_sampled)
y_3=['L']*len(df3_sampled)
y_4=['N']*len(df4_sampled)
y_5=['R']*len(df5_sampled)
y_6=['V']*len(df6_sampled)
    

In [18]:
X_final=[]
X_final.extend(df0_sampled)
X_final.extend(df1_sampled)
X_final.extend(df2_sampled)
X_final.extend(df3_sampled)
X_final.extend(df4_sampled)
X_final.extend(df5_sampled)
X_final.extend(df6_sampled)

In [19]:
y_final=[]
y_final.extend(y_0)
y_final.extend(y_1)
y_final.extend(y_2)
y_final.extend(y_3)
y_final.extend(y_4)
y_final.extend(y_5)
y_final.extend(y_6)

## Standarization of the data

In [20]:
from sklearn.preprocessing import StandardScaler
scaler=StandardScaler()

In [21]:
scaled=scaler.fit_transform(X_final)

In [22]:
def check_strat(y):
    dummy = pd.DataFrame(y,columns=["Labels"])
    print("Value distribution:\n")
    count = dummy["Labels"].value_counts()
    percent = dummy["Labels"].value_counts(normalize=True).mul(100).round(2)
    print(pd.concat([count,percent],axis=1,keys=["Counts","Percent"]))

## Splitting Data into Train and Test Set

In [23]:
from sklearn.model_selection import StratifiedShuffleSplit
strad=StratifiedShuffleSplit()

In [24]:
assin_strad=strad.split(scaled, y_final)

In [25]:
train_index, test_index=next(assin_strad)

In [26]:
train_data_scaled=scaled[train_index]

In [27]:
from sklearn.preprocessing import LabelEncoder
lab=LabelEncoder()

In [28]:
labels_final=lab.fit_transform(y_final)

In [29]:
lab.classes_

array(['/', 'A', 'E', 'L', 'N', 'R', 'V'], dtype='<U1')

In [30]:
train_label=labels_final[train_index]

In [31]:
test_data_scaled=scaled[test_index]

In [32]:
test_labels=labels_final[test_index]

In [33]:
check_strat(test_labels)

Value distribution:

   Counts  Percent
0    2000    14.29
1    2000    14.29
2    2000    14.29
3    2000    14.29
4    2000    14.29
5    2000    14.29
6    2000    14.29


In [34]:
check_strat(train_label)

Value distribution:

   Counts  Percent
0   18000    14.29
1   18000    14.29
2   18000    14.29
3   18000    14.29
4   18000    14.29
5   18000    14.29
6   18000    14.29
