In [37]:
import wfdb
import csv
import pandas as pd
import numpy as np
import os

In [38]:
import os
import csv
import pandas as pd
import wfdb  # Make sure wfdb is installed

output_dir = "data_creation"
os.makedirs(output_dir, exist_ok=True)

# Patient numbers
patient_numbers = [
    "100", "101", "102", "103", "104", "105", "106", "107", "108", "109",
    "111", "112", "113", "114", "115", "116", "117", "118", "119", "121",
    "122", "123", "124", "200", "201", "202", "203", "205", "207", "208",
    "209", "210", "212", "213", "214", "215", "217", "219", "220", "221",
    "222", "223", "228", "230", "231", "232", "233", "234"
]

# N = normal 
# S = supra-ventricular premature
# V = ventricular escape
# F = fusion of ventricular and normal
# Q = unclassified heartbeats
symbol_to_category = {
    'N': 'N', '.': 'N', 'L': 'N', 'R': 'N', 'e': 'N', 'j': 'N',
    'a': 'S', 'A': 'S', 'J': 'S', 'S': 'S',
    'V': 'V', 'E': 'V',
    'F': 'F',
    '/': 'Q', 'f': 'Q', 'Q': 'Q'
}

for patient_number in patient_numbers:
    try:
        # ECG data
        path_to_record = f"mit-database/{patient_number}"
        patient_record = wfdb.rdrecord(path_to_record)
        leads = patient_record.sig_name
        ecg_data = patient_record.p_signal

        # ECG CSV
        ecg_filename = f"{output_dir}/{patient_number}_ECG.csv"
        with open(ecg_filename, "w", newline='') as outfile:
            out_csv = csv.writer(outfile)
            out_csv.writerow(leads)
            for row in ecg_data:
                out_csv.writerow(row)

        # Annotations data
        annotation = wfdb.rdann(path_to_record, 'atr')
        symbols = annotation.symbol
        annotations = annotation.sample

        # Filter out symbols not in symbol_to_category
        filtered_symbols_annotations = [(sym, ann) for sym, ann in zip(symbols, annotations) if sym in symbol_to_category]
        categories = [symbol_to_category[sym] for sym, ann in filtered_symbols_annotations]
        annotations_filtered = [ann for sym, ann in filtered_symbols_annotations]

        df_annotations = pd.DataFrame({'Category': categories, 'Annotation': annotations_filtered})

        # Annotations CSV
        annotations_filename = f"{output_dir}/{patient_number}_Annotations.csv"
        df_annotations.to_csv(annotations_filename, index=False)

    except Exception as e:
        print(f"Failed to process: {patient_number}: {e}")

print("Done")

Done


In [39]:
import os
import pandas as pd
import numpy as np


def process_patient_data(patient_number, data_creation_dir="data_creation"):
    ecg_file_path = os.path.join(data_creation_dir, f"{patient_number}_ECG.csv")
    annotations_file_path = os.path.join(data_creation_dir, f"{patient_number}_Annotations.csv")
    
    patient_X = []
    patient_Y = []
    
    try:
        ecg_df = pd.read_csv(ecg_file_path)
        annotations_df = pd.read_csv(annotations_file_path)
    except FileNotFoundError:
        print(f"Files for patient {patient_number} not found. Skipping...")
        return [], []
    
    first_column_name = ecg_df.columns[0]
    second_column_name = ecg_df.columns[1] if len(ecg_df.columns) > 1 else None

    sampling_rate = 360  # Hz
    total_window_size_seconds = 2  # Total window size in seconds
    total_window_size_samples = total_window_size_seconds * sampling_rate

    for _, row in annotations_df.iterrows():
        annotation_point = row['Annotation']
        category = row['Category']
        
        # Randomly determine the window split around the annotation point
        before_seconds = np.random.uniform(0, total_window_size_seconds)
        after_seconds = total_window_size_seconds - before_seconds
        before_samples = int(before_seconds * sampling_rate)
        after_samples = int(after_seconds * sampling_rate)
        
        start_point = max(0, annotation_point - before_samples)
        end_point = start_point + total_window_size_samples  # Ensure the window is of the exact expected size
        
        # Adjust the end point if it exceeds the length of the data
        if end_point > len(ecg_df):
            end_point = len(ecg_df)
            start_point = max(0, end_point - total_window_size_samples)  # Adjust start point accordingly
        
        # Process data from the first column
        window_data_first_column = ecg_df.iloc[start_point:end_point][first_column_name].to_numpy()
        
        patient_X.append(window_data_first_column)
        patient_Y.append(category)
        
        # If there's a second column, process it and add as a new entry
        if second_column_name:
            window_data_second_column = ecg_df.iloc[start_point:end_point][second_column_name].to_numpy()
            
            patient_X.append(window_data_second_column)
            patient_Y.append(category)  # Repeat category for the new entry
    
    return patient_X, patient_Y



# Initialize lists to hold the entire dataset
all_X = []
all_Y = []

data_creation_dir = "data_creation"

# Process each patient
for patient_number in patient_numbers:
    patient_X, patient_Y = process_patient_data(patient_number, data_creation_dir)
    all_X.extend(patient_X)
    all_Y.extend(patient_Y)

X = np.array(all_X)
Y = np.array(all_Y)

In [40]:
X

array([[-0.145, -0.145, -0.145, ..., -0.4  , -0.415, -0.425],
       [-0.065, -0.065, -0.065, ..., -0.28 , -0.29 , -0.29 ],
       [-0.35 , -0.35 , -0.345, ..., -0.31 , -0.305, -0.305],
       ...,
       [ 0.09 ,  0.09 ,  0.095, ...,  0.06 ,  0.065,  0.075],
       [-0.355, -0.35 , -0.335, ..., -0.395, -0.38 ,  0.   ],
       [ 0.03 ,  0.04 ,  0.045, ...,  0.075,  0.08 ,  0.   ]])

In [41]:
Y

array(['N', 'N', 'N', ..., 'N', 'N', 'N'], dtype='<U1')

In [42]:
df_x = pd.DataFrame(X)
df_x

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,710,711,712,713,714,715,716,717,718,719
0,-0.145,-0.145,-0.145,-0.145,-0.145,-0.145,-0.145,-0.145,-0.120,-0.135,...,-0.400,-0.400,-0.395,-0.415,-0.405,-0.390,-0.400,-0.400,-0.415,-0.425
1,-0.065,-0.065,-0.065,-0.065,-0.065,-0.065,-0.065,-0.065,-0.080,-0.080,...,-0.265,-0.270,-0.280,-0.285,-0.280,-0.255,-0.270,-0.280,-0.290,-0.290
2,-0.350,-0.350,-0.345,-0.335,-0.335,-0.335,-0.350,-0.355,-0.355,-0.345,...,-0.360,-0.345,-0.355,-0.345,-0.345,-0.345,-0.340,-0.310,-0.305,-0.305
3,-0.235,-0.245,-0.240,-0.240,-0.240,-0.245,-0.260,-0.260,-0.265,-0.265,...,-0.215,-0.210,-0.225,-0.240,-0.235,-0.240,-0.225,-0.215,-0.220,-0.220
4,-0.350,-0.355,-0.365,-0.375,-0.380,-0.370,-0.365,-0.365,-0.380,-0.385,...,-0.365,-0.345,-0.330,-0.315,-0.305,-0.305,-0.320,-0.310,-0.310,-0.305
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
218983,0.085,0.100,0.115,0.125,0.120,0.105,0.105,0.105,0.105,0.090,...,0.045,0.045,0.070,0.065,0.060,0.040,0.030,0.040,0.050,0.060
218984,-0.255,-0.250,-0.260,-0.250,-0.265,-0.280,-0.270,-0.270,-0.265,-0.250,...,-0.305,-0.310,-0.320,-0.310,-0.315,-0.320,-0.320,-0.315,-0.305,-0.295
218985,0.090,0.090,0.095,0.080,0.080,0.065,0.065,0.080,0.095,0.100,...,0.080,0.095,0.085,0.090,0.080,0.070,0.065,0.060,0.065,0.075
218986,-0.355,-0.350,-0.335,-0.285,-0.210,-0.080,0.120,0.335,0.620,0.910,...,-0.350,-0.350,-0.350,-0.365,-0.390,-0.390,-0.390,-0.395,-0.380,0.000


In [43]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
df_x = scaler.fit_transform(df_x)
df_x = pd.DataFrame(df_x)


In [44]:
import pywt

def madev(d, axis=None):
    'Mean absolute deviation of a signal'
    return np.mean(np.absolute(d - np.mean(d, axis)), axis)

def wavelet_denoising(x, wavelet='sym4', level=1):
    coeff = pywt.wavedec(x, wavelet, mode="per")
    sigma = (1/0.6745) * madev(coeff[-level])
    uthresh = sigma * np.sqrt(2 * np.log(len(x)))
    coeff[1:] = (pywt.threshold(i, value=uthresh, mode='hard') for i in coeff[1:])
    return pywt.waverec(coeff, wavelet, mode='per')


df_x = wavelet_denoising(df_x, wavelet='sym4', level=2)
df_x = pd.DataFrame(df_x)

In [45]:
df_y = pd.DataFrame(Y, columns=['ColumnName'])
print(df_y)

       ColumnName
0               N
1               N
2               N
3               N
4               N
...           ...
218983          N
218984          N
218985          N
218986          N
218987          N

[218988 rows x 1 columns]


In [46]:
value_counts_y = df_y['ColumnName'].value_counts()
print(value_counts_y)

ColumnName
N    181262
Q     16086
V     14472
S      5562
F      1606
Name: count, dtype: int64


In [47]:
df_fusionné = pd.concat([df_y, df_x], axis=1)
df_fusionné

Unnamed: 0,ColumnName,0,1,2,3,4,5,6,7,8,...,710,711,712,713,714,715,716,717,718,719
0,N,0.474612,0.476679,0.478512,0.480266,0.482392,0.484453,0.486573,0.488440,0.488981,...,0.457734,0.457379,0.458380,0.460001,0.461891,0.464033,0.466055,0.468095,0.470291,0.472508
1,N,0.487613,0.488369,0.488991,0.489403,0.489775,0.490112,0.490249,0.490371,0.490951,...,0.470220,0.469527,0.470491,0.472223,0.474069,0.476189,0.478572,0.480984,0.483697,0.486295
2,N,0.466043,0.466057,0.466094,0.466145,0.466172,0.466202,0.466234,0.466276,0.466371,...,0.466705,0.466653,0.466589,0.466517,0.466450,0.466382,0.466299,0.466219,0.466133,0.466057
3,N,0.477884,0.477537,0.477119,0.476647,0.476231,0.475811,0.475372,0.474918,0.474398,...,0.477588,0.477528,0.477576,0.477667,0.477724,0.477779,0.477877,0.477953,0.478055,0.478107
4,N,0.465664,0.465429,0.465074,0.464649,0.464261,0.463815,0.463621,0.463377,0.462513,...,0.463029,0.463015,0.463281,0.463655,0.464002,0.464360,0.464735,0.465070,0.465436,0.465728
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
218983,N,0.509255,0.509583,0.509838,0.510016,0.510188,0.510331,0.510416,0.510469,0.510534,...,0.501333,0.502050,0.502870,0.503733,0.504556,0.505374,0.506248,0.507098,0.507966,0.508770
218984,N,0.472329,0.472408,0.472515,0.472645,0.472758,0.472878,0.473008,0.473150,0.473329,...,0.472887,0.472850,0.472785,0.472708,0.472642,0.472578,0.472499,0.472428,0.472352,0.472294
218985,N,0.508852,0.508851,0.508864,0.508886,0.508885,0.508880,0.508877,0.508876,0.508894,...,0.509337,0.509335,0.509336,0.509324,0.509306,0.509275,0.509197,0.509106,0.508993,0.508883
218986,N,0.462353,0.466682,0.464064,0.468946,0.480229,0.496959,0.513037,0.531042,0.558766,...,0.466077,0.467140,0.465838,0.464159,0.460622,0.457609,0.459825,0.461559,0.474049,0.494654


In [49]:
df_fusionné.to_csv('df_fusionné.csv', index=False)