In [1]:
import wfdb
import csv
import pandas as pd
# pour les calculs numériques
import numpy as np
import os

In [2]:
import os
# pour la lecture et l'écriture de fichiers CSV
import csv
# pour la manipulation de données
import pandas as pd
# pour la manipulation des fichiers ECG
import wfdb  # Make sure wfdb is installed

# répertoire de sortie pour stocker les données traitées
output_dir = "data_creation"
os.makedirs(output_dir, exist_ok=True)

# Patient numbers
patient_numbers = [
    "100", "101", "102", "103", "104", "105", "106", "107", "108", "109",
    "111", "112", "113", "114", "115", "116", "117", "118", "119", "121",
    "122", "123", "124", "200", "201", "202", "203", "205", "207", "208",
    "209", "210", "212", "213", "214", "215", "217", "219", "220", "221",
    "222", "223", "228", "230", "231", "232", "233", "234"
]

# pour mapper les symboles d'annotation ECG à des catégories
# N = normal 
# S = supra-ventricular premature
# V = ventricular escape
# F = fusion of ventricular and normal
# Q = unclassified heartbeats
symbol_to_category = {
    'N': 'N', '.': 'N', 'L': 'N', 'R': 'N', 'e': 'N', 'j': 'N',
    'a': 'S', 'A': 'S', 'J': 'S', 'S': 'S',
    'V': 'V', 'E': 'V',
    'F': 'F',
    '/': 'Q', 'f': 'Q', 'Q': 'Q'
}

# Les annotations ECG du patient sont lues et filtrées pour ne garder que celles qui 
# correspondent aux symboles définis dans symbol_to_category
# Ces données sont ensuite sauvegardées dans des fichiers CSV distincts dans le répertoire de sortie
for patient_number in patient_numbers:
    try:
        # ECG data
        path_to_record = f"mit-database/{patient_number}"
        patient_record = wfdb.rdrecord(path_to_record)
        leads = patient_record.sig_name
        ecg_data = patient_record.p_signal

        # ECG CSV
        ecg_filename = f"{output_dir}/{patient_number}_ECG.csv"
        with open(ecg_filename, "w", newline='') as outfile:
            out_csv = csv.writer(outfile)
            out_csv.writerow(leads)
            for row in ecg_data:
                out_csv.writerow(row)

        # Annotations data
        annotation = wfdb.rdann(path_to_record, 'atr')
        symbols = annotation.symbol
        annotations = annotation.sample

        # Filter out symbols not in symbol_to_category
        filtered_symbols_annotations = [(sym, ann) for sym, ann in zip(symbols, annotations) if sym in symbol_to_category]
        categories = [symbol_to_category[sym] for sym, ann in filtered_symbols_annotations]
        annotations_filtered = [ann for sym, ann in filtered_symbols_annotations]

        df_annotations = pd.DataFrame({'Category': categories, 'Annotation': annotations_filtered})

        # Annotations CSV
        annotations_filename = f"{output_dir}/{patient_number}_Annotations.csv"
        df_annotations.to_csv(annotations_filename, index=False)

    except Exception as e:
        print(f"Failed to process: {patient_number}: {e}")

print("Done")

Done


In [3]:
# pour traiter les données ECG et annotations pour un patient donné
def process_patient_data(patient_number, data_creation_dir="data_creation"):

    ecg_file_path = os.path.join(data_creation_dir, f"{patient_number}_ECG.csv")
    annotations_file_path = os.path.join(data_creation_dir, f"{patient_number}_Annotations.csv")
    
    patient_X = []
    patient_Y = []
    
    try:
        ecg_df = pd.read_csv(ecg_file_path)
        annotations_df = pd.read_csv(annotations_file_path)
    except FileNotFoundError:
        print(f"Files for patient {patient_number} not found. Skipping...")
        return [], []
    
    first_column_name = ecg_df.columns[0]

    sampling_rate = 360  # Hz
    window_size_seconds = 3  # Seconds before and after annotation
    window_size_samples = window_size_seconds * sampling_rate

    # une fenêtre temporelle autour de chaque annotation est extraite des données ECG.
    for _, row in annotations_df.iterrows():
        annotation_point = row['Annotation']
        category = row['Category']
        
        start_point = max(0, annotation_point - window_size_samples)
        end_point = min(len(ecg_df), annotation_point + window_size_samples)
        
        window_data = ecg_df.iloc[start_point:end_point][first_column_name].to_numpy()
        if len(window_data) < window_size_samples * 2:
            window_data = np.pad(window_data, (0, window_size_samples * 2 - len(window_data)), 'constant')
        
        # Les fenêtres temporelles et les catégories correspondantes sont ajoutées aux listes patient_X et patient_Y
        patient_X.append(window_data)
        patient_Y.append(category)
    
    return patient_X, patient_Y

# Initialize lists to hold the entire dataset
all_X = []
all_Y = []

data_creation_dir = "data_creation"

# données traitées grâce à la fonction 'process_patient_data' puis stockées dans les listes all_X et all_Y
# Process each patient
for patient_number in patient_numbers:
    patient_X, patient_Y = process_patient_data(patient_number, data_creation_dir)
    all_X.extend(patient_X)
    all_Y.extend(patient_Y)

# conversion en tableaux numpy
X = np.array(all_X)
Y = np.array(all_Y)

In [4]:
X

array([[-0.145, -0.145, -0.145, ...,  0.   ,  0.   ,  0.   ],
       [-0.145, -0.145, -0.145, ...,  0.   ,  0.   ,  0.   ],
       [-0.145, -0.145, -0.145, ...,  0.   ,  0.   ,  0.   ],
       ...,
       [-0.29 , -0.3  , -0.295, ...,  0.   ,  0.   ,  0.   ],
       [-0.29 , -0.29 , -0.28 , ...,  0.   ,  0.   ,  0.   ],
       [-0.215, -0.22 , -0.225, ...,  0.   ,  0.   ,  0.   ]])

In [5]:
Y

array(['N', 'N', 'N', ..., 'N', 'N', 'N'], dtype='<U1')

In [6]:
# Convertir X en DataFrame
df_x = pd.DataFrame(X)
df_x

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,2150,2151,2152,2153,2154,2155,2156,2157,2158,2159
0,-0.145,-0.145,-0.145,-0.145,-0.145,-0.145,-0.145,-0.145,-0.120,-0.135,...,0.000,0.000,0.000,0.000,0.00,0.000,0.00,0.00,0.00,0.000
1,-0.145,-0.145,-0.145,-0.145,-0.145,-0.145,-0.145,-0.145,-0.120,-0.135,...,0.000,0.000,0.000,0.000,0.00,0.000,0.00,0.00,0.00,0.000
2,-0.145,-0.145,-0.145,-0.145,-0.145,-0.145,-0.145,-0.145,-0.120,-0.135,...,0.000,0.000,0.000,0.000,0.00,0.000,0.00,0.00,0.00,0.000
3,-0.145,-0.145,-0.145,-0.145,-0.145,-0.145,-0.145,-0.145,-0.120,-0.135,...,0.000,0.000,0.000,0.000,0.00,0.000,0.00,0.00,0.00,0.000
4,-0.340,-0.335,-0.330,-0.350,-0.350,-0.345,-0.335,-0.335,-0.335,-0.350,...,-0.365,-0.375,-0.370,-0.365,-0.36,-0.355,-0.36,-0.36,-0.35,-0.340
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
109489,-0.230,-0.215,-0.200,-0.205,-0.210,-0.225,-0.215,-0.225,-0.225,-0.240,...,-0.295,-0.280,-0.275,-0.275,-0.27,-0.280,-0.28,-0.27,-0.27,-0.275
109490,-0.250,-0.245,-0.260,-0.260,-0.275,-0.260,-0.275,-0.275,-0.275,-0.280,...,0.000,0.000,0.000,0.000,0.00,0.000,0.00,0.00,0.00,0.000
109491,-0.290,-0.300,-0.295,-0.290,-0.290,-0.295,-0.310,-0.320,-0.310,-0.300,...,0.000,0.000,0.000,0.000,0.00,0.000,0.00,0.00,0.00,0.000
109492,-0.290,-0.290,-0.280,-0.295,-0.300,-0.295,-0.285,-0.265,-0.245,-0.250,...,0.000,0.000,0.000,0.000,0.00,0.000,0.00,0.00,0.00,0.000


In [7]:
# Convertir en DataFrame avec qu'une colonne 'ColumnName'
df_y = pd.DataFrame(Y, columns=['ColumnName'])
print(df_y)

       ColumnName
0               N
1               N
2               N
3               N
4               N
...           ...
109489          N
109490          N
109491          N
109492          N
109493          N

[109494 rows x 1 columns]


In [8]:
# Compte le nombre de répétitions dans chaque catégories
value_counts_y = df_y['ColumnName'].value_counts()
print(value_counts_y)

ColumnName
N    90631
Q     8043
V     7236
S     2781
F      803
Name: count, dtype: int64


In [9]:
# Créer un DataFrame où les données X et Y sont placées sur un seul axe à la suite
df_fusionné = pd.concat([df_y, df_x], axis=1)
df_fusionné

Unnamed: 0,ColumnName,0,1,2,3,4,5,6,7,8,...,2150,2151,2152,2153,2154,2155,2156,2157,2158,2159
0,N,-0.145,-0.145,-0.145,-0.145,-0.145,-0.145,-0.145,-0.145,-0.120,...,0.000,0.000,0.000,0.000,0.00,0.000,0.00,0.00,0.00,0.000
1,N,-0.145,-0.145,-0.145,-0.145,-0.145,-0.145,-0.145,-0.145,-0.120,...,0.000,0.000,0.000,0.000,0.00,0.000,0.00,0.00,0.00,0.000
2,N,-0.145,-0.145,-0.145,-0.145,-0.145,-0.145,-0.145,-0.145,-0.120,...,0.000,0.000,0.000,0.000,0.00,0.000,0.00,0.00,0.00,0.000
3,N,-0.145,-0.145,-0.145,-0.145,-0.145,-0.145,-0.145,-0.145,-0.120,...,0.000,0.000,0.000,0.000,0.00,0.000,0.00,0.00,0.00,0.000
4,N,-0.340,-0.335,-0.330,-0.350,-0.350,-0.345,-0.335,-0.335,-0.335,...,-0.365,-0.375,-0.370,-0.365,-0.36,-0.355,-0.36,-0.36,-0.35,-0.340
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
109489,N,-0.230,-0.215,-0.200,-0.205,-0.210,-0.225,-0.215,-0.225,-0.225,...,-0.295,-0.280,-0.275,-0.275,-0.27,-0.280,-0.28,-0.27,-0.27,-0.275
109490,N,-0.250,-0.245,-0.260,-0.260,-0.275,-0.260,-0.275,-0.275,-0.275,...,0.000,0.000,0.000,0.000,0.00,0.000,0.00,0.00,0.00,0.000
109491,N,-0.290,-0.300,-0.295,-0.290,-0.290,-0.295,-0.310,-0.320,-0.310,...,0.000,0.000,0.000,0.000,0.00,0.000,0.00,0.00,0.00,0.000
109492,N,-0.290,-0.290,-0.280,-0.295,-0.300,-0.295,-0.285,-0.265,-0.245,...,0.000,0.000,0.000,0.000,0.00,0.000,0.00,0.00,0.00,0.000


In [10]:
# df_fusionné.to_csv('df_fusionné.csv', index=False)

In [11]:
# Applique une fonction anonyme à la première colonne du DataFrame fusionné pour binariser les étiquettes
# Dans ce cas, si la valeur de l'étiquette est différente de 'N', elle est remplacée par 0, sinon elle est remplacée par 1
df_fusionné_binaire = df_fusionné.copy()
df_fusionné_binaire.iloc[:, 0] = df_fusionné_binaire.iloc[:, 0].apply(lambda x: 0 if x != 'N' else 1)

MemoryError: Unable to allocate 1.76 GiB for an array with shape (2160, 109494) and data type float64

In [None]:
df_fusionné_binaire

Unnamed: 0,ColumnName,0,1,2,3,4,5,6,7,8,...,2150,2151,2152,2153,2154,2155,2156,2157,2158,2159
0,1,-0.145,-0.145,-0.145,-0.145,-0.145,-0.145,-0.145,-0.145,-0.120,...,0.000,0.000,0.000,0.000,0.00,0.000,0.00,0.00,0.00,0.000
1,1,-0.145,-0.145,-0.145,-0.145,-0.145,-0.145,-0.145,-0.145,-0.120,...,0.000,0.000,0.000,0.000,0.00,0.000,0.00,0.00,0.00,0.000
2,1,-0.145,-0.145,-0.145,-0.145,-0.145,-0.145,-0.145,-0.145,-0.120,...,0.000,0.000,0.000,0.000,0.00,0.000,0.00,0.00,0.00,0.000
3,1,-0.145,-0.145,-0.145,-0.145,-0.145,-0.145,-0.145,-0.145,-0.120,...,0.000,0.000,0.000,0.000,0.00,0.000,0.00,0.00,0.00,0.000
4,1,-0.340,-0.335,-0.330,-0.350,-0.350,-0.345,-0.335,-0.335,-0.335,...,-0.365,-0.375,-0.370,-0.365,-0.36,-0.355,-0.36,-0.36,-0.35,-0.340
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
109489,1,-0.230,-0.215,-0.200,-0.205,-0.210,-0.225,-0.215,-0.225,-0.225,...,-0.295,-0.280,-0.275,-0.275,-0.27,-0.280,-0.28,-0.27,-0.27,-0.275
109490,1,-0.250,-0.245,-0.260,-0.260,-0.275,-0.260,-0.275,-0.275,-0.275,...,0.000,0.000,0.000,0.000,0.00,0.000,0.00,0.00,0.00,0.000
109491,1,-0.290,-0.300,-0.295,-0.290,-0.290,-0.295,-0.310,-0.320,-0.310,...,0.000,0.000,0.000,0.000,0.00,0.000,0.00,0.00,0.00,0.000
109492,1,-0.290,-0.290,-0.280,-0.295,-0.300,-0.295,-0.285,-0.265,-0.245,...,0.000,0.000,0.000,0.000,0.00,0.000,0.00,0.00,0.00,0.000
