In [None]:
from google.colab import drive
drive.mount('/content/drive/')

Mounted at /content/drive/


In [None]:
pip install wfdb

Collecting wfdb
  Downloading wfdb-4.1.2-py3-none-any.whl (159 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/160.0 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━[0m [32m153.6/160.0 kB[0m [31m5.4 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m160.0/160.0 kB[0m [31m4.2 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: wfdb
Successfully installed wfdb-4.1.2


In [None]:
import wfdb
import csv
import pandas as pd
# pour les calculs numériques
import numpy as np
import os

In [None]:
import os
# pour la lecture et l'écriture de fichiers CSV
import csv
# pour la manipulation de données
import pandas as pd
# pour la manipulation des fichiers ECG
import wfdb  # Make sure wfdb is installed

# répertoire de sortie pour stocker les données traitées
output_dir = "data_creation"
os.makedirs(output_dir, exist_ok=True)

# Patient numbers
patient_numbers = [
    "100", "101", "102", "103", "104", "105", "106", "107", "108", "109",
    "111", "112", "113", "114", "115", "116", "117", "118", "119", "121",
    "122", "123", "124", "200", "201", "202", "203", "205", "207", "208",
    "209", "210", "212", "213", "214", "215", "217", "219", "220", "221",
    "222", "223", "228", "230", "231", "232", "233", "234"
]

# pour mapper les symboles d'annotation ECG à des catégories
# N = normal
# S = supra-ventricular premature
# V = ventricular escape
# F = fusion of ventricular and normal
# Q = unclassified heartbeats
symbol_to_category = {
    'N': 'N', '.': 'N', 'L': 'N', 'R': 'N', 'e': 'N', 'j': 'N',
    'a': 'S', 'A': 'S', 'J': 'S', 'S': 'S',
    'V': 'V', 'E': 'V',
    'F': 'F',
    '/': 'Q', 'f': 'Q', 'Q': 'Q'
}

# Les annotations ECG du patient sont lues et filtrées pour ne garder que celles qui
# correspondent aux symboles définis dans symbol_to_category
# Ces données sont ensuite sauvegardées dans des fichiers CSV distincts dans le répertoire de sortie
for patient_number in patient_numbers:
    try:
        # ECG data
        path_to_record = f"/content/drive/MyDrive/Projet_E4/mit-database/{patient_number}"
        patient_record = wfdb.rdrecord(path_to_record)
        leads = patient_record.sig_name
        ecg_data = patient_record.p_signal

        # ECG CSV
        ecg_filename = f"{output_dir}/{patient_number}_ECG.csv"
        with open(ecg_filename, "w", newline='') as outfile:
            out_csv = csv.writer(outfile)
            out_csv.writerow(leads)
            for row in ecg_data:
                out_csv.writerow(row)

        # Annotations data
        annotation = wfdb.rdann(path_to_record, 'atr')
        symbols = annotation.symbol
        annotations = annotation.sample

        # Filter out symbols not in symbol_to_category
        filtered_symbols_annotations = [(sym, ann) for sym, ann in zip(symbols, annotations) if sym in symbol_to_category]
        categories = [symbol_to_category[sym] for sym, ann in filtered_symbols_annotations]
        annotations_filtered = [ann for sym, ann in filtered_symbols_annotations]

        df_annotations = pd.DataFrame({'Category': categories, 'Annotation': annotations_filtered})

        # Annotations CSV
        annotations_filename = f"{output_dir}/{patient_number}_Annotations.csv"
        df_annotations.to_csv(annotations_filename, index=False)

    except Exception as e:
        print(f"Failed to process: {patient_number}: {e}")

print("Done")

Done


In [None]:
import os
import pandas as pd
import numpy as np


# pour traiter les données ECG et annotations pour un patient donné
def process_patient_data(patient_number, data_creation_dir="data_creation"):
    ecg_file_path = os.path.join(data_creation_dir, f"{patient_number}_ECG.csv")
    annotations_file_path = os.path.join(data_creation_dir, f"{patient_number}_Annotations.csv")

    patient_X = []
    patient_Y = []

    try:
        ecg_df = pd.read_csv(ecg_file_path)
        annotations_df = pd.read_csv(annotations_file_path)
    except FileNotFoundError:
        print(f"Files for patient {patient_number} not found. Skipping...")
        return [], []

    first_column_name = ecg_df.columns[0]
    second_column_name = ecg_df.columns[1] if len(ecg_df.columns) > 1 else None

    sampling_rate = 360  # Hz
    total_window_size_seconds = 1  # Total window size in seconds
    total_window_size_samples = total_window_size_seconds * sampling_rate

    # une fenêtre temporelle autour de chaque annotation est extraite des données ECG.
    for _, row in annotations_df.iterrows():
        annotation_point = row['Annotation']
        category = row['Category']

        # Randomly determine the window split around the annotation point
        before_seconds = np.random.uniform(0, total_window_size_seconds)
        after_seconds = total_window_size_seconds - before_seconds
        before_samples = int(before_seconds * sampling_rate)
        after_samples = int(after_seconds * sampling_rate)

        start_point = max(0, annotation_point - before_samples)
        end_point = start_point + total_window_size_samples  # Ensure the window is of the exact expected size

        # Adjust the end point if it exceeds the length of the data
        if end_point > len(ecg_df):
            end_point = len(ecg_df)
            start_point = max(0, end_point - total_window_size_samples)  # Adjust start point accordingly

        # Process data from the first column
        window_data_first_column = ecg_df.iloc[start_point:end_point][first_column_name].to_numpy()

        # Les fenêtres temporelles et les catégories correspondantes sont ajoutées aux listes patient_X et patient_Y
        patient_X.append(window_data_first_column)
        patient_Y.append(category)

        # If there's a second column, process it and add as a new entry
        if second_column_name:
            window_data_second_column = ecg_df.iloc[start_point:end_point][second_column_name].to_numpy()

            patient_X.append(window_data_second_column)
            patient_Y.append(category)  # Repeat category for the new entry

    return patient_X, patient_Y



# Initialize lists to hold the entire dataset
all_X = []
all_Y = []

data_creation_dir = "data_creation"

# données traitées grâce à la fonction 'process_patient_data' puis stockées dans les listes all_X et all_Y
# Process each patient
for patient_number in patient_numbers:
    patient_X, patient_Y = process_patient_data(patient_number, data_creation_dir)
    all_X.extend(patient_X)
    all_Y.extend(patient_Y)

# Conversion en tableaux numpy
X = np.array(all_X)
Y = np.array(all_Y)

In [None]:
X # askip 3 dimensions : temps, numéro du patient et donnée

array([[-0.145, -0.145, -0.145, ..., -0.445, -0.475, -0.51 ],
       [-0.065, -0.065, -0.065, ..., -0.36 , -0.355, -0.305],
       [-0.225, -0.23 , -0.235, ..., -0.395, -0.39 , -0.395],
       ...,
       [ 0.08 ,  0.08 ,  0.065, ..., -0.06 , -0.135, -0.135],
       [-0.23 , -0.255, -0.235, ..., -0.385, -0.4  , -0.415],
       [ 0.06 ,  0.045,  0.045, ..., -0.03 , -0.03 , -0.035]])

In [None]:
Y

array(['N', 'N', 'N', ..., 'N', 'N', 'N'], dtype='<U1')

In [None]:
# Dimensions
print(X.shape)
print(Y.shape)

# type de données
print(X.dtype)
print(Y.dtype)

(218988, 360)
(218988,)
float64
<U1


In [None]:
# Convertir X en DataFrame
df_x = pd.DataFrame(X)
df_x

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,350,351,352,353,354,355,356,357,358,359
0,-0.145,-0.145,-0.145,-0.145,-0.145,-0.145,-0.145,-0.145,-0.120,-0.135,...,-0.325,-0.330,-0.350,-0.365,-0.360,-0.380,-0.425,-0.445,-0.475,-0.510
1,-0.065,-0.065,-0.065,-0.065,-0.065,-0.065,-0.065,-0.065,-0.080,-0.080,...,-0.235,-0.250,-0.270,-0.280,-0.290,-0.300,-0.330,-0.360,-0.355,-0.305
2,-0.225,-0.230,-0.235,-0.240,-0.235,-0.220,-0.210,-0.205,-0.245,-0.285,...,0.520,0.050,-0.320,-0.500,-0.505,-0.445,-0.415,-0.395,-0.390,-0.395
3,-0.180,-0.185,-0.185,-0.175,-0.175,-0.175,-0.190,-0.205,-0.200,-0.205,...,-0.250,-0.270,-0.240,-0.240,-0.250,-0.255,-0.245,-0.245,-0.245,-0.255
4,-0.330,-0.335,-0.330,-0.320,-0.345,-0.355,-0.340,-0.330,-0.325,-0.330,...,-0.405,-0.420,-0.420,-0.435,-0.420,-0.410,-0.410,-0.405,-0.425,-0.430
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
218983,0.025,0.040,0.125,0.230,0.330,0.440,0.540,0.610,0.645,0.610,...,-0.090,-0.080,-0.060,-0.065,-0.060,-0.050,-0.060,-0.055,-0.065,-0.055
218984,-0.250,-0.265,-0.280,-0.270,-0.270,-0.265,-0.250,-0.260,-0.260,-0.250,...,1.670,1.535,1.280,0.945,0.590,0.260,0.005,-0.180,-0.285,-0.330
218985,0.080,0.080,0.065,0.065,0.080,0.095,0.100,0.100,0.110,0.110,...,0.570,0.600,0.580,0.490,0.345,0.185,0.040,-0.060,-0.135,-0.135
218986,-0.230,-0.255,-0.235,-0.225,-0.230,-0.230,-0.240,-0.245,-0.235,-0.240,...,-0.390,-0.385,-0.390,-0.395,-0.380,-0.385,-0.370,-0.385,-0.400,-0.415


In [None]:
# Convertir en DataFrame avec qu'une colonne 'ColumnName'
df_y = pd.DataFrame(Y, columns=['ColumnName'])
print(df_y)

       ColumnName
0               N
1               N
2               N
3               N
4               N
...           ...
218983          N
218984          N
218985          N
218986          N
218987          N

[218988 rows x 1 columns]


In [None]:
# Compte le nombre de répétitions dans chaque catégories
value_counts_y = df_y['ColumnName'].value_counts()
print(value_counts_y)

N    181262
Q     16086
V     14472
S      5562
F      1606
Name: ColumnName, dtype: int64


In [None]:
# Créer un DataFrame où les données X et Y sont placées sur un seul axe à la suite
df_fusionné = pd.concat([df_y, df_x], axis=1)
df_fusionné

Unnamed: 0,ColumnName,0,1,2,3,4,5,6,7,8,...,350,351,352,353,354,355,356,357,358,359
0,N,-0.145,-0.145,-0.145,-0.145,-0.145,-0.145,-0.145,-0.145,-0.120,...,-0.325,-0.330,-0.350,-0.365,-0.360,-0.380,-0.425,-0.445,-0.475,-0.510
1,N,-0.065,-0.065,-0.065,-0.065,-0.065,-0.065,-0.065,-0.065,-0.080,...,-0.235,-0.250,-0.270,-0.280,-0.290,-0.300,-0.330,-0.360,-0.355,-0.305
2,N,-0.225,-0.230,-0.235,-0.240,-0.235,-0.220,-0.210,-0.205,-0.245,...,0.520,0.050,-0.320,-0.500,-0.505,-0.445,-0.415,-0.395,-0.390,-0.395
3,N,-0.180,-0.185,-0.185,-0.175,-0.175,-0.175,-0.190,-0.205,-0.200,...,-0.250,-0.270,-0.240,-0.240,-0.250,-0.255,-0.245,-0.245,-0.245,-0.255
4,N,-0.330,-0.335,-0.330,-0.320,-0.345,-0.355,-0.340,-0.330,-0.325,...,-0.405,-0.420,-0.420,-0.435,-0.420,-0.410,-0.410,-0.405,-0.425,-0.430
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
218983,N,0.025,0.040,0.125,0.230,0.330,0.440,0.540,0.610,0.645,...,-0.090,-0.080,-0.060,-0.065,-0.060,-0.050,-0.060,-0.055,-0.065,-0.055
218984,N,-0.250,-0.265,-0.280,-0.270,-0.270,-0.265,-0.250,-0.260,-0.260,...,1.670,1.535,1.280,0.945,0.590,0.260,0.005,-0.180,-0.285,-0.330
218985,N,0.080,0.080,0.065,0.065,0.080,0.095,0.100,0.100,0.110,...,0.570,0.600,0.580,0.490,0.345,0.185,0.040,-0.060,-0.135,-0.135
218986,N,-0.230,-0.255,-0.235,-0.225,-0.230,-0.230,-0.240,-0.245,-0.235,...,-0.390,-0.385,-0.390,-0.395,-0.380,-0.385,-0.370,-0.385,-0.400,-0.415


In [None]:
# df_fusionné.to_csv('df_fusionné.csv', index=False)

In [None]:
# Applique une fonction anonyme à la première colonne du DataFrame fusionné pour binariser les étiquettes
# Dans ce cas, si la valeur de l'étiquette est différente de 'N', elle est remplacée par 0, sinon elle est remplacée par 1
df_fusionné_binaire = df_fusionné.copy()
df_fusionné_binaire.iloc[:, 0] = df_fusionné_binaire.iloc[:, 0].apply(lambda x: 0 if x != 'N' else 1)

  df_fusionné_binaire.iloc[:, 0] = df_fusionné_binaire.iloc[:, 0].apply(lambda x: 0 if x != 'N' else 1)


In [None]:
df_fusionné_binaire

Unnamed: 0,ColumnName,0,1,2,3,4,5,6,7,8,...,350,351,352,353,354,355,356,357,358,359
0,1,-0.145,-0.145,-0.145,-0.145,-0.145,-0.145,-0.145,-0.145,-0.120,...,-0.325,-0.330,-0.350,-0.365,-0.360,-0.380,-0.425,-0.445,-0.475,-0.510
1,1,-0.065,-0.065,-0.065,-0.065,-0.065,-0.065,-0.065,-0.065,-0.080,...,-0.235,-0.250,-0.270,-0.280,-0.290,-0.300,-0.330,-0.360,-0.355,-0.305
2,1,-0.225,-0.230,-0.235,-0.240,-0.235,-0.220,-0.210,-0.205,-0.245,...,0.520,0.050,-0.320,-0.500,-0.505,-0.445,-0.415,-0.395,-0.390,-0.395
3,1,-0.180,-0.185,-0.185,-0.175,-0.175,-0.175,-0.190,-0.205,-0.200,...,-0.250,-0.270,-0.240,-0.240,-0.250,-0.255,-0.245,-0.245,-0.245,-0.255
4,1,-0.330,-0.335,-0.330,-0.320,-0.345,-0.355,-0.340,-0.330,-0.325,...,-0.405,-0.420,-0.420,-0.435,-0.420,-0.410,-0.410,-0.405,-0.425,-0.430
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
218983,1,0.025,0.040,0.125,0.230,0.330,0.440,0.540,0.610,0.645,...,-0.090,-0.080,-0.060,-0.065,-0.060,-0.050,-0.060,-0.055,-0.065,-0.055
218984,1,-0.250,-0.265,-0.280,-0.270,-0.270,-0.265,-0.250,-0.260,-0.260,...,1.670,1.535,1.280,0.945,0.590,0.260,0.005,-0.180,-0.285,-0.330
218985,1,0.080,0.080,0.065,0.065,0.080,0.095,0.100,0.100,0.110,...,0.570,0.600,0.580,0.490,0.345,0.185,0.040,-0.060,-0.135,-0.135
218986,1,-0.230,-0.255,-0.235,-0.225,-0.230,-0.230,-0.240,-0.245,-0.235,...,-0.390,-0.385,-0.390,-0.395,-0.380,-0.385,-0.370,-0.385,-0.400,-0.415


In [None]:
Y_anormal = df_y[df_y["ColumnName"] != "N"]
Y_anormal

Unnamed: 0,ColumnName
14,S
15,S
460,S
461,S
516,S
...,...
216643,V
217442,V
217443,V
218702,V


In [None]:
# on met Y en binaire avec 1 quand l'ECG est normal, et 0 sinon
#Y_binaire = np.where(Y != 'N', 0, 1)
#Y_binaire

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder

# Supprimer les valeurs de X pour lesquelles Y vaut N pour avoir le meme nombre d'échantillons entre X et Y
# Trouver les indices où les valeurs de Y sont 'N'
indices_a_supprimer = np.where(Y == 'N')[0]
# Supprimer les lignes correspondantes de X
X_anormal = np.delete(X, indices_a_supprimer, axis=0)

# On utilise la classe StabdarScaler pour normaliser nos données d'entrées
# Ici, on normalise car les réseaux de neurones sont sensibles à l'échelle des données
# Ca permet d'avoir des données avec une distribution standardisée
scaler = StandardScaler()
X_normalized = scaler.fit_transform(X_anormal)
print(X_normalized)

# pour encoder les sorties anormales
# on aura 1 au niveau de la colonne représentant la classe en question
encoder = OneHotEncoder()
Y_encode = encoder.fit_transform(np.array(Y_anormal).reshape(-1, 1)).toarray()
print(Y_encode)

# Encoder sert à par exemple qd on a une colonne genre avec feminin ou masculin, créer 2 colonnes
# genre_feminin et genre_masculin avec 0 ou 1 suivant si la prsn est une femme ou un homme
# On encode généralement pour des données catégorielles (one hot encoding comme décrit plus haut) ou textuelles (word embedding)
# Avec notre Y_binaire, on a  pas besoin d'encoder car déjà une valeur numérique

[[-0.38857681 -0.37888352 -0.34100534 ... -0.31151111 -0.32698706
  -0.31831522]
 [-0.04854594 -0.02161169  0.0147725  ... -0.0862012  -0.08427897
  -0.09042571]
 [-0.44524862 -0.43529486 -0.45335624 ... -0.33565146 -0.31889679
  -0.31831522]
 ...
 [ 0.15925071  0.11941667  0.11776082 ...  0.31613791  0.34450533
   0.36535332]
 [-0.3413503  -0.33187407 -0.32228019 ... -0.35174502 -0.37552868
  -0.41598216]
 [-0.02021003 -0.02161169 -0.0226778  ...  0.81503842  0.82183124
   0.83741018]]
[[0. 0. 1. 0.]
 [0. 0. 1. 0.]
 [0. 0. 1. 0.]
 ...
 [0. 0. 0. 1.]
 [0. 0. 0. 1.]
 [0. 0. 0. 1.]]


In [None]:
print(X_normalized.shape)
print(Y_encode.shape)

(37726, 360)
(37726, 4)


In [None]:
from sklearn.model_selection import train_test_split

# Réduire X et Y à 20% de leur taille
X_reduit, _, Y_reduit, _ = train_test_split(X_normalized, Y_encode, test_size=0.8, random_state=42)  # Conserve 20%, ignore 80%
# Split des données en test et entrainement
X_train, X_test, Y_train, Y_test = train_test_split(X_reduit, Y_reduit, test_size=0.2, random_state=42)

In [None]:
# Dimensions de X_train
print(X_train.shape)
print(Y_train.shape)

# 3017 données
# colonnes en entrée  (1440) pour le nombre de fenetres étudiées
# colonnes en sortie (4) pour dire l'état d'ecg (valeur 1 ans la colonne correspondante)

(6036, 360)
(6036, 4)


In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Conv1D, MaxPooling1D, Flatten

# Construction du modèle
model = Sequential([
    Conv1D(filters=64, kernel_size=3, activation='relu', input_shape=(X_train.shape[1], 1)), # première couche de convolution
    MaxPooling1D(pool_size=2), # toujours associer à une couche de pooling pour réduire la dimensionnalité
    Flatten(), # pour applatir les sorties des couches précédentes en un vecteur pour pouvoir les connecter à des couches entièrement connectées
    Dense(32, activation='relu'),  # première couche entièrement connectée
                                  # à 100 neurones ca plantait donc j'ai mis 32
                                  # (détermine la complexité du modèle avec sa capacité à apprendre des relations complexes entre les caractéristiques (X))
                                  # c'est mieux de choisir un nombre de neurones en 2^
    Dense(4, activation='softmax') # deuxième avec une activation softmax pour prédire les proba des classes de sortie (ici que Y)
    # 4 neurones pour la dernière couche car 4 classes de sortie
])

# le nombre de filtres détermine le nb de caracteristiques ou de motifs différents que la couche peut extraire des données
# la taille du noyau détermine la taille de la fenetre utilisée pour calculer les convolutions sur les données d'entrée X
# la fonction d'activation est généralement relu pr controler la non linéarité dans le modèle puis softmax pr les proba

In [None]:
# Compilation du modèle

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# ici on choisit la fonction de perte categorical_crossentropy car notre sortie Y est une catégorisation à 4 classes en fction de l'anomalie
# on peut aussi choisir la fonction de perte binary_crossentropy si notre sortie Y était binaire (normal ou anormal par exemple)
# l'optimiseur met à jour les poids du modèle lors de l'entrainement pr minimiser la fonction de perte et est généralement adam, sgd..
# métriques pour évaluer les performances du modèle donc ici on choisit la précision

In [None]:
# Entrainement du modèle

model.fit(X_train, Y_train, epochs=2, batch_size=10, validation_data=(X_test, Y_test)) # normalement 10 et 32 mais plante

Epoch 1/2
Epoch 2/2


<keras.src.callbacks.History at 0x7ae4883aca30>

In [None]:
from sklearn.metrics import confusion_matrix
import numpy as np

# Faire des prédictions sur l'ensemble de test
Y_pred = model.predict(X_test)

# Convertir les prédictions en étiquettes de classe
predicted_labels = np.argmax(Y_pred, axis=1)
true_labels = np.argmax(Y_test, axis=1)

# Créer la matrice de confusion
conf_matrix = confusion_matrix(true_labels, predicted_labels)

print("Matrice de confusion :")
print(conf_matrix)


Matrice de confusion :
[[  1   3   2  62]
 [  0 525  26  86]
 [  0  42 104  63]
 [  4  89  39 463]]
