In [77]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [78]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import keras
from keras import layers
import os
import matplotlib.pyplot as plt
from imblearn.under_sampling import RandomUnderSampler
from tabulate import tabulate
from itertools import islice
from scipy.signal import resample
from sklearn.preprocessing import StandardScaler
import pickle

In [79]:
data_path = '/content/drive/MyDrive/MovePort/'
Modified_Df_PATH = '/content/drive/MyDrive/MovePortModif/'
participants = sorted([p for p in os.listdir(data_path) if p.isdigit()], key=int)
print("Participants disponibles :", participants)

Participants disponibles : ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15', '16', '17', '18', '19', '20', '21', '22', '23', '24', '25']


In [80]:
# Fonction pour créer un itérable par lot
def batch_iterable(iterable, size):
    iterator = iter(iterable)
    while batch := list(islice(iterator, size)):
        yield batch

In [81]:
# Fonction pour traiter les fichiers d'un participant
def process_participant_files(participant_path, activities):
    processed_data = {}

    for activity in activities:
        activity_path = os.path.join(participant_path, activity)
        files = [f for f in os.listdir(activity_path) if f.endswith('.csv') and not f.startswith('.')]  # Filtrer uniquement les fichiers CSV
        # Filtrer les fichiers spécifiques avec le format exact
        emg_files = [f for f in files if f.startswith('emg_1') and f.endswith('.csv')]
        imu_files = [f for f in files if f.startswith('imu_1') and f.endswith('.csv')]
        ips_files = [f for f in files if f.startswith('ips_1') and f.endswith('.csv')]
        print(f"  Activity: {activity}")
        """print(f"    EMG Files: {emg_files}")
        print(f"    IMU Files: {imu_files}")
        print(f"    IPS Files: {ips_files}")"""
        processed_data[activity] = {}
        # Traiter chaque groupe de fichiers (EMG, IMU, IPS)
        for file_group, file_list in [("EMG", emg_files), ("IMU", imu_files), ("IPS", ips_files)]:
            for file in file_list:
                file_path = os.path.join(activity_path, file)
                #print(f"      {file_group} File: {file}")
                try:
                    df = pd.read_csv(file_path)
                    df_transposed = df.transpose()  # Transposer pour correspondre aux dimensions
                    processed_data[activity][file_group] = df_transposed
                except Exception as e:
                    print(f"Impossible de lire ou de transposer le fichier {file}: {e}")
    return processed_data

In [82]:
# Fonction pour traiter les données par lots
def process_data_in_batches(data_path, participants_list, batch_size=5):
    all_processed_data = {}

    for batch_number, participant_batch in enumerate(batch_iterable(participants_list, batch_size), start=1):
        print(f"Traitement du lot {batch_number} : {participant_batch}")

        for participant in participant_batch:
            participant_path = os.path.join(data_path, str(participant))
            print(f"Participant: {participant}")

            # Filtrer les activités
            activities = [a for a in os.listdir(participant_path) if not a.startswith('.') and os.path.isdir(os.path.join(participant_path, a))]

            processed_data = process_participant_files(participant_path, activities)
            all_processed_data[participant] = processed_data
            print(f"  Données traitées pour le participant {participant}:")
            print(processed_data)

        print(f"Fin du traitement du lot {batch_number}\n")

    return all_processed_data

In [83]:
# aligner les fréquences
"""def resample_data(df, target_rate=100):
    original_rate = len(df) / (df.index[-1] - df.index[0])
    target_len = int(len(df) * target_rate / original_rate)
    resampled_data = resample(df, target_len)
    return pd.DataFrame(resampled_data, columns=df.columns)"""

'def resample_data(df, target_rate=100):\n    original_rate = len(df) / (df.index[-1] - df.index[0])\n    target_len = int(len(df) * target_rate / original_rate)\n    resampled_data = resample(df, target_len)\n    return pd.DataFrame(resampled_data, columns=df.columns)'

In [84]:
"""def resample_modalities(processed_data, target_rate=100):
    resampled_data = {}

    for participant, activities in processed_data.items():
        resampled_data[participant] = {}
        for activity, modalities in activities.items():
            resampled_data[participant][activity] = {}
            for modality, df in modalities.items():
                resampled_data[participant][activity][modality] = resample_data(df, target_rate)

    return resampled_data"""

'def resample_modalities(processed_data, target_rate=100):\n    resampled_data = {}\n\n    for participant, activities in processed_data.items():\n        resampled_data[participant] = {}\n        for activity, modalities in activities.items():\n            resampled_data[participant][activity] = {}\n            for modality, df in modalities.items():\n                resampled_data[participant][activity][modality] = resample_data(df, target_rate)\n\n    return resampled_data'

In [130]:
# Nettoyage des données
"""def clean_missing_values(processed_data, threshold=0.5):
    cleaned_data = {}
    for participant, activities in processed_data.items():
        cleaned_data[participant] = {}
        for activity, modalities in activities.items():
            cleaned_data[participant][activity] = {}
            for modality, df in modalities.items():
                # Identifier et traiter uniquement les colonnes numériques
                numeric_cols = df.select_dtypes(include=[np.number])
                non_numeric_cols = df.select_dtypes(exclude=[np.number])
                # Calculer le pourcentage de valeurs manquantes pour les colonnes numériques
                missing_percentage = numeric_cols.isna().mean()
                # Supprimer les colonnes numériques avec trop de NaN
                numeric_cols_cleaned = numeric_cols.loc[:, missing_percentage < threshold]
                # Remplir les valeurs manquantes dans les colonnes numériques restantes avec la médiane
                numeric_cols_cleaned = numeric_cols_cleaned.apply(lambda col: col.fillna(col.median()), axis=0)
                # Réassembler : conserver les colonnes non numériques inchangées
                df_cleaned = pd.concat([non_numeric_cols, numeric_cols_cleaned], axis=1)
                cleaned_data[participant][activity][modality] = df_cleaned

    return cleaned_data"""

In [155]:
# Nettoyage des données
def clean_missing_values(processed_data, threshold=0.51):
    cleaned_data = {}

    for participant, activities in processed_data.items():
        cleaned_data[participant] = {}
        for activity, modalities in activities.items():
            cleaned_data[participant][activity] = {}
            for modality, df in modalities.items():
                # Étape 1 : Supprimer les lignes contenant >= threshold% de zéros
                zero_threshold = (df == 0).mean(axis=1)  # Calcul du pourcentage de zéros par ligne
                df = df.loc[zero_threshold < threshold]  # Conserver les lignes où le pourcentage de zéros est inférieur au seuil
                # Étape 2 : Identifier et traiter uniquement les colonnes numériques
                numeric_cols = df.select_dtypes(include=[np.number])
                non_numeric_cols = df.select_dtypes(exclude=[np.number])
                # Étape 3 : Calculer le pourcentage de valeurs manquantes pour les colonnes numériques
                missing_percentage = numeric_cols.isna().mean()
                # Étape 4 : Supprimer les colonnes numériques avec trop de NaN
                numeric_cols_cleaned = numeric_cols.loc[:, missing_percentage < threshold]
                # Étape 5 : Remplir les valeurs manquantes dans les colonnes numériques restantes avec la médiane
                numeric_cols_cleaned = numeric_cols_cleaned.apply(lambda col: col.fillna(col.median()), axis=0)
                # Étape 6 : Réassembler : conserver les colonnes non numériques inchangées
                df_cleaned = pd.concat([non_numeric_cols, numeric_cols_cleaned], axis=1)
                cleaned_data[participant][activity][modality] = df_cleaned

    return cleaned_data


In [86]:
##  Normalisation des données
import pandas as pd
from sklearn.preprocessing import StandardScaler

def normalize_data(cleaned_data):
    normalized_data = {}
    scaler = StandardScaler()

    for participant, activities in cleaned_data.items():
        normalized_data[participant] = {}
        for activity, modalities in activities.items():
            normalized_data[participant][activity] = {}
            for modality, df in modalities.items():
                # Vérifier si la première ligne contient des chaînes de caractères (en-têtes)
                if isinstance(df.iloc[0, 0], str):
                    headers = df.columns
                    df_values = df.iloc[1:].astype(float)
                    # Conserver les en-têtes en réappliquant après normalisation
                    df_normalized_values = pd.DataFrame(scaler.fit_transform(df_values), columns=headers, index=df.index[1:])
                    df_normalized = pd.concat([df.iloc[:1], df_normalized_values])
                else:
                    df_normalized = pd.DataFrame(scaler.fit_transform(df), columns=df.columns, index=df.index)

                normalized_data[participant][activity][modality] = df_normalized

    return normalized_data

In [87]:
##  Normalisation des données
"""def normalize_data(df):
    scaler = StandardScaler()
    return pd.DataFrame(scaler.fit_transform(df), columns=df.columns)

def normalize_modalities(processed_data):
    normalized_data = {}

    for participant, activities in processed_data.items():
        normalized_data[participant] = {}
        for activity, modalities in activities.items():
            normalized_data[participant][activity] = {}
            for modality, df in modalities.items():
                normalized_data[participant][activity][modality] = normalize_data(df)

    return normalized_data"""


'def normalize_data(df):\n    scaler = StandardScaler()\n    return pd.DataFrame(scaler.fit_transform(df), columns=df.columns)\n\ndef normalize_modalities(processed_data):\n    normalized_data = {}\n\n    for participant, activities in processed_data.items():\n        normalized_data[participant] = {}\n        for activity, modalities in activities.items():\n            normalized_data[participant][activity] = {}\n            for modality, df in modalities.items():\n                normalized_data[participant][activity][modality] = normalize_data(df)\n\n    return normalized_data'

In [88]:
## Segmentation des données en fenêtres temporelles
def segment_data(df, window_size=100, step_size=50):
    segments = []
    num_frames = df.shape[0]

    for start in range(0, num_frames - window_size + 1, step_size):
        end = start + window_size
        segment = df.iloc[start:end].values  # Segmenter le dataframe
        segments.append(segment)

    return np.array(segments)

def segment_modalities(processed_data, window_size=100, step_size=50):
    segmented_data = {}

    for participant, activities in processed_data.items():
        segmented_data[participant] = {}
        for activity, modalities in activities.items():
            segmented_data[participant][activity] = {}

            for modality, df in modalities.items():
                segmented_data[participant][activity][modality] = segment_data(df, window_size, step_size)

    return segmented_data


In [89]:
## Reshaping des données IPS en matrices 3D
def reshape_ips_data(df):
    reshaped_data = df.values.reshape(-1, 31, 11)
    return reshaped_data

# Fonction pour segmenter et reshaper les données
def segment_and_reshape_modalities(processed_data, window_size=100, step_size=50):
    segmented_and_reshaped_data = {}

    for participant, activities in processed_data.items():
        segmented_and_reshaped_data[participant] = {}
        for activity, modalities in activities.items():
            segmented_and_reshaped_data[participant][activity] = {}
            for modality, df in modalities.items():
                # Segmenter les données pour chaque modalité
                segmented_data = segment_data(df, window_size, step_size)
                # Appliquer le reshaping uniquement pour les données IPS
                if modality == "IPS":
                    reshaped_data = reshape_ips_data(df)
                    segmented_and_reshaped_data[participant][activity][modality] = reshaped_data
                else:
                    # Laisser les données EMG et IMU sous forme de segments 2D
                    segmented_and_reshaped_data[participant][activity][modality] = segmented_data

    return segmented_and_reshaped_data

In [90]:
# Fusion des données multimodales (EMG, IMU, IPS)

def fuse_modalities(emg_data, imu_data, ips_data):
    # Fusionner les données en un seul vecteur pour chaque segment
    fused_data = np.concatenate([emg_data, imu_data, ips_data], axis=-1)
    return fused_data

def fuse_modalities_for_all_data(processed_data):
    fused_data = {}

    for participant, activities in processed_data.items():
        fused_data[participant] = {}
        for activity, modalities in activities.items():
            emg_data = modalities["EMG"]
            imu_data = modalities["IMU"]
            ips_data = modalities["IPS"]
            # Fusionner les données
            fused_segments = fuse_modalities(emg_data, imu_data, ips_data)
            fused_data[participant][activity] = {"fused_segments": fused_segments, "labels": None}  # Placeholder pour les labels

    return fused_data

In [91]:
###  Sauvegarde des segments et des labels
def save_data(fused_data, filename="processed_data.pkl"):
    with open(filename, 'wb') as f:
        pickle.dump(fused_data, f)

def save_segments_and_labels(fused_data, output_path):
    for participant, activities in fused_data.items():
        for activity, data in activities.items():
            segments = data["fused_segments"]
            labels = data["labels"]  # Vous devez ajouter des labels à ce point

            # Sauvegarder les segments et labels pour chaque activité
            segments_filename = os.path.join(output_path, f"segments_{participant}_{activity}.csv")
            labels_filename = os.path.join(output_path, f"labels_{participant}_{activity}.csv")

            # Sauvegarder les segments et les labels en fichiers CSV
            pd.DataFrame(segments).to_csv(segments_filename, index=False)
            pd.DataFrame(labels).to_csv(labels_filename, index=False)

In [92]:
# Liste des participants à traiter
participants_to_process = [1, 2]

In [107]:
# 1 : Charger et prétraiter les données
processed_data = process_data_in_batches(data_path, participants_to_process, batch_size=5)


Traitement du lot 1 : [1, 2]
Participant: 1
  Activity: forward
  Activity: still
  Activity: back
  Activity: halfsquat
  Données traitées pour le participant 1:
{'forward': {'EMG':                   0         1          2         3         4         5   \
Unnamed: 0    R_Vlat      R_RF       R_ST      R_TA    L_Vlat      L_RF   
0          -4.431152  3.323364   7.653809 -4.733276  3.927612  2.920532   
1          -9.970093  0.100708  31.723022 -4.733276   0.50354  3.726196   
2          -6.546021  2.920532  40.484619 -4.733276  2.215576 -0.704956   
3          -1.208496  0.302124  47.232056 -4.733276 -4.632568  0.402832   
...              ...       ...        ...       ...       ...       ...   
120155     -7.351685 -3.826905 -61.029053 -1.611328   2.01416 -1.309204   
120156     -7.452393  4.833984 -55.892944   1.00708 -4.733276 -2.920532   
120157     -4.934692  3.121948   -56.5979  0.402832  4.330444 -2.719116   
120158      -3.52478 -0.201416 -68.582153  0.604248  -3.52478 -3.22

In [134]:
# Étape 2 : Rééchantillonner les données pour aligner les fréquences
#resampled_data = resample_modalities(processed_data, target_rate=100)
type(processed_data)
# Si 'forward' contient plusieurs modalités (EMG, IMU, IPS)
print('################### FORWARD')
print("Shape of IPS data:", processed_data[1]['forward']['IPS'].shape)
print("Shape of EMG data:", processed_data[1]['forward']['EMG'].shape)
print("Shape of EMG data:", processed_data[1]['forward']['IMU'].shape)
print('################### BACK')
print("Shape of IPS data:", processed_data[1]['back']['IPS'].shape)
print("Shape of EMG data:", processed_data[1]['back']['EMG'].shape)
print("Shape of EMG data:", processed_data[1]['back']['IMU'].shape)
print('################### HALFSQUAT')
print("Shape of IPS data:", processed_data[1]['halfsquat']['IPS'].shape)
print("Shape of EMG data:", processed_data[1]['halfsquat']['EMG'].shape)
print("Shape of EMG data:", processed_data[1]['halfsquat']['IMU'].shape)
print('################### STILL')
print("Shape of IPS data:", processed_data[1]['still']['IPS'].shape)
print("Shape of EMG data:", processed_data[1]['still']['EMG'].shape)
print("Shape of EMG data:", processed_data[1]['still']['IMU'].shape)


################### FORWARD
Shape of IPS data: (3605, 682)
Shape of EMG data: (120161, 16)
Shape of EMG data: (6009, 54)
################### BACK
Shape of IPS data: (3974, 682)
Shape of EMG data: (132441, 16)
Shape of EMG data: (6623, 54)
################### HALFSQUAT
Shape of IPS data: (3583, 682)
Shape of EMG data: (119401, 16)
Shape of EMG data: (5971, 54)
################### STILL
Shape of IPS data: (3949, 682)
Shape of EMG data: (131621, 16)
Shape of EMG data: (6582, 54)


In [156]:
# Étape 3 : Nettoyer les valeurs manquantes
# Nettoyage des données avec un seuil de 50% pour supprimer les colonnes trop vides
cleaned_data = clean_missing_values(processed_data, threshold=0.51)
# Si 'forward' contient plusieurs modalités (EMG, IMU, IPS)

In [157]:
print('################### FORWARD')
print("Shape of IPS data:", cleaned_data[1]['forward']['IPS'].shape)
print("Shape of EMG data:", cleaned_data[1]['forward']['EMG'].shape)
print("Shape of EMG data:", cleaned_data[1]['forward']['IMU'].shape)
print('################### BACK')
print("Shape of IPS data:", cleaned_data[1]['back']['IPS'].shape)
print("Shape of EMG data:", cleaned_data[1]['back']['EMG'].shape)
print("Shape of EMG data:", cleaned_data[1]['back']['IMU'].shape)
print('################### HALFSQUAT')
print("Shape of IPS data:", cleaned_data[1]['halfsquat']['IPS'].shape)
print("Shape of EMG data:", cleaned_data[1]['halfsquat']['EMG'].shape)
print("Shape of EMG data:", cleaned_data[1]['halfsquat']['IMU'].shape)
print('################### STILL')
print("Shape of IPS data:", cleaned_data[1]['still']['IPS'].shape)
print("Shape of EMG data:", cleaned_data[1]['still']['EMG'].shape)
print("Shape of EMG data:", cleaned_data[1]['still']['IMU'].shape)

################### FORWARD
Shape of IPS data: (3605, 682)
Shape of EMG data: (120161, 16)
Shape of EMG data: (6007, 54)
################### BACK
Shape of IPS data: (1232, 682)
Shape of EMG data: (132441, 16)
Shape of EMG data: (6623, 54)
################### HALFSQUAT
Shape of IPS data: (3553, 682)
Shape of EMG data: (119401, 16)
Shape of EMG data: (5969, 54)
################### STILL
Shape of IPS data: (3949, 682)
Shape of EMG data: (131621, 16)
Shape of EMG data: (6580, 54)


In [158]:
# Étape 4 : Normaliser les données
normalized_data = normalize_data(cleaned_data)

In [159]:
# Participant à vérifier
participant_id = 1

# Parcourir les activités et modalités pour ce participant
for activity, modalities in normalized_data[participant_id].items():
    print(f"Activité : {activity}")

    for modality, df in modalities.items():
        print(f"  Modalité : {modality}, Taille des données : {df.shape}")

        # Afficher un aperçu des 5 premières lignes de la modalité
        print(f"    Aperçu des 5 premières lignes de {modality} :")
        print(df.head())  # Afficher les premières lignes du DataFrame
        print("-" * 50)  # Ligne de séparation pour rendre la sortie plus lisible

Activité : forward
  Modalité : EMG, Taille des données : (120161, 16)
    Aperçu des 5 premières lignes de EMG :
                  0         1         2         3         4         5   \
Unnamed: 0    R_Vlat      R_RF      R_ST      R_TA    L_Vlat      L_RF   
0          -0.699717  1.588466  0.228831 -0.174003  1.204777   1.25631   
1          -1.901176  0.701886  0.750127 -0.174003  0.406394  1.460689   
2          -1.158456  1.477643  0.939888 -0.174003  0.805585  0.336604   
3          -0.000686  0.757297  1.086025 -0.174003 -0.791181  0.617625   

                  6         7         8         9         10        11  \
Unnamed: 0      L_ST      L_TA      R_MG      R_LG     R_SOL      R_IL   
0           1.058997 -0.015547  0.031426  0.086216  0.101264 -1.216718   
1           1.249201 -0.155308 -0.647623  0.453086  0.540769 -1.216718   
2           1.374871  0.155979 -0.140485 -0.966541  0.120099 -1.216718   
3           1.575264  0.429149  0.117381 -0.535868  0.471704 -1.216718 

In [97]:
# Étape 5 : Segmenter les données en fenêtres temporelles
segmented_data = segment_modalities(normalized_data, window_size=100, step_size=50)

In [98]:
# Étape 6 : Reshaper les données IPS en matrices
reshaped_data = segment_and_reshape_modalities(segmented_data)

AttributeError: 'numpy.ndarray' object has no attribute 'iloc'

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Pour les analyses statistiques
from scipy.signal import resample
from scipy.stats import describe