# Preprocessing

In [2]:
import os
import pandas as pd
import numpy as np
from scipy import signal
from datetime import datetime, timedelta

In [5]:
def process_data(folder_path):
    individuals = ['S2', 'S3', 'S4', 'S5', 'S6', 'S7', 'S8', 'S9', 'S10', 'S11', 'S13', 'S14', 'S15', 'S16', 'S17']  # List of individual folder names
    sampling_rate_original = 4  # Original sampling rate
    sampling_rate_target = 1  # Target sampling rate
    dfs = []
    
    for idx, individual in enumerate(individuals):
        individual_path = os.path.join(folder_path, individual)
        if not os.path.isdir(individual_path):
            continue  # Skip if the folder doesn't exist

        hr_path = os.path.join(individual_path, 'E4_Data', 'HR.csv')
        temp_path = os.path.join(individual_path, 'E4_Data', 'TEMP.csv')

        if not (os.path.isfile(hr_path) and os.path.isfile(temp_path)):
            continue  

        # HR signal
        hr_df = pd.read_csv(hr_path, names=['hr'], skiprows=2)
        hr_signal = hr_df['hr'].values
        hr_smoothed = pd.Series(hr_signal).rolling(3).mean().values

        # Temp signal
        temp_df = pd.read_csv(temp_path, names=['temp'], skiprows=2)
        temp_signal = temp_df['temp'].values
        temp_resampled = signal.resample(temp_signal, int(len(temp_signal) * sampling_rate_target / sampling_rate_original))
        temp_smoothed = pd.Series(temp_resampled).rolling(3).mean().values

        # Time 
        start_time = pd.to_datetime('00:00', format='%M:%S')
        time_hr = pd.Series([start_time + timedelta(seconds=(i / sampling_rate_target)) for i in range(len(hr_smoothed))])
        time_temp = pd.Series([start_time + timedelta(seconds=(i / sampling_rate_target)) for i in range(len(temp_smoothed))])

        # Create dataframes for 'bvp' and 'temp' signals with time vectors and ID column
        hr_df_processed = pd.DataFrame({'ID': [idx] * len(hr_smoothed), 'Time': time_hr, 'hr': hr_smoothed})
        temp_df_processed = pd.DataFrame({'ID': [idx] * len(temp_smoothed), 'Time': time_temp, 'temp': temp_smoothed})

        # Merge the dataframes based on the 'ID' and 'Time' columns
        final_df = pd.merge(hr_df_processed, temp_df_processed, on=['ID', 'Time'], how='outer')

        # Handle missing values, if any
        final_df = final_df.ffill().bfill()  # Forward-fill and backward-fill missing values

        # Convert hours to minutes 
        final_df['Time'] = final_df['Time'].dt.hour * 60 + final_df['Time'].dt.minute

        # Fill with labels
        final_df['labels'] = 999 # Fill with placeholder for transition 
        quest = pd.read_csv(os.path.join(individual_path, 'quest.csv'), delimiter= ';')
        labels = quest.columns

        final_df.loc[(final_df['Time'] >= quest.iloc[0, 1]) & (final_df['Time'] <= quest.iloc[1, 1]), 'labels'] = labels[1]
        final_df.loc[(final_df['Time'] >= quest.iloc[0, 2]) & (final_df['Time'] <= quest.iloc[1, 2]), 'labels'] = labels[2]
        final_df.loc[(final_df['Time'] >= quest.iloc[0, 3]) & (final_df['Time'] <= quest.iloc[1, 3]), 'labels'] = labels[3]
        final_df.loc[(final_df['Time'] >= quest.iloc[0, 4]) & (final_df['Time'] <= quest.iloc[1, 4]), 'labels'] = labels[4]
        final_df.loc[(final_df['Time'] >= quest.iloc[0, 5]) & (final_df['Time'] <= quest.iloc[1, 5]), 'labels'] = labels[5]

        start_time = pd.to_datetime('2023-07-13 00:00', format='%Y-%m-%d %M:%S')
        final_df['Time'] = pd.Series([start_time + timedelta(seconds=(i / sampling_rate_target)) for i in range(len(final_df))])

        dfs.append(final_df)

    # Save the merged dataframe to a CSV file
    merged_df = pd.concat(dfs, ignore_index=True)
    output_file_path = os.path.join(folder_path, 'final_df.csv')
    merged_df.to_csv(output_file_path, index=False)
    print(f"Merged dataset saved as {output_file_path}")

In [6]:
folder_path = 'C:/Users/aless/OneDrive - Università degli Studi di Catania/tesi/dataset'
process_data(folder_path)

Merged dataset saved as C:/Users/aless/OneDrive - Università degli Studi di Catania/tesi/dataset\final_df.csv
