# 1. Original Dataset (1min)

## EEG

In [None]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta

class EEGProcessor:
     
    # time_interval : Unified seconds with Fitbit data (ex. 10secs) 
    # remove_time_in_group : the criteria of processing error values in each time_interval group (ex. 7secs)
    
    def __init__(self, file_path, time_interval, remove_time_in_group):
        self.time_interval = time_interval
        self.remove_time_in_group = remove_time_in_group
        self.time_interval_str = f'{time_interval}S'
        self.EEG_report = pd.read_csv(file_path)

    # List to dataframe (ex. brain waves)
    def parse_raw_data(self, dataframe, col_name):
        col_str = dataframe.iloc[0][col_name]
        col_str = col_str.strip('[]')
        col_list = [float(val) for val in col_str.split(',')]  # cause list is divided by comma
        col_data = pd.DataFrame({col_name: col_list})
        return col_data

    # Experiment time calculating function
    def time_difference(self, dataframe, start_time_col, finish_time_col):
        start_time = datetime.strptime(dataframe.iloc[0][start_time_col], '%Y-%m-%d %H:%M:%S')
        finish_time = datetime.strptime(dataframe.iloc[0][finish_time_col], '%Y-%m-%d %0H:%M:%S')

        # time difference between two datatime objects
        time_difference = (finish_time - start_time).total_seconds()
        return time_difference
    
    # Comparing the experimental initial recognition error period and delete the part to be deleted
    def count_initial_same_values(self, series):
        initial_value = series.iloc[0]
        count = 0
        for value in series:
            if value == initial_value:
                count += 1
            else:
                break
        return count
    
    # Processing of values that are not exactly divided into front and back
    def process_start_time_trash_sec(self, start_time):
        # Plus 1 min and delete second in input time
        rounded_time = start_time + timedelta(minutes=1) - timedelta(seconds=start_time.second)
        time_difference = (rounded_time - start_time).total_seconds()

        remainder = time_difference % float(self.time_interval)

        # use the seconds over remove_time_in_group seconds
        if self.remove_time_in_group <= remainder:
            return False
        
        else:
            return remainder
        
    # Processing of values that are not exactly divided into front and back
    def process_finish_time_trash_sec(self, finish_time):
        # delete second in input time
        rounded_time = finish_time - timedelta(seconds=finish_time.second)
        time_difference = (finish_time - rounded_time).total_seconds()

        remainder = time_difference % float(self.time_interval)

        if self.remove_time_in_group <= remainder:
            return False
        
        else:
            return remainder

    # Rounding time to nearest time which can divided by time interval
    def nearest_time_rounding(self, time):
        seconds = time.second
        # For example, find nearest value in 0, 10, 20, 30, 40, 50sec
        time_points = [time_point for time_point in range(0,60, self.time_interval)]
        nearest = min(time_points, key=lambda time_point: abs(time_point - seconds))
        
        if nearest == time_points[-1] and seconds >= (time_points[-1] + self.remove_time_in_group):
            rounded_time = time.replace(second=0, microsecond=0) + timedelta(minutes=1)
        else:
            rounded_time = time.replace(second=nearest, microsecond=0)

        return rounded_time

    # Make the same end time
    def align_end_time(self, dataframe_1, dataframe_2):
        if dataframe_1.index[-1] > dataframe_2.index[-1]:
            dataframe_1 = dataframe_1[dataframe_1.index <= dataframe_2.index[-1]]

        elif dataframe_1.index[-1] < dataframe_2.index[-1]:
            dataframe_2 = dataframe_2[dataframe_2.index <= dataframe_1.index[-1]]

        else: 
            pass # if two dataframe's endtime is same

        return dataframe_1, dataframe_2

    # Adjust start time and end time processing
    # start time processing -> process_type : 0 , finish time processing -> process_type : -1
    # start time processing -> process_start_time_trash_sec func , finish time processing -> process_finish_time_trash_sec func    
    def adjust_time_index(self, process_type, dataframe, func):
        remainder = func(dataframe.index[process_type])
        
        # the last data only shows one original data, so processing this problem
        one_sec = timedelta(seconds=1)

        if remainder == False:
            # change time to nearest (Start time processing)
            if process_type == 0 :
                time = self.nearest_time_rounding(dataframe.index[process_type])
                new_index = dataframe.index.tolist()
                new_index[process_type] = time
                dataframe.index = new_index
            
            # change time to nearest (Finish time processing)
            else:
                time = self.nearest_time_rounding(dataframe.index[process_type]) - one_sec
                new_index = dataframe.index.tolist()
                new_index[process_type] = time
                dataframe.index = new_index
        
        # if remainder is under remove_time_in_group, just remove
        else:
            cutting_time = timedelta(seconds=remainder)
            # Start time processing
            if process_type == 0:
                dataframe = dataframe[dataframe.index >= dataframe.index[process_type] + cutting_time]
            
            # Finish time processing
            # make the seconds like 9, 19, 29...
            else:
                dataframe = dataframe[dataframe.index <= dataframe.index[-1] - cutting_time - one_sec]

        return dataframe
    
    # Removing error values in group (brain waves and attention score)
    def check_invalid_values(self, group):
        # find error data length in brain wave
        alpha_invalid_series = group['α_wave_raw_data'].diff().eq(0)
        alpha_invalid_timestamps = group.index[alpha_invalid_series].tolist()

        # find error data length in attention_raw_data
        attention_invalid_series = group['attention_raw_data'] == 0
        attention_invalid_timestamps = group.index[attention_invalid_series].tolist()

        # check whether the length of error data is over remove_time_in_group second
        def has_long_invalid_duration(invalid_timestamps):
            if not invalid_timestamps:
                return False
            for invalid_time in range(1, len(invalid_timestamps)):
                if (invalid_timestamps[invalid_time] - invalid_timestamps[invalid_time-1]).seconds > self.remove_time_in_group:
                    return True
            return False

        alpha_invalid = has_long_invalid_duration(alpha_invalid_timestamps)
        attention_invalid = has_long_invalid_duration(attention_invalid_timestamps)

        if alpha_invalid or attention_invalid:
            return group.mean()
#             return pd.Series([np.nan] * group.shape[1], index=group.columns)

        else:
            return group.mean()
            # calculate average except error value
#             valid_conditions = (
#                 (group['α_wave_raw_data'].diff() != 0) & 
#                 (group['β_wave_raw_data'].diff() != 0) & 
#                 (group['θ_wave_raw_data'].diff() != 0) & 
#                 (group['δ_wave_raw_data'].diff() != 0) & 
#                 (group['γ_wave_raw_data'].diff() != 0) & 
#                 (group['attention_raw_data'] != 0)
#             )
#             return group[valid_conditions].mean()

    # Removing error values in group (hr)
    def check_invalid_values_other(self, group):
        # find error data length in hr
        hr_invalid_series = group['hr_raw_data'] == 0
        hr_invalid_timestamps = group.index[hr_invalid_series].tolist()

        # check whether the length of error data is over remove_time_in_group second
        def has_long_invalid_duration(invalid_timestamps):
            if not invalid_timestamps:
                return False
            for invalid_time in range(1, len(invalid_timestamps)):
                if (invalid_timestamps[invalid_time] - invalid_timestamps[invalid_time-1]).seconds > self.remove_time_in_group:
                    return True
            return False

        hr_invalid = has_long_invalid_duration(hr_invalid_timestamps)

        if hr_invalid:
            return group.mean()
#             return pd.Series([np.nan] * group.shape[1], index=group.columns)

        else:
            # calculate average except error value
#             group = group[(group['hr_raw_data'] != 0)]
            return group.mean()
    
    # Process EEG data
    def process_eeg_data(self, experiment_id):
        if experiment_id not in self.EEG_report.index:
            return None

        # all experiments in one df
        EEG_report_sample = self.EEG_report.loc[[experiment_id],:]

        # one dataframe for one column
        cols = ['α_wave_raw_data', 'β_wave_raw_data', 'θ_wave_raw_data', 'δ_wave_raw_data', 'γ_wave_raw_data', 'attention_raw_data', 'hrv_raw_data', 'hr_raw_data', 'coherence_flag_raw_data']
        parsed_dfs = [self.parse_raw_data(EEG_report_sample, col) for col in cols]

        # calculate two interval second because there's two type of time interval in EEG data
        interval_sec = self.time_difference(EEG_report_sample, 'meditation_start_time', 'meditation_finish_time') / len(parsed_dfs[0])
        interval_sec_other = self.time_difference(EEG_report_sample, 'meditation_start_time', 'meditation_finish_time') / len(parsed_dfs[6])

        # make two merged dataframe
        merged_df = parsed_dfs[0].join(parsed_dfs[1:6])
        merged_df_other = parsed_dfs[6].join(parsed_dfs[7:])

        # experiment start time
        start_time = datetime.strptime(EEG_report_sample.iloc[0]['meditation_start_time'], '%Y-%m-%d %H:%M:%S')

        # change index to time index based on interval second
        interval_sec, interval_sec_other = timedelta(seconds=round(interval_sec,2)), timedelta(seconds=round(interval_sec_other,2))
        merged_df['time'] = [start_time + time * interval_sec for time in range(len(merged_df))]
        merged_df_other['time'] = [start_time + time * interval_sec_other for time in range(len(merged_df_other))]
        merged_df, merged_df_other = merged_df.set_index('time'), merged_df_other.set_index('time')
        
        # comparing the inital experiment error time
#         counts = [self.count_initial_same_values(merged_df[col]) for col in cols[:6]] + [self.count_initial_same_values(merged_df_other['hr_raw_data'])]
#         initial_error_times = [counts[error] * interval_sec.total_seconds() if error != 6 else counts[error] * interval_sec_other.total_seconds() for error in range(7)]
#         initial_error_time = timedelta(seconds=max(initial_error_times))

        # dataset start time
#         real_start_time = start_time + initial_error_time
        real_start_time = start_time
        merged_df, merged_df_other = merged_df[merged_df.index > real_start_time], merged_df_other[merged_df_other.index > real_start_time]
        merged_df.index, merged_df_other.index = merged_df.index.round('S'), merged_df_other.index.round('S')

        # make the experiment end time same
        merged_df, merged_df_other = self.align_end_time(merged_df, merged_df_other)

        # start time process -> i : 0 , finish time process -> i : -1
        # start time process -> process_start_time_trash_sec func , finish time process -> process_finish_time_trash_sec func
        merged_df = self.adjust_time_index(0, merged_df, self.process_start_time_trash_sec)
        merged_df_other = self.adjust_time_index(0, merged_df_other, self.process_start_time_trash_sec)
        merged_df = self.adjust_time_index(-1, merged_df, self.process_finish_time_trash_sec)
        merged_df_other = self.adjust_time_index(-1, merged_df_other, self.process_finish_time_trash_sec)

        # grouping
        grouped = merged_df.groupby(merged_df.index.floor(self.time_interval_str))
        grouped_other = merged_df_other.groupby(merged_df_other.index.floor(self.time_interval_str))

        result = grouped.apply(self.check_invalid_values)
        result_other = grouped_other.apply(self.check_invalid_values_other)

        # final EEG dataset including β/θ SP ratio
        EEG_data_per_time_interval = result.merge(result_other, left_index=True, right_index=True)
        EEG_data_per_time_interval['β/θ SP'] = EEG_data_per_time_interval['β_wave_raw_data'] / EEG_data_per_time_interval['θ_wave_raw_data']
        
        EEG_data_per_time_interval = EEG_data_per_time_interval.rename(columns={
            'α_wave_raw_data':'alpha_wave',
            'β_wave_raw_data':'beta_wave',
            'θ_wave_raw_data':'theta_wave',
            'δ_wave_raw_data':'delta_wave',
            'γ_wave_raw_data':'gamma_wave',
            'attention_raw_data' : 'attention',
            'hrv_raw_data' : 'hrv',
            'hr_raw_data' : 'hr',
            'coherence_flag_raw_data' : 'coherence',
            'β/θ SP' : 'SP ratio'
        })

        return EEG_data_per_time_interval

## Fitbit

In [None]:
import datetime as dt
import matplotlib.pyplot as plt
import glob

class FitbitProcessor:
    '''
    time_interval : Unified seconds with EEG dataset (ex.10secs) 
    BM_sum_minutes
    Body Movement feature was meaningless because the experiment was conducted usually while sitting on the chair.
    So, created a new body movement feature as accumulated Body Movement value from previous time.
    And, the previous time is the BM_sum_minutes variable. (ex. 3 hours)
    '''
    def __init__(self, folder_path, time_interval, BM_sum_minutes):
        self.folder_path = folder_path
        self.BM_sum_minutes = BM_sum_minutes
        self.BM_sum_minutes_str = f'{BM_sum_minutes}T'
        self.time_interval = time_interval
        self.time_interval_str = f'{time_interval}S'
        folder_patterns = [
            "Active Zone Minutes (AZM)/*",
            "Sleep Score/*",
            "Stress Journal/CEDA*",
            "Temperature/Wrist Temperature - *"
        ]
        self.things_path = [glob.glob(f"{folder_path}/{pattern}") for pattern in folder_patterns]
        self.things_path = [item for sublist in self.things_path for item in sublist]
        self.wt_count = len(glob.glob(f"{folder_path}/Temperature/Wrist Temperature - *"))
        self.azm_count = len(glob.glob(f"{folder_path}/Active Zone Minutes (AZM)/*"))
        self.sleep_count = len(glob.glob(f"{folder_path}/Sleep Score/*"))
        self.eda_count = len(glob.glob(f"{folder_path}/Stress Journal/CEDA*"))
        # original fitbit dataset's time interval is 1 min
        self.original_interval = 60
        self.num_timestamps = self.original_interval // self.time_interval
        self.half_point = self.num_timestamps // 2
    
    def read_filtered_csv(self, path, columns):
        name = pd.read_csv(path)
        name = name[columns]
        name[columns[0]] = pd.to_datetime(name[columns[0]])
        return name
    
    def round_seconds(self, obj):
        if obj.second % self.time_interval == 0:
            return obj
        else:
            return obj - timedelta(seconds=obj.second % self.time_interval)
    
    def round_zero(self, datetime_obj):
        datetime_obj = datetime_obj.replace(second=0)
        return datetime_obj
    
    # Process whole fitbit data
    def process_fitbit_data(self):
        AZM_col = ['date_time', 'total_minutes']
        sleep_col = ['timestamp', 'deep_sleep_in_minutes']
        stress_col = ['timestamp', 'eda_level_real']
        temp_col = ['recorded_time', 'temperature']
        
        # merge all features
        things_col = [AZM_col] * self.azm_count + [sleep_col] * self.sleep_count + [stress_col] * self.eda_count + [temp_col] * self.wt_count
        things = [self.read_filtered_csv(path, col) for path, col in zip(self.things_path, things_col)]

        # if there's no wrist temperature
        if self.wt_count == 0:
            # if there's no eda data
            # there was no eda in two subjects' fitbit data
            if self.eda_count == 0:
                azm = self.process_azm(things[:self.azm_count])
                sleep = self.process_sleep(things[self.azm_count:self.azm_count+self.sleep_count])
                Min_Time, Max_Time = self.find_time_bounds([azm, sleep])
                
                # make final dataframe
                df = self.create_final_df([azm, sleep], Min_Time, Max_Time)
                df['timestamp'] = pd.to_datetime(df['timestamp'])
                df = df.set_index('timestamp')
                df = df.assign(temperature=np.nan)
                df = df.assign(eda=np.nan)
                
                return df
            
            else:
                azm = self.process_azm(things[:self.azm_count])
                sleep = self.process_sleep(things[self.azm_count:self.azm_count+self.sleep_count])
                eda = self.process_eda(things[self.azm_count+self.sleep_count:self.azm_count+self.sleep_count+self.eda_count])
                Min_Time, Max_Time = self.find_time_bounds([azm, sleep, eda])

                df = self.create_final_df([azm, sleep, eda], Min_Time, Max_Time)
                df['timestamp'] = pd.to_datetime(df['timestamp'])
                df = df.set_index('timestamp')
                df = df.assign(temperature=np.nan)
                return df
            
        # if there's no Active Zone Minutes data
        elif self.azm_count == 0:
            sleep = self.process_sleep(things[self.azm_count])
            eda = self.process_eda(things[self.azm_count+self.sleep_count:self.azm_count+self.sleep_count+self.eda_count])
            temp = self.process_temperature(things[self.azm_count+self.sleep_count+self.eda_count:])
            Min_Time, Max_Time = self.find_time_bounds([sleep, eda, temp])

            df = self.create_final_df([sleep, eda, temp], Min_Time, Max_Time)
            df['timestamp'] = pd.to_datetime(df['timestamp'])
            df = df.set_index('timestamp')
            df = df.assign(BM=np.nan)

            return df
        
        # if there's no sleep data
        elif self.sleep_count == 0:
            azm = self.process_azm(things[:self.azm_count])
            eda = self.process_eda(things[self.azm_count+self.sleep_count:self.azm_count+self.sleep_count+self.eda_count])
            temp = self.process_temperature(things[self.azm_count+self.sleep_count+self.eda_count:])
            Min_Time, Max_Time = self.find_time_bounds([azm, eda, temp])

            df = self.create_final_df([azm, eda, temp], Min_Time, Max_Time)
            df['timestamp'] = pd.to_datetime(df['timestamp'])
            df = df.set_index('timestamp')
            df = df.assign(sleep=np.nan)

            return df            
        
        # if there's no eda data
        elif self.eda_count == 0:
            azm = self.process_azm(things[:self.azm_count])
            sleep = self.process_sleep(things[self.azm_count:self.azm_count+self.sleep_count])
            temp = self.process_temperature(things[self.azm_count+self.sleep_count:])
            Min_Time, Max_Time = self.find_time_bounds([azm, sleep, temp])

            df = self.create_final_df([azm, sleep, temp], Min_Time, Max_Time)
            df['timestamp'] = pd.to_datetime(df['timestamp'])
            df = df.set_index('timestamp')
            df = df.assign(eda=np.nan)

            return df
            
        # if there's no error in data file
        else:
            azm = self.process_azm(things[:self.azm_count])
            sleep = self.process_sleep(things[self.azm_count:self.azm_count+self.sleep_count])
            eda = self.process_eda(things[self.azm_count+self.sleep_count:self.azm_count+self.sleep_count+self.eda_count])
            temp = self.process_temperature(things[self.azm_count+self.sleep_count+self.eda_count:])
            Min_Time, Max_Time = self.find_time_bounds([azm, sleep, eda, temp])

            df = self.create_final_df([azm, sleep, eda, temp], Min_Time, Max_Time)
            df['timestamp'] = pd.to_datetime(df['timestamp'])
            df = df.set_index('timestamp')

            return df
    
    def process_azm(self, azm):

        azm = pd.concat(azm, axis=0)
        azm = azm.rename(columns={'date_time':'timestamp', 'total_minutes':'BM'})
        time_list = []
        body_movement = []

        for time in azm['timestamp']:
            for _ in range(self.num_timestamps):
                time_list.append(time)
                
        for bm in azm['BM']:
            for _ in range(self.num_timestamps):
                body_movement.append(bm)

        azm_list = {'timestamp': time_list, 'BM': body_movement}
        azm_final = pd.DataFrame(azm_list)

#         azm_final['timestamp'] = pd.to_datetime(azm_final['timestamp'])
#         azm_final.set_index('timestamp', inplace=True)
#         azm_final = azm_final.resample(self.time_interval_str).asfreq().fillna(0)
#         azm_final['new_BM'] = azm_final['BM'].rolling(self.BM_sum_minutes_str, closed='right').sum()
#         azm_final = azm_final.drop(['BM'], axis=1)
#         azm_final = azm_final.rename(columns={'new_BM':'BM'})
#         azm_final = azm_final.astype({'BM':'int'})
#         azm_final.reset_index(inplace=True)
        
        return azm_final
    
    # Process sleep data (Deep sleep in minutes)
    def process_sleep(self, sleep):
        sleep = pd.concat(sleep, axis=0)
        sleep['timestamp'] = [
                self.round_zero(datetime.strptime(str(time)[0:19], '%Y-%m-%d %H:%M:%S')) for time in sleep['timestamp']
            ]
            
        sleep = sleep.rename(columns={'deep_sleep_in_minutes':'sleep'})
        return sleep
    
    # Process eda data
    def process_eda(self, eda):
        eda = pd.concat(eda, axis=0)
        eda['timestamp'] = [
            self.round_seconds(
                datetime.strptime(str(time)[0:19], '%Y-%m-%d %H:%M:%S') - dt.timedelta(hours=4)
            ) for time in eda['timestamp']]

        # EDA Bilinear Interpolation
        time_list = []
        eda_list = []
        
        for time in range(1, len(eda['timestamp']) - 1):
            start_timestamp = eda.iloc[time,0]
            for num in range(self.num_timestamps):
                new_timestamp = start_timestamp + timedelta(seconds = (num * self.time_interval))
                time_list.append(new_timestamp)

                value = eda.iloc[time,1]
                eda_list.append(round(value, 2))

        eda_list = {'timestamp': time_list, 'eda': eda_list}
        eda_final = pd.DataFrame(eda_list)

        return eda_final
    
    # Process temperature data
    def process_temperature(self, temperature):
        #Temperature
        temp = pd.concat(temperature, axis=0)
        temp = temp.rename(columns={'recorded_time':'timestamp'})

        # Temperature Bilinear Interpolation
        time_list = []
        temp_list = []

        for time in range(1, len(temp['timestamp']) - 1):
            start_timestamp = temp.iloc[time,0]
            for num in range(self.num_timestamps):
                new_timestamp = start_timestamp + timedelta(seconds = (num * self.time_interval))
                time_list.append(new_timestamp)

                value = temp.iloc[time,1]
                temp_list.append(round(value, 6))

        temp_list = {'timestamp': time_list, 'temperature': temp_list}
        temp_final = pd.DataFrame(temp_list)

        return temp_final
    
    # find minimum and maximum time of whole feature
    def find_time_bounds(self, dataframes):
        min_times = []
        max_times = []
        
        for df in dataframes:
            if not df.empty:
                min_times.append(df['timestamp'].min())
                max_times.append(df['timestamp'].max())
                
        if not min_times or not max_times:
            Min_Time = pd.Timestamp.now(tz='UTC')
            Max_Time = pd.Timestamp.now(tz='UTC')
        else:
            Min_Time = min(min_times)
            Max_Time = max(max_times)

        return Min_Time, Max_Time

    # create dataframe from Min_time to Max_time
    def create_final_df(self, datasets, Min_Time, Max_Time):
        fitbit = pd.date_range(start=Min_Time, end=Max_Time, freq=self.time_interval_str, name='timestamp')
        fitbit = pd.DataFrame(fitbit)

        for dataset in datasets:
            fitbit = pd.merge(fitbit, dataset, how='outer', on='timestamp')
            
        fitbit['BM'] = fitbit['BM'].fillna(0)
        fitbit['sleep'] = fitbit['sleep'].fillna(method='ffill')

        return fitbit

### Merging EEG and Fitbit dataset

In [None]:
class DataMerger(EEGProcessor, FitbitProcessor):
    def __init__(self, eeg_filepath, fitbit_folderpath, time_interval=60, eeg_remove_time_in_group=45, BM_sum_minutes=180):
        # Initialize by calling parent class constructor
        EEGProcessor.__init__(self, eeg_filepath, time_interval, eeg_remove_time_in_group)
        FitbitProcessor.__init__(self, fitbit_folderpath, time_interval, BM_sum_minutes)
        
        self.eeg_filepath = eeg_filepath
    # Merge EEG and Fitbit data
    def merge_data(self):
        # processing EEG data
        eeg_data = pd.read_csv(self.eeg_filepath)
        result_dfs = []
        
        # For all experiments in the eeg data csv file
        for exp_id in range(3, len(eeg_data)):
            processed_data = self.process_eeg_data(exp_id)
            if processed_data is not None:
                result_dfs.append(processed_data)
                
        if result_dfs:
            combined_eeg = pd.concat(result_dfs)
            combined_eeg.index = pd.to_datetime(combined_eeg.index)
        
        # processing Fitbit data
        fitbit_data = self.process_fitbit_data()
        fitbit_data.index = pd.to_datetime(fitbit_data.index)

        # merging two dataframes
        if 'combined_eeg' in locals() and not fitbit_data.empty:
            merged_df = combined_eeg.merge(fitbit_data, left_index=True, right_index=True, how='left')
            return merged_df
        else:
            return None

### JM

In [None]:
# eeg_filepath, fitbit_folderpath, time_interval, eeg_remove_time_in_group
merger = DataMerger(r"C:\Users\ballj\OneDrive\바탕 화면\EEG_jm.csv",
                    r"C:\Users\ballj\OneDrive\바탕 화면\Fitbit_JM")

final_jm = merger.merge_data()

In [None]:
final_jm.head(50)

### YH

In [None]:
# eeg_filepath, fitbit_folderpath, time_interval, eeg_remove_time_in_group
merger = DataMerger(r"C:\Users\ballj\OneDrive\바탕 화면\EEG_yh.csv",
                    r"C:\Users\ballj\OneDrive\바탕 화면\Fitbit_YH")

final_yh = merger.merge_data()

In [None]:
final_yh.tail(50)

### SJ

In [None]:
# eeg_filepath, fitbit_folderpath, time_interval, eeg_remove_time_in_group
merger = DataMerger(r"C:\Users\ballj\OneDrive\바탕 화면\EEG_sj.csv",
                    r"C:\Users\ballj\OneDrive\바탕 화면\Fitbit_SJ")


final_sj = merger.merge_data()

In [None]:
final_sj

### SA - no CEDA

In [None]:
# # eeg_filepath, fitbit_folderpath, time_interval, eeg_remove_time_in_group
merger = DataMerger(r"C:\Users\ballj\OneDrive\바탕 화면\EEG_sa.csv",
                    r"C:\Users\ballj\OneDrive\바탕 화면\Fitbit_SA")


final_sa = merger.merge_data()

In [None]:
final_sa

### BS

In [None]:
# eeg_filepath, fitbit_folderpath, time_interval, eeg_remove_time_in_group
merger = DataMerger(r"C:\Users\ballj\OneDrive\바탕 화면\EEG_bs.csv",
                    r"C:\Users\ballj\OneDrive\바탕 화면\Fitbit_BS")


final_bs = merger.merge_data()

In [None]:
final_bs

### MJ - no CEDA

In [None]:
# eeg_filepath, fitbit_folderpath, time_interval, eeg_remove_time_in_group
merger = DataMerger(r"C:\Users\ballj\OneDrive\바탕 화면\EEG_mj.csv",
                    r"C:\Users\ballj\OneDrive\바탕 화면\Fitbit_MJ")


final_mj = merger.merge_data()

In [None]:
final_mj

### Concat EEG

In [None]:
dataset = pd.concat([final_jm, final_sj, final_bs, final_yh, final_mj, final_sa])
dataset = dataset.sort_index()
dataset

In [None]:
dataset.to_csv(r'C:\Users\ballj\OneDrive\바탕 화면\1.1m_grouped_mean_not_removing_error_value_in_original_data_no_bm_process.csv')

# 2. Augmentation (10sec group)

In [6]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta

class EEGProcessor:
     
    # time_interval : Unified seconds with Fitbit data (ex. 10secs) 
    # remove_time_in_group : the criteria of processing error values in each time_interval group (ex. 7secs)
    
    def __init__(self, file_path, time_interval, remove_time_in_group):
        self.time_interval = time_interval
        self.remove_time_in_group = remove_time_in_group
        self.time_interval_str = f'{time_interval}S'
        self.EEG_report = pd.read_csv(file_path)

    # List to dataframe (ex. brain waves)
    def parse_raw_data(self, dataframe, col_name):
        col_str = dataframe.iloc[0][col_name]
        col_str = col_str.strip('[]')
        col_list = [float(val) for val in col_str.split(',')]  # cause list is divided by comma
        col_data = pd.DataFrame({col_name: col_list})
        return col_data

    # Experiment time calculating function
    def time_difference(self, dataframe, start_time_col, finish_time_col):
        start_time = datetime.strptime(dataframe.iloc[0][start_time_col], '%Y-%m-%d %H:%M:%S')
        finish_time = datetime.strptime(dataframe.iloc[0][finish_time_col], '%Y-%m-%d %0H:%M:%S')

        # time difference between two datatime objects
        time_difference = (finish_time - start_time).total_seconds()
        return time_difference
    
    # Comparing the experimental initial recognition error period and delete the part to be deleted
    def count_initial_same_values(self, series):
        initial_value = series.iloc[0]
        count = 0
        for value in series:
            if value == initial_value:
                count += 1
            else:
                break
        return count
    
    # Processing of values that are not exactly divided into front and back
    def process_start_time_trash_sec(self, start_time):
        # Plus 1 min and delete second in input time
        rounded_time = start_time + timedelta(minutes=1) - timedelta(seconds=start_time.second)
        time_difference = (rounded_time - start_time).total_seconds()

        remainder = time_difference % float(self.time_interval)

        # use the seconds over remove_time_in_group seconds
        if self.remove_time_in_group <= remainder:
            return False
        
        else:
            return remainder
        
    # Processing of values that are not exactly divided into front and back
    def process_finish_time_trash_sec(self, finish_time):
        # delete second in input time
        rounded_time = finish_time - timedelta(seconds=finish_time.second)
        time_difference = (finish_time - rounded_time).total_seconds()

        remainder = time_difference % float(self.time_interval)

        if self.remove_time_in_group <= remainder:
            return False
        
        else:
            return remainder

    # Rounding time to nearest time which can divided by time interval
    def nearest_time_rounding(self, time):
        seconds = time.second
        # For example, find nearest value in 0, 10, 20, 30, 40, 50sec
        time_points = [time_point for time_point in range(0,60, self.time_interval)]
        nearest = min(time_points, key=lambda time_point: abs(time_point - seconds))
        
        if nearest == time_points[-1] and seconds >= (time_points[-1] + self.remove_time_in_group):
            rounded_time = time.replace(second=0, microsecond=0) + timedelta(minutes=1)
        else:
            rounded_time = time.replace(second=nearest, microsecond=0)

        return rounded_time

    # Make the same end time
    def align_end_time(self, dataframe_1, dataframe_2):
        if dataframe_1.index[-1] > dataframe_2.index[-1]:
            dataframe_1 = dataframe_1[dataframe_1.index <= dataframe_2.index[-1]]

        elif dataframe_1.index[-1] < dataframe_2.index[-1]:
            dataframe_2 = dataframe_2[dataframe_2.index <= dataframe_1.index[-1]]

        else: 
            pass # if two dataframe's endtime is same

        return dataframe_1, dataframe_2

    # Adjust start time and end time processing
    # start time processing -> process_type : 0 , finish time processing -> process_type : -1
    # start time processing -> process_start_time_trash_sec func , finish time processing -> process_finish_time_trash_sec func    
    def adjust_time_index(self, process_type, dataframe, func):
        remainder = func(dataframe.index[process_type])
        
        # the last data only shows one original data, so processing this problem
        one_sec = timedelta(seconds=1)

        if remainder == False:
            # change time to nearest (Start time processing)
            if process_type == 0 :
                time = self.nearest_time_rounding(dataframe.index[process_type])
                new_index = dataframe.index.tolist()
                new_index[process_type] = time
                dataframe.index = new_index
            
            # change time to nearest (Finish time processing)
            else:
                time = self.nearest_time_rounding(dataframe.index[process_type]) - one_sec
                new_index = dataframe.index.tolist()
                new_index[process_type] = time
                dataframe.index = new_index
        
        # if remainder is under remove_time_in_group, just remove
        else:
            cutting_time = timedelta(seconds=remainder)
            # Start time processing
            if process_type == 0:
                dataframe = dataframe[dataframe.index >= dataframe.index[process_type] + cutting_time]
            
            # Finish time processing
            # make the seconds like 9, 19, 29...
            else:
                dataframe = dataframe[dataframe.index <= dataframe.index[-1] - cutting_time - one_sec]

        return dataframe
    
    # Removing error values in group (brain waves and attention score)
    def check_invalid_values(self, group):
        # find error data length in brain wave
        alpha_invalid_series = group['α_wave_raw_data'].diff().eq(0)
        alpha_invalid_timestamps = group.index[alpha_invalid_series].tolist()

        # find error data length in attention_raw_data
        attention_invalid_series = group['attention_raw_data'] == 0
        attention_invalid_timestamps = group.index[attention_invalid_series].tolist()

        # check whether the length of error data is over remove_time_in_group second
        def has_long_invalid_duration(invalid_timestamps):
            if not invalid_timestamps:
                return False
            for invalid_time in range(1, len(invalid_timestamps)):
                if (invalid_timestamps[invalid_time] - invalid_timestamps[invalid_time-1]).seconds > self.remove_time_in_group:
                    return True
            return False

        alpha_invalid = has_long_invalid_duration(alpha_invalid_timestamps)
        attention_invalid = has_long_invalid_duration(attention_invalid_timestamps)

        if alpha_invalid or attention_invalid:
            return group.mean()
#             return pd.Series([np.nan] * group.shape[1], index=group.columns)

        else:
            return group.mean()
            # calculate average except error value
#             valid_conditions = (
#                 (group['α_wave_raw_data'].diff() != 0) & 
#                 (group['β_wave_raw_data'].diff() != 0) & 
#                 (group['θ_wave_raw_data'].diff() != 0) & 
#                 (group['δ_wave_raw_data'].diff() != 0) & 
#                 (group['γ_wave_raw_data'].diff() != 0) & 
#                 (group['attention_raw_data'] != 0)
#             )
#             return group[valid_conditions].mean()

    # Removing error values in group (hr)
    def check_invalid_values_other(self, group):
        # find error data length in hr
        hr_invalid_series = group['hr_raw_data'] == 0
        hr_invalid_timestamps = group.index[hr_invalid_series].tolist()

        # check whether the length of error data is over remove_time_in_group second
        def has_long_invalid_duration(invalid_timestamps):
            if not invalid_timestamps:
                return False
            for invalid_time in range(1, len(invalid_timestamps)):
                if (invalid_timestamps[invalid_time] - invalid_timestamps[invalid_time-1]).seconds > self.remove_time_in_group:
                    return True
            return False

        hr_invalid = has_long_invalid_duration(hr_invalid_timestamps)

        if hr_invalid:
            return group.mean()
#             return pd.Series([np.nan] * group.shape[1], index=group.columns)

        else:
            # calculate average except error value
#             group = group[(group['hr_raw_data'] != 0)]
            return group.mean()
    
    # Process EEG data
    def process_eeg_data(self, experiment_id):
        if experiment_id not in self.EEG_report.index:
            return None

        # all experiments in one df
        EEG_report_sample = self.EEG_report.loc[[experiment_id],:]

        # one dataframe for one column
        cols = ['α_wave_raw_data', 'β_wave_raw_data', 'θ_wave_raw_data', 'δ_wave_raw_data', 'γ_wave_raw_data', 'attention_raw_data', 'hrv_raw_data', 'hr_raw_data', 'coherence_flag_raw_data']
        parsed_dfs = [self.parse_raw_data(EEG_report_sample, col) for col in cols]

        # calculate two interval second because there's two type of time interval in EEG data
        interval_sec = self.time_difference(EEG_report_sample, 'meditation_start_time', 'meditation_finish_time') / len(parsed_dfs[0])
        interval_sec_other = self.time_difference(EEG_report_sample, 'meditation_start_time', 'meditation_finish_time') / len(parsed_dfs[6])

        # make two merged dataframe
        merged_df = parsed_dfs[0].join(parsed_dfs[1:6])
        merged_df_other = parsed_dfs[6].join(parsed_dfs[7:])

        # experiment start time
        start_time = datetime.strptime(EEG_report_sample.iloc[0]['meditation_start_time'], '%Y-%m-%d %H:%M:%S')

        # change index to time index based on interval second
        interval_sec, interval_sec_other = timedelta(seconds=round(interval_sec,2)), timedelta(seconds=round(interval_sec_other,2))
        merged_df['time'] = [start_time + time * interval_sec for time in range(len(merged_df))]
        merged_df_other['time'] = [start_time + time * interval_sec_other for time in range(len(merged_df_other))]
        merged_df, merged_df_other = merged_df.set_index('time'), merged_df_other.set_index('time')
        
        # comparing the inital experiment error time
#         counts = [self.count_initial_same_values(merged_df[col]) for col in cols[:6]] + [self.count_initial_same_values(merged_df_other['hr_raw_data'])]
#         initial_error_times = [counts[error] * interval_sec.total_seconds() if error != 6 else counts[error] * interval_sec_other.total_seconds() for error in range(7)]
#         initial_error_time = timedelta(seconds=max(initial_error_times))

        # dataset start time
#         real_start_time = start_time + initial_error_time
        real_start_time = start_time
        merged_df, merged_df_other = merged_df[merged_df.index > real_start_time], merged_df_other[merged_df_other.index > real_start_time]
        merged_df.index, merged_df_other.index = merged_df.index.round('S'), merged_df_other.index.round('S')

        # make the experiment end time same
        merged_df, merged_df_other = self.align_end_time(merged_df, merged_df_other)

        # start time process -> i : 0 , finish time process -> i : -1
        # start time process -> process_start_time_trash_sec func , finish time process -> process_finish_time_trash_sec func
        merged_df = self.adjust_time_index(0, merged_df, self.process_start_time_trash_sec)
        merged_df_other = self.adjust_time_index(0, merged_df_other, self.process_start_time_trash_sec)
        merged_df = self.adjust_time_index(-1, merged_df, self.process_finish_time_trash_sec)
        merged_df_other = self.adjust_time_index(-1, merged_df_other, self.process_finish_time_trash_sec)

        # grouping
        grouped = merged_df.groupby(merged_df.index.floor(self.time_interval_str))
        grouped_other = merged_df_other.groupby(merged_df_other.index.floor(self.time_interval_str))

        result = grouped.apply(self.check_invalid_values)
        result_other = grouped_other.apply(self.check_invalid_values_other)

        # final EEG dataset including β/θ SP ratio
        EEG_data_per_time_interval = result.merge(result_other, left_index=True, right_index=True)
        EEG_data_per_time_interval['β/θ SP'] = EEG_data_per_time_interval['β_wave_raw_data'] / EEG_data_per_time_interval['θ_wave_raw_data']
        
        EEG_data_per_time_interval = EEG_data_per_time_interval.rename(columns={
            'α_wave_raw_data':'alpha_wave',
            'β_wave_raw_data':'beta_wave',
            'θ_wave_raw_data':'theta_wave',
            'δ_wave_raw_data':'delta_wave',
            'γ_wave_raw_data':'gamma_wave',
            'attention_raw_data' : 'attention',
            'hrv_raw_data' : 'hrv',
            'hr_raw_data' : 'hr',
            'coherence_flag_raw_data' : 'coherence',
            'β/θ SP' : 'SP ratio'
        })

        return EEG_data_per_time_interval

In [7]:
import datetime as dt
import matplotlib.pyplot as plt
import glob

class FitbitProcessor:
    '''
    time_interval : Unified seconds with EEG dataset (ex.10secs) 
    BM_sum_minutes
    Body Movement feature was meaningless because the experiment was conducted usually while sitting on the chair.
    So, created a new body movement feature as accumulated Body Movement value from previous time.
    And, the previous time is the BM_sum_minutes variable. (ex. 3 hours)
    '''
    def __init__(self, folder_path, time_interval, BM_sum_minutes):
        self.folder_path = folder_path
        self.BM_sum_minutes = BM_sum_minutes
        self.BM_sum_minutes_str = f'{BM_sum_minutes}T'
        self.time_interval = time_interval
        self.time_interval_str = f'{time_interval}S'
        folder_patterns = [
            "Active Zone Minutes (AZM)/*",
            "Sleep Score/*",
            "Stress Journal/CEDA*",
            "Temperature/Wrist Temperature - *"
        ]
        self.things_path = [glob.glob(f"{folder_path}/{pattern}") for pattern in folder_patterns]
        self.things_path = [item for sublist in self.things_path for item in sublist]
        self.wt_count = len(glob.glob(f"{folder_path}/Temperature/Wrist Temperature - *"))
        self.azm_count = len(glob.glob(f"{folder_path}/Active Zone Minutes (AZM)/*"))
        self.sleep_count = len(glob.glob(f"{folder_path}/Sleep Score/*"))
        self.eda_count = len(glob.glob(f"{folder_path}/Stress Journal/CEDA*"))
        # original fitbit dataset's time interval is 1 min
        self.original_interval = 60
        self.num_timestamps = self.original_interval // self.time_interval
        self.half_point = self.num_timestamps // 2
    
    def read_filtered_csv(self, path, columns):
        name = pd.read_csv(path)
        name = name[columns]
        name[columns[0]] = pd.to_datetime(name[columns[0]])
        return name
    
    def round_seconds(self, obj):
        if obj.second % self.time_interval == 0:
            return obj
        else:
            return obj - timedelta(seconds=obj.second % self.time_interval)
    
    def round_zero(self, datetime_obj):
        datetime_obj = datetime_obj.replace(second=0)
        return datetime_obj
    
    # Process whole fitbit data
    def process_fitbit_data(self):
        AZM_col = ['date_time', 'total_minutes']
        sleep_col = ['timestamp', 'deep_sleep_in_minutes']
        stress_col = ['timestamp', 'eda_level_real']
        temp_col = ['recorded_time', 'temperature']
        
        # merge all features
        things_col = [AZM_col] * self.azm_count + [sleep_col] * self.sleep_count + [stress_col] * self.eda_count + [temp_col] * self.wt_count
        things = [self.read_filtered_csv(path, col) for path, col in zip(self.things_path, things_col)]

        # if there's no wrist temperature
        if self.wt_count == 0:
            # if there's no eda data
            # there was no eda in two subjects' fitbit data
            if self.eda_count == 0:
                azm = self.process_azm(things[:self.azm_count])
                sleep = self.process_sleep(things[self.azm_count:self.azm_count+self.sleep_count])
                Min_Time, Max_Time = self.find_time_bounds([azm, sleep])
                
                # make final dataframe
                df = self.create_final_df([azm, sleep], Min_Time, Max_Time)
                df['timestamp'] = pd.to_datetime(df['timestamp'])
                df = df.set_index('timestamp')
                df = df.assign(temperature=np.nan)
                df = df.assign(eda=np.nan)
                
                return df
            
            else:
                azm = self.process_azm(things[:self.azm_count])
                sleep = self.process_sleep(things[self.azm_count:self.azm_count+self.sleep_count])
                eda = self.process_eda(things[self.azm_count+self.sleep_count:self.azm_count+self.sleep_count+self.eda_count])
                Min_Time, Max_Time = self.find_time_bounds([azm, sleep, eda])

                df = self.create_final_df([azm, sleep, eda], Min_Time, Max_Time)
                df['timestamp'] = pd.to_datetime(df['timestamp'])
                df = df.set_index('timestamp')
                df = df.assign(temperature=np.nan)
                return df
            
        # if there's no Active Zone Minutes data
        elif self.azm_count == 0:
            sleep = self.process_sleep(things[self.azm_count])
            eda = self.process_eda(things[self.azm_count+self.sleep_count:self.azm_count+self.sleep_count+self.eda_count])
            temp = self.process_temperature(things[self.azm_count+self.sleep_count+self.eda_count:])
            Min_Time, Max_Time = self.find_time_bounds([sleep, eda, temp])

            df = self.create_final_df([sleep, eda, temp], Min_Time, Max_Time)
            df['timestamp'] = pd.to_datetime(df['timestamp'])
            df = df.set_index('timestamp')
            df = df.assign(BM=np.nan)

            return df
        
        # if there's no sleep data
        elif self.sleep_count == 0:
            azm = self.process_azm(things[:self.azm_count])
            eda = self.process_eda(things[self.azm_count+self.sleep_count:self.azm_count+self.sleep_count+self.eda_count])
            temp = self.process_temperature(things[self.azm_count+self.sleep_count+self.eda_count:])
            Min_Time, Max_Time = self.find_time_bounds([azm, eda, temp])

            df = self.create_final_df([azm, eda, temp], Min_Time, Max_Time)
            df['timestamp'] = pd.to_datetime(df['timestamp'])
            df = df.set_index('timestamp')
            df = df.assign(sleep=np.nan)

            return df            
        
        # if there's no eda data
        elif self.eda_count == 0:
            azm = self.process_azm(things[:self.azm_count])
            sleep = self.process_sleep(things[self.azm_count:self.azm_count+self.sleep_count])
            temp = self.process_temperature(things[self.azm_count+self.sleep_count:])
            Min_Time, Max_Time = self.find_time_bounds([azm, sleep, temp])

            df = self.create_final_df([azm, sleep, temp], Min_Time, Max_Time)
            df['timestamp'] = pd.to_datetime(df['timestamp'])
            df = df.set_index('timestamp')
            df = df.assign(eda=np.nan)

            return df
            
        # if there's no error in data file
        else:
            azm = self.process_azm(things[:self.azm_count])
            sleep = self.process_sleep(things[self.azm_count:self.azm_count+self.sleep_count])
            eda = self.process_eda(things[self.azm_count+self.sleep_count:self.azm_count+self.sleep_count+self.eda_count])
            temp = self.process_temperature(things[self.azm_count+self.sleep_count+self.eda_count:])
            Min_Time, Max_Time = self.find_time_bounds([azm, sleep, eda, temp])

            df = self.create_final_df([azm, sleep, eda, temp], Min_Time, Max_Time)
            df['timestamp'] = pd.to_datetime(df['timestamp'])
            df = df.set_index('timestamp')

            return df
    
    # Process Active Zone Minutes data
    def process_azm(self, azm):

        azm = pd.concat(azm, axis=0)
        azm = azm.rename(columns={'date_time':'timestamp', 'total_minutes':'BM'})
        time_list = []
        body_movement = []

        for time in azm['timestamp']:
            start_timestamp = time - timedelta(seconds=(self.half_point * self.time_interval))

            for number in range(self.num_timestamps):
                new_timestamp = start_timestamp + timedelta(seconds = (number*self.time_interval))
                time_list.append(new_timestamp)

        for bm in azm['BM']:
            for _ in range(self.num_timestamps):
                body_movement.append(bm)

        azm_list = {'timestamp': time_list, 'BM': body_movement}
        azm_final = pd.DataFrame(azm_list)

#         azm_final['timestamp'] = pd.to_datetime(azm_final['timestamp'])
#         azm_final.set_index('timestamp', inplace=True)
#         azm_final = azm_final.resample(self.time_interval_str).asfreq().fillna(0)
#         azm_final['new_BM'] = azm_final['BM'].rolling(self.BM_sum_minutes_str, closed='right').sum()
#         azm_final = azm_final.drop(['BM'], axis=1)
#         azm_final = azm_final.rename(columns={'new_BM':'BM'})
#         azm_final = azm_final.astype({'BM':'int'})
#         azm_final.reset_index(inplace=True)
        
        return azm_final
    
    # Process sleep data (Deep sleep in minutes)
    def process_sleep(self, sleep):
        sleep = pd.concat(sleep, axis=0)
        sleep['timestamp'] = [
                self.round_zero(datetime.strptime(str(time)[0:19], '%Y-%m-%d %H:%M:%S')) for time in sleep['timestamp']
            ]
            
        sleep = sleep.rename(columns={'deep_sleep_in_minutes':'sleep'})
        return sleep
    
    # Process eda data
    def process_eda(self, eda):
        eda = pd.concat(eda, axis=0)
        eda['timestamp'] = [
            self.round_seconds(
                datetime.strptime(str(time)[0:19], '%Y-%m-%d %H:%M:%S') - dt.timedelta(hours=4)
            ) for time in eda['timestamp']]

        # EDA Bilinear Interpolation
        time_list = []
        eda_list = []

        for time in range(1, len(eda['timestamp']) - 1):
            start_timestamp = eda.iloc[time,0] - timedelta(seconds = self.half_point * self.time_interval)

            for num in range(self.num_timestamps):
                new_timestamp = start_timestamp + timedelta(seconds = (num * self.time_interval))
                time_list.append(new_timestamp)

                if num < self.half_point:
                    weight = (self.half_point - num) / self.num_timestamps
                    value = eda.iloc[time, 1] - ((eda.iloc[time, 1] - eda.iloc[time - 1, 1]) * weight)

                elif num == self.half_point:
                    value = eda.iloc[time,1]

                else:
                    weight = (num - self.half_point) / self.num_timestamps
                    value = eda.iloc[time, 1] + ((eda.iloc[time + 1, 1] - eda.iloc[time, 1]) * weight)

                eda_list.append(round(value, 2))

        eda_list = {'timestamp': time_list, 'eda': eda_list}
        eda_final = pd.DataFrame(eda_list)

        return eda_final
    
    # Process temperature data
    def process_temperature(self, temperature):
        #Temperature
        temp = pd.concat(temperature, axis=0)
        temp = temp.rename(columns={'recorded_time':'timestamp'})

        # Temperature Bilinear Interpolation
        time_list = []
        temp_list = []

        for time in range(1, len(temp['timestamp']) - 1):
            if self.half_point % 2 != 0:
                start_timestamp = temp.iloc[time,0] - timedelta(seconds = self.half_point * self.time_interval)
                for num in range(self.num_timestamps):
                    new_timestamp = start_timestamp + timedelta(seconds = (num * self.time_interval))
                    time_list.append(new_timestamp)

                    if num < self.half_point:
                        weight = (self.half_point - num) / self.num_timestamps 
                        value = temp.iloc[time, 1] - ((temp.iloc[time, 1] - temp.iloc[time - 1, 1]) * weight)

                    elif num == self.half_point:
                        value = temp.iloc[time,1]

                    else:
                        weight = (num - self.half_point) / self.num_timestamps
                        value = temp.iloc[time, 1] + ((temp.iloc[time + 1, 1] - temp.iloc[time, 1]) * weight)

                    temp_list.append(round(value, 6))

        temp_list = {'timestamp': time_list, 'temperature': temp_list}
        temp_final = pd.DataFrame(temp_list)

        return temp_final
    
    # find minimum and maximum time of whole feature
    def find_time_bounds(self, dataframes):
        min_times = []
        max_times = []
        
        for df in dataframes:
            if not df.empty:
                min_times.append(df['timestamp'].min())
                max_times.append(df['timestamp'].max())
                
        if not min_times or not max_times:
            Min_Time = pd.Timestamp.now(tz='UTC')
            Max_Time = pd.Timestamp.now(tz='UTC')
        else:
            Min_Time = min(min_times)
            Max_Time = max(max_times)

        return Min_Time, Max_Time

    # create dataframe from Min_time to Max_time
    def create_final_df(self, datasets, Min_Time, Max_Time):
        fitbit = pd.date_range(start=Min_Time, end=Max_Time, freq=self.time_interval_str, name='timestamp')
        fitbit = pd.DataFrame(fitbit)

        for dataset in datasets:
            fitbit = pd.merge(fitbit, dataset, how='outer', on='timestamp')
            
        fitbit['BM'] = fitbit['BM'].fillna(0)
        fitbit['sleep'] = fitbit['sleep'].fillna(method='ffill')

        return fitbit

In [14]:
class DataMerger(EEGProcessor, FitbitProcessor):
   
    def __init__(self, eeg_filepath, fitbit_folderpath, time_interval=10, eeg_remove_time_in_group=7, BM_sum_minutes=180):
        # Initialize by calling parent class constructor
        EEGProcessor.__init__(self, eeg_filepath, time_interval, eeg_remove_time_in_group)
        FitbitProcessor.__init__(self, fitbit_folderpath, time_interval, BM_sum_minutes)
        
        self.eeg_filepath = eeg_filepath

    # Merge EEG and Fitbit data
    def merge_data(self):
        # processing EEG data
        eeg_data = pd.read_csv(self.eeg_filepath)
        result_dfs = []
        
        # For all experiments in the eeg data csv file
        for exp_id in range(3, len(eeg_data)):
            processed_data = self.process_eeg_data(exp_id)
            if processed_data is not None:
                result_dfs.append(processed_data)
                
        if result_dfs:
            combined_eeg = pd.concat(result_dfs)
            combined_eeg.index = pd.to_datetime(combined_eeg.index)
        
        # processing Fitbit data
        fitbit_data = self.process_fitbit_data()
        fitbit_data.index = pd.to_datetime(fitbit_data.index)

        # merging two dataframes
        if 'combined_eeg' in locals() and not fitbit_data.empty:
            merged_df = combined_eeg.merge(fitbit_data, left_index=True, right_index=True, how='left')
            return merged_df
        else:
            return None

In [15]:
# eeg_filepath, fitbit_folderpath, time_interval, eeg_remove_time_in_group
merger = DataMerger(r"C:\Users\ballj\OneDrive\바탕 화면\EEG_jm.csv",
                    r"C:\Users\ballj\OneDrive\바탕 화면\Fitbit_JM")

final_jm = merger.merge_data()

# eeg_filepath, fitbit_folderpath, time_interval, eeg_remove_time_in_group
merger = DataMerger(r"C:\Users\ballj\OneDrive\바탕 화면\EEG_yh.csv",
                    r"C:\Users\ballj\OneDrive\바탕 화면\Fitbit_YH")

final_yh = merger.merge_data()

# eeg_filepath, fitbit_folderpath, time_interval, eeg_remove_time_in_group
merger = DataMerger(r"C:\Users\ballj\OneDrive\바탕 화면\EEG_sj.csv",
                    r"C:\Users\ballj\OneDrive\바탕 화면\Fitbit_SJ")


final_sj = merger.merge_data()

# eeg_filepath, fitbit_folderpath, time_interval, eeg_remove_time_in_group
merger = DataMerger(r"C:\Users\ballj\OneDrive\바탕 화면\EEG_sa.csv",
                    r"C:\Users\ballj\OneDrive\바탕 화면\Fitbit_SA")


final_sa = merger.merge_data()

# eeg_filepath, fitbit_folderpath, time_interval, eeg_remove_time_in_group
merger = DataMerger(r"C:\Users\ballj\OneDrive\바탕 화면\EEG_bs.csv",
                    r"C:\Users\ballj\OneDrive\바탕 화면\Fitbit_BS")


final_bs = merger.merge_data()

# eeg_filepath, fitbit_folderpath, time_interval, eeg_remove_time_in_group
merger = DataMerger(r"C:\Users\ballj\OneDrive\바탕 화면\EEG_mj.csv",
                    r"C:\Users\ballj\OneDrive\바탕 화면\Fitbit_MJ")


final_mj = merger.merge_data()

In [16]:
dataset = pd.concat([final_jm, final_sj, final_bs, final_yh, final_mj, final_sa])
dataset = dataset.sort_index()
dataset = dataset.dropna()
dataset

Unnamed: 0,alpha_wave,beta_wave,theta_wave,delta_wave,gamma_wave,attention,hrv,hr,coherence,SP ratio,BM,sleep,eda,temperature
2023-10-16 10:39:20,98.366769,101.628072,96.312422,91.826384,94.410009,36.937500,0.000000,25.357143,0.0,1.055192,78.0,95.0,5.14,-1.821261
2023-10-16 10:39:40,93.265881,99.750884,92.076881,85.074334,91.760712,64.531250,0.000000,84.310345,0.0,1.083343,78.0,95.0,5.09,-1.771261
2023-10-16 10:40:00,92.458164,99.227658,90.810442,83.178958,91.300818,72.636364,0.000000,83.750000,0.0,1.092690,78.0,95.0,5.03,-1.721261
2023-10-16 10:40:20,93.191097,98.719353,92.557253,85.024866,90.182428,65.500000,0.000000,76.172414,0.0,1.066576,78.0,95.0,4.89,-1.711261
2023-10-16 10:40:40,92.830909,96.476331,91.692616,83.868441,87.118172,51.156250,32.892857,78.750000,0.0,1.052171,78.0,95.0,4.74,-1.701261
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2023-12-04 13:57:20,96.462818,101.984994,96.968858,89.793724,92.941361,81.242424,37.896552,82.413793,0.0,1.051729,9.0,25.0,6.91,-2.583520
2023-12-04 13:57:40,94.894634,100.428706,95.300203,87.970316,91.439456,75.062500,30.500000,77.600000,0.0,1.053814,9.0,25.0,6.85,-2.563520
2023-12-04 13:58:00,94.042113,98.627097,94.294506,87.400819,90.092603,67.531250,26.833333,77.966667,0.0,1.045947,9.0,25.0,6.78,-2.543520
2023-12-04 13:58:20,94.600222,100.409416,95.208922,88.280319,91.589297,79.218750,23.966667,78.466667,0.0,1.054622,9.0,25.0,6.77,-2.520187


In [None]:
dataset.to_csv(r'C:\Users\ballj\OneDrive\바탕 화면\2.csv')

# 3. Error value preprocessing (10sec group)

In [None]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta

class EEGProcessor:
     
    # time_interval : Unified seconds with Fitbit data (ex. 10secs) 
    # remove_time_in_group : the criteria of processing error values in each time_interval group (ex. 7secs)
    
    def __init__(self, file_path, time_interval, remove_time_in_group):
        self.time_interval = time_interval
        self.remove_time_in_group = remove_time_in_group
        self.time_interval_str = f'{time_interval}S'
        self.EEG_report = pd.read_csv(file_path)

    # List to dataframe (ex. brain waves)
    def parse_raw_data(self, dataframe, col_name):
        col_str = dataframe.iloc[0][col_name]
        col_str = col_str.strip('[]')
        col_list = [float(val) for val in col_str.split(',')]  # cause list is divided by comma
        col_data = pd.DataFrame({col_name: col_list})
        return col_data

    # Experiment time calculating function
    def time_difference(self, dataframe, start_time_col, finish_time_col):
        start_time = datetime.strptime(dataframe.iloc[0][start_time_col], '%Y-%m-%d %H:%M:%S')
        finish_time = datetime.strptime(dataframe.iloc[0][finish_time_col], '%Y-%m-%d %0H:%M:%S')

        # time difference between two datatime objects
        time_difference = (finish_time - start_time).total_seconds()
        return time_difference
    
    # Comparing the experimental initial recognition error period and delete the part to be deleted
    def count_initial_same_values(self, series):
        initial_value = series.iloc[0]
        count = 0
        for value in series:
            if value == initial_value:
                count += 1
            else:
                break
        return count
    
    # Processing of values that are not exactly divided into front and back
    def process_start_time_trash_sec(self, start_time):
        # Plus 1 min and delete second in input time
        rounded_time = start_time + timedelta(minutes=1) - timedelta(seconds=start_time.second)
        time_difference = (rounded_time - start_time).total_seconds()

        remainder = time_difference % float(self.time_interval)

        # use the seconds over remove_time_in_group seconds
        if self.remove_time_in_group <= remainder:
            return False
        
        else:
            return remainder
        
    # Processing of values that are not exactly divided into front and back
    def process_finish_time_trash_sec(self, finish_time):
        # delete second in input time
        rounded_time = finish_time - timedelta(seconds=finish_time.second)
        time_difference = (finish_time - rounded_time).total_seconds()

        remainder = time_difference % float(self.time_interval)

        if self.remove_time_in_group <= remainder:
            return False
        
        else:
            return remainder

    # Rounding time to nearest time which can divided by time interval
    def nearest_time_rounding(self, time):
        seconds = time.second
        # For example, find nearest value in 0, 10, 20, 30, 40, 50sec
        time_points = [time_point for time_point in range(0,60, self.time_interval)]
        nearest = min(time_points, key=lambda time_point: abs(time_point - seconds))
        
        if nearest == time_points[-1] and seconds >= (time_points[-1] + self.remove_time_in_group):
            rounded_time = time.replace(second=0, microsecond=0) + timedelta(minutes=1)
        else:
            rounded_time = time.replace(second=nearest, microsecond=0)

        return rounded_time

    # Make the same end time
    def align_end_time(self, dataframe_1, dataframe_2):
        if dataframe_1.index[-1] > dataframe_2.index[-1]:
            dataframe_1 = dataframe_1[dataframe_1.index <= dataframe_2.index[-1]]

        elif dataframe_1.index[-1] < dataframe_2.index[-1]:
            dataframe_2 = dataframe_2[dataframe_2.index <= dataframe_1.index[-1]]

        else: 
            pass # if two dataframe's endtime is same

        return dataframe_1, dataframe_2

    # Adjust start time and end time processing
    # start time processing -> process_type : 0 , finish time processing -> process_type : -1
    # start time processing -> process_start_time_trash_sec func , finish time processing -> process_finish_time_trash_sec func    
    def adjust_time_index(self, process_type, dataframe, func):
        remainder = func(dataframe.index[process_type])
        
        # the last data only shows one original data, so processing this problem
        one_sec = timedelta(seconds=1)

        if remainder == False:
            # change time to nearest (Start time processing)
            if process_type == 0 :
                time = self.nearest_time_rounding(dataframe.index[process_type])
                new_index = dataframe.index.tolist()
                new_index[process_type] = time
                dataframe.index = new_index
            
            # change time to nearest (Finish time processing)
            else:
                time = self.nearest_time_rounding(dataframe.index[process_type]) - one_sec
                new_index = dataframe.index.tolist()
                new_index[process_type] = time
                dataframe.index = new_index
        
        # if remainder is under remove_time_in_group, just remove
        else:
            cutting_time = timedelta(seconds=remainder)
            # Start time processing
            if process_type == 0:
                dataframe = dataframe[dataframe.index >= dataframe.index[process_type] + cutting_time]
            
            # Finish time processing
            # make the seconds like 9, 19, 29...
            else:
                dataframe = dataframe[dataframe.index <= dataframe.index[-1] - cutting_time - one_sec]

        return dataframe
    
    # Removing error values in group (brain waves and attention score)
    def check_invalid_values(self, group):
        # find error data length in brain wave
        alpha_invalid_series = group['α_wave_raw_data'].diff().eq(0)
        alpha_invalid_timestamps = group.index[alpha_invalid_series].tolist()

        # find error data length in attention_raw_data
        attention_invalid_series = group['attention_raw_data'] == 0
        attention_invalid_timestamps = group.index[attention_invalid_series].tolist()

        # check whether the length of error data is over remove_time_in_group second
        def has_long_invalid_duration(invalid_timestamps):
            if not invalid_timestamps:
                return False
            for invalid_time in range(1, len(invalid_timestamps)):
                if (invalid_timestamps[invalid_time] - invalid_timestamps[invalid_time-1]).seconds > self.remove_time_in_group:
                    return True
            return False

        alpha_invalid = has_long_invalid_duration(alpha_invalid_timestamps)
        attention_invalid = has_long_invalid_duration(attention_invalid_timestamps)
        
        # make error values to missing values
        if alpha_invalid or attention_invalid:
            return pd.Series([np.nan] * group.shape[1], index=group.columns)

        else:
            # calculate average except error value
            valid_conditions = (
                (group['α_wave_raw_data'].diff() != 0) & 
                (group['β_wave_raw_data'].diff() != 0) & 
                (group['θ_wave_raw_data'].diff() != 0) & 
                (group['δ_wave_raw_data'].diff() != 0) & 
                (group['γ_wave_raw_data'].diff() != 0) & 
                (group['attention_raw_data'] != 0)
            )
            return group[valid_conditions].mean()

    # Removing error values in group (hr)
    def check_invalid_values_other(self, group):
        # find error data length in hr
        hr_invalid_series = group['hr_raw_data'] == 0
        hr_invalid_timestamps = group.index[hr_invalid_series].tolist()

        # check whether the length of error data is over remove_time_in_group second
        def has_long_invalid_duration(invalid_timestamps):
            if not invalid_timestamps:
                return False
            for invalid_time in range(1, len(invalid_timestamps)):
                if (invalid_timestamps[invalid_time] - invalid_timestamps[invalid_time-1]).seconds > self.remove_time_in_group:
                    return True
            return False

        hr_invalid = has_long_invalid_duration(hr_invalid_timestamps)

        # make error values to missing values
        if hr_invalid:
            return pd.Series([np.nan] * group.shape[1], index=group.columns)

        else:
            # calculate average except error value
            group = group[(group['hr_raw_data'] != 0)]
            return group.mean()
    
    # Process EEG data
    def process_eeg_data(self, experiment_id):
        if experiment_id not in self.EEG_report.index:
            return None

        # all experiments in one df
        EEG_report_sample = self.EEG_report.loc[[experiment_id],:]

        # one dataframe for one column
        cols = ['α_wave_raw_data', 'β_wave_raw_data', 'θ_wave_raw_data', 'δ_wave_raw_data', 'γ_wave_raw_data', 'attention_raw_data', 'hrv_raw_data', 'hr_raw_data', 'coherence_flag_raw_data']
        parsed_dfs = [self.parse_raw_data(EEG_report_sample, col) for col in cols]

        # calculate two interval second because there's two type of time interval in EEG data
        interval_sec = self.time_difference(EEG_report_sample, 'meditation_start_time', 'meditation_finish_time') / len(parsed_dfs[0])
        interval_sec_other = self.time_difference(EEG_report_sample, 'meditation_start_time', 'meditation_finish_time') / len(parsed_dfs[6])

        # make two merged dataframe
        merged_df = parsed_dfs[0].join(parsed_dfs[1:6])
        merged_df_other = parsed_dfs[6].join(parsed_dfs[7:])

        # experiment start time
        start_time = datetime.strptime(EEG_report_sample.iloc[0]['meditation_start_time'], '%Y-%m-%d %H:%M:%S')

        # change index to time index based on interval second
        interval_sec, interval_sec_other = timedelta(seconds=round(interval_sec,2)), timedelta(seconds=round(interval_sec_other,2))
        merged_df['time'] = [start_time + time * interval_sec for time in range(len(merged_df))]
        merged_df_other['time'] = [start_time + time * interval_sec_other for time in range(len(merged_df_other))]
        merged_df, merged_df_other = merged_df.set_index('time'), merged_df_other.set_index('time')
        
        # comparing the inital experiment error time
        counts = [self.count_initial_same_values(merged_df[col]) for col in cols[:6]] + [self.count_initial_same_values(merged_df_other['hr_raw_data'])]
        initial_error_times = [counts[error] * interval_sec.total_seconds() if error != 6 else counts[error] * interval_sec_other.total_seconds() for error in range(7)]
        initial_error_time = timedelta(seconds=max(initial_error_times))

        # dataset start time
        real_start_time = start_time + initial_error_time
        merged_df, merged_df_other = merged_df[merged_df.index > real_start_time], merged_df_other[merged_df_other.index > real_start_time]
        merged_df.index, merged_df_other.index = merged_df.index.round('S'), merged_df_other.index.round('S')

        # make the experiment end time same
        merged_df, merged_df_other = self.align_end_time(merged_df, merged_df_other)

        # start time process -> i : 0 , finish time process -> i : -1
        # start time process -> process_start_time_trash_sec func , finish time process -> process_finish_time_trash_sec func
        merged_df = self.adjust_time_index(0, merged_df, self.process_start_time_trash_sec)
        merged_df_other = self.adjust_time_index(0, merged_df_other, self.process_start_time_trash_sec)
        merged_df = self.adjust_time_index(-1, merged_df, self.process_finish_time_trash_sec)
        merged_df_other = self.adjust_time_index(-1, merged_df_other, self.process_finish_time_trash_sec)

        # grouping
        grouped = merged_df.groupby(merged_df.index.floor(self.time_interval_str))
        grouped_other = merged_df_other.groupby(merged_df_other.index.floor(self.time_interval_str))

        result = grouped.apply(self.check_invalid_values)
        result_other = grouped_other.apply(self.check_invalid_values_other)

        # final EEG dataset including β/θ SP ratio
        EEG_data_per_time_interval = result.merge(result_other, left_index=True, right_index=True)
        EEG_data_per_time_interval['β/θ SP'] = EEG_data_per_time_interval['β_wave_raw_data'] / EEG_data_per_time_interval['θ_wave_raw_data']
        
        EEG_data_per_time_interval = EEG_data_per_time_interval.rename(columns={
            'α_wave_raw_data':'alpha_wave',
            'β_wave_raw_data':'beta_wave',
            'θ_wave_raw_data':'theta_wave',
            'δ_wave_raw_data':'delta_wave',
            'γ_wave_raw_data':'gamma_wave',
            'attention_raw_data' : 'attention',
            'hrv_raw_data' : 'hrv',
            'hr_raw_data' : 'hr',
            'coherence_flag_raw_data' : 'coherence',
            'β/θ SP' : 'SP ratio'
        })

        return EEG_data_per_time_interval

In [None]:
import datetime as dt
import matplotlib.pyplot as plt
import glob

class FitbitProcessor:
    '''
    time_interval : Unified seconds with EEG dataset (ex.10secs) 
    BM_sum_minutes
    Body Movement feature was meaningless because the experiment was conducted usually while sitting on the chair.
    So, created a new body movement feature as accumulated Body Movement value from previous time.
    And, the previous time is the BM_sum_minutes variable. (ex. 3 hours)
    '''
    def __init__(self, folder_path, time_interval, BM_sum_minutes):
        self.folder_path = folder_path
        self.BM_sum_minutes = BM_sum_minutes
        self.BM_sum_minutes_str = f'{BM_sum_minutes}T'
        self.time_interval = time_interval
        self.time_interval_str = f'{time_interval}S'
        folder_patterns = [
            "Active Zone Minutes (AZM)/*",
            "Sleep Score/*",
            "Stress Journal/CEDA*",
            "Temperature/Wrist Temperature - *"
        ]
        self.things_path = [glob.glob(f"{folder_path}/{pattern}") for pattern in folder_patterns]
        self.things_path = [item for sublist in self.things_path for item in sublist]
        self.wt_count = len(glob.glob(f"{folder_path}/Temperature/Wrist Temperature - *"))
        self.azm_count = len(glob.glob(f"{folder_path}/Active Zone Minutes (AZM)/*"))
        self.sleep_count = len(glob.glob(f"{folder_path}/Sleep Score/*"))
        self.eda_count = len(glob.glob(f"{folder_path}/Stress Journal/CEDA*"))
        # original fitbit dataset's time interval is 1 min
        self.original_interval = 60
        self.num_timestamps = self.original_interval // self.time_interval
        self.half_point = self.num_timestamps // 2
    
    def read_filtered_csv(self, path, columns):
        name = pd.read_csv(path)
        name = name[columns]
        name[columns[0]] = pd.to_datetime(name[columns[0]])
        return name
    
    def round_seconds(self, obj):
        if obj.second % self.time_interval == 0:
            return obj
        else:
            return obj - timedelta(seconds=obj.second % self.time_interval)
    
    def round_zero(self, datetime_obj):
        datetime_obj = datetime_obj.replace(second=0)
        return datetime_obj
    
    # Process whole fitbit data
    def process_fitbit_data(self):
        AZM_col = ['date_time', 'total_minutes']
        sleep_col = ['timestamp', 'deep_sleep_in_minutes']
        stress_col = ['timestamp', 'eda_level_real']
        temp_col = ['recorded_time', 'temperature']
        
        # merge all features
        things_col = [AZM_col] * self.azm_count + [sleep_col] * self.sleep_count + [stress_col] * self.eda_count + [temp_col] * self.wt_count
        things = [self.read_filtered_csv(path, col) for path, col in zip(self.things_path, things_col)]

        # if there's no wrist temperature
        if self.wt_count == 0:
            # if there's no eda data
            # there was no eda in two subjects' fitbit data
            if self.eda_count == 0:
                azm = self.process_azm(things[:self.azm_count])
                sleep = self.process_sleep(things[self.azm_count:self.azm_count+self.sleep_count])
                Min_Time, Max_Time = self.find_time_bounds([azm, sleep])
                
                # make final dataframe
                df = self.create_final_df([azm, sleep], Min_Time, Max_Time)
                df['timestamp'] = pd.to_datetime(df['timestamp'])
                df = df.set_index('timestamp')
                df = df.assign(temperature=np.nan)
                df = df.assign(eda=np.nan)
                
                return df
            
            else:
                azm = self.process_azm(things[:self.azm_count])
                sleep = self.process_sleep(things[self.azm_count:self.azm_count+self.sleep_count])
                eda = self.process_eda(things[self.azm_count+self.sleep_count:self.azm_count+self.sleep_count+self.eda_count])
                Min_Time, Max_Time = self.find_time_bounds([azm, sleep, eda])

                df = self.create_final_df([azm, sleep, eda], Min_Time, Max_Time)
                df['timestamp'] = pd.to_datetime(df['timestamp'])
                df = df.set_index('timestamp')
                df = df.assign(temperature=np.nan)
                return df
            
        # if there's no Active Zone Minutes data
        elif self.azm_count == 0:
            sleep = self.process_sleep(things[self.azm_count])
            eda = self.process_eda(things[self.azm_count+self.sleep_count:self.azm_count+self.sleep_count+self.eda_count])
            temp = self.process_temperature(things[self.azm_count+self.sleep_count+self.eda_count:])
            Min_Time, Max_Time = self.find_time_bounds([sleep, eda, temp])

            df = self.create_final_df([sleep, eda, temp], Min_Time, Max_Time)
            df['timestamp'] = pd.to_datetime(df['timestamp'])
            df = df.set_index('timestamp')
            df = df.assign(BM=np.nan)

            return df
        
        # if there's no sleep data
        elif self.sleep_count == 0:
            azm = self.process_azm(things[:self.azm_count])
            eda = self.process_eda(things[self.azm_count+self.sleep_count:self.azm_count+self.sleep_count+self.eda_count])
            temp = self.process_temperature(things[self.azm_count+self.sleep_count+self.eda_count:])
            Min_Time, Max_Time = self.find_time_bounds([azm, eda, temp])

            df = self.create_final_df([azm, eda, temp], Min_Time, Max_Time)
            df['timestamp'] = pd.to_datetime(df['timestamp'])
            df = df.set_index('timestamp')
            df = df.assign(sleep=np.nan)

            return df            
        
        # if there's no eda data
        elif self.eda_count == 0:
            azm = self.process_azm(things[:self.azm_count])
            sleep = self.process_sleep(things[self.azm_count:self.azm_count+self.sleep_count])
            temp = self.process_temperature(things[self.azm_count+self.sleep_count:])
            Min_Time, Max_Time = self.find_time_bounds([azm, sleep, temp])

            df = self.create_final_df([azm, sleep, temp], Min_Time, Max_Time)
            df['timestamp'] = pd.to_datetime(df['timestamp'])
            df = df.set_index('timestamp')
            df = df.assign(eda=np.nan)

            return df
            
        # if there's no error in data file
        else:
            azm = self.process_azm(things[:self.azm_count])
            sleep = self.process_sleep(things[self.azm_count:self.azm_count+self.sleep_count])
            eda = self.process_eda(things[self.azm_count+self.sleep_count:self.azm_count+self.sleep_count+self.eda_count])
            temp = self.process_temperature(things[self.azm_count+self.sleep_count+self.eda_count:])
            Min_Time, Max_Time = self.find_time_bounds([azm, sleep, eda, temp])

            df = self.create_final_df([azm, sleep, eda, temp], Min_Time, Max_Time)
            df['timestamp'] = pd.to_datetime(df['timestamp'])
            df = df.set_index('timestamp')

            return df
    
    # Process Active Zone Minutes data
    def process_azm(self, azm):

        azm = pd.concat(azm, axis=0)
        azm = azm.rename(columns={'date_time':'timestamp', 'total_minutes':'BM'})
        time_list = []
        body_movement = []

        for time in azm['timestamp']:
            start_timestamp = time - timedelta(seconds=(self.half_point * self.time_interval))

            for number in range(self.num_timestamps):
                new_timestamp = start_timestamp + timedelta(seconds = (number*self.time_interval))
                time_list.append(new_timestamp)

        for bm in azm['BM']:
            for _ in range(self.num_timestamps):
                body_movement.append(bm)

        azm_list = {'timestamp': time_list, 'BM': body_movement}
        azm_final = pd.DataFrame(azm_list)

#         azm_final['timestamp'] = pd.to_datetime(azm_final['timestamp'])
#         azm_final.set_index('timestamp', inplace=True)
#         azm_final = azm_final.resample(self.time_interval_str).asfreq().fillna(0)
#         azm_final['new_BM'] = azm_final['BM'].rolling(self.BM_sum_minutes_str, closed='right').sum()
#         azm_final = azm_final.drop(['BM'], axis=1)
#         azm_final = azm_final.rename(columns={'new_BM':'BM'})
#         azm_final = azm_final.astype({'BM':'int'})
#         azm_final.reset_index(inplace=True)
        
        return azm_final
    
    # Process sleep data (Deep sleep in minutes)
    def process_sleep(self, sleep):
        sleep = pd.concat(sleep, axis=0)
        sleep['timestamp'] = [
                self.round_zero(datetime.strptime(str(time)[0:19], '%Y-%m-%d %H:%M:%S')) for time in sleep['timestamp']
            ]
            
        sleep = sleep.rename(columns={'deep_sleep_in_minutes':'sleep'})
        return sleep
    
    # Process eda data
    def process_eda(self, eda):
        eda = pd.concat(eda, axis=0)
        eda['timestamp'] = [
            self.round_seconds(
                datetime.strptime(str(time)[0:19], '%Y-%m-%d %H:%M:%S') - dt.timedelta(hours=4)
            ) for time in eda['timestamp']]

        # EDA Bilinear Interpolation
        time_list = []
        eda_list = []

        for time in range(1, len(eda['timestamp']) - 1):
            start_timestamp = eda.iloc[time,0] - timedelta(seconds = self.half_point * self.time_interval)

            for num in range(self.num_timestamps):
                new_timestamp = start_timestamp + timedelta(seconds = (num * self.time_interval))
                time_list.append(new_timestamp)

                if num < self.half_point:
                    weight = (self.half_point - num) / self.num_timestamps
                    value = eda.iloc[time, 1] - ((eda.iloc[time, 1] - eda.iloc[time - 1, 1]) * weight)

                elif num == self.half_point:
                    value = eda.iloc[time,1]

                else:
                    weight = (num - self.half_point) / self.num_timestamps
                    value = eda.iloc[time, 1] + ((eda.iloc[time + 1, 1] - eda.iloc[time, 1]) * weight)

                eda_list.append(round(value, 2))

        eda_list = {'timestamp': time_list, 'eda': eda_list}
        eda_final = pd.DataFrame(eda_list)

        return eda_final
    
    # Process temperature data
    def process_temperature(self, temperature):
        #Temperature
        temp = pd.concat(temperature, axis=0)
        temp = temp.rename(columns={'recorded_time':'timestamp'})

        # Temperature Bilinear Interpolation
        time_list = []
        temp_list = []

        for time in range(1, len(temp['timestamp']) - 1):
            if self.half_point % 2 != 0:
                start_timestamp = temp.iloc[time,0] - timedelta(seconds = self.half_point * self.time_interval)
                for num in range(self.num_timestamps):
                    new_timestamp = start_timestamp + timedelta(seconds = (num * self.time_interval))
                    time_list.append(new_timestamp)

                    if num < self.half_point:
                        weight = (self.half_point - num) / self.num_timestamps 
                        value = temp.iloc[time, 1] - ((temp.iloc[time, 1] - temp.iloc[time - 1, 1]) * weight)

                    elif num == self.half_point:
                        value = temp.iloc[time,1]

                    else:
                        weight = (num - self.half_point) / self.num_timestamps
                        value = temp.iloc[time, 1] + ((temp.iloc[time + 1, 1] - temp.iloc[time, 1]) * weight)

                    temp_list.append(round(value, 6))

        temp_list = {'timestamp': time_list, 'temperature': temp_list}
        temp_final = pd.DataFrame(temp_list)

        return temp_final
    
    # find minimum and maximum time of whole feature
    def find_time_bounds(self, dataframes):
        min_times = []
        max_times = []
        
        for df in dataframes:
            if not df.empty:
                min_times.append(df['timestamp'].min())
                max_times.append(df['timestamp'].max())
                
        if not min_times or not max_times:
            Min_Time = pd.Timestamp.now(tz='UTC')
            Max_Time = pd.Timestamp.now(tz='UTC')
        else:
            Min_Time = min(min_times)
            Max_Time = max(max_times)

        return Min_Time, Max_Time

    # create dataframe from Min_time to Max_time
    def create_final_df(self, datasets, Min_Time, Max_Time):
        fitbit = pd.date_range(start=Min_Time, end=Max_Time, freq=self.time_interval_str, name='timestamp')
        fitbit = pd.DataFrame(fitbit)

        for dataset in datasets:
            fitbit = pd.merge(fitbit, dataset, how='outer', on='timestamp')
            
        fitbit['BM'] = fitbit['BM'].fillna(0)
        fitbit['sleep'] = fitbit['sleep'].fillna(method='ffill')

        return fitbit

In [None]:
class DataMerger(EEGProcessor, FitbitProcessor):
    '''
    We've tested some hyperparameters, and "time_interval=10, eeg_remove_time_in_group=7, BM_sum_minutes=180" have shown the best R-squared score.
    
    [Tested hyperparameters]
    1. Time interval : 10, 12, 15, 20sec & eeg_remove_time_in_group : 7, 9, 12, 16sec (7~80% proportion of time interval)
    R-squared score was best when we split the dataset into 10 seconds group.
    
    2. BM (Body Movement) sum minutes : 1h, 1h 30m, 2h, 2h 30m, 3h
    R-squared score was best when we set up the BM (Body Movement) sum minutes as 3 hours.     
    '''
    
    def __init__(self, eeg_filepath, fitbit_folderpath, time_interval=10, eeg_remove_time_in_group=7, BM_sum_minutes=180):
        # Initialize by calling parent class constructor
        EEGProcessor.__init__(self, eeg_filepath, time_interval, eeg_remove_time_in_group)
        FitbitProcessor.__init__(self, fitbit_folderpath, time_interval, BM_sum_minutes)
        
        self.eeg_filepath = eeg_filepath

    # Merge EEG and Fitbit data
    def merge_data(self):
        # processing EEG data
        eeg_data = pd.read_csv(self.eeg_filepath)
        result_dfs = []
        
        # For all experiments in the eeg data csv file
        for exp_id in range(3, len(eeg_data)):
            processed_data = self.process_eeg_data(exp_id)
            if processed_data is not None:
                result_dfs.append(processed_data)
                
        if result_dfs:
            combined_eeg = pd.concat(result_dfs)
            combined_eeg.index = pd.to_datetime(combined_eeg.index)
        
        # processing Fitbit data
        fitbit_data = self.process_fitbit_data()
        fitbit_data.index = pd.to_datetime(fitbit_data.index)

        # merging two dataframes
        if 'combined_eeg' in locals() and not fitbit_data.empty:
            merged_df = combined_eeg.merge(fitbit_data, left_index=True, right_index=True, how='left')
            return merged_df
        else:
            return None

In [None]:
# eeg_filepath, fitbit_folderpath, time_interval, eeg_remove_time_in_group
merger = DataMerger(r"C:\Users\ballj\OneDrive\바탕 화면\EEG_jm.csv",
                    r"C:\Users\ballj\OneDrive\바탕 화면\Fitbit_JM")

final_jm = merger.merge_data()

# eeg_filepath, fitbit_folderpath, time_interval, eeg_remove_time_in_group
merger = DataMerger(r"C:\Users\ballj\OneDrive\바탕 화면\EEG_yh.csv",
                    r"C:\Users\ballj\OneDrive\바탕 화면\Fitbit_YH")

final_yh = merger.merge_data()

# eeg_filepath, fitbit_folderpath, time_interval, eeg_remove_time_in_group
merger = DataMerger(r"C:\Users\ballj\OneDrive\바탕 화면\EEG_sj.csv",
                    r"C:\Users\ballj\OneDrive\바탕 화면\Fitbit_SJ")


final_sj = merger.merge_data()

# eeg_filepath, fitbit_folderpath, time_interval, eeg_remove_time_in_group
merger = DataMerger(r"C:\Users\ballj\OneDrive\바탕 화면\EEG_sa.csv",
                    r"C:\Users\ballj\OneDrive\바탕 화면\Fitbit_SA")


final_sa = merger.merge_data()

# eeg_filepath, fitbit_folderpath, time_interval, eeg_remove_time_in_group
merger = DataMerger(r"C:\Users\ballj\OneDrive\바탕 화면\EEG_bs.csv",
                    r"C:\Users\ballj\OneDrive\바탕 화면\Fitbit_BS")


final_bs = merger.merge_data()

# eeg_filepath, fitbit_folderpath, time_interval, eeg_remove_time_in_group
merger = DataMerger(r"C:\Users\ballj\OneDrive\바탕 화면\EEG_mj.csv",
                    r"C:\Users\ballj\OneDrive\바탕 화면\Fitbit_MJ")


final_mj = merger.merge_data()

In [None]:
dataset = pd.concat([final_jm, final_sj, final_bs, final_yh, final_mj, final_sa])
dataset = dataset.sort_index()
dataset

In [None]:
dataset.to_csv(r'C:\Users\ballj\OneDrive\바탕 화면\3.csv')

# 4. Feature Engineering (10sec group) - Same as Automation Code for Dataset

In [None]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta

class EEGProcessor:
     
    # time_interval : Unified seconds with Fitbit data (ex. 10secs) 
    # remove_time_in_group : the criteria of processing error values in each time_interval group (ex. 7secs)
    
    def __init__(self, file_path, time_interval, remove_time_in_group):
        self.time_interval = time_interval
        self.remove_time_in_group = remove_time_in_group
        self.time_interval_str = f'{time_interval}S'
        self.EEG_report = pd.read_csv(file_path)

    # List to dataframe (ex. brain waves)
    def parse_raw_data(self, dataframe, col_name):
        col_str = dataframe.iloc[0][col_name]
        col_str = col_str.strip('[]')
        col_list = [float(val) for val in col_str.split(',')]  # cause list is divided by comma
        col_data = pd.DataFrame({col_name: col_list})
        return col_data

    # Experiment time calculating function
    def time_difference(self, dataframe, start_time_col, finish_time_col):
        start_time = datetime.strptime(dataframe.iloc[0][start_time_col], '%Y-%m-%d %H:%M:%S')
        finish_time = datetime.strptime(dataframe.iloc[0][finish_time_col], '%Y-%m-%d %0H:%M:%S')

        # time difference between two datatime objects
        time_difference = (finish_time - start_time).total_seconds()
        return time_difference
    
    # Comparing the experimental initial recognition error period and delete the part to be deleted
    def count_initial_same_values(self, series):
        initial_value = series.iloc[0]
        count = 0
        for value in series:
            if value == initial_value:
                count += 1
            else:
                break
        return count
    
    # Processing of values that are not exactly divided into front and back
    def process_start_time_trash_sec(self, start_time):
        # Plus 1 min and delete second in input time
        rounded_time = start_time + timedelta(minutes=1) - timedelta(seconds=start_time.second)
        time_difference = (rounded_time - start_time).total_seconds()

        remainder = time_difference % float(self.time_interval)

        # use the seconds over remove_time_in_group seconds
        if self.remove_time_in_group <= remainder:
            return False
        
        else:
            return remainder
        
    # Processing of values that are not exactly divided into front and back
    def process_finish_time_trash_sec(self, finish_time):
        # delete second in input time
        rounded_time = finish_time - timedelta(seconds=finish_time.second)
        time_difference = (finish_time - rounded_time).total_seconds()

        remainder = time_difference % float(self.time_interval)

        if self.remove_time_in_group <= remainder:
            return False
        
        else:
            return remainder

    # Rounding time to nearest time which can divided by time interval
    def nearest_time_rounding(self, time):
        seconds = time.second
        # For example, find nearest value in 0, 10, 20, 30, 40, 50sec
        time_points = [time_point for time_point in range(0,60, self.time_interval)]
        nearest = min(time_points, key=lambda time_point: abs(time_point - seconds))
        
        if nearest == time_points[-1] and seconds >= (time_points[-1] + self.remove_time_in_group):
            rounded_time = time.replace(second=0, microsecond=0) + timedelta(minutes=1)
        else:
            rounded_time = time.replace(second=nearest, microsecond=0)

        return rounded_time

    # Make the same end time
    def align_end_time(self, dataframe_1, dataframe_2):
        if dataframe_1.index[-1] > dataframe_2.index[-1]:
            dataframe_1 = dataframe_1[dataframe_1.index <= dataframe_2.index[-1]]

        elif dataframe_1.index[-1] < dataframe_2.index[-1]:
            dataframe_2 = dataframe_2[dataframe_2.index <= dataframe_1.index[-1]]

        else: 
            pass # if two dataframe's endtime is same

        return dataframe_1, dataframe_2

    # Adjust start time and end time processing
    # start time processing -> process_type : 0 , finish time processing -> process_type : -1
    # start time processing -> process_start_time_trash_sec func , finish time processing -> process_finish_time_trash_sec func    
    def adjust_time_index(self, process_type, dataframe, func):
        remainder = func(dataframe.index[process_type])
        
        # the last data only shows one original data, so processing this problem
        one_sec = timedelta(seconds=1)

        if remainder == False:
            # change time to nearest (Start time processing)
            if process_type == 0 :
                time = self.nearest_time_rounding(dataframe.index[process_type])
                new_index = dataframe.index.tolist()
                new_index[process_type] = time
                dataframe.index = new_index
            
            # change time to nearest (Finish time processing)
            else:
                time = self.nearest_time_rounding(dataframe.index[process_type]) - one_sec
                new_index = dataframe.index.tolist()
                new_index[process_type] = time
                dataframe.index = new_index
        
        # if remainder is under remove_time_in_group, just remove
        else:
            cutting_time = timedelta(seconds=remainder)
            # Start time processing
            if process_type == 0:
                dataframe = dataframe[dataframe.index >= dataframe.index[process_type] + cutting_time]
            
            # Finish time processing
            # make the seconds like 9, 19, 29...
            else:
                dataframe = dataframe[dataframe.index <= dataframe.index[-1] - cutting_time - one_sec]

        return dataframe
    
    # Removing error values in group (brain waves and attention score)
    def check_invalid_values(self, group):
        # find error data length in brain wave
        alpha_invalid_series = group['α_wave_raw_data'].diff().eq(0)
        alpha_invalid_timestamps = group.index[alpha_invalid_series].tolist()

        # find error data length in attention_raw_data
        attention_invalid_series = group['attention_raw_data'] == 0
        attention_invalid_timestamps = group.index[attention_invalid_series].tolist()

        # check whether the length of error data is over remove_time_in_group second
        def has_long_invalid_duration(invalid_timestamps):
            if not invalid_timestamps:
                return False
            for invalid_time in range(1, len(invalid_timestamps)):
                if (invalid_timestamps[invalid_time] - invalid_timestamps[invalid_time-1]).seconds > self.remove_time_in_group:
                    return True
            return False

        alpha_invalid = has_long_invalid_duration(alpha_invalid_timestamps)
        attention_invalid = has_long_invalid_duration(attention_invalid_timestamps)
        
        # make error values to missing values
        if alpha_invalid or attention_invalid:
            return pd.Series([np.nan] * group.shape[1], index=group.columns)

        else:
            # calculate average except error value
            valid_conditions = (
                (group['α_wave_raw_data'].diff() != 0) & 
                (group['β_wave_raw_data'].diff() != 0) & 
                (group['θ_wave_raw_data'].diff() != 0) & 
                (group['δ_wave_raw_data'].diff() != 0) & 
                (group['γ_wave_raw_data'].diff() != 0) & 
                (group['attention_raw_data'] != 0)
            )
            return group[valid_conditions].mean()

    # Removing error values in group (hr)
    def check_invalid_values_other(self, group):
        # find error data length in hr
        hr_invalid_series = group['hr_raw_data'] == 0
        hr_invalid_timestamps = group.index[hr_invalid_series].tolist()

        # check whether the length of error data is over remove_time_in_group second
        def has_long_invalid_duration(invalid_timestamps):
            if not invalid_timestamps:
                return False
            for invalid_time in range(1, len(invalid_timestamps)):
                if (invalid_timestamps[invalid_time] - invalid_timestamps[invalid_time-1]).seconds > self.remove_time_in_group:
                    return True
            return False

        hr_invalid = has_long_invalid_duration(hr_invalid_timestamps)

        # make error values to missing values
        if hr_invalid:
            return pd.Series([np.nan] * group.shape[1], index=group.columns)

        else:
            # calculate average except error value
            group = group[(group['hr_raw_data'] != 0)]
            return group.mean()
    
    # Process EEG data
    def process_eeg_data(self, experiment_id):
        if experiment_id not in self.EEG_report.index:
            return None

        # all experiments in one df
        EEG_report_sample = self.EEG_report.loc[[experiment_id],:]

        # one dataframe for one column
        cols = ['α_wave_raw_data', 'β_wave_raw_data', 'θ_wave_raw_data', 'δ_wave_raw_data', 'γ_wave_raw_data', 'attention_raw_data', 'hrv_raw_data', 'hr_raw_data', 'coherence_flag_raw_data']
        parsed_dfs = [self.parse_raw_data(EEG_report_sample, col) for col in cols]

        # calculate two interval second because there's two type of time interval in EEG data
        interval_sec = self.time_difference(EEG_report_sample, 'meditation_start_time', 'meditation_finish_time') / len(parsed_dfs[0])
        interval_sec_other = self.time_difference(EEG_report_sample, 'meditation_start_time', 'meditation_finish_time') / len(parsed_dfs[6])

        # make two merged dataframe
        merged_df = parsed_dfs[0].join(parsed_dfs[1:6])
        merged_df_other = parsed_dfs[6].join(parsed_dfs[7:])

        # experiment start time
        start_time = datetime.strptime(EEG_report_sample.iloc[0]['meditation_start_time'], '%Y-%m-%d %H:%M:%S')

        # change index to time index based on interval second
        interval_sec, interval_sec_other = timedelta(seconds=round(interval_sec,2)), timedelta(seconds=round(interval_sec_other,2))
        merged_df['time'] = [start_time + time * interval_sec for time in range(len(merged_df))]
        merged_df_other['time'] = [start_time + time * interval_sec_other for time in range(len(merged_df_other))]
        merged_df, merged_df_other = merged_df.set_index('time'), merged_df_other.set_index('time')
        
        # comparing the inital experiment error time
        counts = [self.count_initial_same_values(merged_df[col]) for col in cols[:6]] + [self.count_initial_same_values(merged_df_other['hr_raw_data'])]
        initial_error_times = [counts[error] * interval_sec.total_seconds() if error != 6 else counts[error] * interval_sec_other.total_seconds() for error in range(7)]
        initial_error_time = timedelta(seconds=max(initial_error_times))

        # dataset start time
        real_start_time = start_time + initial_error_time
        merged_df, merged_df_other = merged_df[merged_df.index > real_start_time], merged_df_other[merged_df_other.index > real_start_time]
        merged_df.index, merged_df_other.index = merged_df.index.round('S'), merged_df_other.index.round('S')

        # make the experiment end time same
        merged_df, merged_df_other = self.align_end_time(merged_df, merged_df_other)

        # start time process -> i : 0 , finish time process -> i : -1
        # start time process -> process_start_time_trash_sec func , finish time process -> process_finish_time_trash_sec func
        merged_df = self.adjust_time_index(0, merged_df, self.process_start_time_trash_sec)
        merged_df_other = self.adjust_time_index(0, merged_df_other, self.process_start_time_trash_sec)
        merged_df = self.adjust_time_index(-1, merged_df, self.process_finish_time_trash_sec)
        merged_df_other = self.adjust_time_index(-1, merged_df_other, self.process_finish_time_trash_sec)

        # grouping
        grouped = merged_df.groupby(merged_df.index.floor(self.time_interval_str))
        grouped_other = merged_df_other.groupby(merged_df_other.index.floor(self.time_interval_str))

        result = grouped.apply(self.check_invalid_values)
        result_other = grouped_other.apply(self.check_invalid_values_other)

        # final EEG dataset including β/θ SP ratio
        EEG_data_per_time_interval = result.merge(result_other, left_index=True, right_index=True)
        EEG_data_per_time_interval['β/θ SP'] = EEG_data_per_time_interval['β_wave_raw_data'] / EEG_data_per_time_interval['θ_wave_raw_data']
        
        EEG_data_per_time_interval = EEG_data_per_time_interval.rename(columns={
            'α_wave_raw_data':'alpha_wave',
            'β_wave_raw_data':'beta_wave',
            'θ_wave_raw_data':'theta_wave',
            'δ_wave_raw_data':'delta_wave',
            'γ_wave_raw_data':'gamma_wave',
            'attention_raw_data' : 'attention',
            'hrv_raw_data' : 'hrv',
            'hr_raw_data' : 'hr',
            'coherence_flag_raw_data' : 'coherence',
            'β/θ SP' : 'SP ratio'
        })

        return EEG_data_per_time_interval

In [None]:
import datetime as dt
import matplotlib.pyplot as plt
import glob

class FitbitProcessor:
    '''
    time_interval : Unified seconds with EEG dataset (ex.10secs) 
    BM_sum_minutes
    Body Movement feature was meaningless because the experiment was conducted usually while sitting on the chair.
    So, created a new body movement feature as accumulated Body Movement value from previous time.
    And, the previous time is the BM_sum_minutes variable. (ex. 3 hours)
    '''
    def __init__(self, folder_path, time_interval, BM_sum_minutes):
        self.folder_path = folder_path
        self.BM_sum_minutes = BM_sum_minutes
        self.BM_sum_minutes_str = f'{BM_sum_minutes}T'
        self.time_interval = time_interval
        self.time_interval_str = f'{time_interval}S'
        folder_patterns = [
            "Active Zone Minutes (AZM)/*",
            "Sleep Score/*",
            "Stress Journal/CEDA*",
            "Temperature/Wrist Temperature - *"
        ]
        self.things_path = [glob.glob(f"{folder_path}/{pattern}") for pattern in folder_patterns]
        self.things_path = [item for sublist in self.things_path for item in sublist]
        self.wt_count = len(glob.glob(f"{folder_path}/Temperature/Wrist Temperature - *"))
        self.azm_count = len(glob.glob(f"{folder_path}/Active Zone Minutes (AZM)/*"))
        self.sleep_count = len(glob.glob(f"{folder_path}/Sleep Score/*"))
        self.eda_count = len(glob.glob(f"{folder_path}/Stress Journal/CEDA*"))
        # original fitbit dataset's time interval is 1 min
        self.original_interval = 60
        self.num_timestamps = self.original_interval // self.time_interval
        self.half_point = self.num_timestamps // 2
    
    def read_filtered_csv(self, path, columns):
        name = pd.read_csv(path)
        name = name[columns]
        name[columns[0]] = pd.to_datetime(name[columns[0]])
        return name
    
    def round_seconds(self, obj):
        if obj.second % self.time_interval == 0:
            return obj
        else:
            return obj - timedelta(seconds=obj.second % self.time_interval)
    
    def round_zero(self, datetime_obj):
        datetime_obj = datetime_obj.replace(second=0)
        return datetime_obj
    
    # Process whole fitbit data
    def process_fitbit_data(self):
        AZM_col = ['date_time', 'total_minutes']
        sleep_col = ['timestamp', 'deep_sleep_in_minutes']
        stress_col = ['timestamp', 'eda_level_real']
        temp_col = ['recorded_time', 'temperature']
        
        # merge all features
        things_col = [AZM_col] * self.azm_count + [sleep_col] * self.sleep_count + [stress_col] * self.eda_count + [temp_col] * self.wt_count
        things = [self.read_filtered_csv(path, col) for path, col in zip(self.things_path, things_col)]

        # if there's no wrist temperature
        if self.wt_count == 0:
            # if there's no eda data
            # there was no eda in two subjects' fitbit data
            if self.eda_count == 0:
                azm = self.process_azm(things[:self.azm_count])
                sleep = self.process_sleep(things[self.azm_count:self.azm_count+self.sleep_count])
                Min_Time, Max_Time = self.find_time_bounds([azm, sleep])
                
                # make final dataframe
                df = self.create_final_df([azm, sleep], Min_Time, Max_Time)
                df['timestamp'] = pd.to_datetime(df['timestamp'])
                df = df.set_index('timestamp')
                df = df.assign(temperature=np.nan)
                df = df.assign(eda=np.nan)
                
                return df
            
            else:
                azm = self.process_azm(things[:self.azm_count])
                sleep = self.process_sleep(things[self.azm_count:self.azm_count+self.sleep_count])
                eda = self.process_eda(things[self.azm_count+self.sleep_count:self.azm_count+self.sleep_count+self.eda_count])
                Min_Time, Max_Time = self.find_time_bounds([azm, sleep, eda])

                df = self.create_final_df([azm, sleep, eda], Min_Time, Max_Time)
                df['timestamp'] = pd.to_datetime(df['timestamp'])
                df = df.set_index('timestamp')
                df = df.assign(temperature=np.nan)
                return df
            
        # if there's no Active Zone Minutes data
        elif self.azm_count == 0:
            sleep = self.process_sleep(things[self.azm_count])
            eda = self.process_eda(things[self.azm_count+self.sleep_count:self.azm_count+self.sleep_count+self.eda_count])
            temp = self.process_temperature(things[self.azm_count+self.sleep_count+self.eda_count:])
            Min_Time, Max_Time = self.find_time_bounds([sleep, eda, temp])

            df = self.create_final_df([sleep, eda, temp], Min_Time, Max_Time)
            df['timestamp'] = pd.to_datetime(df['timestamp'])
            df = df.set_index('timestamp')
            df = df.assign(BM=np.nan)

            return df
        
        # if there's no sleep data
        elif self.sleep_count == 0:
            azm = self.process_azm(things[:self.azm_count])
            eda = self.process_eda(things[self.azm_count+self.sleep_count:self.azm_count+self.sleep_count+self.eda_count])
            temp = self.process_temperature(things[self.azm_count+self.sleep_count+self.eda_count:])
            Min_Time, Max_Time = self.find_time_bounds([azm, eda, temp])

            df = self.create_final_df([azm, eda, temp], Min_Time, Max_Time)
            df['timestamp'] = pd.to_datetime(df['timestamp'])
            df = df.set_index('timestamp')
            df = df.assign(sleep=np.nan)

            return df            
        
        # if there's no eda data
        elif self.eda_count == 0:
            azm = self.process_azm(things[:self.azm_count])
            sleep = self.process_sleep(things[self.azm_count:self.azm_count+self.sleep_count])
            temp = self.process_temperature(things[self.azm_count+self.sleep_count:])
            Min_Time, Max_Time = self.find_time_bounds([azm, sleep, temp])

            df = self.create_final_df([azm, sleep, temp], Min_Time, Max_Time)
            df['timestamp'] = pd.to_datetime(df['timestamp'])
            df = df.set_index('timestamp')
            df = df.assign(eda=np.nan)

            return df
            
        # if there's no error in data file
        else:
            azm = self.process_azm(things[:self.azm_count])
            sleep = self.process_sleep(things[self.azm_count:self.azm_count+self.sleep_count])
            eda = self.process_eda(things[self.azm_count+self.sleep_count:self.azm_count+self.sleep_count+self.eda_count])
            temp = self.process_temperature(things[self.azm_count+self.sleep_count+self.eda_count:])
            Min_Time, Max_Time = self.find_time_bounds([azm, sleep, eda, temp])

            df = self.create_final_df([azm, sleep, eda, temp], Min_Time, Max_Time)
            df['timestamp'] = pd.to_datetime(df['timestamp'])
            df = df.set_index('timestamp')

            return df
    
    # Process Active Zone Minutes data
    def process_azm(self, azm):

        azm = pd.concat(azm, axis=0)
        azm = azm.rename(columns={'date_time':'timestamp', 'total_minutes':'BM'})
        time_list = []
        body_movement = []

        for time in azm['timestamp']:
            start_timestamp = time - timedelta(seconds=(self.half_point * self.time_interval))

            for number in range(self.num_timestamps):
                new_timestamp = start_timestamp + timedelta(seconds = (number*self.time_interval))
                time_list.append(new_timestamp)

        for bm in azm['BM']:
            for _ in range(self.num_timestamps):
                body_movement.append(bm)

        azm_list = {'timestamp': time_list, 'BM': body_movement}
        azm_final = pd.DataFrame(azm_list)

        azm_final['timestamp'] = pd.to_datetime(azm_final['timestamp'])
        azm_final.set_index('timestamp', inplace=True)
        azm_final = azm_final.resample(self.time_interval_str).asfreq().fillna(0)
        azm_final['new_BM'] = azm_final['BM'].rolling(self.BM_sum_minutes_str, closed='right').sum()
        azm_final = azm_final.drop(['BM'], axis=1)
        azm_final = azm_final.rename(columns={'new_BM':'BM'})
        azm_final = azm_final.astype({'BM':'int'})
        azm_final.reset_index(inplace=True)
        
        return azm_final
    
    # Process sleep data (Deep sleep in minutes)
    def process_sleep(self, sleep):
        sleep = pd.concat(sleep, axis=0)
        sleep['timestamp'] = [
                self.round_zero(datetime.strptime(str(time)[0:19], '%Y-%m-%d %H:%M:%S')) for time in sleep['timestamp']
            ]
            
        sleep = sleep.rename(columns={'deep_sleep_in_minutes':'sleep'})
        return sleep
    
    # Process eda data
    def process_eda(self, eda):
        eda = pd.concat(eda, axis=0)
        eda['timestamp'] = [
            self.round_seconds(
                datetime.strptime(str(time)[0:19], '%Y-%m-%d %H:%M:%S') - dt.timedelta(hours=4)
            ) for time in eda['timestamp']]

        # EDA Bilinear Interpolation
        time_list = []
        eda_list = []

        for time in range(1, len(eda['timestamp']) - 1):
            start_timestamp = eda.iloc[time,0] - timedelta(seconds = self.half_point * self.time_interval)

            for num in range(self.num_timestamps):
                new_timestamp = start_timestamp + timedelta(seconds = (num * self.time_interval))
                time_list.append(new_timestamp)

                if num < self.half_point:
                    weight = (self.half_point - num) / self.num_timestamps
                    value = eda.iloc[time, 1] - ((eda.iloc[time, 1] - eda.iloc[time - 1, 1]) * weight)

                elif num == self.half_point:
                    value = eda.iloc[time,1]

                else:
                    weight = (num - self.half_point) / self.num_timestamps
                    value = eda.iloc[time, 1] + ((eda.iloc[time + 1, 1] - eda.iloc[time, 1]) * weight)

                eda_list.append(round(value, 2))

        eda_list = {'timestamp': time_list, 'eda': eda_list}
        eda_final = pd.DataFrame(eda_list)

        return eda_final
    
    # Process temperature data
    def process_temperature(self, temperature):
        #Temperature
        temp = pd.concat(temperature, axis=0)
        temp = temp.rename(columns={'recorded_time':'timestamp'})

        # Temperature Bilinear Interpolation
        time_list = []
        temp_list = []

        for time in range(1, len(temp['timestamp']) - 1):
            if self.half_point % 2 != 0:
                start_timestamp = temp.iloc[time,0] - timedelta(seconds = self.half_point * self.time_interval)
                for num in range(self.num_timestamps):
                    new_timestamp = start_timestamp + timedelta(seconds = (num * self.time_interval))
                    time_list.append(new_timestamp)

                    if num < self.half_point:
                        weight = (self.half_point - num) / self.num_timestamps 
                        value = temp.iloc[time, 1] - ((temp.iloc[time, 1] - temp.iloc[time - 1, 1]) * weight)

                    elif num == self.half_point:
                        value = temp.iloc[time,1]

                    else:
                        weight = (num - self.half_point) / self.num_timestamps
                        value = temp.iloc[time, 1] + ((temp.iloc[time + 1, 1] - temp.iloc[time, 1]) * weight)

                    temp_list.append(round(value, 6))

        temp_list = {'timestamp': time_list, 'temperature': temp_list}
        temp_final = pd.DataFrame(temp_list)

        return temp_final
    
    # find minimum and maximum time of whole feature
    def find_time_bounds(self, dataframes):
        min_times = []
        max_times = []
        
        for df in dataframes:
            if not df.empty:
                min_times.append(df['timestamp'].min())
                max_times.append(df['timestamp'].max())
                
        if not min_times or not max_times:
            Min_Time = pd.Timestamp.now(tz='UTC')
            Max_Time = pd.Timestamp.now(tz='UTC')
        else:
            Min_Time = min(min_times)
            Max_Time = max(max_times)

        return Min_Time, Max_Time

    # create dataframe from Min_time to Max_time
    def create_final_df(self, datasets, Min_Time, Max_Time):
        fitbit = pd.date_range(start=Min_Time, end=Max_Time, freq=self.time_interval_str, name='timestamp')
        fitbit = pd.DataFrame(fitbit)

        for dataset in datasets:
            fitbit = pd.merge(fitbit, dataset, how='outer', on='timestamp')
            
        fitbit['BM'] = fitbit['BM'].fillna(0)
        fitbit['sleep'] = fitbit['sleep'].fillna(method='ffill')

        return fitbit

In [None]:
class DataMerger(EEGProcessor, FitbitProcessor):
    '''
    We've tested some hyperparameters, and "time_interval=10, eeg_remove_time_in_group=7, BM_sum_minutes=180" have shown the best R-squared score.
    
    [Tested hyperparameters]
    1. Time interval : 10, 12, 15, 20sec & eeg_remove_time_in_group : 7, 9, 12, 16sec (7~80% proportion of time interval)
    R-squared score was best when we split the dataset into 10 seconds group.
    
    2. BM (Body Movement) sum minutes : 1h, 1h 30m, 2h, 2h 30m, 3h
    R-squared score was best when we set up the BM (Body Movement) sum minutes as 3 hours.     
    '''
    
    def __init__(self, eeg_filepath, fitbit_folderpath, time_interval=10, eeg_remove_time_in_group=7, BM_sum_minutes=180):
        # Initialize by calling parent class constructor
        EEGProcessor.__init__(self, eeg_filepath, time_interval, eeg_remove_time_in_group)
        FitbitProcessor.__init__(self, fitbit_folderpath, time_interval, BM_sum_minutes)
        
        self.eeg_filepath = eeg_filepath

    # Merge EEG and Fitbit data
    def merge_data(self):
        # processing EEG data
        eeg_data = pd.read_csv(self.eeg_filepath)
        result_dfs = []
        
        # For all experiments in the eeg data csv file
        for exp_id in range(3, len(eeg_data)):
            processed_data = self.process_eeg_data(exp_id)
            if processed_data is not None:
                result_dfs.append(processed_data)
                
        if result_dfs:
            combined_eeg = pd.concat(result_dfs)
            combined_eeg.index = pd.to_datetime(combined_eeg.index)
        
        # processing Fitbit data
        fitbit_data = self.process_fitbit_data()
        fitbit_data.index = pd.to_datetime(fitbit_data.index)

        # merging two dataframes
        if 'combined_eeg' in locals() and not fitbit_data.empty:
            merged_df = combined_eeg.merge(fitbit_data, left_index=True, right_index=True, how='left')
            return merged_df
        else:
            return None

In [None]:
# eeg_filepath, fitbit_folderpath, time_interval, eeg_remove_time_in_group
merger = DataMerger(r"C:\Users\ballj\OneDrive\바탕 화면\EEG_jm.csv",
                    r"C:\Users\ballj\OneDrive\바탕 화면\Fitbit_JM")

final_jm = merger.merge_data()

# eeg_filepath, fitbit_folderpath, time_interval, eeg_remove_time_in_group
merger = DataMerger(r"C:\Users\ballj\OneDrive\바탕 화면\EEG_yh.csv",
                    r"C:\Users\ballj\OneDrive\바탕 화면\Fitbit_YH")

final_yh = merger.merge_data()

# eeg_filepath, fitbit_folderpath, time_interval, eeg_remove_time_in_group
merger = DataMerger(r"C:\Users\ballj\OneDrive\바탕 화면\EEG_sj.csv",
                    r"C:\Users\ballj\OneDrive\바탕 화면\Fitbit_SJ")


final_sj = merger.merge_data()

# eeg_filepath, fitbit_folderpath, time_interval, eeg_remove_time_in_group
merger = DataMerger(r"C:\Users\ballj\OneDrive\바탕 화면\EEG_sa.csv",
                    r"C:\Users\ballj\OneDrive\바탕 화면\Fitbit_SA")


final_sa = merger.merge_data()

# eeg_filepath, fitbit_folderpath, time_interval, eeg_remove_time_in_group
merger = DataMerger(r"C:\Users\ballj\OneDrive\바탕 화면\EEG_bs.csv",
                    r"C:\Users\ballj\OneDrive\바탕 화면\Fitbit_BS")


final_bs = merger.merge_data()

# eeg_filepath, fitbit_folderpath, time_interval, eeg_remove_time_in_group
merger = DataMerger(r"C:\Users\ballj\OneDrive\바탕 화면\EEG_mj.csv",
                    r"C:\Users\ballj\OneDrive\바탕 화면\Fitbit_MJ")


final_mj = merger.merge_data()

In [None]:
dataset = pd.concat([final_jm, final_sj, final_bs, final_yh, final_mj, final_sa])
dataset = dataset.sort_index()
dataset

In [None]:
dataset.to_csv(r'C:\Users\ballj\OneDrive\바탕 화면\4.Feature_Engineering.csv')

# 1~4 + 5.Feature Scaling + 6.Imputation

In [None]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta

class EEGProcessor:
     
    # time_interval : Unified seconds with Fitbit data (ex. 10secs) 
    # remove_time_in_group : the criteria of processing error values in each time_interval group (ex. 7secs)
    
    def __init__(self, file_path, time_interval, remove_time_in_group):
        self.time_interval = time_interval
        self.remove_time_in_group = remove_time_in_group
        self.time_interval_str = f'{time_interval}S'
        self.EEG_report = pd.read_csv(file_path)

    # List to dataframe (ex. brain waves)
    def parse_raw_data(self, dataframe, col_name):
        col_str = dataframe.iloc[0][col_name]
        col_str = col_str.strip('[]')
        col_list = [float(val) for val in col_str.split(',')]  # cause list is divided by comma
        col_data = pd.DataFrame({col_name: col_list})
        return col_data

    # Experiment time calculating function
    def time_difference(self, dataframe, start_time_col, finish_time_col):
        start_time = datetime.strptime(dataframe.iloc[0][start_time_col], '%Y-%m-%d %H:%M:%S')
        finish_time = datetime.strptime(dataframe.iloc[0][finish_time_col], '%Y-%m-%d %0H:%M:%S')

        # time difference between two datatime objects
        time_difference = (finish_time - start_time).total_seconds()
        return time_difference
    
    # Comparing the experimental initial recognition error period and delete the part to be deleted
    def count_initial_same_values(self, series):
        initial_value = series.iloc[0]
        count = 0
        for value in series:
            if value == initial_value:
                count += 1
            else:
                break
        return count
    
    # Processing of values that are not exactly divided into front and back
    def process_start_time_trash_sec(self, start_time):
        # Plus 1 min and delete second in input time
        rounded_time = start_time + timedelta(minutes=1) - timedelta(seconds=start_time.second)
        time_difference = (rounded_time - start_time).total_seconds()

        remainder = time_difference % float(self.time_interval)

        # use the seconds over remove_time_in_group seconds
        if self.remove_time_in_group <= remainder:
            return False
        
        else:
            return remainder
        
    # Processing of values that are not exactly divided into front and back
    def process_finish_time_trash_sec(self, finish_time):
        # delete second in input time
        rounded_time = finish_time - timedelta(seconds=finish_time.second)
        time_difference = (finish_time - rounded_time).total_seconds()

        remainder = time_difference % float(self.time_interval)

        if self.remove_time_in_group <= remainder:
            return False
        
        else:
            return remainder

    # Rounding time to nearest time which can divided by time interval
    def nearest_time_rounding(self, time):
        seconds = time.second
        # For example, find nearest value in 0, 10, 20, 30, 40, 50sec
        time_points = [time_point for time_point in range(0,60, self.time_interval)]
        nearest = min(time_points, key=lambda time_point: abs(time_point - seconds))
        
        if nearest == time_points[-1] and seconds >= (time_points[-1] + self.remove_time_in_group):
            rounded_time = time.replace(second=0, microsecond=0) + timedelta(minutes=1)
        else:
            rounded_time = time.replace(second=nearest, microsecond=0)

        return rounded_time

    # Make the same end time
    def align_end_time(self, dataframe_1, dataframe_2):
        if dataframe_1.index[-1] > dataframe_2.index[-1]:
            dataframe_1 = dataframe_1[dataframe_1.index <= dataframe_2.index[-1]]

        elif dataframe_1.index[-1] < dataframe_2.index[-1]:
            dataframe_2 = dataframe_2[dataframe_2.index <= dataframe_1.index[-1]]

        else: 
            pass # if two dataframe's endtime is same

        return dataframe_1, dataframe_2

    # Adjust start time and end time processing
    # start time processing -> process_type : 0 , finish time processing -> process_type : -1
    # start time processing -> process_start_time_trash_sec func , finish time processing -> process_finish_time_trash_sec func    
    def adjust_time_index(self, process_type, dataframe, func):
        remainder = func(dataframe.index[process_type])
        
        # the last data only shows one original data, so processing this problem
        one_sec = timedelta(seconds=1)

        if remainder == False:
            # change time to nearest (Start time processing)
            if process_type == 0 :
                time = self.nearest_time_rounding(dataframe.index[process_type])
                new_index = dataframe.index.tolist()
                new_index[process_type] = time
                dataframe.index = new_index
            
            # change time to nearest (Finish time processing)
            else:
                time = self.nearest_time_rounding(dataframe.index[process_type]) - one_sec
                new_index = dataframe.index.tolist()
                new_index[process_type] = time
                dataframe.index = new_index
        
        # if remainder is under remove_time_in_group, just remove
        else:
            cutting_time = timedelta(seconds=remainder)
            # Start time processing
            if process_type == 0:
                dataframe = dataframe[dataframe.index >= dataframe.index[process_type] + cutting_time]
            
            # Finish time processing
            # make the seconds like 9, 19, 29...
            else:
                dataframe = dataframe[dataframe.index <= dataframe.index[-1] - cutting_time - one_sec]

        return dataframe
    
    # Removing error values in group (brain waves and attention score)
    def check_invalid_values(self, group):
        # find error data length in brain wave
        alpha_invalid_series = group['α_wave_raw_data'].diff().eq(0)
        alpha_invalid_timestamps = group.index[alpha_invalid_series].tolist()

        # find error data length in attention_raw_data
        attention_invalid_series = group['attention_raw_data'] == 0
        attention_invalid_timestamps = group.index[attention_invalid_series].tolist()

        # check whether the length of error data is over remove_time_in_group second
        def has_long_invalid_duration(invalid_timestamps):
            if not invalid_timestamps:
                return False
            for invalid_time in range(1, len(invalid_timestamps)):
                if (invalid_timestamps[invalid_time] - invalid_timestamps[invalid_time-1]).seconds > self.remove_time_in_group:
                    return True
            return False

        alpha_invalid = has_long_invalid_duration(alpha_invalid_timestamps)
        attention_invalid = has_long_invalid_duration(attention_invalid_timestamps)
        
        # make error values to missing values
        if alpha_invalid or attention_invalid:
            return pd.Series([np.nan] * group.shape[1], index=group.columns)

        else:
            # calculate average except error value
            valid_conditions = (
                (group['α_wave_raw_data'].diff() != 0) & 
                (group['β_wave_raw_data'].diff() != 0) & 
                (group['θ_wave_raw_data'].diff() != 0) & 
                (group['δ_wave_raw_data'].diff() != 0) & 
                (group['γ_wave_raw_data'].diff() != 0) & 
                (group['attention_raw_data'] != 0)
            )
            return group[valid_conditions].mean()

    # Removing error values in group (hr)
    def check_invalid_values_other(self, group):
        # find error data length in hr
        hr_invalid_series = group['hr_raw_data'] == 0
        hr_invalid_timestamps = group.index[hr_invalid_series].tolist()

        # check whether the length of error data is over remove_time_in_group second
        def has_long_invalid_duration(invalid_timestamps):
            if not invalid_timestamps:
                return False
            for invalid_time in range(1, len(invalid_timestamps)):
                if (invalid_timestamps[invalid_time] - invalid_timestamps[invalid_time-1]).seconds > self.remove_time_in_group:
                    return True
            return False

        hr_invalid = has_long_invalid_duration(hr_invalid_timestamps)

        # make error values to missing values
        if hr_invalid:
            return pd.Series([np.nan] * group.shape[1], index=group.columns)

        else:
            # calculate average except error value
            group = group[(group['hr_raw_data'] != 0)]
            return group.mean()
    
    # Process EEG data
    def process_eeg_data(self, experiment_id):
        if experiment_id not in self.EEG_report.index:
            return None

        # all experiments in one df
        EEG_report_sample = self.EEG_report.loc[[experiment_id],:]

        # one dataframe for one column
        cols = ['α_wave_raw_data', 'β_wave_raw_data', 'θ_wave_raw_data', 'δ_wave_raw_data', 'γ_wave_raw_data', 'attention_raw_data', 'hrv_raw_data', 'hr_raw_data', 'coherence_flag_raw_data']
        parsed_dfs = [self.parse_raw_data(EEG_report_sample, col) for col in cols]

        # calculate two interval second because there's two type of time interval in EEG data
        interval_sec = self.time_difference(EEG_report_sample, 'meditation_start_time', 'meditation_finish_time') / len(parsed_dfs[0])
        interval_sec_other = self.time_difference(EEG_report_sample, 'meditation_start_time', 'meditation_finish_time') / len(parsed_dfs[6])

        # make two merged dataframe
        merged_df = parsed_dfs[0].join(parsed_dfs[1:6])
        merged_df_other = parsed_dfs[6].join(parsed_dfs[7:])

        # experiment start time
        start_time = datetime.strptime(EEG_report_sample.iloc[0]['meditation_start_time'], '%Y-%m-%d %H:%M:%S')

        # change index to time index based on interval second
        interval_sec, interval_sec_other = timedelta(seconds=round(interval_sec,2)), timedelta(seconds=round(interval_sec_other,2))
        merged_df['time'] = [start_time + time * interval_sec for time in range(len(merged_df))]
        merged_df_other['time'] = [start_time + time * interval_sec_other for time in range(len(merged_df_other))]
        merged_df, merged_df_other = merged_df.set_index('time'), merged_df_other.set_index('time')
        
        # comparing the inital experiment error time
        counts = [self.count_initial_same_values(merged_df[col]) for col in cols[:6]] + [self.count_initial_same_values(merged_df_other['hr_raw_data'])]
        initial_error_times = [counts[error] * interval_sec.total_seconds() if error != 6 else counts[error] * interval_sec_other.total_seconds() for error in range(7)]
        initial_error_time = timedelta(seconds=max(initial_error_times))

        # dataset start time
        real_start_time = start_time + initial_error_time
        merged_df, merged_df_other = merged_df[merged_df.index > real_start_time], merged_df_other[merged_df_other.index > real_start_time]
        merged_df.index, merged_df_other.index = merged_df.index.round('S'), merged_df_other.index.round('S')

        # make the experiment end time same
        merged_df, merged_df_other = self.align_end_time(merged_df, merged_df_other)

        # start time process -> i : 0 , finish time process -> i : -1
        # start time process -> process_start_time_trash_sec func , finish time process -> process_finish_time_trash_sec func
        merged_df = self.adjust_time_index(0, merged_df, self.process_start_time_trash_sec)
        merged_df_other = self.adjust_time_index(0, merged_df_other, self.process_start_time_trash_sec)
        merged_df = self.adjust_time_index(-1, merged_df, self.process_finish_time_trash_sec)
        merged_df_other = self.adjust_time_index(-1, merged_df_other, self.process_finish_time_trash_sec)

        # grouping
        grouped = merged_df.groupby(merged_df.index.floor(self.time_interval_str))
        grouped_other = merged_df_other.groupby(merged_df_other.index.floor(self.time_interval_str))

        result = grouped.apply(self.check_invalid_values)
        result_other = grouped_other.apply(self.check_invalid_values_other)

        # final EEG dataset including β/θ SP ratio
        EEG_data_per_time_interval = result.merge(result_other, left_index=True, right_index=True)
        EEG_data_per_time_interval['β/θ SP'] = EEG_data_per_time_interval['β_wave_raw_data'] / EEG_data_per_time_interval['θ_wave_raw_data']
        
        EEG_data_per_time_interval = EEG_data_per_time_interval.rename(columns={
            'α_wave_raw_data':'alpha_wave',
            'β_wave_raw_data':'beta_wave',
            'θ_wave_raw_data':'theta_wave',
            'δ_wave_raw_data':'delta_wave',
            'γ_wave_raw_data':'gamma_wave',
            'attention_raw_data' : 'attention',
            'hrv_raw_data' : 'hrv',
            'hr_raw_data' : 'hr',
            'coherence_flag_raw_data' : 'coherence',
            'β/θ SP' : 'SP ratio'
        })

        return EEG_data_per_time_interval

In [None]:
import datetime as dt
import matplotlib.pyplot as plt
import glob

class FitbitProcessor:
    '''
    time_interval : Unified seconds with EEG dataset (ex.10secs) 
    BM_sum_minutes
    Body Movement feature was meaningless because the experiment was conducted usually while sitting on the chair.
    So, created a new body movement feature as accumulated Body Movement value from previous time.
    And, the previous time is the BM_sum_minutes variable. (ex. 3 hours)
    '''
    def __init__(self, folder_path, time_interval, BM_sum_minutes):
        self.folder_path = folder_path
        self.BM_sum_minutes = BM_sum_minutes
        self.BM_sum_minutes_str = f'{BM_sum_minutes}T'
        self.time_interval = time_interval
        self.time_interval_str = f'{time_interval}S'
        folder_patterns = [
            "Active Zone Minutes (AZM)/*",
            "Sleep Score/*",
            "Stress Journal/CEDA*",
            "Temperature/Wrist Temperature - *"
        ]
        self.things_path = [glob.glob(f"{folder_path}/{pattern}") for pattern in folder_patterns]
        self.things_path = [item for sublist in self.things_path for item in sublist]
        self.wt_count = len(glob.glob(f"{folder_path}/Temperature/Wrist Temperature - *"))
        self.azm_count = len(glob.glob(f"{folder_path}/Active Zone Minutes (AZM)/*"))
        self.sleep_count = len(glob.glob(f"{folder_path}/Sleep Score/*"))
        self.eda_count = len(glob.glob(f"{folder_path}/Stress Journal/CEDA*"))
        # original fitbit dataset's time interval is 1 min
        self.original_interval = 60
        self.num_timestamps = self.original_interval // self.time_interval
        self.half_point = self.num_timestamps // 2
    
    def read_filtered_csv(self, path, columns):
        name = pd.read_csv(path)
        name = name[columns]
        name[columns[0]] = pd.to_datetime(name[columns[0]])
        return name
    
    def round_seconds(self, obj):
        if obj.second % self.time_interval == 0:
            return obj
        else:
            return obj - timedelta(seconds=obj.second % self.time_interval)
    
    def round_zero(self, datetime_obj):
        datetime_obj = datetime_obj.replace(second=0)
        return datetime_obj
    
    # Process whole fitbit data
    def process_fitbit_data(self):
        AZM_col = ['date_time', 'total_minutes']
        sleep_col = ['timestamp', 'deep_sleep_in_minutes']
        stress_col = ['timestamp', 'eda_level_real']
        temp_col = ['recorded_time', 'temperature']
        
        # merge all features
        things_col = [AZM_col] * self.azm_count + [sleep_col] * self.sleep_count + [stress_col] * self.eda_count + [temp_col] * self.wt_count
        things = [self.read_filtered_csv(path, col) for path, col in zip(self.things_path, things_col)]

        # if there's no wrist temperature
        if self.wt_count == 0:
            # if there's no eda data
            # there was no eda in two subjects' fitbit data
            if self.eda_count == 0:
                azm = self.process_azm(things[:self.azm_count])
                sleep = self.process_sleep(things[self.azm_count:self.azm_count+self.sleep_count])
                Min_Time, Max_Time = self.find_time_bounds([azm, sleep])
                
                # make final dataframe
                df = self.create_final_df([azm, sleep], Min_Time, Max_Time)
                df['timestamp'] = pd.to_datetime(df['timestamp'])
                df = df.set_index('timestamp')
                df = df.assign(temperature=np.nan)
                df = df.assign(eda=np.nan)
                
                return df
            
            else:
                azm = self.process_azm(things[:self.azm_count])
                sleep = self.process_sleep(things[self.azm_count:self.azm_count+self.sleep_count])
                eda = self.process_eda(things[self.azm_count+self.sleep_count:self.azm_count+self.sleep_count+self.eda_count])
                Min_Time, Max_Time = self.find_time_bounds([azm, sleep, eda])

                df = self.create_final_df([azm, sleep, eda], Min_Time, Max_Time)
                df['timestamp'] = pd.to_datetime(df['timestamp'])
                df = df.set_index('timestamp')
                df = df.assign(temperature=np.nan)
                return df
            
        # if there's no Active Zone Minutes data
        elif self.azm_count == 0:
            sleep = self.process_sleep(things[self.azm_count])
            eda = self.process_eda(things[self.azm_count+self.sleep_count:self.azm_count+self.sleep_count+self.eda_count])
            temp = self.process_temperature(things[self.azm_count+self.sleep_count+self.eda_count:])
            Min_Time, Max_Time = self.find_time_bounds([sleep, eda, temp])

            df = self.create_final_df([sleep, eda, temp], Min_Time, Max_Time)
            df['timestamp'] = pd.to_datetime(df['timestamp'])
            df = df.set_index('timestamp')
            df = df.assign(BM=np.nan)

            return df
        
        # if there's no sleep data
        elif self.sleep_count == 0:
            azm = self.process_azm(things[:self.azm_count])
            eda = self.process_eda(things[self.azm_count+self.sleep_count:self.azm_count+self.sleep_count+self.eda_count])
            temp = self.process_temperature(things[self.azm_count+self.sleep_count+self.eda_count:])
            Min_Time, Max_Time = self.find_time_bounds([azm, eda, temp])

            df = self.create_final_df([azm, eda, temp], Min_Time, Max_Time)
            df['timestamp'] = pd.to_datetime(df['timestamp'])
            df = df.set_index('timestamp')
            df = df.assign(sleep=np.nan)

            return df            
        
        # if there's no eda data
        elif self.eda_count == 0:
            azm = self.process_azm(things[:self.azm_count])
            sleep = self.process_sleep(things[self.azm_count:self.azm_count+self.sleep_count])
            temp = self.process_temperature(things[self.azm_count+self.sleep_count:])
            Min_Time, Max_Time = self.find_time_bounds([azm, sleep, temp])

            df = self.create_final_df([azm, sleep, temp], Min_Time, Max_Time)
            df['timestamp'] = pd.to_datetime(df['timestamp'])
            df = df.set_index('timestamp')
            df = df.assign(eda=np.nan)

            return df
            
        # if there's no error in data file
        else:
            azm = self.process_azm(things[:self.azm_count])
            sleep = self.process_sleep(things[self.azm_count:self.azm_count+self.sleep_count])
            eda = self.process_eda(things[self.azm_count+self.sleep_count:self.azm_count+self.sleep_count+self.eda_count])
            temp = self.process_temperature(things[self.azm_count+self.sleep_count+self.eda_count:])
            Min_Time, Max_Time = self.find_time_bounds([azm, sleep, eda, temp])

            df = self.create_final_df([azm, sleep, eda, temp], Min_Time, Max_Time)
            df['timestamp'] = pd.to_datetime(df['timestamp'])
            df = df.set_index('timestamp')

            return df
    
    # Process Active Zone Minutes data
    def process_azm(self, azm):

        azm = pd.concat(azm, axis=0)
        azm = azm.rename(columns={'date_time':'timestamp', 'total_minutes':'BM'})
        time_list = []
        body_movement = []

        for time in azm['timestamp']:
            start_timestamp = time - timedelta(seconds=(self.half_point * self.time_interval))

            for number in range(self.num_timestamps):
                new_timestamp = start_timestamp + timedelta(seconds = (number*self.time_interval))
                time_list.append(new_timestamp)

        for bm in azm['BM']:
            for _ in range(self.num_timestamps):
                body_movement.append(bm)

        azm_list = {'timestamp': time_list, 'BM': body_movement}
        azm_final = pd.DataFrame(azm_list)

        azm_final['timestamp'] = pd.to_datetime(azm_final['timestamp'])
        azm_final.set_index('timestamp', inplace=True)
        azm_final = azm_final.resample(self.time_interval_str).asfreq().fillna(0)
        azm_final['new_BM'] = azm_final['BM'].rolling(self.BM_sum_minutes_str, closed='right').sum()
        azm_final = azm_final.drop(['BM'], axis=1)
        azm_final = azm_final.rename(columns={'new_BM':'BM'})
        azm_final = azm_final.astype({'BM':'int'})
        azm_final.reset_index(inplace=True)
        
        return azm_final
    
    # Process sleep data (Deep sleep in minutes)
    def process_sleep(self, sleep):
        sleep = pd.concat(sleep, axis=0)
        sleep['timestamp'] = [
                self.round_zero(datetime.strptime(str(time)[0:19], '%Y-%m-%d %H:%M:%S')) for time in sleep['timestamp']
            ]
            
        sleep = sleep.rename(columns={'deep_sleep_in_minutes':'sleep'})
        return sleep
    
    # Process eda data
    def process_eda(self, eda):
        eda = pd.concat(eda, axis=0)
        eda['timestamp'] = [
            self.round_seconds(
                datetime.strptime(str(time)[0:19], '%Y-%m-%d %H:%M:%S') - dt.timedelta(hours=4)
            ) for time in eda['timestamp']]

        # EDA Bilinear Interpolation
        time_list = []
        eda_list = []

        for time in range(1, len(eda['timestamp']) - 1):
            start_timestamp = eda.iloc[time,0] - timedelta(seconds = self.half_point * self.time_interval)

            for num in range(self.num_timestamps):
                new_timestamp = start_timestamp + timedelta(seconds = (num * self.time_interval))
                time_list.append(new_timestamp)

                if num < self.half_point:
                    weight = (self.half_point - num) / self.num_timestamps
                    value = eda.iloc[time, 1] - ((eda.iloc[time, 1] - eda.iloc[time - 1, 1]) * weight)

                elif num == self.half_point:
                    value = eda.iloc[time,1]

                else:
                    weight = (num - self.half_point) / self.num_timestamps
                    value = eda.iloc[time, 1] + ((eda.iloc[time + 1, 1] - eda.iloc[time, 1]) * weight)

                eda_list.append(round(value, 2))

        eda_list = {'timestamp': time_list, 'eda': eda_list}
        eda_final = pd.DataFrame(eda_list)

        return eda_final
    
    # Process temperature data
    def process_temperature(self, temperature):
        #Temperature
        temp = pd.concat(temperature, axis=0)
        temp = temp.rename(columns={'recorded_time':'timestamp'})

        # Temperature Bilinear Interpolation
        time_list = []
        temp_list = []

        for time in range(1, len(temp['timestamp']) - 1):
            if self.half_point % 2 != 0:
                start_timestamp = temp.iloc[time,0] - timedelta(seconds = self.half_point * self.time_interval)
                for num in range(self.num_timestamps):
                    new_timestamp = start_timestamp + timedelta(seconds = (num * self.time_interval))
                    time_list.append(new_timestamp)

                    if num < self.half_point:
                        weight = (self.half_point - num) / self.num_timestamps 
                        value = temp.iloc[time, 1] - ((temp.iloc[time, 1] - temp.iloc[time - 1, 1]) * weight)

                    elif num == self.half_point:
                        value = temp.iloc[time,1]

                    else:
                        weight = (num - self.half_point) / self.num_timestamps
                        value = temp.iloc[time, 1] + ((temp.iloc[time + 1, 1] - temp.iloc[time, 1]) * weight)

                    temp_list.append(round(value, 6))

        temp_list = {'timestamp': time_list, 'temperature': temp_list}
        temp_final = pd.DataFrame(temp_list)

        return temp_final
    
    # find minimum and maximum time of whole feature
    def find_time_bounds(self, dataframes):
        min_times = []
        max_times = []
        
        for df in dataframes:
            if not df.empty:
                min_times.append(df['timestamp'].min())
                max_times.append(df['timestamp'].max())
                
        if not min_times or not max_times:
            Min_Time = pd.Timestamp.now(tz='UTC')
            Max_Time = pd.Timestamp.now(tz='UTC')
        else:
            Min_Time = min(min_times)
            Max_Time = max(max_times)

        return Min_Time, Max_Time

    # create dataframe from Min_time to Max_time
    def create_final_df(self, datasets, Min_Time, Max_Time):
        fitbit = pd.date_range(start=Min_Time, end=Max_Time, freq=self.time_interval_str, name='timestamp')
        fitbit = pd.DataFrame(fitbit)

        for dataset in datasets:
            fitbit = pd.merge(fitbit, dataset, how='outer', on='timestamp')
            
        fitbit['BM'] = fitbit['BM'].fillna(0)
        fitbit['sleep'] = fitbit['sleep'].fillna(method='ffill')

        return fitbit

In [None]:
class DataMerger(EEGProcessor, FitbitProcessor):
    '''
    We've tested some hyperparameters, and "time_interval=10, eeg_remove_time_in_group=7, BM_sum_minutes=180" have shown the best R-squared score.
    
    [Tested hyperparameters]
    1. Time interval : 10, 12, 15, 20sec & eeg_remove_time_in_group : 7, 9, 12, 16sec (7~80% proportion of time interval)
    R-squared score was best when we split the dataset into 10 seconds group.
    
    2. BM (Body Movement) sum minutes : 1h, 1h 30m, 2h, 2h 30m, 3h
    R-squared score was best when we set up the BM (Body Movement) sum minutes as 3 hours.     
    '''
    
    def __init__(self, eeg_filepath, fitbit_folderpath, time_interval=10, eeg_remove_time_in_group=7, BM_sum_minutes=180):
        # Initialize by calling parent class constructor
        EEGProcessor.__init__(self, eeg_filepath, time_interval, eeg_remove_time_in_group)
        FitbitProcessor.__init__(self, fitbit_folderpath, time_interval, BM_sum_minutes)
        
        self.eeg_filepath = eeg_filepath

    # Merge EEG and Fitbit data
    def merge_data(self):
        # processing EEG data
        eeg_data = pd.read_csv(self.eeg_filepath)
        result_dfs = []
        
        # For all experiments in the eeg data csv file
        for exp_id in range(3, len(eeg_data)):
            processed_data = self.process_eeg_data(exp_id)
            if processed_data is not None:
                result_dfs.append(processed_data)
                
        if result_dfs:
            combined_eeg = pd.concat(result_dfs)
            combined_eeg.index = pd.to_datetime(combined_eeg.index)
        
        # processing Fitbit data
        fitbit_data = self.process_fitbit_data()
        fitbit_data.index = pd.to_datetime(fitbit_data.index)

        # merging two dataframes
        if 'combined_eeg' in locals() and not fitbit_data.empty:
            merged_df = combined_eeg.merge(fitbit_data, left_index=True, right_index=True, how='left')
            return merged_df
        else:
            return None

In [None]:
# eeg_filepath, fitbit_folderpath, time_interval, eeg_remove_time_in_group
merger = DataMerger(r"C:\Users\ballj\OneDrive\바탕 화면\EEG_jm.csv",
                    r"C:\Users\ballj\OneDrive\바탕 화면\Fitbit_JM")

final_jm = merger.merge_data()

# eeg_filepath, fitbit_folderpath, time_interval, eeg_remove_time_in_group
merger = DataMerger(r"C:\Users\ballj\OneDrive\바탕 화면\EEG_yh.csv",
                    r"C:\Users\ballj\OneDrive\바탕 화면\Fitbit_YH")

final_yh = merger.merge_data()

# eeg_filepath, fitbit_folderpath, time_interval, eeg_remove_time_in_group
merger = DataMerger(r"C:\Users\ballj\OneDrive\바탕 화면\EEG_sj.csv",
                    r"C:\Users\ballj\OneDrive\바탕 화면\Fitbit_SJ")


final_sj = merger.merge_data()

# eeg_filepath, fitbit_folderpath, time_interval, eeg_remove_time_in_group
merger = DataMerger(r"C:\Users\ballj\OneDrive\바탕 화면\EEG_sa.csv",
                    r"C:\Users\ballj\OneDrive\바탕 화면\Fitbit_SA")


final_sa = merger.merge_data()

# eeg_filepath, fitbit_folderpath, time_interval, eeg_remove_time_in_group
merger = DataMerger(r"C:\Users\ballj\OneDrive\바탕 화면\EEG_bs.csv",
                    r"C:\Users\ballj\OneDrive\바탕 화면\Fitbit_BS")


final_bs = merger.merge_data()

# eeg_filepath, fitbit_folderpath, time_interval, eeg_remove_time_in_group
merger = DataMerger(r"C:\Users\ballj\OneDrive\바탕 화면\EEG_mj.csv",
                    r"C:\Users\ballj\OneDrive\바탕 화면\Fitbit_MJ")


final_mj = merger.merge_data()

In [None]:
dataset = pd.concat([final_jm, final_sj, final_bs, final_yh, final_mj, final_sa])
dataset = dataset.sort_index()
dataset

In [None]:
dataset_original = dataset

In [None]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler

coherence_original = dataset_original['coherence'].copy()
# Temp - Standard Scaler & Else - Min-Max Scaling
features_to_minmax = dataset_original.drop(['temperature', 'SP ratio', 'coherence'], axis=1)
feature_to_standardize = dataset_original[['temperature']]
feature_to_minmax_0_1 = dataset_original[['SP ratio']]

minmax_scaler = MinMaxScaler()
minmax_scaler_0_1 = MinMaxScaler(feature_range=(0, 1))
standard_scaler = StandardScaler()

features_to_minmax_scaled = minmax_scaler.fit_transform(features_to_minmax)
feature_to_standardize_scaled = standard_scaler.fit_transform(feature_to_standardize)
feature_to_minmax_0_1_scaled = minmax_scaler_0_1.fit_transform(feature_to_minmax_0_1)

# convert to dataframe
features_to_minmax_scaled_df = pd.DataFrame(features_to_minmax_scaled, 
                                            index=dataset_original.index, 
                                            columns=features_to_minmax.columns)
feature_to_standardize_scaled_df = pd.DataFrame(feature_to_standardize_scaled, 
                                                index=dataset_original.index, 
                                                columns=['temperature'])
feature_to_minmax_0_1_scaled_df = pd.DataFrame(feature_to_minmax_0_1_scaled, 
                                                index=dataset_original.index, 
                                                columns=['SP ratio'])

dataset_original.update(features_to_minmax_scaled_df)
dataset_original.update(feature_to_standardize_scaled_df)
dataset_original.update(feature_to_minmax_0_1_scaled_df)

dataset_original['coherence'] = coherence_original

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.impute import SimpleImputer

# Temperature Imputation (Random Forest Imputation)
temperature_na1 = dataset_original[dataset_original['temperature'].isna() & dataset_original.drop(columns='temperature').notna().all(axis=1)]
temperature_na1 = temperature_na1.reset_index()
temperature_na1 = temperature_na1.drop(['index'], axis=1)

full_dataset1 = dataset_original.dropna()
full_dataset1 = full_dataset1.reset_index()
full_dataset1 = full_dataset1.drop(['index'], axis=1)

dataset_temp = pd.concat([full_dataset1, temperature_na1])

In [None]:
# Impute missing values using Random Forest
regressor = RandomForestRegressor(n_estimators=100, random_state=0)

# Split the dataset into two parts: with and without missing 'temperature' values
dataset_with_temperature = dataset_temp.dropna(subset=['temperature'])
dataset_without_temperature = dataset_temp[dataset_temp['temperature'].isnull()]

# Train the model on the rows with no missing temperature values
regressor.fit(dataset_with_temperature.drop('temperature', axis=1), dataset_with_temperature['temperature'])

# Predict the missing temperature values
predicted_temperatures = regressor.predict(dataset_without_temperature.drop('temperature', axis=1))

# Fill in the missing values in the original dataframe
dataset_temp.loc[dataset_temp['temperature'].isnull(), 'temperature'] = predicted_temperatures

dataset_temp

In [None]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.impute import SimpleImputer

eda_na = dataset_original[dataset_original['eda'].isna() & dataset_original.drop(columns='eda').notna().all(axis=1)]
eda_na = eda_na.reset_index()
eda_na = eda_na.drop(['index'], axis=1)

dataset_eda = pd.concat([full_dataset1, eda_na])

# Impute missing values using Random Forest
regressor = RandomForestRegressor(n_estimators=100, random_state=0)

# Split the dataset into two parts: with and without missing 'temperature' values
dataset_with_eda = dataset_eda.dropna(subset=['eda'])
dataset_without_eda = dataset_eda[dataset_eda['eda'].isnull()]

# Train the model on the rows with no missing temperature values
regressor.fit(dataset_with_eda.drop('eda', axis=1), dataset_with_eda['eda'])

# Predict the missing temperature values
predicted_edas = regressor.predict(dataset_without_eda.drop('eda', axis=1))

# Fill in the missing values in the original dataframe
dataset_eda.loc[dataset_eda['eda'].isnull(), 'eda'] = predicted_edas
dataset_eda

In [None]:
merged_df = pd.concat([dataset_temp, dataset_eda])
merged_df = merged_df.drop_duplicates()
merged_df

In [None]:
merged_df.to_csv(r'C:\Users\ballj\OneDrive\바탕 화면\1~4+5.Feature_Scaling+6.Imputation.csv')

# 1~4 + 6.Imputation + 5.Feature Scaling

In [None]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta

class EEGProcessor:
     
    # time_interval : Unified seconds with Fitbit data (ex. 10secs) 
    # remove_time_in_group : the criteria of processing error values in each time_interval group (ex. 7secs)
    
    def __init__(self, file_path, time_interval, remove_time_in_group):
        self.time_interval = time_interval
        self.remove_time_in_group = remove_time_in_group
        self.time_interval_str = f'{time_interval}S'
        self.EEG_report = pd.read_csv(file_path)

    # List to dataframe (ex. brain waves)
    def parse_raw_data(self, dataframe, col_name):
        col_str = dataframe.iloc[0][col_name]
        col_str = col_str.strip('[]')
        col_list = [float(val) for val in col_str.split(',')]  # cause list is divided by comma
        col_data = pd.DataFrame({col_name: col_list})
        return col_data

    # Experiment time calculating function
    def time_difference(self, dataframe, start_time_col, finish_time_col):
        start_time = datetime.strptime(dataframe.iloc[0][start_time_col], '%Y-%m-%d %H:%M:%S')
        finish_time = datetime.strptime(dataframe.iloc[0][finish_time_col], '%Y-%m-%d %0H:%M:%S')

        # time difference between two datatime objects
        time_difference = (finish_time - start_time).total_seconds()
        return time_difference
    
    # Comparing the experimental initial recognition error period and delete the part to be deleted
    def count_initial_same_values(self, series):
        initial_value = series.iloc[0]
        count = 0
        for value in series:
            if value == initial_value:
                count += 1
            else:
                break
        return count
    
    # Processing of values that are not exactly divided into front and back
    def process_start_time_trash_sec(self, start_time):
        # Plus 1 min and delete second in input time
        rounded_time = start_time + timedelta(minutes=1) - timedelta(seconds=start_time.second)
        time_difference = (rounded_time - start_time).total_seconds()

        remainder = time_difference % float(self.time_interval)

        # use the seconds over remove_time_in_group seconds
        if self.remove_time_in_group <= remainder:
            return False
        
        else:
            return remainder
        
    # Processing of values that are not exactly divided into front and back
    def process_finish_time_trash_sec(self, finish_time):
        # delete second in input time
        rounded_time = finish_time - timedelta(seconds=finish_time.second)
        time_difference = (finish_time - rounded_time).total_seconds()

        remainder = time_difference % float(self.time_interval)

        if self.remove_time_in_group <= remainder:
            return False
        
        else:
            return remainder

    # Rounding time to nearest time which can divided by time interval
    def nearest_time_rounding(self, time):
        seconds = time.second
        # For example, find nearest value in 0, 10, 20, 30, 40, 50sec
        time_points = [time_point for time_point in range(0,60, self.time_interval)]
        nearest = min(time_points, key=lambda time_point: abs(time_point - seconds))
        
        if nearest == time_points[-1] and seconds >= (time_points[-1] + self.remove_time_in_group):
            rounded_time = time.replace(second=0, microsecond=0) + timedelta(minutes=1)
        else:
            rounded_time = time.replace(second=nearest, microsecond=0)

        return rounded_time

    # Make the same end time
    def align_end_time(self, dataframe_1, dataframe_2):
        if dataframe_1.index[-1] > dataframe_2.index[-1]:
            dataframe_1 = dataframe_1[dataframe_1.index <= dataframe_2.index[-1]]

        elif dataframe_1.index[-1] < dataframe_2.index[-1]:
            dataframe_2 = dataframe_2[dataframe_2.index <= dataframe_1.index[-1]]

        else: 
            pass # if two dataframe's endtime is same

        return dataframe_1, dataframe_2

    # Adjust start time and end time processing
    # start time processing -> process_type : 0 , finish time processing -> process_type : -1
    # start time processing -> process_start_time_trash_sec func , finish time processing -> process_finish_time_trash_sec func    
    def adjust_time_index(self, process_type, dataframe, func):
        remainder = func(dataframe.index[process_type])
        
        # the last data only shows one original data, so processing this problem
        one_sec = timedelta(seconds=1)

        if remainder == False:
            # change time to nearest (Start time processing)
            if process_type == 0 :
                time = self.nearest_time_rounding(dataframe.index[process_type])
                new_index = dataframe.index.tolist()
                new_index[process_type] = time
                dataframe.index = new_index
            
            # change time to nearest (Finish time processing)
            else:
                time = self.nearest_time_rounding(dataframe.index[process_type]) - one_sec
                new_index = dataframe.index.tolist()
                new_index[process_type] = time
                dataframe.index = new_index
        
        # if remainder is under remove_time_in_group, just remove
        else:
            cutting_time = timedelta(seconds=remainder)
            # Start time processing
            if process_type == 0:
                dataframe = dataframe[dataframe.index >= dataframe.index[process_type] + cutting_time]
            
            # Finish time processing
            # make the seconds like 9, 19, 29...
            else:
                dataframe = dataframe[dataframe.index <= dataframe.index[-1] - cutting_time - one_sec]

        return dataframe
    
    # Removing error values in group (brain waves and attention score)
    def check_invalid_values(self, group):
        # find error data length in brain wave
        alpha_invalid_series = group['α_wave_raw_data'].diff().eq(0)
        alpha_invalid_timestamps = group.index[alpha_invalid_series].tolist()

        # find error data length in attention_raw_data
        attention_invalid_series = group['attention_raw_data'] == 0
        attention_invalid_timestamps = group.index[attention_invalid_series].tolist()

        # check whether the length of error data is over remove_time_in_group second
        def has_long_invalid_duration(invalid_timestamps):
            if not invalid_timestamps:
                return False
            for invalid_time in range(1, len(invalid_timestamps)):
                if (invalid_timestamps[invalid_time] - invalid_timestamps[invalid_time-1]).seconds > self.remove_time_in_group:
                    return True
            return False

        alpha_invalid = has_long_invalid_duration(alpha_invalid_timestamps)
        attention_invalid = has_long_invalid_duration(attention_invalid_timestamps)
        
        # make error values to missing values
        if alpha_invalid or attention_invalid:
            return pd.Series([np.nan] * group.shape[1], index=group.columns)

        else:
            # calculate average except error value
            valid_conditions = (
                (group['α_wave_raw_data'].diff() != 0) & 
                (group['β_wave_raw_data'].diff() != 0) & 
                (group['θ_wave_raw_data'].diff() != 0) & 
                (group['δ_wave_raw_data'].diff() != 0) & 
                (group['γ_wave_raw_data'].diff() != 0) & 
                (group['attention_raw_data'] != 0)
            )
            return group[valid_conditions].mean()

    # Removing error values in group (hr)
    def check_invalid_values_other(self, group):
        # find error data length in hr
        hr_invalid_series = group['hr_raw_data'] == 0
        hr_invalid_timestamps = group.index[hr_invalid_series].tolist()

        # check whether the length of error data is over remove_time_in_group second
        def has_long_invalid_duration(invalid_timestamps):
            if not invalid_timestamps:
                return False
            for invalid_time in range(1, len(invalid_timestamps)):
                if (invalid_timestamps[invalid_time] - invalid_timestamps[invalid_time-1]).seconds > self.remove_time_in_group:
                    return True
            return False

        hr_invalid = has_long_invalid_duration(hr_invalid_timestamps)

        # make error values to missing values
        if hr_invalid:
            return pd.Series([np.nan] * group.shape[1], index=group.columns)

        else:
            # calculate average except error value
            group = group[(group['hr_raw_data'] != 0)]
            return group.mean()
    
    # Process EEG data
    def process_eeg_data(self, experiment_id):
        if experiment_id not in self.EEG_report.index:
            return None

        # all experiments in one df
        EEG_report_sample = self.EEG_report.loc[[experiment_id],:]

        # one dataframe for one column
        cols = ['α_wave_raw_data', 'β_wave_raw_data', 'θ_wave_raw_data', 'δ_wave_raw_data', 'γ_wave_raw_data', 'attention_raw_data', 'hrv_raw_data', 'hr_raw_data', 'coherence_flag_raw_data']
        parsed_dfs = [self.parse_raw_data(EEG_report_sample, col) for col in cols]

        # calculate two interval second because there's two type of time interval in EEG data
        interval_sec = self.time_difference(EEG_report_sample, 'meditation_start_time', 'meditation_finish_time') / len(parsed_dfs[0])
        interval_sec_other = self.time_difference(EEG_report_sample, 'meditation_start_time', 'meditation_finish_time') / len(parsed_dfs[6])

        # make two merged dataframe
        merged_df = parsed_dfs[0].join(parsed_dfs[1:6])
        merged_df_other = parsed_dfs[6].join(parsed_dfs[7:])

        # experiment start time
        start_time = datetime.strptime(EEG_report_sample.iloc[0]['meditation_start_time'], '%Y-%m-%d %H:%M:%S')

        # change index to time index based on interval second
        interval_sec, interval_sec_other = timedelta(seconds=round(interval_sec,2)), timedelta(seconds=round(interval_sec_other,2))
        merged_df['time'] = [start_time + time * interval_sec for time in range(len(merged_df))]
        merged_df_other['time'] = [start_time + time * interval_sec_other for time in range(len(merged_df_other))]
        merged_df, merged_df_other = merged_df.set_index('time'), merged_df_other.set_index('time')
        
        # comparing the inital experiment error time
        counts = [self.count_initial_same_values(merged_df[col]) for col in cols[:6]] + [self.count_initial_same_values(merged_df_other['hr_raw_data'])]
        initial_error_times = [counts[error] * interval_sec.total_seconds() if error != 6 else counts[error] * interval_sec_other.total_seconds() for error in range(7)]
        initial_error_time = timedelta(seconds=max(initial_error_times))

        # dataset start time
        real_start_time = start_time + initial_error_time
        merged_df, merged_df_other = merged_df[merged_df.index > real_start_time], merged_df_other[merged_df_other.index > real_start_time]
        merged_df.index, merged_df_other.index = merged_df.index.round('S'), merged_df_other.index.round('S')

        # make the experiment end time same
        merged_df, merged_df_other = self.align_end_time(merged_df, merged_df_other)

        # start time process -> i : 0 , finish time process -> i : -1
        # start time process -> process_start_time_trash_sec func , finish time process -> process_finish_time_trash_sec func
        merged_df = self.adjust_time_index(0, merged_df, self.process_start_time_trash_sec)
        merged_df_other = self.adjust_time_index(0, merged_df_other, self.process_start_time_trash_sec)
        merged_df = self.adjust_time_index(-1, merged_df, self.process_finish_time_trash_sec)
        merged_df_other = self.adjust_time_index(-1, merged_df_other, self.process_finish_time_trash_sec)

        # grouping
        grouped = merged_df.groupby(merged_df.index.floor(self.time_interval_str))
        grouped_other = merged_df_other.groupby(merged_df_other.index.floor(self.time_interval_str))

        result = grouped.apply(self.check_invalid_values)
        result_other = grouped_other.apply(self.check_invalid_values_other)

        # final EEG dataset including β/θ SP ratio
        EEG_data_per_time_interval = result.merge(result_other, left_index=True, right_index=True)
        EEG_data_per_time_interval['β/θ SP'] = EEG_data_per_time_interval['β_wave_raw_data'] / EEG_data_per_time_interval['θ_wave_raw_data']
        
        EEG_data_per_time_interval = EEG_data_per_time_interval.rename(columns={
            'α_wave_raw_data':'alpha_wave',
            'β_wave_raw_data':'beta_wave',
            'θ_wave_raw_data':'theta_wave',
            'δ_wave_raw_data':'delta_wave',
            'γ_wave_raw_data':'gamma_wave',
            'attention_raw_data' : 'attention',
            'hrv_raw_data' : 'hrv',
            'hr_raw_data' : 'hr',
            'coherence_flag_raw_data' : 'coherence',
            'β/θ SP' : 'SP ratio'
        })

        return EEG_data_per_time_interval

In [None]:
import datetime as dt
import matplotlib.pyplot as plt
import glob

class FitbitProcessor:
    '''
    time_interval : Unified seconds with EEG dataset (ex.10secs) 
    BM_sum_minutes
    Body Movement feature was meaningless because the experiment was conducted usually while sitting on the chair.
    So, created a new body movement feature as accumulated Body Movement value from previous time.
    And, the previous time is the BM_sum_minutes variable. (ex. 3 hours)
    '''
    def __init__(self, folder_path, time_interval, BM_sum_minutes):
        self.folder_path = folder_path
        self.BM_sum_minutes = BM_sum_minutes
        self.BM_sum_minutes_str = f'{BM_sum_minutes}T'
        self.time_interval = time_interval
        self.time_interval_str = f'{time_interval}S'
        folder_patterns = [
            "Active Zone Minutes (AZM)/*",
            "Sleep Score/*",
            "Stress Journal/CEDA*",
            "Temperature/Wrist Temperature - *"
        ]
        self.things_path = [glob.glob(f"{folder_path}/{pattern}") for pattern in folder_patterns]
        self.things_path = [item for sublist in self.things_path for item in sublist]
        self.wt_count = len(glob.glob(f"{folder_path}/Temperature/Wrist Temperature - *"))
        self.azm_count = len(glob.glob(f"{folder_path}/Active Zone Minutes (AZM)/*"))
        self.sleep_count = len(glob.glob(f"{folder_path}/Sleep Score/*"))
        self.eda_count = len(glob.glob(f"{folder_path}/Stress Journal/CEDA*"))
        # original fitbit dataset's time interval is 1 min
        self.original_interval = 60
        self.num_timestamps = self.original_interval // self.time_interval
        self.half_point = self.num_timestamps // 2
    
    def read_filtered_csv(self, path, columns):
        name = pd.read_csv(path)
        name = name[columns]
        name[columns[0]] = pd.to_datetime(name[columns[0]])
        return name
    
    def round_seconds(self, obj):
        if obj.second % self.time_interval == 0:
            return obj
        else:
            return obj - timedelta(seconds=obj.second % self.time_interval)
    
    def round_zero(self, datetime_obj):
        datetime_obj = datetime_obj.replace(second=0)
        return datetime_obj
    
    # Process whole fitbit data
    def process_fitbit_data(self):
        AZM_col = ['date_time', 'total_minutes']
        sleep_col = ['timestamp', 'deep_sleep_in_minutes']
        stress_col = ['timestamp', 'eda_level_real']
        temp_col = ['recorded_time', 'temperature']
        
        # merge all features
        things_col = [AZM_col] * self.azm_count + [sleep_col] * self.sleep_count + [stress_col] * self.eda_count + [temp_col] * self.wt_count
        things = [self.read_filtered_csv(path, col) for path, col in zip(self.things_path, things_col)]

        # if there's no wrist temperature
        if self.wt_count == 0:
            # if there's no eda data
            # there was no eda in two subjects' fitbit data
            if self.eda_count == 0:
                azm = self.process_azm(things[:self.azm_count])
                sleep = self.process_sleep(things[self.azm_count:self.azm_count+self.sleep_count])
                Min_Time, Max_Time = self.find_time_bounds([azm, sleep])
                
                # make final dataframe
                df = self.create_final_df([azm, sleep], Min_Time, Max_Time)
                df['timestamp'] = pd.to_datetime(df['timestamp'])
                df = df.set_index('timestamp')
                df = df.assign(temperature=np.nan)
                df = df.assign(eda=np.nan)
                
                return df
            
            else:
                azm = self.process_azm(things[:self.azm_count])
                sleep = self.process_sleep(things[self.azm_count:self.azm_count+self.sleep_count])
                eda = self.process_eda(things[self.azm_count+self.sleep_count:self.azm_count+self.sleep_count+self.eda_count])
                Min_Time, Max_Time = self.find_time_bounds([azm, sleep, eda])

                df = self.create_final_df([azm, sleep, eda], Min_Time, Max_Time)
                df['timestamp'] = pd.to_datetime(df['timestamp'])
                df = df.set_index('timestamp')
                df = df.assign(temperature=np.nan)
                return df
            
        # if there's no Active Zone Minutes data
        elif self.azm_count == 0:
            sleep = self.process_sleep(things[self.azm_count])
            eda = self.process_eda(things[self.azm_count+self.sleep_count:self.azm_count+self.sleep_count+self.eda_count])
            temp = self.process_temperature(things[self.azm_count+self.sleep_count+self.eda_count:])
            Min_Time, Max_Time = self.find_time_bounds([sleep, eda, temp])

            df = self.create_final_df([sleep, eda, temp], Min_Time, Max_Time)
            df['timestamp'] = pd.to_datetime(df['timestamp'])
            df = df.set_index('timestamp')
            df = df.assign(BM=np.nan)

            return df
        
        # if there's no sleep data
        elif self.sleep_count == 0:
            azm = self.process_azm(things[:self.azm_count])
            eda = self.process_eda(things[self.azm_count+self.sleep_count:self.azm_count+self.sleep_count+self.eda_count])
            temp = self.process_temperature(things[self.azm_count+self.sleep_count+self.eda_count:])
            Min_Time, Max_Time = self.find_time_bounds([azm, eda, temp])

            df = self.create_final_df([azm, eda, temp], Min_Time, Max_Time)
            df['timestamp'] = pd.to_datetime(df['timestamp'])
            df = df.set_index('timestamp')
            df = df.assign(sleep=np.nan)

            return df            
        
        # if there's no eda data
        elif self.eda_count == 0:
            azm = self.process_azm(things[:self.azm_count])
            sleep = self.process_sleep(things[self.azm_count:self.azm_count+self.sleep_count])
            temp = self.process_temperature(things[self.azm_count+self.sleep_count:])
            Min_Time, Max_Time = self.find_time_bounds([azm, sleep, temp])

            df = self.create_final_df([azm, sleep, temp], Min_Time, Max_Time)
            df['timestamp'] = pd.to_datetime(df['timestamp'])
            df = df.set_index('timestamp')
            df = df.assign(eda=np.nan)

            return df
            
        # if there's no error in data file
        else:
            azm = self.process_azm(things[:self.azm_count])
            sleep = self.process_sleep(things[self.azm_count:self.azm_count+self.sleep_count])
            eda = self.process_eda(things[self.azm_count+self.sleep_count:self.azm_count+self.sleep_count+self.eda_count])
            temp = self.process_temperature(things[self.azm_count+self.sleep_count+self.eda_count:])
            Min_Time, Max_Time = self.find_time_bounds([azm, sleep, eda, temp])

            df = self.create_final_df([azm, sleep, eda, temp], Min_Time, Max_Time)
            df['timestamp'] = pd.to_datetime(df['timestamp'])
            df = df.set_index('timestamp')

            return df
    
    # Process Active Zone Minutes data
    def process_azm(self, azm):

        azm = pd.concat(azm, axis=0)
        azm = azm.rename(columns={'date_time':'timestamp', 'total_minutes':'BM'})
        time_list = []
        body_movement = []

        for time in azm['timestamp']:
            start_timestamp = time - timedelta(seconds=(self.half_point * self.time_interval))

            for number in range(self.num_timestamps):
                new_timestamp = start_timestamp + timedelta(seconds = (number*self.time_interval))
                time_list.append(new_timestamp)

        for bm in azm['BM']:
            for _ in range(self.num_timestamps):
                body_movement.append(bm)

        azm_list = {'timestamp': time_list, 'BM': body_movement}
        azm_final = pd.DataFrame(azm_list)

        azm_final['timestamp'] = pd.to_datetime(azm_final['timestamp'])
        azm_final.set_index('timestamp', inplace=True)
        azm_final = azm_final.resample(self.time_interval_str).asfreq().fillna(0)
        azm_final['new_BM'] = azm_final['BM'].rolling(self.BM_sum_minutes_str, closed='right').sum()
        azm_final = azm_final.drop(['BM'], axis=1)
        azm_final = azm_final.rename(columns={'new_BM':'BM'})
        azm_final = azm_final.astype({'BM':'int'})
        azm_final.reset_index(inplace=True)
        
        return azm_final
    
    # Process sleep data (Deep sleep in minutes)
    def process_sleep(self, sleep):
        sleep = pd.concat(sleep, axis=0)
        sleep['timestamp'] = [
                self.round_zero(datetime.strptime(str(time)[0:19], '%Y-%m-%d %H:%M:%S')) for time in sleep['timestamp']
            ]
            
        sleep = sleep.rename(columns={'deep_sleep_in_minutes':'sleep'})
        return sleep
    
    # Process eda data
    def process_eda(self, eda):
        eda = pd.concat(eda, axis=0)
        eda['timestamp'] = [
            self.round_seconds(
                datetime.strptime(str(time)[0:19], '%Y-%m-%d %H:%M:%S') - dt.timedelta(hours=4)
            ) for time in eda['timestamp']]

        # EDA Bilinear Interpolation
        time_list = []
        eda_list = []

        for time in range(1, len(eda['timestamp']) - 1):
            start_timestamp = eda.iloc[time,0] - timedelta(seconds = self.half_point * self.time_interval)

            for num in range(self.num_timestamps):
                new_timestamp = start_timestamp + timedelta(seconds = (num * self.time_interval))
                time_list.append(new_timestamp)

                if num < self.half_point:
                    weight = (self.half_point - num) / self.num_timestamps
                    value = eda.iloc[time, 1] - ((eda.iloc[time, 1] - eda.iloc[time - 1, 1]) * weight)

                elif num == self.half_point:
                    value = eda.iloc[time,1]

                else:
                    weight = (num - self.half_point) / self.num_timestamps
                    value = eda.iloc[time, 1] + ((eda.iloc[time + 1, 1] - eda.iloc[time, 1]) * weight)

                eda_list.append(round(value, 2))

        eda_list = {'timestamp': time_list, 'eda': eda_list}
        eda_final = pd.DataFrame(eda_list)

        return eda_final
    
    # Process temperature data
    def process_temperature(self, temperature):
        #Temperature
        temp = pd.concat(temperature, axis=0)
        temp = temp.rename(columns={'recorded_time':'timestamp'})

        # Temperature Bilinear Interpolation
        time_list = []
        temp_list = []

        for time in range(1, len(temp['timestamp']) - 1):
            if self.half_point % 2 != 0:
                start_timestamp = temp.iloc[time,0] - timedelta(seconds = self.half_point * self.time_interval)
                for num in range(self.num_timestamps):
                    new_timestamp = start_timestamp + timedelta(seconds = (num * self.time_interval))
                    time_list.append(new_timestamp)

                    if num < self.half_point:
                        weight = (self.half_point - num) / self.num_timestamps 
                        value = temp.iloc[time, 1] - ((temp.iloc[time, 1] - temp.iloc[time - 1, 1]) * weight)

                    elif num == self.half_point:
                        value = temp.iloc[time,1]

                    else:
                        weight = (num - self.half_point) / self.num_timestamps
                        value = temp.iloc[time, 1] + ((temp.iloc[time + 1, 1] - temp.iloc[time, 1]) * weight)

                    temp_list.append(round(value, 6))

        temp_list = {'timestamp': time_list, 'temperature': temp_list}
        temp_final = pd.DataFrame(temp_list)

        return temp_final
    
    # find minimum and maximum time of whole feature
    def find_time_bounds(self, dataframes):
        min_times = []
        max_times = []
        
        for df in dataframes:
            if not df.empty:
                min_times.append(df['timestamp'].min())
                max_times.append(df['timestamp'].max())
                
        if not min_times or not max_times:
            Min_Time = pd.Timestamp.now(tz='UTC')
            Max_Time = pd.Timestamp.now(tz='UTC')
        else:
            Min_Time = min(min_times)
            Max_Time = max(max_times)

        return Min_Time, Max_Time

    # create dataframe from Min_time to Max_time
    def create_final_df(self, datasets, Min_Time, Max_Time):
        fitbit = pd.date_range(start=Min_Time, end=Max_Time, freq=self.time_interval_str, name='timestamp')
        fitbit = pd.DataFrame(fitbit)

        for dataset in datasets:
            fitbit = pd.merge(fitbit, dataset, how='outer', on='timestamp')
            
        fitbit['BM'] = fitbit['BM'].fillna(0)
        fitbit['sleep'] = fitbit['sleep'].fillna(method='ffill')

        return fitbit

In [None]:
class DataMerger(EEGProcessor, FitbitProcessor):
    '''
    We've tested some hyperparameters, and "time_interval=10, eeg_remove_time_in_group=7, BM_sum_minutes=180" have shown the best R-squared score.
    
    [Tested hyperparameters]
    1. Time interval : 10, 12, 15, 20sec & eeg_remove_time_in_group : 7, 9, 12, 16sec (7~80% proportion of time interval)
    R-squared score was best when we split the dataset into 10 seconds group.
    
    2. BM (Body Movement) sum minutes : 1h, 1h 30m, 2h, 2h 30m, 3h
    R-squared score was best when we set up the BM (Body Movement) sum minutes as 3 hours.     
    '''
    
    def __init__(self, eeg_filepath, fitbit_folderpath, time_interval=10, eeg_remove_time_in_group=7, BM_sum_minutes=180):
        # Initialize by calling parent class constructor
        EEGProcessor.__init__(self, eeg_filepath, time_interval, eeg_remove_time_in_group)
        FitbitProcessor.__init__(self, fitbit_folderpath, time_interval, BM_sum_minutes)
        
        self.eeg_filepath = eeg_filepath

    # Merge EEG and Fitbit data
    def merge_data(self):
        # processing EEG data
        eeg_data = pd.read_csv(self.eeg_filepath)
        result_dfs = []
        
        # For all experiments in the eeg data csv file
        for exp_id in range(3, len(eeg_data)):
            processed_data = self.process_eeg_data(exp_id)
            if processed_data is not None:
                result_dfs.append(processed_data)
                
        if result_dfs:
            combined_eeg = pd.concat(result_dfs)
            combined_eeg.index = pd.to_datetime(combined_eeg.index)
        
        # processing Fitbit data
        fitbit_data = self.process_fitbit_data()
        fitbit_data.index = pd.to_datetime(fitbit_data.index)

        # merging two dataframes
        if 'combined_eeg' in locals() and not fitbit_data.empty:
            merged_df = combined_eeg.merge(fitbit_data, left_index=True, right_index=True, how='left')
            return merged_df
        else:
            return None

In [None]:
# eeg_filepath, fitbit_folderpath, time_interval, eeg_remove_time_in_group
merger = DataMerger(r"C:\Users\ballj\OneDrive\바탕 화면\EEG_jm.csv",
                    r"C:\Users\ballj\OneDrive\바탕 화면\Fitbit_JM")

final_jm = merger.merge_data()

# eeg_filepath, fitbit_folderpath, time_interval, eeg_remove_time_in_group
merger = DataMerger(r"C:\Users\ballj\OneDrive\바탕 화면\EEG_yh.csv",
                    r"C:\Users\ballj\OneDrive\바탕 화면\Fitbit_YH")

final_yh = merger.merge_data()

# eeg_filepath, fitbit_folderpath, time_interval, eeg_remove_time_in_group
merger = DataMerger(r"C:\Users\ballj\OneDrive\바탕 화면\EEG_sj.csv",
                    r"C:\Users\ballj\OneDrive\바탕 화면\Fitbit_SJ")


final_sj = merger.merge_data()

# eeg_filepath, fitbit_folderpath, time_interval, eeg_remove_time_in_group
merger = DataMerger(r"C:\Users\ballj\OneDrive\바탕 화면\EEG_sa.csv",
                    r"C:\Users\ballj\OneDrive\바탕 화면\Fitbit_SA")


final_sa = merger.merge_data()

# eeg_filepath, fitbit_folderpath, time_interval, eeg_remove_time_in_group
merger = DataMerger(r"C:\Users\ballj\OneDrive\바탕 화면\EEG_bs.csv",
                    r"C:\Users\ballj\OneDrive\바탕 화면\Fitbit_BS")


final_bs = merger.merge_data()

# eeg_filepath, fitbit_folderpath, time_interval, eeg_remove_time_in_group
merger = DataMerger(r"C:\Users\ballj\OneDrive\바탕 화면\EEG_mj.csv",
                    r"C:\Users\ballj\OneDrive\바탕 화면\Fitbit_MJ")


final_mj = merger.merge_data()

In [None]:
dataset = pd.concat([final_jm, final_sj, final_bs, final_yh, final_mj, final_sa])
dataset = dataset.sort_index()
dataset

In [None]:
dataset_original = dataset

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.impute import SimpleImputer

# Temperature Imputation (Random Forest Imputation)
temperature_na1 = dataset_original[dataset_original['temperature'].isna() & dataset_original.drop(columns='temperature').notna().all(axis=1)]
temperature_na1 = temperature_na1.reset_index()
temperature_na1 = temperature_na1.drop(['index'], axis=1)

full_dataset1 = dataset_original.dropna()
full_dataset1 = full_dataset1.reset_index()
full_dataset1 = full_dataset1.drop(['index'], axis=1)

dataset_temp = pd.concat([full_dataset1, temperature_na1])

In [None]:
# Impute missing values using Random Forest
regressor = RandomForestRegressor(n_estimators=100, random_state=0)

# Split the dataset into two parts: with and without missing 'temperature' values
dataset_with_temperature = dataset_temp.dropna(subset=['temperature'])
dataset_without_temperature = dataset_temp[dataset_temp['temperature'].isnull()]

# Train the model on the rows with no missing temperature values
regressor.fit(dataset_with_temperature.drop('temperature', axis=1), dataset_with_temperature['temperature'])

# Predict the missing temperature values
predicted_temperatures = regressor.predict(dataset_without_temperature.drop('temperature', axis=1))

# Fill in the missing values in the original dataframe
dataset_temp.loc[dataset_temp['temperature'].isnull(), 'temperature'] = predicted_temperatures

dataset_temp

In [None]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.impute import SimpleImputer

eda_na = dataset_original[dataset_original['eda'].isna() & dataset_original.drop(columns='eda').notna().all(axis=1)]
eda_na = eda_na.reset_index()
eda_na = eda_na.drop(['index'], axis=1)

dataset_eda = pd.concat([full_dataset1, eda_na])

# Impute missing values using Random Forest
regressor = RandomForestRegressor(n_estimators=100, random_state=0)

# Split the dataset into two parts: with and without missing 'temperature' values
dataset_with_eda = dataset_eda.dropna(subset=['eda'])
dataset_without_eda = dataset_eda[dataset_eda['eda'].isnull()]

# Train the model on the rows with no missing temperature values
regressor.fit(dataset_with_eda.drop('eda', axis=1), dataset_with_eda['eda'])

# Predict the missing temperature values
predicted_edas = regressor.predict(dataset_without_eda.drop('eda', axis=1))

# Fill in the missing values in the original dataframe
dataset_eda.loc[dataset_eda['eda'].isnull(), 'eda'] = predicted_edas
dataset_eda

In [None]:
merged_df = pd.concat([dataset_temp, dataset_eda])
merged_df = merged_df.drop_duplicates()
merged_df

In [None]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler

coherence_original = merged_df['coherence'].copy()
# Temp - Standard Scaler & Else - Min-Max Scaling
features_to_minmax = merged_df.drop(['temperature', 'SP ratio', 'coherence'], axis=1)
feature_to_standardize = merged_df[['temperature']]
feature_to_minmax_0_1 = merged_df[['SP ratio']]

minmax_scaler = MinMaxScaler()
minmax_scaler_0_1 = MinMaxScaler(feature_range=(0, 1))
standard_scaler = StandardScaler()

features_to_minmax_scaled = minmax_scaler.fit_transform(features_to_minmax)
feature_to_standardize_scaled = standard_scaler.fit_transform(feature_to_standardize)
feature_to_minmax_0_1_scaled = minmax_scaler_0_1.fit_transform(feature_to_minmax_0_1)

# convert to dataframe
features_to_minmax_scaled_df = pd.DataFrame(features_to_minmax_scaled, 
                                            index=merged_df.index, 
                                            columns=features_to_minmax.columns)
feature_to_standardize_scaled_df = pd.DataFrame(feature_to_standardize_scaled, 
                                                index=merged_df.index, 
                                                columns=['temperature'])
feature_to_minmax_0_1_scaled_df = pd.DataFrame(feature_to_minmax_0_1_scaled, 
                                                index=merged_df.index, 
                                                columns=['SP ratio'])

merged_df.update(features_to_minmax_scaled_df)
merged_df.update(feature_to_standardize_scaled_df)
merged_df.update(feature_to_minmax_0_1_scaled_df)

merged_df['coherence'] = coherence_original

In [None]:
merged_df.to_csv(r'C:\Users\ballj\OneDrive\바탕 화면\1~4+6.Imputation+5.Feature_Scaling.csv')

# 1~4 + 6.Imputation + 5.Feature Scaling (SP ratio 0-1)

In [None]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta

class EEGProcessor:
     
    # time_interval : Unified seconds with Fitbit data (ex. 10secs) 
    # remove_time_in_group : the criteria of processing error values in each time_interval group (ex. 7secs)
    
    def __init__(self, file_path, time_interval, remove_time_in_group):
        self.time_interval = time_interval
        self.remove_time_in_group = remove_time_in_group
        self.time_interval_str = f'{time_interval}S'
        self.EEG_report = pd.read_csv(file_path)

    # List to dataframe (ex. brain waves)
    def parse_raw_data(self, dataframe, col_name):
        col_str = dataframe.iloc[0][col_name]
        col_str = col_str.strip('[]')
        col_list = [float(val) for val in col_str.split(',')]  # cause list is divided by comma
        col_data = pd.DataFrame({col_name: col_list})
        return col_data

    # Experiment time calculating function
    def time_difference(self, dataframe, start_time_col, finish_time_col):
        start_time = datetime.strptime(dataframe.iloc[0][start_time_col], '%Y-%m-%d %H:%M:%S')
        finish_time = datetime.strptime(dataframe.iloc[0][finish_time_col], '%Y-%m-%d %0H:%M:%S')

        # time difference between two datatime objects
        time_difference = (finish_time - start_time).total_seconds()
        return time_difference
    
    # Comparing the experimental initial recognition error period and delete the part to be deleted
    def count_initial_same_values(self, series):
        initial_value = series.iloc[0]
        count = 0
        for value in series:
            if value == initial_value:
                count += 1
            else:
                break
        return count
    
    # Processing of values that are not exactly divided into front and back
    def process_start_time_trash_sec(self, start_time):
        # Plus 1 min and delete second in input time
        rounded_time = start_time + timedelta(minutes=1) - timedelta(seconds=start_time.second)
        time_difference = (rounded_time - start_time).total_seconds()

        remainder = time_difference % float(self.time_interval)

        # use the seconds over remove_time_in_group seconds
        if self.remove_time_in_group <= remainder:
            return False
        
        else:
            return remainder
        
    # Processing of values that are not exactly divided into front and back
    def process_finish_time_trash_sec(self, finish_time):
        # delete second in input time
        rounded_time = finish_time - timedelta(seconds=finish_time.second)
        time_difference = (finish_time - rounded_time).total_seconds()

        remainder = time_difference % float(self.time_interval)

        if self.remove_time_in_group <= remainder:
            return False
        
        else:
            return remainder

    # Rounding time to nearest time which can divided by time interval
    def nearest_time_rounding(self, time):
        seconds = time.second
        # For example, find nearest value in 0, 10, 20, 30, 40, 50sec
        time_points = [time_point for time_point in range(0,60, self.time_interval)]
        nearest = min(time_points, key=lambda time_point: abs(time_point - seconds))
        
        if nearest == time_points[-1] and seconds >= (time_points[-1] + self.remove_time_in_group):
            rounded_time = time.replace(second=0, microsecond=0) + timedelta(minutes=1)
        else:
            rounded_time = time.replace(second=nearest, microsecond=0)

        return rounded_time

    # Make the same end time
    def align_end_time(self, dataframe_1, dataframe_2):
        if dataframe_1.index[-1] > dataframe_2.index[-1]:
            dataframe_1 = dataframe_1[dataframe_1.index <= dataframe_2.index[-1]]

        elif dataframe_1.index[-1] < dataframe_2.index[-1]:
            dataframe_2 = dataframe_2[dataframe_2.index <= dataframe_1.index[-1]]

        else: 
            pass # if two dataframe's endtime is same

        return dataframe_1, dataframe_2

    # Adjust start time and end time processing
    # start time processing -> process_type : 0 , finish time processing -> process_type : -1
    # start time processing -> process_start_time_trash_sec func , finish time processing -> process_finish_time_trash_sec func    
    def adjust_time_index(self, process_type, dataframe, func):
        remainder = func(dataframe.index[process_type])
        
        # the last data only shows one original data, so processing this problem
        one_sec = timedelta(seconds=1)

        if remainder == False:
            # change time to nearest (Start time processing)
            if process_type == 0 :
                time = self.nearest_time_rounding(dataframe.index[process_type])
                new_index = dataframe.index.tolist()
                new_index[process_type] = time
                dataframe.index = new_index
            
            # change time to nearest (Finish time processing)
            else:
                time = self.nearest_time_rounding(dataframe.index[process_type]) - one_sec
                new_index = dataframe.index.tolist()
                new_index[process_type] = time
                dataframe.index = new_index
        
        # if remainder is under remove_time_in_group, just remove
        else:
            cutting_time = timedelta(seconds=remainder)
            # Start time processing
            if process_type == 0:
                dataframe = dataframe[dataframe.index >= dataframe.index[process_type] + cutting_time]
            
            # Finish time processing
            # make the seconds like 9, 19, 29...
            else:
                dataframe = dataframe[dataframe.index <= dataframe.index[-1] - cutting_time - one_sec]

        return dataframe
    
    # Removing error values in group (brain waves and attention score)
    def check_invalid_values(self, group):
        # find error data length in brain wave
        alpha_invalid_series = group['α_wave_raw_data'].diff().eq(0)
        alpha_invalid_timestamps = group.index[alpha_invalid_series].tolist()

        # find error data length in attention_raw_data
        attention_invalid_series = group['attention_raw_data'] == 0
        attention_invalid_timestamps = group.index[attention_invalid_series].tolist()

        # check whether the length of error data is over remove_time_in_group second
        def has_long_invalid_duration(invalid_timestamps):
            if not invalid_timestamps:
                return False
            for invalid_time in range(1, len(invalid_timestamps)):
                if (invalid_timestamps[invalid_time] - invalid_timestamps[invalid_time-1]).seconds > self.remove_time_in_group:
                    return True
            return False

        alpha_invalid = has_long_invalid_duration(alpha_invalid_timestamps)
        attention_invalid = has_long_invalid_duration(attention_invalid_timestamps)
        
        # make error values to missing values
        if alpha_invalid or attention_invalid:
            return pd.Series([np.nan] * group.shape[1], index=group.columns)

        else:
            # calculate average except error value
            valid_conditions = (
                (group['α_wave_raw_data'].diff() != 0) & 
                (group['β_wave_raw_data'].diff() != 0) & 
                (group['θ_wave_raw_data'].diff() != 0) & 
                (group['δ_wave_raw_data'].diff() != 0) & 
                (group['γ_wave_raw_data'].diff() != 0) & 
                (group['attention_raw_data'] != 0)
            )
            return group[valid_conditions].mean()

    # Removing error values in group (hr)
    def check_invalid_values_other(self, group):
        # find error data length in hr
        hr_invalid_series = group['hr_raw_data'] == 0
        hr_invalid_timestamps = group.index[hr_invalid_series].tolist()

        # check whether the length of error data is over remove_time_in_group second
        def has_long_invalid_duration(invalid_timestamps):
            if not invalid_timestamps:
                return False
            for invalid_time in range(1, len(invalid_timestamps)):
                if (invalid_timestamps[invalid_time] - invalid_timestamps[invalid_time-1]).seconds > self.remove_time_in_group:
                    return True
            return False

        hr_invalid = has_long_invalid_duration(hr_invalid_timestamps)

        # make error values to missing values
        if hr_invalid:
            return pd.Series([np.nan] * group.shape[1], index=group.columns)

        else:
            # calculate average except error value
            group = group[(group['hr_raw_data'] != 0)]
            return group.mean()
    
    # Process EEG data
    def process_eeg_data(self, experiment_id):
        if experiment_id not in self.EEG_report.index:
            return None

        # all experiments in one df
        EEG_report_sample = self.EEG_report.loc[[experiment_id],:]

        # one dataframe for one column
        cols = ['α_wave_raw_data', 'β_wave_raw_data', 'θ_wave_raw_data', 'δ_wave_raw_data', 'γ_wave_raw_data', 'attention_raw_data', 'hrv_raw_data', 'hr_raw_data', 'coherence_flag_raw_data']
        parsed_dfs = [self.parse_raw_data(EEG_report_sample, col) for col in cols]

        # calculate two interval second because there's two type of time interval in EEG data
        interval_sec = self.time_difference(EEG_report_sample, 'meditation_start_time', 'meditation_finish_time') / len(parsed_dfs[0])
        interval_sec_other = self.time_difference(EEG_report_sample, 'meditation_start_time', 'meditation_finish_time') / len(parsed_dfs[6])

        # make two merged dataframe
        merged_df = parsed_dfs[0].join(parsed_dfs[1:6])
        merged_df_other = parsed_dfs[6].join(parsed_dfs[7:])

        # experiment start time
        start_time = datetime.strptime(EEG_report_sample.iloc[0]['meditation_start_time'], '%Y-%m-%d %H:%M:%S')

        # change index to time index based on interval second
        interval_sec, interval_sec_other = timedelta(seconds=round(interval_sec,2)), timedelta(seconds=round(interval_sec_other,2))
        merged_df['time'] = [start_time + time * interval_sec for time in range(len(merged_df))]
        merged_df_other['time'] = [start_time + time * interval_sec_other for time in range(len(merged_df_other))]
        merged_df, merged_df_other = merged_df.set_index('time'), merged_df_other.set_index('time')
        
        # comparing the inital experiment error time
        counts = [self.count_initial_same_values(merged_df[col]) for col in cols[:6]] + [self.count_initial_same_values(merged_df_other['hr_raw_data'])]
        initial_error_times = [counts[error] * interval_sec.total_seconds() if error != 6 else counts[error] * interval_sec_other.total_seconds() for error in range(7)]
        initial_error_time = timedelta(seconds=max(initial_error_times))

        # dataset start time
        real_start_time = start_time + initial_error_time
        merged_df, merged_df_other = merged_df[merged_df.index > real_start_time], merged_df_other[merged_df_other.index > real_start_time]
        merged_df.index, merged_df_other.index = merged_df.index.round('S'), merged_df_other.index.round('S')

        # make the experiment end time same
        merged_df, merged_df_other = self.align_end_time(merged_df, merged_df_other)

        # start time process -> i : 0 , finish time process -> i : -1
        # start time process -> process_start_time_trash_sec func , finish time process -> process_finish_time_trash_sec func
        merged_df = self.adjust_time_index(0, merged_df, self.process_start_time_trash_sec)
        merged_df_other = self.adjust_time_index(0, merged_df_other, self.process_start_time_trash_sec)
        merged_df = self.adjust_time_index(-1, merged_df, self.process_finish_time_trash_sec)
        merged_df_other = self.adjust_time_index(-1, merged_df_other, self.process_finish_time_trash_sec)

        # grouping
        grouped = merged_df.groupby(merged_df.index.floor(self.time_interval_str))
        grouped_other = merged_df_other.groupby(merged_df_other.index.floor(self.time_interval_str))

        result = grouped.apply(self.check_invalid_values)
        result_other = grouped_other.apply(self.check_invalid_values_other)

        # final EEG dataset including β/θ SP ratio
        EEG_data_per_time_interval = result.merge(result_other, left_index=True, right_index=True)
        EEG_data_per_time_interval['β/θ SP'] = EEG_data_per_time_interval['β_wave_raw_data'] / EEG_data_per_time_interval['θ_wave_raw_data']
        
        EEG_data_per_time_interval = EEG_data_per_time_interval.rename(columns={
            'α_wave_raw_data':'alpha_wave',
            'β_wave_raw_data':'beta_wave',
            'θ_wave_raw_data':'theta_wave',
            'δ_wave_raw_data':'delta_wave',
            'γ_wave_raw_data':'gamma_wave',
            'attention_raw_data' : 'attention',
            'hrv_raw_data' : 'hrv',
            'hr_raw_data' : 'hr',
            'coherence_flag_raw_data' : 'coherence',
            'β/θ SP' : 'SP ratio'
        })

        return EEG_data_per_time_interval

In [None]:
import datetime as dt
import matplotlib.pyplot as plt
import glob

class FitbitProcessor:
    '''
    time_interval : Unified seconds with EEG dataset (ex.10secs) 
    BM_sum_minutes
    Body Movement feature was meaningless because the experiment was conducted usually while sitting on the chair.
    So, created a new body movement feature as accumulated Body Movement value from previous time.
    And, the previous time is the BM_sum_minutes variable. (ex. 3 hours)
    '''
    def __init__(self, folder_path, time_interval, BM_sum_minutes):
        self.folder_path = folder_path
        self.BM_sum_minutes = BM_sum_minutes
        self.BM_sum_minutes_str = f'{BM_sum_minutes}T'
        self.time_interval = time_interval
        self.time_interval_str = f'{time_interval}S'
        folder_patterns = [
            "Active Zone Minutes (AZM)/*",
            "Sleep Score/*",
            "Stress Journal/CEDA*",
            "Temperature/Wrist Temperature - *"
        ]
        self.things_path = [glob.glob(f"{folder_path}/{pattern}") for pattern in folder_patterns]
        self.things_path = [item for sublist in self.things_path for item in sublist]
        self.wt_count = len(glob.glob(f"{folder_path}/Temperature/Wrist Temperature - *"))
        self.azm_count = len(glob.glob(f"{folder_path}/Active Zone Minutes (AZM)/*"))
        self.sleep_count = len(glob.glob(f"{folder_path}/Sleep Score/*"))
        self.eda_count = len(glob.glob(f"{folder_path}/Stress Journal/CEDA*"))
        # original fitbit dataset's time interval is 1 min
        self.original_interval = 60
        self.num_timestamps = self.original_interval // self.time_interval
        self.half_point = self.num_timestamps // 2
    
    def read_filtered_csv(self, path, columns):
        name = pd.read_csv(path)
        name = name[columns]
        name[columns[0]] = pd.to_datetime(name[columns[0]])
        return name
    
    def round_seconds(self, obj):
        if obj.second % self.time_interval == 0:
            return obj
        else:
            return obj - timedelta(seconds=obj.second % self.time_interval)
    
    def round_zero(self, datetime_obj):
        datetime_obj = datetime_obj.replace(second=0)
        return datetime_obj
    
    # Process whole fitbit data
    def process_fitbit_data(self):
        AZM_col = ['date_time', 'total_minutes']
        sleep_col = ['timestamp', 'deep_sleep_in_minutes']
        stress_col = ['timestamp', 'eda_level_real']
        temp_col = ['recorded_time', 'temperature']
        
        # merge all features
        things_col = [AZM_col] * self.azm_count + [sleep_col] * self.sleep_count + [stress_col] * self.eda_count + [temp_col] * self.wt_count
        things = [self.read_filtered_csv(path, col) for path, col in zip(self.things_path, things_col)]

        # if there's no wrist temperature
        if self.wt_count == 0:
            # if there's no eda data
            # there was no eda in two subjects' fitbit data
            if self.eda_count == 0:
                azm = self.process_azm(things[:self.azm_count])
                sleep = self.process_sleep(things[self.azm_count:self.azm_count+self.sleep_count])
                Min_Time, Max_Time = self.find_time_bounds([azm, sleep])
                
                # make final dataframe
                df = self.create_final_df([azm, sleep], Min_Time, Max_Time)
                df['timestamp'] = pd.to_datetime(df['timestamp'])
                df = df.set_index('timestamp')
                df = df.assign(temperature=np.nan)
                df = df.assign(eda=np.nan)
                
                return df
            
            else:
                azm = self.process_azm(things[:self.azm_count])
                sleep = self.process_sleep(things[self.azm_count:self.azm_count+self.sleep_count])
                eda = self.process_eda(things[self.azm_count+self.sleep_count:self.azm_count+self.sleep_count+self.eda_count])
                Min_Time, Max_Time = self.find_time_bounds([azm, sleep, eda])

                df = self.create_final_df([azm, sleep, eda], Min_Time, Max_Time)
                df['timestamp'] = pd.to_datetime(df['timestamp'])
                df = df.set_index('timestamp')
                df = df.assign(temperature=np.nan)
                return df
            
        # if there's no Active Zone Minutes data
        elif self.azm_count == 0:
            sleep = self.process_sleep(things[self.azm_count])
            eda = self.process_eda(things[self.azm_count+self.sleep_count:self.azm_count+self.sleep_count+self.eda_count])
            temp = self.process_temperature(things[self.azm_count+self.sleep_count+self.eda_count:])
            Min_Time, Max_Time = self.find_time_bounds([sleep, eda, temp])

            df = self.create_final_df([sleep, eda, temp], Min_Time, Max_Time)
            df['timestamp'] = pd.to_datetime(df['timestamp'])
            df = df.set_index('timestamp')
            df = df.assign(BM=np.nan)

            return df
        
        # if there's no sleep data
        elif self.sleep_count == 0:
            azm = self.process_azm(things[:self.azm_count])
            eda = self.process_eda(things[self.azm_count+self.sleep_count:self.azm_count+self.sleep_count+self.eda_count])
            temp = self.process_temperature(things[self.azm_count+self.sleep_count+self.eda_count:])
            Min_Time, Max_Time = self.find_time_bounds([azm, eda, temp])

            df = self.create_final_df([azm, eda, temp], Min_Time, Max_Time)
            df['timestamp'] = pd.to_datetime(df['timestamp'])
            df = df.set_index('timestamp')
            df = df.assign(sleep=np.nan)

            return df            
        
        # if there's no eda data
        elif self.eda_count == 0:
            azm = self.process_azm(things[:self.azm_count])
            sleep = self.process_sleep(things[self.azm_count:self.azm_count+self.sleep_count])
            temp = self.process_temperature(things[self.azm_count+self.sleep_count:])
            Min_Time, Max_Time = self.find_time_bounds([azm, sleep, temp])

            df = self.create_final_df([azm, sleep, temp], Min_Time, Max_Time)
            df['timestamp'] = pd.to_datetime(df['timestamp'])
            df = df.set_index('timestamp')
            df = df.assign(eda=np.nan)

            return df
            
        # if there's no error in data file
        else:
            azm = self.process_azm(things[:self.azm_count])
            sleep = self.process_sleep(things[self.azm_count:self.azm_count+self.sleep_count])
            eda = self.process_eda(things[self.azm_count+self.sleep_count:self.azm_count+self.sleep_count+self.eda_count])
            temp = self.process_temperature(things[self.azm_count+self.sleep_count+self.eda_count:])
            Min_Time, Max_Time = self.find_time_bounds([azm, sleep, eda, temp])

            df = self.create_final_df([azm, sleep, eda, temp], Min_Time, Max_Time)
            df['timestamp'] = pd.to_datetime(df['timestamp'])
            df = df.set_index('timestamp')

            return df
    
    # Process Active Zone Minutes data
    def process_azm(self, azm):

        azm = pd.concat(azm, axis=0)
        azm = azm.rename(columns={'date_time':'timestamp', 'total_minutes':'BM'})
        time_list = []
        body_movement = []

        for time in azm['timestamp']:
            start_timestamp = time - timedelta(seconds=(self.half_point * self.time_interval))

            for number in range(self.num_timestamps):
                new_timestamp = start_timestamp + timedelta(seconds = (number*self.time_interval))
                time_list.append(new_timestamp)

        for bm in azm['BM']:
            for _ in range(self.num_timestamps):
                body_movement.append(bm)

        azm_list = {'timestamp': time_list, 'BM': body_movement}
        azm_final = pd.DataFrame(azm_list)

        azm_final['timestamp'] = pd.to_datetime(azm_final['timestamp'])
        azm_final.set_index('timestamp', inplace=True)
        azm_final = azm_final.resample(self.time_interval_str).asfreq().fillna(0)
        azm_final['new_BM'] = azm_final['BM'].rolling(self.BM_sum_minutes_str, closed='right').sum()
        azm_final = azm_final.drop(['BM'], axis=1)
        azm_final = azm_final.rename(columns={'new_BM':'BM'})
        azm_final = azm_final.astype({'BM':'int'})
        azm_final.reset_index(inplace=True)
        
        return azm_final
    
    # Process sleep data (Deep sleep in minutes)
    def process_sleep(self, sleep):
        sleep = pd.concat(sleep, axis=0)
        sleep['timestamp'] = [
                self.round_zero(datetime.strptime(str(time)[0:19], '%Y-%m-%d %H:%M:%S')) for time in sleep['timestamp']
            ]
            
        sleep = sleep.rename(columns={'deep_sleep_in_minutes':'sleep'})
        return sleep
    
    # Process eda data
    def process_eda(self, eda):
        eda = pd.concat(eda, axis=0)
        eda['timestamp'] = [
            self.round_seconds(
                datetime.strptime(str(time)[0:19], '%Y-%m-%d %H:%M:%S') - dt.timedelta(hours=4)
            ) for time in eda['timestamp']]

        # EDA Bilinear Interpolation
        time_list = []
        eda_list = []

        for time in range(1, len(eda['timestamp']) - 1):
            start_timestamp = eda.iloc[time,0] - timedelta(seconds = self.half_point * self.time_interval)

            for num in range(self.num_timestamps):
                new_timestamp = start_timestamp + timedelta(seconds = (num * self.time_interval))
                time_list.append(new_timestamp)

                if num < self.half_point:
                    weight = (self.half_point - num) / self.num_timestamps
                    value = eda.iloc[time, 1] - ((eda.iloc[time, 1] - eda.iloc[time - 1, 1]) * weight)

                elif num == self.half_point:
                    value = eda.iloc[time,1]

                else:
                    weight = (num - self.half_point) / self.num_timestamps
                    value = eda.iloc[time, 1] + ((eda.iloc[time + 1, 1] - eda.iloc[time, 1]) * weight)

                eda_list.append(round(value, 2))

        eda_list = {'timestamp': time_list, 'eda': eda_list}
        eda_final = pd.DataFrame(eda_list)

        return eda_final
    
    # Process temperature data
    def process_temperature(self, temperature):
        #Temperature
        temp = pd.concat(temperature, axis=0)
        temp = temp.rename(columns={'recorded_time':'timestamp'})

        # Temperature Bilinear Interpolation
        time_list = []
        temp_list = []

        for time in range(1, len(temp['timestamp']) - 1):
            if self.half_point % 2 != 0:
                start_timestamp = temp.iloc[time,0] - timedelta(seconds = self.half_point * self.time_interval)
                for num in range(self.num_timestamps):
                    new_timestamp = start_timestamp + timedelta(seconds = (num * self.time_interval))
                    time_list.append(new_timestamp)

                    if num < self.half_point:
                        weight = (self.half_point - num) / self.num_timestamps 
                        value = temp.iloc[time, 1] - ((temp.iloc[time, 1] - temp.iloc[time - 1, 1]) * weight)

                    elif num == self.half_point:
                        value = temp.iloc[time,1]

                    else:
                        weight = (num - self.half_point) / self.num_timestamps
                        value = temp.iloc[time, 1] + ((temp.iloc[time + 1, 1] - temp.iloc[time, 1]) * weight)

                    temp_list.append(round(value, 6))

        temp_list = {'timestamp': time_list, 'temperature': temp_list}
        temp_final = pd.DataFrame(temp_list)

        return temp_final
    
    # find minimum and maximum time of whole feature
    def find_time_bounds(self, dataframes):
        min_times = []
        max_times = []
        
        for df in dataframes:
            if not df.empty:
                min_times.append(df['timestamp'].min())
                max_times.append(df['timestamp'].max())
                
        if not min_times or not max_times:
            Min_Time = pd.Timestamp.now(tz='UTC')
            Max_Time = pd.Timestamp.now(tz='UTC')
        else:
            Min_Time = min(min_times)
            Max_Time = max(max_times)

        return Min_Time, Max_Time

    # create dataframe from Min_time to Max_time
    def create_final_df(self, datasets, Min_Time, Max_Time):
        fitbit = pd.date_range(start=Min_Time, end=Max_Time, freq=self.time_interval_str, name='timestamp')
        fitbit = pd.DataFrame(fitbit)

        for dataset in datasets:
            fitbit = pd.merge(fitbit, dataset, how='outer', on='timestamp')
            
        fitbit['BM'] = fitbit['BM'].fillna(0)
        fitbit['sleep'] = fitbit['sleep'].fillna(method='ffill')

        return fitbit

In [None]:
class DataMerger(EEGProcessor, FitbitProcessor):
    '''
    We've tested some hyperparameters, and "time_interval=10, eeg_remove_time_in_group=7, BM_sum_minutes=180" have shown the best R-squared score.
    
    [Tested hyperparameters]
    1. Time interval : 10, 12, 15, 20sec & eeg_remove_time_in_group : 7, 9, 12, 16sec (7~80% proportion of time interval)
    R-squared score was best when we split the dataset into 10 seconds group.
    
    2. BM (Body Movement) sum minutes : 1h, 1h 30m, 2h, 2h 30m, 3h
    R-squared score was best when we set up the BM (Body Movement) sum minutes as 3 hours.     
    '''
    
    def __init__(self, eeg_filepath, fitbit_folderpath, time_interval=10, eeg_remove_time_in_group=7, BM_sum_minutes=180):
        # Initialize by calling parent class constructor
        EEGProcessor.__init__(self, eeg_filepath, time_interval, eeg_remove_time_in_group)
        FitbitProcessor.__init__(self, fitbit_folderpath, time_interval, BM_sum_minutes)
        
        self.eeg_filepath = eeg_filepath

    # Merge EEG and Fitbit data
    def merge_data(self):
        # processing EEG data
        eeg_data = pd.read_csv(self.eeg_filepath)
        result_dfs = []
        
        # For all experiments in the eeg data csv file
        for exp_id in range(3, len(eeg_data)):
            processed_data = self.process_eeg_data(exp_id)
            if processed_data is not None:
                result_dfs.append(processed_data)
                
        if result_dfs:
            combined_eeg = pd.concat(result_dfs)
            combined_eeg.index = pd.to_datetime(combined_eeg.index)
        
        # processing Fitbit data
        fitbit_data = self.process_fitbit_data()
        fitbit_data.index = pd.to_datetime(fitbit_data.index)

        # merging two dataframes
        if 'combined_eeg' in locals() and not fitbit_data.empty:
            merged_df = combined_eeg.merge(fitbit_data, left_index=True, right_index=True, how='left')
            return merged_df
        else:
            return None

In [None]:
# eeg_filepath, fitbit_folderpath, time_interval, eeg_remove_time_in_group
merger = DataMerger(r"C:\Users\ballj\OneDrive\바탕 화면\EEG_jm.csv",
                    r"C:\Users\ballj\OneDrive\바탕 화면\Fitbit_JM")

final_jm = merger.merge_data()

# eeg_filepath, fitbit_folderpath, time_interval, eeg_remove_time_in_group
merger = DataMerger(r"C:\Users\ballj\OneDrive\바탕 화면\EEG_yh.csv",
                    r"C:\Users\ballj\OneDrive\바탕 화면\Fitbit_YH")

final_yh = merger.merge_data()

# eeg_filepath, fitbit_folderpath, time_interval, eeg_remove_time_in_group
merger = DataMerger(r"C:\Users\ballj\OneDrive\바탕 화면\EEG_sj.csv",
                    r"C:\Users\ballj\OneDrive\바탕 화면\Fitbit_SJ")


final_sj = merger.merge_data()

# eeg_filepath, fitbit_folderpath, time_interval, eeg_remove_time_in_group
merger = DataMerger(r"C:\Users\ballj\OneDrive\바탕 화면\EEG_sa.csv",
                    r"C:\Users\ballj\OneDrive\바탕 화면\Fitbit_SA")


final_sa = merger.merge_data()

# eeg_filepath, fitbit_folderpath, time_interval, eeg_remove_time_in_group
merger = DataMerger(r"C:\Users\ballj\OneDrive\바탕 화면\EEG_bs.csv",
                    r"C:\Users\ballj\OneDrive\바탕 화면\Fitbit_BS")


final_bs = merger.merge_data()

# eeg_filepath, fitbit_folderpath, time_interval, eeg_remove_time_in_group
merger = DataMerger(r"C:\Users\ballj\OneDrive\바탕 화면\EEG_mj.csv",
                    r"C:\Users\ballj\OneDrive\바탕 화면\Fitbit_MJ")


final_mj = merger.merge_data()

In [None]:
dataset = pd.concat([final_jm, final_sj, final_bs, final_yh, final_mj, final_sa])
dataset = dataset.sort_index()
dataset

In [None]:
dataset_original = dataset

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.impute import SimpleImputer

# Temperature Imputation (Random Forest Imputation)
temperature_na1 = dataset_original[dataset_original['temperature'].isna() & dataset_original.drop(columns='temperature').notna().all(axis=1)]
temperature_na1 = temperature_na1.reset_index()
temperature_na1 = temperature_na1.drop(['index'], axis=1)

full_dataset1 = dataset_original.dropna()
full_dataset1 = full_dataset1.reset_index()
full_dataset1 = full_dataset1.drop(['index'], axis=1)

dataset_temp = pd.concat([full_dataset1, temperature_na1])

In [None]:
# Impute missing values using Random Forest
regressor = RandomForestRegressor(n_estimators=100, random_state=0)

# Split the dataset into two parts: with and without missing 'temperature' values
dataset_with_temperature = dataset_temp.dropna(subset=['temperature'])
dataset_without_temperature = dataset_temp[dataset_temp['temperature'].isnull()]

# Train the model on the rows with no missing temperature values
regressor.fit(dataset_with_temperature.drop('temperature', axis=1), dataset_with_temperature['temperature'])

# Predict the missing temperature values
predicted_temperatures = regressor.predict(dataset_without_temperature.drop('temperature', axis=1))

# Fill in the missing values in the original dataframe
dataset_temp.loc[dataset_temp['temperature'].isnull(), 'temperature'] = predicted_temperatures

dataset_temp

In [None]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.impute import SimpleImputer

eda_na = dataset_original[dataset_original['eda'].isna() & dataset_original.drop(columns='eda').notna().all(axis=1)]
eda_na = eda_na.reset_index()
eda_na = eda_na.drop(['index'], axis=1)

dataset_eda = pd.concat([full_dataset1, eda_na])

# Impute missing values using Random Forest
regressor = RandomForestRegressor(n_estimators=100, random_state=0)

# Split the dataset into two parts: with and without missing 'temperature' values
dataset_with_eda = dataset_eda.dropna(subset=['eda'])
dataset_without_eda = dataset_eda[dataset_eda['eda'].isnull()]

# Train the model on the rows with no missing temperature values
regressor.fit(dataset_with_eda.drop('eda', axis=1), dataset_with_eda['eda'])

# Predict the missing temperature values
predicted_edas = regressor.predict(dataset_without_eda.drop('eda', axis=1))

# Fill in the missing values in the original dataframe
dataset_eda.loc[dataset_eda['eda'].isnull(), 'eda'] = predicted_edas
dataset_eda

In [None]:
merged_df = pd.concat([dataset_temp, dataset_eda])
merged_df = merged_df.drop_duplicates()
merged_df

In [None]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler

attention_original = merged_df['attention'].copy()
coherence_original = merged_df['coherence'].copy()
# Temp - Standard Scaler & Else - Min-Max Scaling
features_to_minmax = merged_df.drop(['temperature', 'SP ratio', 'coherence', 'attention'], axis=1)
feature_to_standardize = merged_df[['temperature']]
feature_to_minmax_0_1 = merged_df[['SP ratio']]

minmax_scaler = MinMaxScaler()
minmax_scaler_0_1 = MinMaxScaler(feature_range=(0, 1))
standard_scaler = StandardScaler()

features_to_minmax_scaled = minmax_scaler.fit_transform(features_to_minmax)
feature_to_standardize_scaled = standard_scaler.fit_transform(feature_to_standardize)
feature_to_minmax_0_1_scaled = minmax_scaler_0_1.fit_transform(feature_to_minmax_0_1)

# convert to dataframe
features_to_minmax_scaled_df = pd.DataFrame(features_to_minmax_scaled, 
                                            index=merged_df.index, 
                                            columns=features_to_minmax.columns)
feature_to_standardize_scaled_df = pd.DataFrame(feature_to_standardize_scaled, 
                                                index=merged_df.index, 
                                                columns=['temperature'])
feature_to_minmax_0_1_scaled_df = pd.DataFrame(feature_to_minmax_0_1_scaled, 
                                                index=merged_df.index, 
                                                columns=['SP ratio'])

merged_df.update(features_to_minmax_scaled_df)
merged_df.update(feature_to_standardize_scaled_df)
merged_df.update(feature_to_minmax_0_1_scaled_df)

merged_df['coherence'] = coherence_original
merged_df['attention'] = attention_original

In [None]:
merged_df

In [None]:
print(merged_df['attention'].min())
print(merged_df['attention'].max())
print(merged_df['SP ratio'].min())
print(merged_df['SP ratio'].max())

In [None]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler(feature_range=(merged_df['attention'].min(), merged_df['attention'].max()))
merged_df['SP_ratio_scaled'] = scaler.fit_transform(merged_df[['SP ratio']])

merged_df

In [None]:
merged_df = merged_df.drop(['SP ratio'], axis=1)
merged_df = merged_df.rename(columns={'SP_ratio_scaled':'SP ratio'})
merged_df

In [None]:
merged_df.to_csv(r'C:\Users\ballj\OneDrive\바탕 화면\Final_dataset_after_postprocessing.csv')