In [None]:
import numpy as np
import pandas as pd
import os
import neurokit2 as nk
import cvxEDA.src.cvxEDA as cvxEDA
import scipy.io
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
import plotly.express as px
import plotly.figure_factory as ff
from sklearn.preprocessing import MinMaxScaler
from sklearn.manifold import TSNE
import scipy.stats as stats
import scipy.signal as signal
from scipy.interpolate import interp1d
from scipy.integrate import trapz
import warnings
warnings.filterwarnings('ignore')

sampling_freq = 125

In [None]:
raw_ppg = pd.read_csv("../Data_files/PPG.csv")
raw_ppg

In [None]:
final_df = raw_ppg[raw_ppg['CMA'].isin(['HVLA', 'LVLA', 'LVHA', 'HVHA', 'Baseline'])]
final_df['CMA'].unique()

In [None]:
final_df['Video_ID_number'] = [int(part.split('V')[-1]) for part in final_df['Video ID']]
final_df['Video_ID_number']
final_df.to_csv("Stimuli_PPG.csv", index=False)

In [None]:
final_df

In [None]:
final_df['Participant ID'].unique().shape

In [None]:
def label_prep(merged_df):
    vads = pd.read_csv("VADS.csv") #getting label csv
    print("reading vad")

    #adding arousal, valence and dominance columns to orginal data csv
    for index, row in tqdm(merged_df.iterrows()):
        matching_rows = vads[(vads['Participant ID'] == row['Participant ID']) & (vads['Video ID'] == row['Video_ID_number'])]

        if not matching_rows.empty:

            merged_df.at[index, 'Valence'] = matching_rows['Valence'].iloc[0]
            merged_df.at[index, 'Arousal'] = matching_rows['Arousal'].iloc[0]
            merged_df.at[index, 'Dominance'] = matching_rows['Dominance'].iloc[0]
            merged_df.at[index, 'significance'] = matching_rows['significance'].iloc[0]

    print("binning...")
    # Define the bins and labels for categorization
    bins = [1, 3, 5]  # Define the bin edges
    labels = [0, 1]   # Define the corresponding labels (0 (Low) for 1-3:, 1 (High) for 4-5)

    # Use the cut function to categorize the 'arousal' column
    merged_df['arousal_category'] = pd.cut(merged_df['Arousal'], bins=bins, labels=labels, include_lowest=True)
    merged_df['valence_category'] = pd.cut(merged_df['Valence'], bins=bins, labels=labels, include_lowest=True)

    # Convert the 'category' column to integer type if needed
    merged_df['arousal_category'] = merged_df['arousal_category'].astype(int)
    merged_df['valence_category'] = merged_df['valence_category'].astype(int)

    print("mapping")
    mapping = {
    'Baseline': 0,
    'LVLA': 0,
    'LVHA': 0,
    'HVHA': 1,
    'HVLA': 1  # Baseline and HVLA mapped to 0
    }

    # Apply the mapping to the 'CMA' column
    merged_df['taskwiselabel'] = merged_df['CMA'].map(mapping)
    # autofet_df

    three_class_mapping = {
    'Baseline': 1,
    'LVLA': 1,
    'LVHA': 0,
    'HVHA': 1,
    'HVLA': 2  
    }

    merged_df['three_class_label'] = merged_df['CMA'].map(three_class_mapping)
    return merged_df

# ppg_data_with_labels = label_prep(final_df)
# ppg_data_with_labels.to_csv("PPG_data_with_labels.csv")

In [None]:
df_list = {}
for pi in final_df['Participant ID'].unique():
    df_pi = final_df[final_df['Participant ID'] == pi]
    for vi in final_df['Video ID'].unique():
        # print(pi,vi)
        df_vi =  df_pi[df_pi['Video ID'] == vi]
        tag = str(pi) + '_' + vi
        df_list[tag] = (df_vi)

In [None]:
df_list.keys()

In [None]:
def ppg_stat(array):
    # Input: An array (numpy, dataframe, list) 
    # -> Output : A pd.DataFrame
    
    x = np.array(array)
    statistics = {
    'max_ppg': x.max(),
    'min_ppg': x.min(),
    'mean_ppg': x.mean(),
    'sd_ppg': x.std(),
    'ku_ppg': stats.kurtosis(x),
    'sk_ppg': stats.skew(x),
    'median_ppg': np.quantile(x, 0.5),
    'q1_ppg': np.quantile(x, 0.25),
    'q3_ppg': np.quantile(x, 0.75),
    'q05_ppg': np.quantile(x, 0.05),
    'q95_ppg': np.quantile(x, 0.95)}
    
    return statistics

def ppg_peaks(array, sampling_rate=125):
    # Input: An array (numpy, dataframe, list) 
    # -> Output : A numpy array
    
    x = np.array(array)
    
    # distance : minimal horizontal distance (>= 1) in samples between neighbouring peaks. By default = None
    # height : required height of peaks. By default = None
    
    distance = sampling_rate / 3
    #height = x.max() / 2
    height = np.quantile(x, 0.99)/2 #use 99th quantile instead of max (to ignore outliers)
    r_peaks, _ = signal.find_peaks(x, distance= distance, height=height)
    
    return r_peaks

def ppg_time(array, sampling_rate=125):
    # Input: An array (numpy, dataframe, list) 
    # -> Output : A pd.DataFrame
    
    x = np.array(array)
    r_peaks = ppg_peaks(x, sampling_rate=sampling_rate)
    
    # RR intervals are expressed in number of samples and need to be converted into ms. By default, sampling_rate=1000
    # HR is given by: 60/RR intervals in seconds. 
    
    rri = np.diff(r_peaks) #RR intervals
    rri = 1000 * rri / sampling_rate #Convert to ms
    drri = np.diff(rri) #Difference between successive RR intervals
    hr = 1000*60 / rri #Heart rate
    
    meanHR = hr.mean()
    minHR = hr.min()
    maxHR = hr.max()
    sdHR = hr.std()
    modeHR =  maxHR - minHR
    nNN = rri.shape[0] / (x.shape[0]/sampling_rate/60) #to get the number of NN intervals per minute
    meanNN = rri.mean()
    SDSD = drri.std()
    CVNN = rri.std()
    SDNN = CVNN / meanNN
    pNN50 = np.sum(np.abs(drri)>50) / nNN * 100
    pNN20 = np.sum(np.abs(drri)>20) / nNN * 100
    RMSSD = np.sqrt(np.mean(drri**2))
    medianNN = np.quantile(rri, 0.5)
    q20NN = np.quantile(rri, 0.2)
    q80NN = np.quantile(rri, 0.8)
    minNN = rri.min()
    maxNN = rri.max()
    
    # HRV triagular index (HTI): The density distribution D of the RR intervals is estimated. The most frequent
    # RR interval lenght X is established. Y= D(X) is the maximum of the sample density distribution.
    # The HTI is then obtained as : (the number of total NN intervals)/Y

    bins = np.arange(meanNN - 3*CVNN , meanNN + 3*CVNN, 10)
    d,_ = np.histogram(rri, bins=bins)
    y = d.argmax()
    triHRV = nNN / y
    
    return {
        'meanHR': meanHR,
        'minHR': minHR,
        'maxHR': maxHR,
        'sdHR': sdHR,
        'modeHR': modeHR,
        'nNN': nNN,
        'meanNN': meanNN,
        'SDSD': SDSD,
        'CVNN': CVNN,
        'SDNN': SDNN,
        'pNN50': pNN50,
        'pNN20': pNN20,
        'RMSSD': RMSSD,
        'medianNN': medianNN,
        'q20NN': q20NN,
        'q80NN': q80NN,
        'minNN': minNN,
        'maxNN': maxNN,
        'triHRV': triHRV
    }

def interpolate_Rpeaks(peaks, sampling_rate=125, upsample_rate=4):
    # Input: An array (numpy, dataframe, list) 
    # -> Output : A numpy array 
    
    # The RR intervals are aranged over time, and the values are summed up to find the time points.
    # An interpolation function is defined, to use to sample from with any upsampling resolution. 
    # By default upsample_rate = 4 : 4 evenly spaced data points per seconds are added. 
    
    rr = np.diff(peaks)
    rr = 1000 * rr / sampling_rate # convert to ms
    rr_time = np.cumsum(rr) / 1000 # convert to s
    rr_time -= rr_time[0] 
    
    interpolation_f = interp1d(rr_time, rr, kind='cubic')
    
    x = np.arange(1, rr_time.max(), 1/upsample_rate)
    rr_interpolated = interpolation_f(x)
    
    return rr_interpolated, x

def ppg_freq(array, sampling_rate=1000, upsample_rate=4, freqband_limits=(.0, .0033,.04,.15,.4, .5)):
    # Input: An array (numpy, dataframe, list) 
    # -> Output : A numpy array 
    
    # FFT needs evenly sampled data, so the RR-interval can't be used directly and need to
    # be interpolated. Then the spectral density of the signal is computed using Welch method.
    
    x = np.array(array)
    r_peaks = ppg_peaks(x, sampling_rate=sampling_rate)
    rri, _ = interpolate_Rpeaks(r_peaks, upsample_rate=upsample_rate)
    freq, power = signal.welch(x=rri, fs=upsample_rate)
    
    lim_ulf= (freq >= freqband_limits[0]) & (freq < freqband_limits[1])
    lim_vlf = (freq >= freqband_limits[1]) & (freq < freqband_limits[2])
    lim_lf = (freq >= freqband_limits[2]) & (freq < freqband_limits[3])
    lim_hf = (freq >= freqband_limits[3]) & (freq < freqband_limits[4])
    lim_vhf = (freq >= freqband_limits[4]) & (freq < freqband_limits[5])
    
    # The power (PSD) of each frequency band is obtained by integrating the spectral density 
    # by trapezoidal rule, using the scipy.integrate.trapz function.
    
    ulf = trapz(power[lim_ulf], freq[lim_ulf])
    vlf = trapz(power[lim_vlf], freq[lim_vlf])
    lf = trapz(power[lim_lf], freq[lim_lf])
    hf = trapz(power[lim_hf], freq[lim_hf])
    vhf = trapz(power[lim_vhf], freq[lim_vhf])
    totalpower = ulf + vlf + lf + hf + vhf
    lfhf = lf / hf
    rlf = lf / (lf + hf) * 100
    rhf = hf / (lf + hf) * 100
    peaklf = freq[lim_lf][np.argmax(power[lim_lf])]
    peakhf = freq[lim_hf][np.argmax(power[lim_hf])]
    
    return {
    'totalpower': totalpower,
    'LF': lf,
    'HF': hf,
    'ULF': ulf,
    'VLF': vlf,
    'VHF': vhf,
    'LF/HF': lfhf,
    'rLF': rlf,
    'rHF': rhf,
    'peakLF': peaklf,
    'peakHF': peakhf
}

def apEntropy(array, m=2, r=None):
    # Input: An array (numpy, dataframe, list) 
    # -> Output : A Float
    
    # m : a positive integer representing the length of each compared run of data (a window).
    # By default m = 2
    # r : a positive real number specifying a filtering level. By default r = 0.2 * sd.
    
    x = np.array(array)
    N = len(x)
    r = 0.2 * x.std() if r == None else r == r
    
    # A sequence of vectors z(1),..., z(N-m+1) is formed from a time series of N equally 
    # spaced raw data values x(1),…,x(N), such that z(i) = x(1),...,x(i+m-1).
    
    # For each i in {1,..., N-m+1}, C = [number of z(j) such that d(x(i),x(j)) < r]/[N-m+1]
    # is computed, with d(z(i),z(j)) = max|x(i)-x(j)|
    
    # phi_m(r) = (N-m+1)^-1 x sum log(Ci) is computed, and the ap Entropy is given by: 
    # phi_m(r) - phi_m+1(r)

    def _maxdist(zi, zj):
        return max([abs(xi - xj) for xi, xj in zip(zi, zj)])
    
    def _phi(m):
        z = [[x[j] for j in range(i, i + m - 1 + 1)] for i in range(N - m + 1)]
        C = [len([1 for zj in z if _maxdist(zi, zj) <= r]) / (N - m + 1.0)
        for zi in z]
        return (N - m + 1.0) ** (-1) * sum(np.log(C))
    
    apEn = abs(_phi(m + 1) - _phi(m))
    return apEn
    
    
def sampEntropy(array, m=2, r=None):
    # Input: An array (numpy, dataframe, list) 
    # -> Output : A Float
    
    # m: embedding dimension
    # r: tolerance distance to consider two data points as similar. By default r = 0.2 * sd.
    
    # SampEn is the negative logarithm of the probability that if two sets of simultaneous 
    # data points of length m have distance < r then two sets of simultaneous data points of 
    # length m + 1 also have distance < r. 
    
    x = np.array(array)
    N = len(x)
    r = 0.2 * x.std() if r == None else r == r
    
    # All templates vector of length m are defined. Distances d(xmi, xmj) are computed
    # and all matches such that d < r are saved. Same for the distances d(xm+1i, xm+1j).
    
    xmi = np.array([x[i : i + m] for i in range(N - m)])
    xmj = np.array([x[i : i + m] for i in range(N - m + 1)])
    B = np.sum([np.sum(np.abs(xmii - xmj).max(axis=1) <= r) - 1 for xmii in xmi])
    m += 1
    xm = np.array([x[i : i + m] for i in range(N - m + 1)])
    A = np.sum([np.sum(np.abs(xmi - xm).max(axis=1) <= r) - 1 for xmi in xm])
    
    return -np.log(A / B)
    
    
def ppg_nonlinear(array, sampling_rate=125, m=2, r=None):
    # Input: An array (numpy, dataframe, list) 
    # -> Output : A numpy array 
    
    # The Poincaré ellipse plot is diagram in which each RR intervals are plotted as a function 
    # of the previous RR interval value. SD1 is the standard deviation spread orthogonally 
    # to the identity line (y=x) and is the ellipse width. SD2 is the standard deviation spread
    # along the identity line and specifies the length of the ellipse.
    
    x = np.array(array)
    r_peaks = ppg_peaks(x, sampling_rate=sampling_rate)
    rri = np.diff(r_peaks)
    rr1 = rri[:-1]
    rr2 = rri[1:]
    SD1 =  np.std(rr2 - rr1) / np.sqrt(2)
    SD2 =  np.std(rr2 + rr1) / np.sqrt(2)
    SD1SD2 = SD1/SD2
    apEn = apEntropy(r_peaks, m=2)
    sampEn = sampEntropy(r_peaks, m=2)
    
    return {
    'SD1': SD1,
    'SD2': SD2,
    'SD1SD2': SD1SD2,
    'apEn': apEn,
    'sampEn': sampEn
}
    

In [None]:
sampling_freq = 125
out_df = pd.DataFrame()
all_participants_stats = []

for i in df_list:

    row_dict = {}

    pid = df_list[i]['Participant ID'].tolist()[0]
    vid = df_list[i]['Video ID'].tolist()[0]
    gender = df_list[i]['Gender'].tolist()[0]
    cma = df_list[i]['CMA'].tolist()[0]
    vnum = df_list[i]['Video_ID_number'].tolist()[0]
    bpm = df_list[i]['BPM'].tolist()[0]
    ibi = df_list[i]['IBI'].tolist()[0]

    ppgdata = df_list[i]['PPG'].to_numpy()
    # print(ppgdata.shape)

    # Combine participant info with computed statistics
    row_dict.update({
        'Participant ID': pid,
        'Video ID': vid,
        'Gender': gender,
        'CMA': cma,
        'Video_ID_number': vnum,
        'BPM': bpm,
        'IBI': ibi
    })

    #cleaning ppg data
    ppg_clean = nk.ppg_clean(ppgdata)
    ppg_signals, info = nk.ppg_process(ppg_clean, sampling_rate=sampling_freq)
    # print(ppg_signals['PPG_Peaks'].unique())
    analyze_df = nk.ppg_analyze(ppg_signals, sampling_rate=sampling_freq) 
    row_dict.update(analyze_df.iloc[0])

    # Add the row to the results list
    all_participants_stats.append(row_dict)

df_all_stats = pd.DataFrame(all_participants_stats)
df_all_stats


In [None]:
# Find columns with NaN values
df_all_stats.replace([np.inf, -np.inf], np.nan, inplace=True)
columns_with_nan = df_all_stats.columns[df_all_stats.isna().any()].tolist()
columns_with_nan

In [None]:
columns_to_drop = ['HRV_SDANN1',
 'HRV_SDNNI1',
 'HRV_SDANN2',
 'HRV_SDNNI2',
 'HRV_SDANN5',
 'HRV_SDNNI5',
 'HRV_ULF',
 'HRV_VLF',
 'HRV_DFA_alpha2',
 'HRV_MFDFA_alpha2_Width',
 'HRV_MFDFA_alpha2_Peak',
 'HRV_MFDFA_alpha2_Mean',
 'HRV_MFDFA_alpha2_Max',
 'HRV_MFDFA_alpha2_Delta',
 'HRV_MFDFA_alpha2_Asymmetry',
 'HRV_MFDFA_alpha2_Fluctuation',
 'HRV_MFDFA_alpha2_Increment', 'HRV_SampEn']

df_all_stats = df_all_stats.drop(columns=columns_to_drop)
# df_all_stats.replace([np.inf, -np.inf], np.nan, inplace=True)
# df_all_stats.dropna(axis=1)
columns_with_nan = df_all_stats.columns[df_all_stats.isna().any()].tolist()
columns_with_nan

In [None]:
df_all_stats.columns

In [None]:
# fet_ppg = df_all_stats[['BPM',
#        'IBI', 'max_ppg', 'min_ppg', 'mean_ppg', 'sd_ppg', 'ku_ppg', 'sk_ppg',
#        'median_ppg', 'q1_ppg', 'q3_ppg', 'q05_ppg', 'q95_ppg', 'meanHR',
#        'minHR', 'maxHR', 'sdHR', 'modeHR', 'nNN', 'meanNN', 'SDSD', 'CVNN',
#        'SDNN', 'pNN50', 'pNN20', 'RMSSD', 'medianNN', 'q20NN', 'q80NN',
#        'minNN', 'maxNN', 'triHRV', 'totalpower', 'LF', 'HF', 'ULF', 'VLF',
#        'VHF', 'LF/HF', 'rLF', 'rHF', 'peakLF', 'peakHF', 'SD1', 'SD2',
#        'SD1SD2', 'apEn', 'sampEn']]

fet_ppg = df_all_stats[['BPM',
       'IBI', 'PPG_Rate_Mean', 'HRV_MeanNN', 'HRV_SDNN', 'HRV_RMSSD',
       'HRV_SDSD', 'HRV_CVNN', 'HRV_CVSD', 'HRV_MedianNN', 'HRV_MadNN',
       'HRV_MCVNN', 'HRV_IQRNN', 'HRV_SDRMSSD', 'HRV_Prc20NN', 'HRV_Prc80NN',
       'HRV_pNN50', 'HRV_pNN20', 'HRV_MinNN', 'HRV_MaxNN', 'HRV_HTI',
       'HRV_TINN', 'HRV_LF', 'HRV_HF', 'HRV_VHF', 'HRV_TP', 'HRV_LFHF',
       'HRV_LFn', 'HRV_HFn', 'HRV_LnHF', 'HRV_SD1', 'HRV_SD2', 'HRV_SD1SD2',
       'HRV_S', 'HRV_CSI', 'HRV_CVI', 'HRV_CSI_Modified', 'HRV_PIP',
       'HRV_IALS', 'HRV_PSS', 'HRV_PAS', 'HRV_GI', 'HRV_SI', 'HRV_AI',
       'HRV_PI', 'HRV_C1d', 'HRV_C1a', 'HRV_SD1d', 'HRV_SD1a', 'HRV_C2d',
       'HRV_C2a', 'HRV_SD2d', 'HRV_SD2a', 'HRV_Cd', 'HRV_Ca', 'HRV_SDNNd',
       'HRV_SDNNa', 'HRV_DFA_alpha1', 'HRV_MFDFA_alpha1_Width',
       'HRV_MFDFA_alpha1_Peak', 'HRV_MFDFA_alpha1_Mean',
       'HRV_MFDFA_alpha1_Max', 'HRV_MFDFA_alpha1_Delta',
       'HRV_MFDFA_alpha1_Asymmetry', 'HRV_MFDFA_alpha1_Fluctuation',
       'HRV_MFDFA_alpha1_Increment', 'HRV_ApEn', 'HRV_ShanEn',
       'HRV_FuzzyEn', 'HRV_MSEn', 'HRV_CMSEn', 'HRV_RCMSEn', 'HRV_CD',
       'HRV_HFD', 'HRV_KFD', 'HRV_LZC']]

ppg_label = df_all_stats[['Participant ID', 'Video ID', 'Gender', 'CMA', 'Video_ID_number']]

In [None]:
scaler_eda = MinMaxScaler()
fet_ppg_scaled = pd.DataFrame(columns=fet_ppg.columns, index=fet_ppg.index)
fet_ppg_scaled[fet_ppg_scaled.columns] = scaler_eda.fit_transform(fet_ppg)
fet_ppg_scaled

In [None]:
# Calculate the correlation matrix
correlation_matrix = fet_ppg_scaled.corr()

# Identify redundant features
redundant_features = []
for feature in correlation_matrix.columns:
    correlated_features = correlation_matrix.index[
        (correlation_matrix[feature] > 0.90) & (correlation_matrix.index != feature)
    ]
    redundant_features.extend(correlated_features)

redundant_features = list(set(redundant_features))
print(redundant_features)
selected_features = [feature for feature in correlation_matrix.columns if feature not in redundant_features]

# Create a subset of the correlation matrix for selected features
reduced_correlation_matrix = correlation_matrix.loc[selected_features, selected_features]

# Create a correlation heatmap using Seaborn
plt.figure(figsize=(10, 10))  # Adjust the figure size as needed
sns.set(font_scale=1)
sns.heatmap(reduced_correlation_matrix, annot=True, cmap='viridis', cbar=True, square=True,
            fmt=".2f", linewidths=0.5)
plt.title('Correlation Heatmap for PPG Features')
plt.show()

print(reduced_correlation_matrix.columns)

In [None]:
# ppg_final_fet = fet_ppg_scaled[['BPM', 'IBI', 'max_ppg', 'min_ppg', 'mean_ppg', 'sk_ppg', 'median_ppg',
#        'meanHR', 'minHR', 'maxHR', 'sdHR', 'modeHR', 'nNN', 'SDNN', 'pNN50',
#        'pNN20', 'medianNN', 'q20NN', 'q80NN', 'minNN', 'maxNN', 'triHRV', 'LF',
#        'HF', 'ULF', 'VHF', 'LF/HF', 'rLF', 'rHF', 'peakLF', 'peakHF', 'SD1SD2',
#        'apEn']]

ppg_final_fet = fet_ppg_scaled[['BPM', 'IBI', 'PPG_Rate_Mean', 'HRV_MedianNN', 'HRV_Prc20NN',
       'HRV_MinNN', 'HRV_HTI', 'HRV_TINN', 'HRV_LF', 'HRV_VHF', 'HRV_LFn',
       'HRV_HFn', 'HRV_LnHF', 'HRV_SD1SD2', 'HRV_CVI', 'HRV_PSS', 'HRV_PAS',
       'HRV_PI', 'HRV_C1d', 'HRV_C1a', 'HRV_DFA_alpha1',
       'HRV_MFDFA_alpha1_Width', 'HRV_MFDFA_alpha1_Peak',
       'HRV_MFDFA_alpha1_Mean', 'HRV_MFDFA_alpha1_Max',
       'HRV_MFDFA_alpha1_Delta', 'HRV_MFDFA_alpha1_Asymmetry', 'HRV_ApEn',
       'HRV_ShanEn', 'HRV_FuzzyEn', 'HRV_MSEn', 'HRV_CMSEn', 'HRV_RCMSEn',
       'HRV_CD', 'HRV_HFD', 'HRV_KFD', 'HRV_LZC']]
ppg_final_fet

In [None]:
merged_df = pd.concat([ppg_final_fet, ppg_label], axis=1)
merged_df

In [None]:
vads = pd.read_csv("../Data_files/VADS.csv")
vads

In [None]:
for index, row in tqdm(merged_df.iterrows()):
    matching_rows = vads[(vads['Participant ID'] == row['Participant ID']) & (vads['Video ID'] == row['Video_ID_number'])]

    if not matching_rows.empty:

        merged_df.at[index, 'Valence'] = matching_rows['Valence'].iloc[0]
        merged_df.at[index, 'Arousal'] = matching_rows['Arousal'].iloc[0]
        merged_df.at[index, 'Dominance'] = matching_rows['Dominance'].iloc[0]
        merged_df.at[index, 'significance'] = matching_rows['significance'].iloc[0]
        
merged_df

In [None]:
# Define the bins and labels for categorization
bins = [1, 3, 5]  # Define the bin edges
labels = [0, 1]   # Define the corresponding labels (0 (Low) for 1-3:, 1 (High) for 4-5)

# Use the cut function to categorize the 'arousal' column
merged_df['arousal_category'] = pd.cut(merged_df['Arousal'], bins=bins, labels=labels, include_lowest=True)
merged_df['valence_category'] = pd.cut(merged_df['Valence'], bins=bins, labels=labels, include_lowest=True)

# Convert the 'category' column to integer type if needed
merged_df['arousal_category'] = merged_df['arousal_category'].astype(int)
merged_df['valence_category'] = merged_df['valence_category'].astype(int)
merged_df

In [None]:
mapping = {
    'Baseline': 0,
    'LVLA': 0,
    'LVHA': 1,
    'HVHA': 1,
    'HVLA': 0  # Baseline and HVLA mapped to 0
}

# Apply the mapping to the 'CMA' column
merged_df['CMA_numeric'] = merged_df['CMA'].map(mapping)
# autofet_df

merged_df['task_valence'] = merged_df['CMA'].apply(lambda x: 0 if x in ['Baseline', 'HVLA'] else 1)
merged_df

In [None]:
def plot_tsne(final_fet, valence_col, arousal_col, stress_col, valence, arousal, task_valence):

    # Apply t-SNE to reduce dimensions
    tsne = TSNE(n_components=2, random_state=42)
    tsne_results = tsne.fit_transform(final_fet)

    # Create a DataFrame with t-SNE results
    tsne_df = pd.DataFrame(tsne_results, columns=['t-SNE1', 't-SNE2'])
    tsne_df['Label'] = valence_col

    # Plotting the t-SNE results with colors based on labels
    plt.figure(figsize=(8, 6))
    scatter = plt.scatter(tsne_df['t-SNE1'], tsne_df['t-SNE2'], c=tsne_df['Label'], cmap='viridis')
    plt.title(f't-SNE Plot of Features Colored by {valence}')
    plt.xlabel('t-SNE1')
    plt.ylabel('t-SNE2')
    plt.colorbar(scatter, label='Label')
    plt.show()

    tsne_df['Label'] = arousal_col

    # Plotting the t-SNE results with colors based on labels
    plt.figure(figsize=(8, 6))
    scatter = plt.scatter(tsne_df['t-SNE1'], tsne_df['t-SNE2'], c=tsne_df['Label'], cmap='viridis')
    plt.title(f't-SNE Plot of Features Colored by {arousal}')
    plt.xlabel('t-SNE1')
    plt.ylabel('t-SNE2')
    plt.colorbar(scatter, label='Label')
    plt.show()

    tsne_df['Label'] = stress_col

    # Plotting the t-SNE results with colors based on labels
    plt.figure(figsize=(8, 6))
    scatter = plt.scatter(tsne_df['t-SNE1'], tsne_df['t-SNE2'], c=tsne_df['Label'], cmap='viridis')
    plt.title(f't-SNE Plot of Features Colored by {task_valence}')
    plt.xlabel('t-SNE1')
    plt.ylabel('t-SNE2')
    plt.colorbar(scatter, label='Label')
    plt.show()

In [None]:
PPG_data_with_labels = label_prep(merged_df)
PPG_data_with_labels.to_csv("../Data_files/PPG_labels.csv")
PPG_data_with_labels

In [None]:
plot_tsne(ppg_final_fet, merged_df['valence_category'], merged_df['arousal_category'], merged_df['task_valence'], 'valence', 'arousal', 'CMA')