In [24]:
import numpy as np
import pandas as pd
import tensorflow as tf

import glob

In [41]:
def num_labels(df):
    d_labels = {'W': 0, '1': 1, '2': 2, '3': 3, '4': 4, 'e': 5, 'R': 6, '?': 7}

    df['label'] = df['label'].map(d_labels)

    return df


def gather_signals_by_class(df_raw, num_signals=20):
    """
    Organize data by class, collecting sets of `num_signals` signals per class.
    """
    grouped = df_raw.groupby('label')
    grouped_data = []
    
    for _, group in grouped:
        for i in range(0, len(group), num_signals):
            subset = group.iloc[i:i+num_signals]
            if len(subset) == num_signals:
                grouped_data.append(subset)
                
    return pd.concat(grouped_data)

def get_dataset(paths_csv: list = None, shuffle: bool = False, batch_size: int = 32, 
                over_samp: bool = False, return_data: bool = False, num_signals: int = 20):
    # Load CSV data
    df_raw = pd.DataFrame()
    
    # Load and concatenate CSV files
    for path in paths_csv:
        temp_df = pd.read_csv(path, delimiter=';', header=0)
        df_raw = pd.concat([df_raw, temp_df], ignore_index=True)    
    
    df_raw = num_labels(df=df_raw)
    
    if over_samp:
        # Implement over_sampling logic here
        pass

    # Organize data by class, collecting sets of num_signals signals per class
    data_processed = gather_signals_by_class(df_raw, num_signals=num_signals)
    
    if shuffle:
        # Shuffling here would disrupt the grouping, consider shuffling within groups if necessary
        pass

    # Extracting sensor data and labels
    sensor_data = data_processed[['sensor_1', 'sensor_2', 'sensor_3', 'sensor_4', 'sensor_5', 'sensor_6', 'sensor_7']].values
    labels = data_processed['label'].values[::num_signals]  # Assuming the same label for each group of 20 signals

    # Reshape data to have 'num_signals' signals per item
    sensor_data_reshaped = sensor_data.reshape(-1, num_signals, sensor_data.shape[1])
    labels_one_hot = tf.keras.utils.to_categorical(labels)

    # Create TensorFlow Dataset
    dataset = tf.data.Dataset.from_tensor_slices((sensor_data_reshaped, labels_one_hot))

    if batch_size is not None:
        dataset = dataset.batch(batch_size)

    dataset = dataset.prefetch(tf.data.experimental.AUTOTUNE)

    if return_data:
        return dataset, labels_one_hot
    else:
        return dataset

In [42]:
tf_dataset = get_dataset(paths_csv=glob.glob('sleep-cassette-csv/*.csv'), shuffle=True)

In [43]:
tf_dataset

<_PrefetchDataset element_spec=(TensorSpec(shape=(None, 20, 7), dtype=tf.float64, name=None), TensorSpec(shape=(None, 8), dtype=tf.float32, name=None))>

In [6]:
df_raw = pd.read_csv('sleep-cassette-csv/70-SC-EEG.csv', delimiter=';', header=0)
df_raw

Unnamed: 0,sensor_1,sensor_2,sensor_3,sensor_4,sensor_5,sensor_6,sensor_7,label,user_id,user_night,sex,patient_age
0,10.740444,-0.310818,36.859228,210.0,1.598,13.241,963.0,W,70,2,Male,89
1,-7.094222,-2.257045,-14.270159,235.0,1.410,13.162,939.0,W,70,2,Male,89
2,3.295111,4.211404,8.536400,206.0,1.232,13.182,952.0,W,70,2,Male,89
3,6.348444,-10.786471,-2.642545,222.0,1.398,13.211,972.0,W,70,2,Male,89
4,8.938667,2.175726,5.646027,190.0,1.604,13.143,939.0,W,70,2,Male,89
...,...,...,...,...,...,...,...,...,...,...,...,...
78583,0.216889,4.824908,-13.637299,141.0,0.764,13.213,992.0,W,70,2,Male,89
78584,-8.981333,-4.098462,5.135961,150.0,1.638,13.136,955.0,W,70,2,Male,89
78585,26.871111,-4.197851,26.417045,137.0,2.398,13.195,968.0,W,70,2,Male,89
78586,1.592000,-2.081758,-12.824972,131.0,1.712,13.192,965.0,W,70,2,Male,89


In [8]:
num_signals = 20
grouped = df_raw.groupby('label')
grouped_data = []

for _, group in grouped:
    for i in range(0, len(group), num_signals):
        subset = group.iloc[i:i+num_signals]
        if len(subset) == num_signals:
            grouped_data.append(subset)

In [11]:
df_agr = pd.concat(grouped_data)
df_agr

Unnamed: 0,sensor_1,sensor_2,sensor_3,sensor_4,sensor_5,sensor_6,sensor_7,label,user_id,user_night,sex,patient_age
10829,9.947556,-1.916410,5.140684,154.0,1.298,13.191,951.0,1,70,2,Male,89
10830,-3.767111,1.077021,1.759136,135.0,1.338,13.172,939.0,1,70,2,Male,89
10831,2.742222,0.336117,-0.857314,125.0,1.332,13.212,962.0,1,70,2,Male,89
10832,6.363556,-1.808889,-3.218730,160.0,1.346,13.164,958.0,1,70,2,Male,89
10833,0.557333,-0.558388,1.825255,144.0,1.420,13.141,945.0,1,70,2,Male,89
...,...,...,...,...,...,...,...,...,...,...,...,...
78568,3.879111,-0.823126,-34.153284,150.0,1.398,13.136,962.0,W,70,2,Male,89
78569,-16.284444,-2.771160,13.131717,130.0,3.682,13.117,955.0,W,70,2,Male,89
78570,23.981333,5.258608,7.634339,153.0,-0.946,13.196,980.0,W,70,2,Male,89
78571,11.715556,-1.105934,5.310706,121.0,0.740,13.245,989.0,W,70,2,Male,89


In [12]:
sensor_data = df_agr[['sensor_1', 'sensor_2', 'sensor_3', 'sensor_4', 'sensor_5', 'sensor_6', 'sensor_7']].values
labels = df_agr['label'].values[::num_signals]  # Assuming the same label for each group of 20 signals

In [15]:
len(labels)

3927

In [17]:
sensor_data_reshaped = sensor_data.reshape(-1, num_signals, sensor_data.shape[1])
# labels_one_hot = tf.keras.utils.to_categorical(labels)

In [21]:
len(sensor_data_reshaped[1])

20

In [35]:
data_raw = pd.DataFrame()
    
# Load and concatenate CSV files
for path in glob.glob('sleep-cassette-csv/*.csv'):
    temp_df = pd.read_csv(path, delimiter=';', header=0)
    data_raw = pd.concat([data_raw, temp_df], ignore_index=True)

In [36]:
data_raw

Unnamed: 0,sensor_1,sensor_2,sensor_3,sensor_4,sensor_5,sensor_6,sensor_7,label,user_id,user_night,sex,patient_age
0,1.781040,1.059883,-9.367766,-627.0,4.024,37.506115,977.0,W,0,2,Female,33
1,3.469407,1.734300,-15.902869,619.0,3.982,37.488241,979.0,W,0,2,Female,33
2,-13.997751,0.857656,36.609463,-520.0,4.114,37.507996,997.0,W,0,2,Female,33
3,-3.853297,1.313158,-24.589011,-403.0,4.154,37.504233,1015.0,W,0,2,Female,33
4,34.658359,6.828264,6.485287,475.0,4.060,37.504233,1016.0,W,0,2,Female,33
...,...,...,...,...,...,...,...,...,...,...,...,...
6170554,0.207111,2.643062,-130.244115,51.0,3.198,14.100710,888.0,W,82,2,Female,56
6170555,-0.831111,1.885011,-5.584733,30.0,3.062,14.045777,820.0,W,82,2,Female,56
6170556,-3.521778,1.148808,58.984276,55.0,3.080,14.116732,875.0,W,82,2,Female,56
6170557,-8.376000,-5.030545,13.712701,53.0,3.050,14.110323,858.0,W,82,2,Female,56


In [37]:
data_raw = num_labels(data_raw)
data_raw

Unnamed: 0,sensor_1,sensor_2,sensor_3,sensor_4,sensor_5,sensor_6,sensor_7,label,user_id,user_night,sex,patient_age
0,1.781040,1.059883,-9.367766,-627.0,4.024,37.506115,977.0,0,0,2,Female,33
1,3.469407,1.734300,-15.902869,619.0,3.982,37.488241,979.0,0,0,2,Female,33
2,-13.997751,0.857656,36.609463,-520.0,4.114,37.507996,997.0,0,0,2,Female,33
3,-3.853297,1.313158,-24.589011,-403.0,4.154,37.504233,1015.0,0,0,2,Female,33
4,34.658359,6.828264,6.485287,475.0,4.060,37.504233,1016.0,0,0,2,Female,33
...,...,...,...,...,...,...,...,...,...,...,...,...
6170554,0.207111,2.643062,-130.244115,51.0,3.198,14.100710,888.0,0,82,2,Female,56
6170555,-0.831111,1.885011,-5.584733,30.0,3.062,14.045777,820.0,0,82,2,Female,56
6170556,-3.521778,1.148808,58.984276,55.0,3.080,14.116732,875.0,0,82,2,Female,56
6170557,-8.376000,-5.030545,13.712701,53.0,3.050,14.110323,858.0,0,82,2,Female,56


In [39]:
data_raw['label'].value_counts()

0    4229754
2     994814
6     399804
1     317965
3     132318
4      65515
7      28066
5       2323
Name: label, dtype: int64