In [4]:
import mne
import os
import numpy as np
import pandas as pd
from tqdm import tqdm
import random

random_seed = 1
random.seed(random_seed)
np.random.seed(random_seed)

In [2]:
# For remapping original labels in to interpretable labels
labels_remap = {"R01": {"T0": "eyes_open"},
          "R02": {"T0": "eyes_closed"},
          "R03": {"T0": "rest", "T1": "real_left_fist", "T2": "real_right_fist"},
          "R04": {"T0": "rest", "T1": "imagine_left_fist", "T2": "imagine_right_fist"},
          "R05": {"T0": "rest", "T1": "real_both_fist", "T2": "real_both_feet"},
          "R06": {"T0": "rest", "T1": "imagine_both_fist", "T2": "imagine_both_feet"},
          "R07": {"T0": "rest", "T1": "real_left_fist", "T2": "real_right_fist"},
          "R08": {"T0": "rest", "T1": "imagine_left_fist", "T2": "imagine_right_fist"},
          "R09": {"T0": "rest", "T1": "real_both_fist", "T2": "real_both_feet"},
          "R10": {"T0": "rest", "T1": "imagine_both_fist", "T2": "imagine_both_feet"},
          "R11": {"T0": "rest", "T1": "real_left_fist", "T2": "real_right_fist"},
          "R12": {"T0": "rest", "T1": "imagine_left_fist", "T2": "imagine_right_fist"},
          "R13": {"T0": "rest", "T1": "real_both_fist", "T2": "real_both_feet"},
          "R14": {"T0": "rest", "T1": "imagine_both_fist", "T2": "imagine_both_feet"}
         }


In [3]:
dataset_path = "dataset/physionet.org/files/eegmmidb/1.0.0/"
save_path = "dataset/physionet.org_csv"

if not os.path.isdir(save_path):
    os.mkdir(save_path)
    
subjects_idc = [f"S{i:03d}" for i in range(1, 110)]
# Recorded in 160 times per second
freq = 160
timestep = pd.to_timedelta(f"{1 / freq} seconds")

In [4]:
# Convert length annoations into timestep annotations
def process_annots(annot_df):
    new_annots = {"timestamp": [], "label": []}
    for onset, duration, description in annot_df.values:
        duration = pd.to_timedelta(f"{duration} seconds")
        stop_onset = onset + duration

        while onset != stop_onset:
            new_annots["timestamp"].append(onset)
            new_annots["label"].append(description)
            onset = onset + timestep

    new_annots = pd.DataFrame(new_annots)
    return new_annots

# Change original labels into interpretable labels
def change_labels(filename, label_df):
    filename = filename.rstrip(".edf")[-3:]
    label_df["description"] = label_df["description"].replace(labels_remap[filename])
    return label_df


In [5]:
# For recording length differences between data_df and labels_df
diff_df = []

# Convert dataset into .csv and save
for subject_id in tqdm(subjects_idc):
    runs = os.listdir(os.path.join(dataset_path, subject_id))
    runs = [i for i in runs if i.endswith(".edf")]
    runs.sort()

    for run in runs:
        run_path = os.path.join(dataset_path, subject_id, run)

        raw = mne.io.read_raw_edf(run_path, verbose=False)
        raw_data = raw.to_data_frame()
        raw_data = raw_data.drop(columns=["time"])
        original_data_length = raw_data.shape[0]

        raw_labels = raw.annotations.to_data_frame()
        raw_labels = change_labels(run, raw_labels)
        raw_labels = process_annots(raw_labels)
        

        if raw_data.shape[0] != raw_labels.shape[0]:
            len_diff = raw_data.shape[0] - raw_labels.shape[0]
            
            unique = raw_data.iloc[raw_labels.shape[0]:].values
            unique = np.unique(unique)

            diff_df.append({"filename": run, "length_diff": len_diff, "diff_value_unique": unique})
        
        raw_data = pd.concat([raw_labels, raw_data], axis=1)
        raw_data = raw_data.dropna()

        assert raw_data.shape[0] == raw_labels.shape[0] or raw_data.shape[0] == original_data_length

        if not os.path.exists(os.path.join(save_path, subject_id)):
            os.mkdir(os.path.join(save_path, subject_id))
        raw_data.to_csv(os.path.join(save_path, subject_id, run.rstrip(".edf") + ".csv"), index=False)
        raw_labels.to_csv(os.path.join(save_path, subject_id, run.rstrip(".edf") + "_labels.csv"), index=False)

diff_df = pd.DataFrame(diff_df)
diff_df.to_csv(os.path.join(save_path, "data_label_diff.csv"))

# Note 1: subject 100 showed errors. 
# we're ignoring subjects:
# #88, 89, 92 100 anyway.

# Note 2: when len(raw_data.shape[0]) != len(raw_labels.shape[0]) the leftover dataframe is [0.] or [], when diff is positive or negative respectively
# #88, 92, 100 has more labels than data


  raw = mne.io.read_raw_edf(run_path, verbose=False)
  raw = mne.io.read_raw_edf(run_path, verbose=False)
  raw = mne.io.read_raw_edf(run_path, verbose=False)
  raw = mne.io.read_raw_edf(run_path, verbose=False)
  raw = mne.io.read_raw_edf(run_path, verbose=False)
  raw = mne.io.read_raw_edf(run_path, verbose=False)
  raw = mne.io.read_raw_edf(run_path, verbose=False)
  raw = mne.io.read_raw_edf(run_path, verbose=False)
  raw = mne.io.read_raw_edf(run_path, verbose=False)
  raw = mne.io.read_raw_edf(run_path, verbose=False)
  raw = mne.io.read_raw_edf(run_path, verbose=False)
  raw = mne.io.read_raw_edf(run_path, verbose=False)
100%|██████████| 109/109 [19:35<00:00, 10.79s/it]


In [6]:
# Concat data of each subject

# For getting columns names only
temp = pd.read_csv("dataset/physionet.org_csv/S001/S001R01.csv")

# Save .csv files as one .csv per subject
for subject_id in tqdm(subjects_idc):
    runs = os.listdir(os.path.join(save_path, subject_id))
    # print(runs)
    runs = [i for i in runs if len(i) == 11]
    runs.sort()
    full_data = pd.DataFrame([], columns=temp.columns)
    for run in runs:
        run_path = os.path.join(save_path, subject_id, run)
        data = pd.read_csv(run_path)
        full_data = pd.concat([full_data, data], axis=0)
    full_data = full_data.reset_index()
    full_data = full_data.rename(columns = {'index':'original_index'})
    full_data.to_csv(run_path[:-7] + ".csv", index=False)

100%|██████████| 109/109 [18:17<00:00, 10.07s/it]


In [8]:
full_data

Unnamed: 0,original_index,timestamp,label,Fc5.,Fc3.,Fc1.,Fcz.,Fc2.,Fc4.,Fc6.,...,P8..,Po7.,Po3.,Poz.,Po4.,Po8.,O1..,Oz..,O2..,Iz..
0,0,2009-08-12 16:15:00.000000,eyes_open,108.0,125.0,166.0,20.0,16.0,52.0,63.0,...,4.0,27.0,-24.0,14.0,-4.0,2.0,13.0,94.0,26.0,-12.0
1,1,2009-08-12 16:15:00.006250,eyes_open,98.0,135.0,172.0,29.0,22.0,100.0,67.0,...,-8.0,40.0,-17.0,12.0,-7.0,-3.0,14.0,91.0,32.0,-14.0
2,2,2009-08-12 16:15:00.012500,eyes_open,78.0,138.0,140.0,30.0,21.0,80.0,76.0,...,-7.0,55.0,-5.0,20.0,-2.0,2.0,25.0,99.0,46.0,-2.0
3,3,2009-08-12 16:15:00.018750,eyes_open,72.0,146.0,127.0,33.0,22.0,61.0,77.0,...,-2.0,56.0,-2.0,23.0,-2.0,-1.0,22.0,100.0,40.0,-4.0
4,4,2009-08-12 16:15:00.025000,eyes_open,89.0,145.0,130.0,38.0,26.0,-5.0,84.0,...,0.0,51.0,-7.0,15.0,-13.0,-8.0,10.0,88.0,21.0,-13.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
255307,19675,2009-08-12 16:17:02.968750,imagine_both_feet,-3.0,69.0,86.0,21.0,28.0,3.0,-53.0,...,39.0,25.0,70.0,30.0,66.0,30.0,24.0,10.0,19.0,78.0
255308,19676,2009-08-12 16:17:02.975000,imagine_both_feet,-2.0,50.0,88.0,17.0,23.0,-8.0,-59.0,...,25.0,10.0,58.0,29.0,59.0,17.0,11.0,5.0,12.0,63.0
255309,19677,2009-08-12 16:17:02.981250,imagine_both_feet,-2.0,32.0,72.0,10.0,9.0,-11.0,-67.0,...,14.0,-3.0,46.0,25.0,53.0,10.0,4.0,2.0,6.0,49.0
255310,19678,2009-08-12 16:17:02.987500,imagine_both_feet,1.0,10.0,64.0,-6.0,-3.0,-13.0,-77.0,...,24.0,4.0,58.0,42.0,67.0,25.0,16.0,14.0,23.0,53.0


# Create training data

In [1]:
exclusions = ["S088", "S089", "S092", "S100"]
subjects_idc = [f"S{i:03d}" for i in range(1, 110)]
subjects_idc = [i for i in subjects_idc if i not in exclusions]

In [5]:
test_idc = random.sample(subjects_idc, 10)
train_idc = [i for i in subjects_idc if i not in test_idc]

In [6]:
print(train_idc)
print(len(train_idc))

['S001', 'S002', 'S003', 'S004', 'S005', 'S006', 'S007', 'S008', 'S010', 'S011', 'S012', 'S013', 'S014', 'S015', 'S017', 'S019', 'S020', 'S021', 'S022', 'S023', 'S024', 'S025', 'S026', 'S027', 'S028', 'S029', 'S030', 'S031', 'S032', 'S034', 'S035', 'S036', 'S037', 'S038', 'S039', 'S040', 'S041', 'S042', 'S043', 'S044', 'S045', 'S046', 'S047', 'S048', 'S049', 'S050', 'S051', 'S052', 'S053', 'S054', 'S055', 'S056', 'S057', 'S059', 'S060', 'S062', 'S063', 'S065', 'S066', 'S067', 'S068', 'S069', 'S070', 'S071', 'S072', 'S074', 'S075', 'S076', 'S077', 'S078', 'S079', 'S080', 'S081', 'S082', 'S083', 'S084', 'S085', 'S086', 'S087', 'S090', 'S091', 'S093', 'S094', 'S095', 'S096', 'S097', 'S098', 'S099', 'S101', 'S103', 'S104', 'S105', 'S106', 'S108', 'S109']
95


In [7]:
print(test_idc)

['S018', 'S073', 'S107', 'S102', 'S009', 'S033', 'S016', 'S064', 'S058', 'S061']


In [8]:
filename = "cross_subject_data_1.pickle"

In [1]:
# X_train = np.empty((0, 64))
# y_train = np.empty((0,))
# X_test = np.empty((0, 64))
# y_test = np.empty((0,))

# for subject_id in tqdm(train_idc):
#     df = pd.read_csv(f"dataset/physionet.org_csv/{subject_id}/{subject_id}.csv")
#     X_train = np.vstack((X_train, df.iloc[:, 3:].values))
#     y_train = np.hstack((y_train, df["label"].values))

# for subject_id in tqdm(test_idc):
#     df = pd.read_csv(f"dataset/physionet.org_csv/{subject_id}/{subject_id}.csv")
#     X_test = np.vstack((X_test, df.iloc[:, 3:].values))
#     y_test = np.hstack((y_test, df["label"].values))

# cross_subject_data = {"train_x": X_train, "train_y": y_train, "test_x": X_test, "test_y": y_test}