In [50]:
# Import necessary packages and functions
import numpy as np
import pandas as pd 
import re
from utility_funs import get_seizure_sequence

In [53]:
# Create df from summary-txt-files
df = get_seizure_sequence("data")

In [54]:
# Create patient column by extracting patient-id from file_name
# First split at the underscore and keep the left part
# Second extract "chb" and all integers to get rid of possible suffixes like "a", "b", and "c"
df["patient"] = df['file_name'].str.split('_').str[0].str.extract(r'(chb\d+)')
df.head()

Unnamed: 0,file_name,number_of_seizures,seizure_start,seizure_end,patient
0,chb11_82.edf,1,298.0,320.0,chb11
1,chb11_92.edf,1,2695.0,2727.0,chb11
2,chb11_99.edf,1,1454.0,2206.0,chb11
3,chb16_10.edf,1,2290.0,2299.0,chb16
4,chb16_11.edf,1,1120.0,1129.0,chb16


In [59]:
# Sum number_of_seizures per patients and drop obsolete columns
patients = df.groupby("patient").sum("number_of_seizures").reset_index().drop(["seizure_start", "seizure_end"], axis=1)

In [58]:
# Filter for all patients with 10 or less seizures
# FYI: In the paper by Truong et al (https://arxiv.org/pdf/1806.08235.pdf), patients were selected slightly differently
filtered_df = patients[patients["number_of_seizures"] <= 10]

Unnamed: 0,patient,number_of_seizures
0,chb01,7
1,chb02,6
2,chb03,7
3,chb04,6
4,chb05,5
6,chb07,3
7,chb08,5
8,chb09,6
10,chb11,3
16,chb17,3


In [None]:
# Return a list of patient numbers
patient_list = [int(re.search(r'\d+', patient).group()) for patient in filtered_df["patient"]]

In [45]:
patient_list

['chb01',
 'chb02',
 'chb03',
 'chb04',
 'chb05',
 'chb07',
 'chb08',
 'chb09',
 'chb11',
 'chb17',
 'chb18',
 'chb19',
 'chb21',
 'chb22']