In [116]:
import numpy as np
from scipy.io import loadmat
import os
import copy
import tqdm
import pandas as pd
from sklearn.model_selection import train_test_split

In [37]:
def load_challenge_data(filename):
    x = loadmat(filename)
    data = np.asarray(x['val'], dtype=np.float64)
    new_file = filename.replace('.mat','.hea')
    input_header_file = os.path.join(new_file)
    with open(input_header_file,'r') as f:
        header_data=f.readlines()
    return data, header_data

def import_key_data(path):
    gender=[]
    age=[]
    labels=[]
    ecg_filenames=[]
    ecg_len = []
    for subdir, dirs, files in sorted(os.walk(path)):
        for filename in tqdm.tqdm(files):
            filepath = subdir + os.sep + filename
            if filepath.endswith(".mat"):
                data, header_data = load_challenge_data(filepath)
                if int(header_data[0].split(" ")[3])//int(header_data[0].split(" ")[2]) == 10:
                    labels.append(header_data[15][5:-1])
                    ecg_filenames.append(filepath)
                    gender.append(header_data[14][6:-1])
                    age.append(header_data[13][6:-1])
                    ecg_len.append(int(header_data[0].split(" ")[3])//int(header_data[0].split(" ")[2]))
    return gender, age, labels, ecg_len, ecg_filenames

def only_ten_sec(ecg_len, age, gender, filename, labels):
    idx = np.where(ecg_len == 10)[0]
    if len(idx) == 0 :
        return age, gender, filename, labels
    gender = gender[idx]
    age = age[idx]
    filename = filename[idx]
    labels = labels[idx]
    return age, gender, filename, labels
def clean_up_age_data(age):
    age[np.where(age == "60.")] = 60
    age = age.astype(int)
    return age

def clean_up_gender_data(gender):
    gender[np.where(gender == "Male")] = 0
    gender[np.where(gender == "male")] = 0
    gender[np.where(gender == "M")] = 0
    gender[np.where(gender == "Female")] = 1
    gender[np.where(gender == "female")] = 1
    gender[np.where(gender == "F")] = 1
    gender[np.where(gender == "NaN")] = 0 # only one nan
    np.unique(gender)
    gender = gender.astype(np.int32)
    return gender

In [38]:
gender, age, labels, ecg_len, ecg_filenames = import_key_data("./new_dataset")

0it [00:00, ?it/s]
100%|██████████| 20688/20688 [01:07<00:00, 306.78it/s]
100%|██████████| 6906/6906 [00:25<00:00, 269.79it/s]
100%|██████████| 1032/1032 [00:06<00:00, 161.27it/s]
100%|██████████| 148/148 [00:02<00:00, 64.31it/s]
100%|██████████| 13754/13754 [00:49<00:00, 276.45it/s]
100%|██████████| 43674/43674 [02:19<00:00, 312.41it/s]
100%|██████████| 20494/20494 [01:05<00:00, 312.02it/s]
100%|██████████| 69810/69810 [03:39<00:00, 317.50it/s]


In [97]:
age, gender,ecg_filenames, labels = only_ten_sec(ecg_len, age, gender, ecg_filenames, labels)

In [98]:
new_df = pd.DataFrame(data = zip(ecg_filenames,gender,age), columns = ['FILENAME','GENDER','AGE'])

In [99]:
new_df = new_df[new_df['AGE'] != 'NaN']
new_df = new_df[new_df['GENDER'] != 'NaN']
new_df = new_df[new_df['GENDER'] != 'Unknown']
new_df['AGE'] = new_df['AGE'].astype(int)
new_df = new_df[new_df['AGE'] >= 0]
new_df.reset_index(drop=True,inplace=True)

In [121]:
file_names = []
for name in tqdm.tqdm((new_df['FILENAME'].values)):
    file_name = name.split('\\')[2].split('.')[0]
    file_names.append(file_name)
    # nparray = load_challenge_data(name)[0]
    # if nparray.shape != (12, 5000) :
    #     nparray = nparray[:,:5000]
    #     nparray = nparray.reshape(-1)
    #     np.save(f'./dataset/numpy_joined/{file_name}',nparray)

100%|██████████| 81102/81102 [00:00<00:00, 2313884.29it/s]


In [122]:
new_df['FILENAME'] = file_names

In [123]:
new_df_adult = new_df[new_df['AGE']>=19]
new_df_child = new_df[new_df['AGE']<19]
new_df_adult.reset_index(drop=True, inplace=True)
new_df_child.reset_index(drop=True, inplace=True)

In [138]:
new_df_child_train, new_df_child_valid = train_test_split(new_df_child, test_size=0.5,random_state=42)
new_df_child_train.reset_index(drop=True, inplace=True)
new_df_child_valid.reset_index(drop=True, inplace=True)


In [139]:
new_df_adult.to_csv('./dataset/new_train_adult.csv',index=False)
new_df_child_train.to_csv('./dataset/new_train_child.csv',index=False)
new_df_child_valid.to_csv('./dataset/new_valid_child.csv',index=False)

# 기존 데이터 numpy 배열 순서 변경

## adult

In [2]:
train_adult = pd.read_csv('./dataset/ECG_adult_age_train.csv')

In [15]:
file_path = './dataset/ECG_adult_numpy_train/'
file_name = train_adult["FILENAME"].values
def return_array(file_name):
    full_array = np.load(file_path + file_name + ".npy")
    _12lead = [
        "I",
        "II",
        "III",
        "V1",
        "V2",
        "V3",
        "V4",
        "V5",
        "V6",
        "aVR",
        "aVL",
        "aVF",
    ]
    array_by_name = dict()
    for name, p_array in zip(_12lead, np.split(full_array, 12)):
        array_by_name[name] = p_array
    _12lead_re = [
        "I",
        "II",
        "III",
        "aVR",
        "aVL",
        "aVF",
        "V1",
        "V2",
        "V3",
        "V4",
        "V5",
        "V6",
    ]
    full_array = []
    for _array in _12lead_re:
        full_array += array_by_name[_array].tolist()
    full_array = np.array(full_array)
    return full_array, array_by_name

In [17]:
array_ , _ = return_array(file_name[1])

In [25]:
for name in file_name:
    array_, _ = return_array(name)
    print(name, array_)
    np.save(f'./dataset/numpy_joined/{name}',array_)

ecg_adult_0 [12. 12. 12. ... 70. 73. 74.]
ecg_adult_1 [-2. -2. -2. ...  0.  1.  1.]
ecg_adult_2 [-10. -10. -10. ...  78.  78.  78.]
ecg_adult_3 [  -6.   -6.   -6. ... -146. -146. -150.]
ecg_adult_4 [  2.   2.   2. ... -28. -28. -28.]
ecg_adult_5 [-8. -8. -8. ... 13. 15. 15.]
ecg_adult_6 [  4.   4.   4. ... -20. -23. -24.]
ecg_adult_7 [10. 10. 10. ...  4.  6.  4.]
ecg_adult_8 [ -6.  -6.  -6. ... -50. -50. -50.]
ecg_adult_9 [ -2.  -2.  -2. ... -52. -53. -54.]
ecg_adult_10 [-2. -2. -2. ...  0. -1.  1.]
ecg_adult_11 [114. 114. 114. ... -25. -25. -27.]
ecg_adult_12 [-4. -4. -4. ...  8.  9.  9.]
ecg_adult_13 [  1.   3.   3. ... -30. -30. -30.]
ecg_adult_14 [-10. -10. -10. ...  82.  88.  94.]
ecg_adult_15 [ 15.  15.  16. ... -14. -12. -14.]
ecg_adult_16 [ -8.  -8.  -8. ... -26. -26. -28.]
ecg_adult_17 [ -8.  -8.  -8. ... -56. -60. -64.]
ecg_adult_19 [0. 0. 0. ... 4. 8. 6.]
ecg_adult_20 [-12. -12. -12. ...  -2.  -2.  -2.]
ecg_adult_21 [ 13.  22.  25. ... 295. 292. 287.]
ecg_adult_22 [ -10.  -1

# child

In [26]:
train_child = pd.read_csv('./dataset/ECG_child_age_train.csv')

In [27]:
file_path = './dataset/ECG_child_numpy_train/'
file_name = train_child["FILENAME"].values
def return_array(file_name):
    full_array = np.load(file_path + file_name + ".npy")
    _12lead = [
        "I",
        "II",
        "III",
        "V1",
        "V2",
        "V3",
        "V4",
        "V5",
        "V6",
        "aVR",
        "aVL",
        "aVF",
    ]
    array_by_name = dict()
    for name, p_array in zip(_12lead, np.split(full_array, 12)):
        array_by_name[name] = p_array
    _12lead_re = [
        "I",
        "II",
        "III",
        "aVR",
        "aVL",
        "aVF",
        "V1",
        "V2",
        "V3",
        "V4",
        "V5",
        "V6",
    ]
    full_array = []
    for _array in _12lead_re:
        full_array += array_by_name[_array].tolist()
    full_array = np.array(full_array)
    return full_array, array_by_name

In [28]:
array_ , _ = return_array(file_name[1])

In [35]:
for name in tqdm.tqdm(file_name):
    array_, _ = return_array(name)
    np.save(f'./dataset/numpy_joined/{name}',array_)

100%|██████████| 8781/8781 [01:24<00:00, 103.95it/s]
