In [1]:
import os,re
import numpy as np
import scipy.io as sio
import hdf5storage as hdf5
import einops
import torch
from tqdm import tqdm
from sklearn import svm

In [2]:
def extend_normal(sample):
    for i in range(len(sample)):
        features_min = np.min(sample[i])
        features_max = np.max(sample[i])
        sample[i] = (sample[i] - features_min) / (features_max - features_min)
    return sample

In [3]:
raw_data_path = "D:\SEED\ExtractedFeatures"
data_path = "data\seed" 
if not os.path.exists(data_path):
    os.makedirs(data_path)
label = hdf5.loadmat(os.path.join(raw_data_path, "label.mat"))['label'][0]
print(label)
label += np.ones(label.size, dtype=np.int16)
label

[ 1  0 -1 -1  0  1 -1  0  1  1  0 -1  0  1 -1]


array([2, 1, 0, 0, 1, 2, 0, 1, 2, 2, 1, 0, 1, 2, 0], dtype=int16)

In [4]:
def build_extracted_features_dataset(folder_path, feature_name, win, frequency_band=None):
    '''
        将 folder_path 文件夹中的 ExtractedFeatures 数据转化为机器学习常用的数据集，区分开不同 trial 的数据
        ToDo: 增加 channel 的选择，而不是使用所有的 channel
    :param folder_path: ExtractedFeatures 文件夹对应的路径
    :param feature_name: 需要使用的特征名，如 'de_LDS'，'asm_LDS' 等，以 de_LDS1 为例，维度为 62 * 235 * 5，235为影片长度235秒，每秒切分为一个样本，62为通道数，5为频带数
    :param frequency_band: 需要选取的频带，'delta', 'theta', 'alpha', 'beta', 'gamma'
    :return feature_vector_dict, label_dict: 分别为样本的特征向量，样本的标签，key 为被试名字，val 为该被试对应的特征向量或标签的 list，方便 subject-independent 的测试
    '''
    band_map = {'delta': 0, 'theta': 1, 'alpha': 2, 'beta': 3, 'gamma': 4} # 频带映射:deta->0,theta->1,alpha->2,beta->3,gamma->4
    label_path = os.path.join(folder_path, 'label.mat')
    labels = hdf5.loadmat(label_path, verify_compressed_data_integrity=False)['label'][0] # [1, 0, -1, -1, 0, 1, -1, 0, 1, 1, 0, -1, 0, 1, -1]
    labels += np.ones(label.size, dtype=np.int16) # [2, 1, 0, 0, 1, 2, 0, 1, 2, 2, 1, 0, 1, 2, 0]
    feature_vector_dict = {}
    label_dict = {}
    try:
        file_list = [file for file in os.listdir(folder_path) if os.path.isfile(os.path.join(folder_path, file)) and re.match(r'\d+_\d+.mat', file)]
        file_list = sorted(file_list, key=lambda x: int(x.split('_')[0]))
        for file_name in tqdm(file_list, desc=f'Progress on features', ncols=100): # 45个mat
            # print(f'Now, processing {file_name}')
            all_features_dict = hdf5.loadmat(os.path.join(folder_path, file_name), verify_compressed_data_integrity=False) # 包含15个trial的各种特征
            subject_name = file_name.split('.')[0] # 被试如10_20131130
            feature_vector_trial_list = []
            label_trial_list = []
            for trial in range(1, 16):
                # cur_feature = all_features_dict[feature_name + str(trial)][:,:180,:] # 取出一共trial的特征，如de_LDS1：62*235*5
                cur_feature = all_features_dict[feature_name + str(trial)] # 取出一共trial的特征，如de_LDS1：62*235*5
                if frequency_band:
                    frequency_idx = band_map[frequency_band] 
                    cur_feature = np.asarray(cur_feature[:, :, frequency_idx]).T  # 取出特定频段的数据，62*235，转置后，维度为 N * 62, N 为影片长度（235s）
                else:
                    cur_feature = np.asarray(cur_feature.transpose(1,0,2)) # 62*235*5 -> 235*62*5
                    # num_win_trial = cur_feature.shape[1] # 235
                    # trial_data = []
                    # for j in range(0, num_win_trial, win): # concat w_len samples
                    #     if (j + win > num_win_trial): break
                    #     window = cur_feature[:, j:j+win, :]
                    #     assert (window.shape == (62, win, 5))
                    #     window = np.reshape(window, (62, -1))
                    #     assert (window.shape == (62, win*5))
                    #     trial_data.append(window)
                    # cur_feature = np.stack(trial_data, axis=0) # 47*62*25
                cur_label = np.asarray([labels[trial - 1]] * cur_feature.shape[0]) # 生成标签，长度为235
                feature_vector_trial_list.append(cur_feature)
                label_trial_list.append(cur_label)
                assert cur_feature.shape[0] == cur_label.shape[0], f'Feature and label shape mismatch, feature shape: {cur_feature.shape}, label shape: {cur_label.shape}'
                assert (cur_label[0] == labels[trial-1])            
            feature_vector_trial = np.concatenate(feature_vector_trial_list, axis=0)
            label_trial = np.concatenate(label_trial_list, axis=0)
            assert feature_vector_trial.shape[0] == label_trial.shape[0], f'Feature and label shape mismatch, feature shape: {feature_vector_trial.shape}, label shape: {label_trial.shape}'
            
            feature_vector_dict[subject_name] = feature_vector_trial
            label_dict[subject_name] = label_trial
    except FileNotFoundError as e:
        print('加载数据时出错: {}'.format(e))
    return feature_vector_dict, label_dict
de_vector_dict, label_dict = build_extracted_features_dataset(raw_data_path, 'de_LDS', win=5)

Progress on features: 100%|█████████████████████████████████████████| 45/45 [00:22<00:00,  1.96it/s]


In [5]:
for key in de_vector_dict.keys():
    print(key, de_vector_dict[key].shape, label_dict[key].shape)

1_20131027 (3394, 62, 5) (3394,)
1_20131030 (3394, 62, 5) (3394,)
1_20131107 (3394, 62, 5) (3394,)
2_20140404 (3394, 62, 5) (3394,)
2_20140413 (3394, 62, 5) (3394,)
2_20140419 (3394, 62, 5) (3394,)
3_20140603 (3394, 62, 5) (3394,)
3_20140611 (3394, 62, 5) (3394,)
3_20140629 (3394, 62, 5) (3394,)
4_20140621 (3394, 62, 5) (3394,)
4_20140702 (3394, 62, 5) (3394,)
4_20140705 (3394, 62, 5) (3394,)
5_20140411 (3394, 62, 5) (3394,)
5_20140418 (3394, 62, 5) (3394,)
5_20140506 (3394, 62, 5) (3394,)
6_20130712 (3394, 62, 5) (3394,)
6_20131016 (3394, 62, 5) (3394,)
6_20131113 (3394, 62, 5) (3394,)
7_20131027 (3394, 62, 5) (3394,)
7_20131030 (3394, 62, 5) (3394,)
7_20131106 (3394, 62, 5) (3394,)
8_20140511 (3394, 62, 5) (3394,)
8_20140514 (3394, 62, 5) (3394,)
8_20140521 (3394, 62, 5) (3394,)
9_20140620 (3394, 62, 5) (3394,)
9_20140627 (3394, 62, 5) (3394,)
9_20140704 (3394, 62, 5) (3394,)
10_20131130 (3394, 62, 5) (3394,)
10_20131204 (3394, 62, 5) (3394,)
10_20131211 (3394, 62, 5) (3394,)
11_2014

In [6]:
def split_with_sub(data_dic, label_dic, path):
    if not os.path.exists(path):
        os.makedirs(path)
    assert data_dic.keys() == label_dic.keys(), 'Data and label keys mismatch'
    
    data_sub_dic = {}
    label_sub_dic = {}

    for sub_id in tqdm(range(1,16), desc=f'Progress on DE features', ncols=100):
        feature_list = [] # M * 62 * 5
        label_list = [] # M,
        for experiment_name in data_dic.keys():
            if int(experiment_name.split('_')[0]) != sub_id: continue
            # print(f'Current processing: {experiment_name}')
            feature_list.append(data_dic[experiment_name])
            label_list.append(label_dic[experiment_name])
        assert(len(feature_list) == 3), 'Feature list length mismatch'
        feature_array = extend_normal(np.concatenate(feature_list, axis=0))
        label_array = np.concatenate(label_list, axis=0)
        data_sub_dic[f'sub_{sub_id}'] = feature_array
        label_sub_dic[f'sub_{sub_id}'] = label_array
    # dic保存为mat文件
    sio.savemat(os.path.join(path,'data_dic.mat'), data_sub_dic)
    sio.savemat(os.path.join(path,'label_dic.mat'), label_sub_dic)
    return data_sub_dic, label_sub_dic

data_sub_dic, label_sub_dic = split_with_sub(de_vector_dict, label_dict, data_path)
print(data_sub_dic.keys())
for key in data_sub_dic.keys():
    print(key, data_sub_dic[key].shape, label_sub_dic[key].shape)

Progress on DE features: 100%|██████████████████████████████████████| 15/15 [00:02<00:00,  6.86it/s]


dict_keys(['sub_1', 'sub_2', 'sub_3', 'sub_4', 'sub_5', 'sub_6', 'sub_7', 'sub_8', 'sub_9', 'sub_10', 'sub_11', 'sub_12', 'sub_13', 'sub_14', 'sub_15'])
sub_1 (10182, 62, 5) (10182,)
sub_2 (10182, 62, 5) (10182,)
sub_3 (10182, 62, 5) (10182,)
sub_4 (10182, 62, 5) (10182,)
sub_5 (10182, 62, 5) (10182,)
sub_6 (10182, 62, 5) (10182,)
sub_7 (10182, 62, 5) (10182,)
sub_8 (10182, 62, 5) (10182,)
sub_9 (10182, 62, 5) (10182,)
sub_10 (10182, 62, 5) (10182,)
sub_11 (10182, 62, 5) (10182,)
sub_12 (10182, 62, 5) (10182,)
sub_13 (10182, 62, 5) (10182,)
sub_14 (10182, 62, 5) (10182,)
sub_15 (10182, 62, 5) (10182,)


In [7]:
save_path = "data\seed" 
data_dic = hdf5.loadmat(os.path.join(save_path, 'data_dic.mat'))
label_dic = hdf5.loadmat(os.path.join(save_path, 'label_dic.mat'))
for key in data_dic.keys():
    if key.endswith('__'): continue
    print(key, data_dic[key].shape, label_dic[key].shape)

sub_1 (10182, 62, 5) (1, 10182)
sub_2 (10182, 62, 5) (1, 10182)
sub_3 (10182, 62, 5) (1, 10182)
sub_4 (10182, 62, 5) (1, 10182)
sub_5 (10182, 62, 5) (1, 10182)
sub_6 (10182, 62, 5) (1, 10182)
sub_7 (10182, 62, 5) (1, 10182)
sub_8 (10182, 62, 5) (1, 10182)
sub_9 (10182, 62, 5) (1, 10182)
sub_10 (10182, 62, 5) (1, 10182)
sub_11 (10182, 62, 5) (1, 10182)
sub_12 (10182, 62, 5) (1, 10182)
sub_13 (10182, 62, 5) (1, 10182)
sub_14 (10182, 62, 5) (1, 10182)
sub_15 (10182, 62, 5) (1, 10182)


In [8]:
set(data_dic.keys()) == set(label_dic.keys())

True

In [9]:
set(label_dic[key][0])

{0, 1, 2}

In [10]:
torch.cuda.get_device_properties(0)

_CudaDeviceProperties(name='NVIDIA TITAN Xp', major=6, minor=1, total_memory=12287MB, multi_processor_count=30)