In [13]:
import os
import numpy as np
import pandas as pd
import glob
import random
import json

In [14]:
data_path = '/mnt/ExtData/pahsos/Data/preprocessed'
cls = ['BCs', 'HBV', 'PA-HSOS']

pahsos_patient_list = []
hbv_patient_list = []
bcs_patient_list = []

for type in cls:
    disease_cls_path = os.path.join(data_path, f'{type}/') 
    patient_path = np.sort(glob.glob(os.path.join(disease_cls_path, '*')))

    for patient_idx in range(len(patient_path)):       

        patient_name = patient_path[patient_idx][patient_path[patient_idx].rfind('/')+1:]
        if type == 'PA-HSOS':
            pahsos_patient_list.append(patient_name)
        elif type == 'HBV':
            hbv_patient_list.append(patient_name)
        elif type == 'BCs':
            bcs_patient_list.append(patient_name)

print(f'pos patient count {len(pahsos_patient_list)}  neg patient count {len(hbv_patient_list)+len(bcs_patient_list)}')
print(f'pahsos : {len(pahsos_patient_list)}, HBV : {len(hbv_patient_list)}, BCs : {len(bcs_patient_list)}')

pos patient count 39  neg patient count 148
pahsos : 39, HBV : 97, BCs : 51


按照8：2的比例进行划分

In [15]:
#生成随机数
test_ratio = 0.2
test_pahsos_num, test_hbv_num, test_bcs_num = round(len(pahsos_patient_list)*test_ratio), round(len(hbv_patient_list)*test_ratio), round(len(bcs_patient_list)*test_ratio)
print(f'label 1 : test/total : pahsos {test_pahsos_num}/{len(pahsos_patient_list)}')
print(f'label 0 : test/total : hbv {test_hbv_num}/{len(hbv_patient_list)}, bcs {test_bcs_num}/{len(bcs_patient_list)}')


label 1 : test/total : pahsos 8/39
label 0 : test/total : hbv 19/97, bcs 10/51


In [16]:
random.seed(147)# 固定随机数种子后，随机出的数字是固定的。

test_pahsos_index = random.sample(range(0, len(pahsos_patient_list)), test_pahsos_num) # (low,high,n)   产生[low, high)间的n个随机数
test_bcs_index = random.sample(range(0, len(bcs_patient_list)), test_bcs_num) # (low,high,n)   产生[low, high)间的n个随机数
test_hbv_index = random.sample(range(0, len(hbv_patient_list)), test_hbv_num) # (low,high,n)   产生[low, high)间的n个随机数
print(f'test index : pahsos  {test_pahsos_index}, bcs {test_bcs_index}, hbv {test_hbv_index}')


test index : pahsos  [38, 9, 25, 28, 35, 11, 19, 26], bcs [29, 45, 6, 50, 22, 4, 26, 46, 43, 21], hbv [92, 81, 75, 25, 30, 24, 7, 85, 58, 16, 64, 9, 38, 95, 29, 96, 4, 23, 36]


In [18]:
# list方式生成并保存traindataset和testdataset list

data_path = '/mnt/ExtData/pahsos/Data/preprocessed'
cls = ['BCs', 'HBV', 'PA-HSOS']
csv_save_path = '/mnt/ExtData/pahsos/Data'
json_save_path = '/mnt/ExtData/pahsos/Data'

total_list = []

train = []
test = []

train_pahsos = 0
train_hbv = 0
train_bcs = 0

test_pahsos = 0
test_hbv = 0
test_bcs = 0

for type in cls:
    disease_cls_path = os.path.join(data_path, f'{type}/') 
    patient_path = np.sort(glob.glob(os.path.join(disease_cls_path, '*')))

    for patient_idx in range(len(patient_path)):       

        patient_name = patient_path[patient_idx][patient_path[patient_idx].rfind('/')+1:]
        if type == 'PA-HSOS':
            if patient_idx in test_pahsos_index:
                total_list.append(['PA-HSOS', patient_name, 1, 'test'])
                data = {
                    'type': 'PA-HSOS', #和读取数据的文件夹名字一样
                    'patient_name': patient_name,
                    'label': 1,
                }
                test.append(data)
                test_pahsos += 1
            else:
                total_list.append(['PA-HSOS', patient_name, 1, 'train'])
                data = {
                    'type': 'PA-HSOS', 
                    'patient_name': patient_name,
                    'label': 1,
                }
                train.append(data)
                train_pahsos += 1
        elif type == 'HBV':
            if patient_idx in test_hbv_index:
                total_list.append(['HBV', patient_name, 0, 'test'])
                data = {
                    'type': 'HBV', 
                    'patient_name': patient_name,
                    'label': 0,
                }
                test.append(data)
                test_hbv += 1
            else:
                total_list.append(['HBV', patient_name, 0, 'train'])
                data = {
                    'type': 'HBV',
                    'patient_name': patient_name,
                    'label': 0,
                }
                train.append(data)
                train_hbv += 1
        elif type == 'BCs':
            if patient_idx in test_bcs_index:
                total_list.append(['BCs', patient_name, 0, 'test'])
                data = {
                    'type': 'BCs',
                    'patient_name': patient_name,
                    'label': 0,
                }
                test.append(data)
                test_bcs += 1
            else:
                total_list.append(['BCs', patient_name, 0, 'train'])
                data = {
                    'type': 'BCs',
                    'patient_name': patient_name,
                    'label': 0,
                }
                train.append(data)
                train_bcs += 1

df1 = pd.DataFrame(data=total_list, columns=['type', 'patient_name', 'label', 'train/test'])
df1.to_csv(f'{csv_save_path}/data_split.csv',index=False)

with open(f'{json_save_path}/data_split.json','a+') as f:
    json_info = {
        'train':train,
        'test':test
    } 
    json.dump(json_info, f)
    f.write("\n")

print(f'train_pahsos : {train_pahsos}, train_hbv : {train_hbv}, train_bcs : {train_bcs}')
print(f'test_pahsos {test_pahsos}, test_hbv {test_hbv}, test_bcs {test_bcs}')

train_pahsos : 31, train_hbv : 78, train_bcs : 41
test_pahsos 8, test_hbv 19, test_bcs 10
