In [8]:
import pandas as pd
import numpy as np
import os, sys
import glob
import re
import hashlib
import pathlib
from tqdm import tqdm
#from sklearn.model_selection import train_test_split
from rxwgan.core import stratified_train_val_test_splits
import warnings
warnings.filterwarnings("ignore")

In [9]:
def expand_folder( path , extension):
    l = glob.glob(path+'/*.'+extension)
    l.sort()
    return l

In [10]:
def get_md5(path):
    return hashlib.md5(pathlib.Path(path).read_bytes()).hexdigest()


In [14]:
# this is the location of the raw chenzen data
base_data_raw_path = '/home/joao.pinto/public/brics_data/Shenzhen/raw'
clinical_path = base_data_raw_path + '/clinical'
images_path = base_data_raw_path + '/images'

In [15]:
def prepare_my_table( clinical_path , images_path ):
    
    d = {
      'target'   : [],
      'image_ID' : [],
      'raw_image_path'     : [],
      'raw_image_md5'      : [],
      'age'      : [],
      'sex'      : [],
      'comment'  : [],
    }

    def treat_string( lines ):
        string = ''
        for s in lines:
            string+=s.replace('\n','').replace('\t','')
        return re.sub(' +', ' ', string)
    
    for idx, path in enumerate(expand_folder(clinical_path, 'txt')):
    
        with open(path,'r') as f:
        
            lines = f.readlines()
            sex = 'male' if 'male' in lines[0] else 'female' # 1 for male and 0 for female
            age = int(re.sub('\D', '', lines[0]))
            # get TB by file name (_1.txt is PTB or _0.txt is NTB)
            target = 1 if '_1.txt' in path else 0
        
            filename = path.split('/')[-1]
            image_filename = filename.replace('txt','png')
            #image_path = images_path+('/tb/' if target else '/no_tb/')+image_filename
            image_path = images_path+'/'+image_filename
            d['target'].append(target)
            d['age'].append(age)
            d['sex'].append(sex)
            d['raw_image_path'].append(image_path)
            d['raw_image_md5'].append(get_md5(image_path))
            d['comment'].append(treat_string(lines[1::]))
            d['image_ID'].append(filename.replace('.txt',''))
   
            
    return pd.DataFrame(d)


df = prepare_my_table(clinical_path, images_path)
df['run'] = 'Shenzhen'


In [16]:
df.head()


Unnamed: 0,target,image_ID,raw_image_path,raw_image_md5,age,sex,comment,run
0,0,CHNCXR_0001_0,/home/joao.pinto/public/brics_data/Shenzhen/ra...,313e3db7e5f03c88d08a4485c364f370,45,male,normal,Shenzhen
1,0,CHNCXR_0002_0,/home/joao.pinto/public/brics_data/Shenzhen/ra...,3d5fcc07713143d414802fcc9cb86b2e,63,male,normal,Shenzhen
2,0,CHNCXR_0003_0,/home/joao.pinto/public/brics_data/Shenzhen/ra...,404cfbaf320875f617a810b7c075a813,48,male,normal,Shenzhen
3,0,CHNCXR_0004_0,/home/joao.pinto/public/brics_data/Shenzhen/ra...,0e7d2065cbbc08ca13fc2e8881e01096,58,male,normal,Shenzhen
4,0,CHNCXR_0005_0,/home/joao.pinto/public/brics_data/Shenzhen/ra...,d4acb116ed926f64a805447a65132e93,28,male,normal,Shenzhen


In [23]:
#df.to_csv('/home/jodafons/public/brics_data/Shenzhen/raw/Shenzhen_table_from_raw.csv')
df.to_csv('user.joao.pinto_Shenzhen_table_from_raw.csv')


# Split K-Folds

In [19]:

seed = 512
splits = stratified_train_val_test_splits(df,10,seed)


bins selected for val: [1, 2, 3, 4, 5, 6, 7, 8, 9]


100%|██████████████████████████████████████████████| 9/9 [00:00<00:00, 2132.70it/s]


bins selected for val: [0, 2, 3, 4, 5, 6, 7, 8, 9]


100%|██████████████████████████████████████████████| 9/9 [00:00<00:00, 3767.71it/s]


bins selected for val: [0, 1, 3, 4, 5, 6, 7, 8, 9]


100%|██████████████████████████████████████████████| 9/9 [00:00<00:00, 4373.12it/s]


bins selected for val: [0, 1, 2, 4, 5, 6, 7, 8, 9]


100%|██████████████████████████████████████████████| 9/9 [00:00<00:00, 4393.47it/s]


bins selected for val: [0, 1, 2, 3, 5, 6, 7, 8, 9]


100%|██████████████████████████████████████████████| 9/9 [00:00<00:00, 4571.73it/s]


bins selected for val: [0, 1, 2, 3, 4, 6, 7, 8, 9]


100%|██████████████████████████████████████████████| 9/9 [00:00<00:00, 4566.20it/s]


bins selected for val: [0, 1, 2, 3, 4, 5, 7, 8, 9]


100%|██████████████████████████████████████████████| 9/9 [00:00<00:00, 4675.35it/s]


bins selected for val: [0, 1, 2, 3, 4, 5, 6, 8, 9]


100%|██████████████████████████████████████████████| 9/9 [00:00<00:00, 4698.04it/s]


bins selected for val: [0, 1, 2, 3, 4, 5, 6, 7, 9]


100%|██████████████████████████████████████████████| 9/9 [00:00<00:00, 4712.11it/s]


bins selected for val: [0, 1, 2, 3, 4, 5, 6, 7, 8]


100%|██████████████████████████████████████████████| 9/9 [00:00<00:00, 4694.53it/s]


In [20]:
df_splitted = None

for test in range(10):
    for sort in range(9):
        
        train_index = splits[test][sort][0]
        val_index = splits[test][sort][1]
        test_index = splits[test][sort][2]
        
        df_train = df.iloc[train_index]
        df_train['test'] = test
        df_train['sort'] = sort
        df_train['dataset'] = 'train'
        
        df_val = df.iloc[val_index]
        df_val['test'] = test
        df_val['sort'] = sort
        df_val['dataset'] = 'val'       
 
        df_test = df.iloc[test_index]
        df_test['test'] = test
        df_test['sort'] = sort
        df_test['dataset'] = 'test'       

        if df_splitted is not None:
            df_splitted = pd.concat((df_splitted, df_train, df_val, df_test) )
        else:
            df_splitted = pd.concat((df_train, df_val, df_test) )
            
        df_splitted['type'] = 'real'

In [22]:
df_splitted.to_csv('user.joao.pinto_Shenzhen_table_from_raw_splitted.csv')
