# 01: Shenzhen Data Preparation

From: jodafons 

This notebook will be responsible to create a final spreadsheet with all images paths, hashes and clinical readings.

In [1]:
import pandas as pd
import numpy as np
import os, sys
import glob
import re
import hashlib
import pathlib
from sklearn.model_selection import train_test_split

In [11]:
def expand_folder( path , extension):
    l = glob.glob(path+'/*.'+extension)
    l.sort()
    return l

In [3]:
def get_md5(path):
    return hashlib.md5(pathlib.Path(path).read_bytes()).hexdigest()

## Create my v1 table

In [4]:
# this is the location of the raw chenzen data
base_data_raw_path = '/home/jodafons/public/brics_data/Shenzhen/raw'
clinical_path = base_data_raw_path + '/clinical'
images_path = base_data_raw_path + '/images'


In [13]:
def prepare_my_table( clinical_path , images_path ):
    
    d = {
      'target'   : [],
      'image_ID' : [],
      'raw_image_path'     : [],
      'raw_image_md5'      : [],
      'age'      : [],
      'sex'      : [],
      'comment'  : [],
    }

    def treat_string( lines ):
        string = ''
        for s in lines:
            string+=s.replace('\n','').replace('\t','')
        return re.sub(' +', ' ', string)
    
    for idx, path in enumerate(expand_folder(clinical_path, 'txt')):
    
        with open(path,'r') as f:
        
            lines = f.readlines()
            sex = 'male' if 'male' in lines[0] else 'female' # 1 for male and 0 for female
            age = int(re.sub('\D', '', lines[0]))
            # get TB by file name (_1.txt is PTB or _0.txt is NTB)
            target = 1 if '_1.txt' in path else 0
        
            filename = path.split('/')[-1]
            image_filename = filename.replace('txt','png')
            #image_path = images_path+('/tb/' if target else '/no_tb/')+image_filename
            image_path = images_path+'/'+image_filename
            d['target'].append(target)
            d['age'].append(age)
            d['sex'].append(sex)
            d['raw_image_path'].append(image_path)
            d['raw_image_md5'].append(get_md5(image_path))
            d['comment'].append(treat_string(lines[1::]))
            d['image_ID'].append(filename.replace('.txt',''))
   
            
    return pd.DataFrame(d)


df = prepare_my_table(clinical_path, images_path)
    

In [14]:
df.head()

Unnamed: 0,target,image_ID,raw_image_path,raw_image_md5,age,sex,comment
0,0,CHNCXR_0001_0,/home/jodafons/public/brics_data/Shenzhen/raw/...,313e3db7e5f03c88d08a4485c364f370,45,male,normal
1,0,CHNCXR_0002_0,/home/jodafons/public/brics_data/Shenzhen/raw/...,3d5fcc07713143d414802fcc9cb86b2e,63,male,normal
2,0,CHNCXR_0003_0,/home/jodafons/public/brics_data/Shenzhen/raw/...,404cfbaf320875f617a810b7c075a813,48,male,normal
3,0,CHNCXR_0004_0,/home/jodafons/public/brics_data/Shenzhen/raw/...,0e7d2065cbbc08ca13fc2e8881e01096,58,male,normal
4,0,CHNCXR_0005_0,/home/jodafons/public/brics_data/Shenzhen/raw/...,d4acb116ed926f64a805447a65132e93,28,male,normal


In [7]:
df.to_csv('/home/jodafons/public/bric_data/Shenzhen/raw/Shenzhen_table_from_raw.csv')

In [8]:
df_tb = df.loc[df.target==1]
df_notb = df.loc[df.target==0]

In [14]:
train_df, test_df = train_test_split(df_tb , test_size=0.2, random_state=512, shuffle=True)
print(train_df.shape)
print(test_df.shape)
train_df.to_csv('/home/jodafons/public/bric_data/Shenzhen/raw/Shenzhen_table_from_raw_tb_only.csv')
test_df.to_csv('/home/jodafons/public/bric_data/Shenzhen/raw/Shenzhen_test_table_from_raw_tb_only.csv')

(268, 7)
(68, 7)


In [15]:
train_df, test_df = train_test_split(df_notb , test_size=0.2, random_state=512, shuffle=True)
print(train_df.shape)
print(test_df.shape)
train_df.to_csv('/home/jodafons/public/bric_data/Shenzhen/raw/Shenzhen_table_from_raw_notb_only.csv')
test_df.to_csv('/home/jodafons/public/bric_data/Shenzhen/raw/Shenzhen_test_table_from_raw_notb_only.csv')

(260, 7)
(66, 7)
