# 01: Shenzhen Data Preparation

From: jodafons 

This notebook will be responsible to create a final spreadsheet with all images paths, hashes and clinical readings.

In [10]:
import pandas as pd
import numpy as np
import os, sys
import glob
import re
import hashlib
import pathlib

In [11]:
def expand_folder( path , extension):
    return glob.glob(path+'/*.'+extension)

In [12]:
def get_md5(path):
    return hashlib.md5(pathlib.Path(path).read_bytes()).hexdigest()

## Create my v1 table

In [13]:
# this is the location of the raw chenzen data
base_data_raw_path = '../data/Shenzhen/raw'
clinical_path = base_data_raw_path + '/clinical'
images_path = base_data_raw_path + '/images'

In [14]:
def prepare_my_table( clinical_path , images_path ):
    
    d = {
      'target'   : [],
      'image_ID' : [],
      'raw_image_path'     : [],
      'raw_image_md5'      : [],
      'age'      : [],
      'sex'      : [],
      'comment'  : [],
    }

    def treat_string( lines ):
        string = ''
        for s in lines:
            string+=s.replace('\n','').replace('\t','')
        return re.sub(' +', ' ', string)
    
    for idx, path in enumerate(expand_folder(clinical_path, 'txt')):
    
        with open(path,'r') as f:
        
            lines = f.readlines()
            sex = True if 'male' in lines[0] else False # 1 for male and 0 for female
            age = int(re.sub('\D', '', lines[0]))
            # get TB by file name (_1.txt is PTB or _0.txt is NTB)
            target = True if '_1.txt' in path else False
        
            filename = path.split('/')[-1]
            image_filename = filename.replace('txt','png')
            image_path = images_path+'/'+image_filename

            d['target'].append(target)
            d['age'].append(age)
            d['sex'].append(sex)
            d['raw_image_path'].append(image_path)
            d['raw_image_md5'].append(get_md5(image_path))
            d['comment'].append(treat_string(lines[1::]))
            d['image_ID'].append(idx)
   
            
    return pd.DataFrame(d)


df = prepare_my_table(clinical_path, images_path)
    

In [15]:
df.head()

Unnamed: 0,target,image_ID,raw_image_path,raw_image_md5,age,sex,comment
0,False,0,../data/Shenzhen/raw/images/CHNCXR_0099_0.png,14a18d4fc7465b66186efb5c9d0d9bea,26,True,normal
1,True,1,../data/Shenzhen/raw/images/CHNCXR_0635_1.png,3ba59bfea5469d26e9409d831bc52ab6,65,True,bilateral PTB
2,False,2,../data/Shenzhen/raw/images/CHNCXR_0320_0.png,68ebd9afd4b7670e78afb5147115af1d,38,True,normal
3,True,3,../data/Shenzhen/raw/images/CHNCXR_0571_1.png,39746a5dc737f0bcbfb8b91b0ecdc54f,36,True,Right PTB
4,True,4,../data/Shenzhen/raw/images/CHNCXR_0534_1.png,53a87376a697ae0e9d39b34056fe9f54,21,True,Right PTB


In [16]:
df.to_excel('Shenzhen_table_from_raw.xlsx')