In [1]:
import numpy as np
import pandas as pd
import os
from skimage import io,transform

folder = r'C:\Users\arung\OneDrive\Desktop\COVID 19 Chest Xray\nih'
trainNormalFileName = folder + r'\normaltrainfiles.txt'
trainOtherFileName = folder + r'\othertrainfiles.txt'
trainPneumoniaFileName = folder + r'\pneumoniatrainfiles.txt'

trainFileDestination = folder + r'\train'
valFileDestination = folder + r'\val'

TRAIN_NUM = 5000
VAL_NUM = 2000

SIZE = 512


In [2]:
def preprocess(img):
    # Normalize the image in 0-255 range and make it 3-plane
    if len(img.shape)==2:
        img_gray = np.float32(img)
    else:
        img_gray = np.float32(img[:,:,0])
        
    img_gray -= np.amin(img_gray)
    img_gray /= np.amax(img_gray)
         
    # Resize
    img_gray = transform.resize(img_gray,(SIZE,SIZE))
    img_gray = np.uint8(255*img_gray)   
    img = np.stack((img_gray,img_gray,img_gray),axis=2)
    return img


In [3]:

file = open(trainNormalFileName,'r')
allText = file.read()
normalFileList = allText.split('\n')
normalFileList = normalFileList[:-1]
file.close()

file = open(trainOtherFileName,'r')
allText = file.read()
otherFileList = allText.split('\n')
otherFileList = otherFileList[:-1]
file.close()

file = open(trainPneumoniaFileName,'r')
allText = file.read()
pneumoniaFileList = allText.split('\n')
pneumoniaFileList = pneumoniaFileList[:-1]
file.close()

print(len(normalFileList))
print(len(otherFileList))
print(len(pneumoniaFileList))

50500
35790
234


In [4]:
# Normal images
print('Training...')
tcount = 0
for file in normalFileList:
    img = io.imread(file)
    img = preprocess(img)
    fileName = file.split('\\')[-1]
    saveFileName = trainFileDestination + r'\normal' + '\\' + fileName
    io.imsave(saveFileName,img)
    tcount +=1
    print('\r',tcount,'of',TRAIN_NUM,end='')
    if tcount >= TRAIN_NUM:
        patientID = fileName.split('_')[0]
        nextImagePatientID = (normalFileList[tcount].split('\\')[-1]).split('_')[0]
        if patientID != nextImagePatientID:
            break

print()
print('Validation...')

vcount = tcount
for index in range(vcount,len(normalFileList)):
    file = normalFileList[index]
    img = io.imread(file)
    img = preprocess(img)
    fileName = file.split('\\')[-1]
    saveFileName = valFileDestination + r'\normal' + '\\' + fileName
    io.imsave(saveFileName,img)
    vcount +=1
    print('\r',vcount-tcount,'of',VAL_NUM,end='')
    if vcount-tcount >= VAL_NUM:
        patientID = fileName.split('_')[0]
        nextImagePatientID = (normalFileList[vcount].split('\\')[-1]).split('_')[0]
        if patientID != nextImagePatientID:
            break

Training...
 5004 of 5000
Validation...
 2004 of 2000

In [5]:
# Other disease images

print('Training...')
tcount = 0
for file in otherFileList:
    img = io.imread(file)
    img = preprocess(img)
    fileName = file.split('\\')[-1]
    saveFileName = trainFileDestination + r'\other' + '\\' + fileName
    io.imsave(saveFileName,img)
    tcount +=1
    print('\r',tcount,'of',TRAIN_NUM,end='')
    if tcount >= TRAIN_NUM:
        patientID = fileName.split('_')[0]
        nextImagePatientID = (otherFileList[tcount].split('\\')[-1]).split('_')[0]
        if patientID != nextImagePatientID:
            break

print()
print('Validation...')

vcount = tcount
for index in range(vcount,len(otherFileList)):
    file = otherFileList[index]
    img = io.imread(file)
    img = preprocess(img)
    fileName = file.split('\\')[-1]
    saveFileName = valFileDestination + r'\other' + '\\' + fileName
    io.imsave(saveFileName,img)
    vcount +=1
    print('\r',vcount-tcount,'of',VAL_NUM,end='')
    if vcount-tcount >= VAL_NUM:
        patientID = fileName.split('_')[0]
        nextImagePatientID = (otherFileList[vcount].split('\\')[-1]).split('_')[0]
        if patientID != nextImagePatientID:
            break


Training...
 5000 of 5000
Validation...
 2002 of 2000

In [6]:
# Pneumonia images

print('Training...')
tcount = 0
for file in pneumoniaFileList:
    img = io.imread(file)
    img = preprocess(img)
    fileName = file.split('\\')[-1]
    saveFileName = trainFileDestination + r'\pneumonia' + '\\' + fileName
    io.imsave(saveFileName,img)
    tcount +=1
    print('\r',tcount,'of',TRAIN_NUM,end='')
    if tcount >= TRAIN_NUM:
        patientID = fileName.split('_')[0]
        nextImagePatientID = (pneumoniaFileList[tcount].split('\\')[-1]).split('_')[0]
        if patientID != nextImagePatientID:
            break

print()
print('Validation...')

vcount = tcount
for index in range(vcount,len(pneumoniaFileList)):
    file = pneumoniaFileList[index]
    img = io.imread(file)
    img = preprocess(img)
    fileName = file.split('\\')[-1]
    saveFileName = valFileDestination + r'\pneumonia' + '\\' + fileName
    io.imsave(saveFileName,img)
    vcount +=1
    print('\r',vcount-tcount,'of',VAL_NUM,end='')
    if vcount-tcount >= VAL_NUM:
        patientID = fileName.split('_')[0]
        nextImagePatientID = (pneumoniaFileList[vcount].split('\\')[-1]).split('_')[0]
        if patientID != nextImagePatientID:
            break


Training...
 234 of 5000
Validation...
