In [35]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from helpers import *
import glob
from PIL import Image

In [6]:
os.chdir('/Users/chloehe/Dropbox (Personal)/Final Project')

In [7]:
pwd

'/Users/chloehe/Dropbox (Personal)/Final Project'

## Process the images

In [36]:
covid_list = os.listdir('CT_COVID')
nocovid_list = os.listdir('CT_NonCOVID')

In [22]:
# checking: test=plt.imread('CT_COVID/2020.03.21.20040691-p18-67-5.png')

In [37]:
# read in covid CT
covid_list = []
for filename in glob.glob('CT_COVID/*.png'): 
    im=plt.imread(filename)
    covid_list.append(im)

In [38]:
# convert "jpg" to "png" and “save-as”
for filename in glob.glob('CT_NonCOVID/*.jpg'):  
    im1 = Image.open(filename)
    prefix = filename.split(".jpg")[0]
    im1.save(prefix+'.png')  

In [39]:
# read in non-covid CT
nocovid_list = []
for filename in glob.glob('CT_NonCOVID/*.png'): 
    im=plt.imread(filename,0)
    nocovid_list.append(im)

Below steps: 
1. Change it to greyscale if grey scale is not the original color scale
2. Make all landscape mode
3. Crop it to the same dimension -- the smallest dimension found
4. Extract the gradient
5. Form data frame

In [40]:
# Find the minimum dimension for covid pictures to crop later
a = np.empty(349, dtype=object) 
for i in range(len(covid_list)):
     a[i] = covid_list[i].shape[0]
        
b = np.empty(349, dtype=object) 
for i in range(len(covid_list)):
     b[i] = covid_list[i].shape[1]
        
min_length = min(a)
min_width = min(b)
print("the minimum length for covid pictures is",min_length)
print("the minimum width for covid pictures is",min_width)

the minimum length for covid pictures is 61
the minimum width for covid pictures is 115


In [41]:
# Find the minimum dimension for non-covid picturesto crop later
c = np.empty(397, dtype=object) 
for i in range(len(nocovid_list)):
     c[i] = nocovid_list[i].shape[0]
        
d = np.empty(397, dtype=object) 
for i in range(len(nocovid_list)):
     d[i] = nocovid_list[i].shape[1]
        
min_length_non_covid = min(c)
min_width_non_covid = min(d)
print("the minimum length for non-covid pictures is",min_length_non_covid)
print("the minimum width for non-covid pictures is",min_width_non_covid)

the minimum length for non-covid pictures is 102
the minimum width for non-covid pictures is 137


In [42]:
covid_processed = []
for i in range(len(covid_list)):
    if len(covid_list[i].shape)==3:
        grey = rgb2gray(covid_list[i])
    else:
        grey=covid_list[i]
    #additional step: rotate matrix such that dim(row)<=dim(column) -> cropping causes less information loss
    if grey.shape[0]>grey.shape[1]:
        grey=np.rot90(grey)
    c = cropR(grey,min_length,min_width) 
    g = grad(c,min_length-2,min_width-2,False)
    h = hog(g[0],g[1],8,8,16)
    covid_processed.append(h)
covid_processed = np.array(covid_processed)
covid_processed.shape

(349, 1024)

In [20]:
nocovid_processed = []
for i in range(len(nocovid_list)):
    if len(nocovid_list[i].shape)==3:
        grey = rgb2gray(nocovid_list[i])
    else:
        grey=nocovid_list[i]
    #additional step: rotate matrix such that dim(row)<=dim(column) -> cropping causes less information loss
    if grey.shape[0]>grey.shape[1]:
        grey=np.rot90(grey)
    c = cropR(grey,min_length,min_width) 
    g = grad(c,min_length-2,min_width-2,False)
    h = hog(g[0],g[1],8,8,16)
    nocovid_processed.append(h)
nocovid_processed = np.array(nocovid_processed)
nocovid_processed.shape

(397, 1024)

In [None]:
# np.row_stack((covid_processed,nocovid_processed))

## Combine images with patients ID 

In [43]:
covid_info = pd.read_csv('COVID-CT-MetaInfo.csv')
nocovid_info = pd.read_csv('NonCOVID-CT-MetaInfo.csv')

In [44]:
covid_data = covid_info.iloc[:,2:5]     # extract covid patient id & covariate information

In [45]:
nocovid_data = nocovid_info.iloc[:,4] # extract non-covid patient id

In [46]:
covid_comp_data = np.column_stack((covid_processed,covid_data))
# covid_comp_data = pd.DataFrame(covid_comp_data)
# covid_comp_data.head()

In [47]:
nocovid_comp_data = np.column_stack((nocovid_processed,nocovid_data,np.full((len(nocovid_data), 2), 'Na')))
# nocovid_comp_data = pd.DataFrame(nocovid_comp_data)
# nocovid_comp_data.head()

In [48]:
Y = np.concatenate((np.repeat(1,covid_processed.shape[0]),np.repeat(-1,nocovid_processed.shape[0]))) # create label

In [53]:
mydata = np.row_stack((covid_comp_data,nocovid_comp_data))
mydata = np.column_stack((mydata,Y))
mydata = pd.DataFrame(mydata)

In [55]:
size=mydata.shape[1]


mydata.rename(columns={mydata.columns[size-4]: "patient_id", 
                       mydata.columns[size-3]: "age", 
                       mydata.columns[size-2]: "gender", 
                       mydata.columns[size-1]: "label" }, inplace = True)
mydata.shape

(746, 1028)

In [51]:
mydata.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1018,1019,1020,1021,1022,1023,patient_id,age,gender,label
0,0.0287986,0.0383981,0.0479976,0.00959952,0.105595,0.115194,0.153592,0.0767962,0.0383981,0.00959952,...,0.0863957,0.124794,0.0863957,0.0479976,0.0383981,0.0383981,1,41,M,1
1,0.0863957,0.0575971,0.0767962,0.105595,0.0767962,0.0383981,0.0383981,0.0287986,0.0479976,0.019199,...,0.172791,0.0575971,0.019199,0.0671966,0.0671966,0.0671966,1,41,M,1
2,0.019199,0.019199,0.0575971,0.0383981,0.0383981,0.0479976,0.0671966,0.134393,0.134393,0.105595,...,0.0767962,0.0959952,0.287986,0.134393,0.0479976,0.0,1,41,M,1
3,0.0479976,0.0383981,0.0479976,0.00959952,0.0959952,0.0287986,0.0479976,0.115194,0.0287986,0.0767962,...,0.0575971,0.0,0.0479976,0.0287986,0.0,0.00959952,1,41,M,1
4,0.0863957,0.0767962,0.0959952,0.0863957,0.153592,0.0575971,0.143993,0.0575971,0.019199,0.0287986,...,0.00959952,0.00959952,0.0287986,0.0,0.0,0.0,2,50,M,1


**Keep all data**

In [26]:
# change directory to your own
mydata.to_csv('mydata_all.csv', index = False)

**Only randomly keep one image if multiple images for single patient**

In [52]:
# randomly select one observation if this patient has more than one pictures taken
blocks = [data.sample(n=1) for _,data in mydata.groupby(['patient_id', 'label'])]
mydata = pd.concat(blocks)
mydata.head()
mydata.shape

(304, 1028)

In [29]:
# change directory to your own
mydata.to_csv('mydata_unique.csv', index = False)

In [2]:
## Possible Data Augmentation

In [30]:
from keras.preprocessing.image import ImageDataGenerator
train_dataaug=ImageDataGenerator(
    rescale=1./255,
    rotation_range=40,
    width_shift_range=0.2,
    height_shift_range=0.2,
    shear_range=0.2,
    zoom_range=0.2,
    horizontal_flip=True,
    fill_mode='nearest'
)

In [None]:
train_generator=train_dataaug.flow_from_directory(
    train_dir, target_size=(61,115),batch_size=20,class_mode='categorical'
)

## The following are debug coding

In [27]:
#debug import (there was valueError, suspecting some images were originally non-PNG file)
covid_pix=[]
error_indicator=[]
error_pix=[]
i=0
for im in covid_list:
    i+=1
    try:
        covid_pix.append(plt.imread('CT_COVID/'+im))
    except:
        error_indicator.append(i)
        error_pix.append(im)
        pass
    
error_pix_table=pd.DataFrame(np.array(error_indicator), np.array(error_pix))    

In [24]:
#no error importing in non-cases
nocovid_pix=[]
error_indicator=[]
error_pix=[]
i=0
for im in noncovid:
    i+=1
    try:
        nocovid_pix.append(plt.imread('CT_NonCOVID/'+im))
    except:
        error_indicator.append(i)
        error_pix.append(im)
        pass

In [29]:
for i in range(len(covid_pix)):
    print(covid_pix[i].shape)

(335, 580, 4)
(332, 566, 4)
(366, 612, 4)
(551, 725, 4)
(541, 720, 4)
(529, 724, 4)
(500, 727, 4)
(202, 287, 3)
(202, 287, 3)
(202, 287, 3)
(202, 287, 3)
(202, 287, 3)
(202, 287, 3)
(202, 287, 3)
(202, 287, 3)
(202, 287, 3)
(202, 287, 3)
(202, 287, 3)
(202, 287, 3)
(202, 287, 3)
(202, 287, 3)
(202, 287, 3)
(202, 287, 3)
(515, 826, 4)
(556, 840, 4)
(532, 818, 4)
(436, 570, 4)
(437, 563, 4)
(398, 516, 4)
(398, 556, 4)
(382, 517, 4)
(381, 538, 4)
(419, 550, 3)
(359, 500, 4)
(391, 512, 4)
(336, 506, 4)
(345, 516, 4)
(303, 516, 4)
(211, 337, 3)
(211, 337, 3)
(211, 337, 4)
(211, 337, 3)
(225, 430, 4)
(272, 436, 3)
(272, 436, 3)
(188, 300, 4)
(194, 305, 4)
(212, 310, 3)
(212, 310, 3)
(212, 310, 3)
(212, 310, 3)
(155, 247, 3)
(155, 247, 3)
(139, 229, 4)
(155, 247, 3)
(134, 255, 4)
(147, 236, 4)
(123, 263, 4)
(150, 246, 4)
(125, 251, 4)
(147, 217, 4)
(266, 401, 4)
(270, 397, 4)
(328, 415, 3)
(300, 412, 4)
(363, 603, 4)
(381, 608, 4)
(124, 198, 4)
(129, 196, 4)
(115, 195, 4)
(110, 167, 4)
(99, 1

In [26]:
covid_data.head()
nocovid_data.head()
nocovid_data.columns = ['Index', 'Filename','Patient_id']
covid_data.columns = ['Filename', 'Patient_id','Age','Gender']

In [27]:
number_of_covid = len(covid_data.Patient_id.unique())
number_of_nocovid = len(nocovid_data.Patient_id.unique())

In [17]:
covid_processed[15][31:60]

array([0.        , 0.        , 0.11519424, 0.11519424, 0.        ,
       0.        , 0.        , 0.        , 0.03839808, 0.03839808,
       0.        , 0.03839808, 0.        , 0.        , 0.07679616,
       0.03839808, 0.        , 0.        , 0.07679616, 0.        ,
       0.        , 0.03839808, 0.03839808, 0.        , 0.        ,
       0.07679616, 0.        , 0.        , 0.        ])

In [37]:
covid_comp_data.shape

(349, 8194)

In [57]:
mydata

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,8186,8187,8188,8189,8190,8191,8192,8193,8194,8195
0,0,0,0,0,0.0383981,0,0,0,0.0383981,0.0767962,...,0.0767962,0.0767962,0,0.0383981,0,0.0383981,1,41,M,1
1,0.383981,0,0.0383981,0,0.0383981,0.0767962,0.0383981,0,0,0,...,0.0767962,0.0383981,0,0,0,0,1,41,M,1
2,0,0,0,0,0.0383981,0.153592,0.19199,0.115194,0.0767962,0.0383981,...,0,0,0,0,0,0,1,41,M,1
3,0.268787,0,0,0,0,0.0383981,0,0,0.345583,0,...,0,0.0383981,0,0,0,0,1,41,M,1
4,0,0,0,0,0.0383981,0,0,0,0,0.115194,...,0,0,0,0,0,0,2,50,M,1
5,0,0,0,0,0,0,0,0,0.80636,0,...,0.0383981,0.0383981,0,0.0383981,0,0,2,50,M,1
6,0,0.0767962,0.0767962,0,0.0383981,0.115194,0.153592,0,0.0383981,0,...,0,0.0383981,0.0383981,0.0767962,0.0383981,0.0767962,2,50,M,1
7,0,0,0.0383981,0,0.115194,0.0383981,0.0383981,0,0.0383981,0.0383981,...,0.0767962,0,0,0,0,0,2,50,M,1
8,0.115194,0.0767962,0.0383981,0,0.0383981,0,0.115194,0.0383981,0.0383981,0,...,0,0,0.0383981,0,0.19199,0.0767962,2,50,M,1
9,0.0383981,0.0383981,0,0,0,0,0,0,0.0383981,0,...,0.153592,0.19199,0.0383981,0.0767962,0.0767962,0,2,50,M,1
