In [1]:
import tensorflow
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Activation, Flatten, Conv2D, MaxPooling2D
from sklearn.model_selection import KFold
import numpy as np
from numpy import asarray
import nibabel as nib
import pandas as pd
from PIL import Image
from matplotlib import image
from matplotlib import pyplot
import math

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [2]:
# reading in our labels
# also has data on other details of the patience but we just want to look at an image and
# an output for right now
metadata = pd.read_csv('metadata.csv').to_numpy()

X = []
y = []

print(metadata.shape)

i=0
for row in metadata:
    # row[20] is the folder and row[21] is the filename
    filename = row[20] + '/' + row[21]
    
    if filename[-3:] == '.gz': # ignoring gunzipped files, not that many of them and dealing with it would vastly complicate things
        i+=1
    else:
        img = Image.open(filename)
        
        if filename[-3:] == 'png':
            img = img.convert('RGB') #converting png to jpg
            
        img = img.convert('LA')  # grayscaling
        img = img.resize((156,156)) # resizing to minimums so every image is the same size
        data = asarray(img) # transforming image to array of ints
        X.append(data)
        
        if row[4] == 'COVID-19':
            y.append((1,0))
        else:
            y.append((0,1))

(360, 28)


In [3]:
# building our model (finally)

inputShape = X[0].shape
BATCH_SIZE=32 # ended up not going with this because it took an extremely long time to train


model = Sequential()
model.add(Conv2D(64, 
                 kernel_size=(3,3), 
                 input_shape=(156,156,2), 
                 activation= 'relu',
                 data_format='channels_last'))

model.add(MaxPooling2D(pool_size=(2,2)))

model.add(Conv2D(64, kernel_size=(3,3), activation= 'relu'))
model.add(MaxPooling2D(pool_size=(2,2)))

model.add(Flatten())
model.add(Dense(64))

model.add(Dense(2))
model.add(Activation('softmax'))

model.compile(loss='binary_crossentropy',
             optimizer='adam',
             metrics=['accuracy'])

#model.fit(np.array(X), np.array(y), batch_size=BATCH_SIZE, validation_split=0.1, epochs=10)

Instructions for updating:
Colocations handled automatically by placer.


In [12]:
kf = KFold(n_splits=5)

#actual_outputs = [] I was going to do a mse of the end results but figured using the
#my_predictions = [] binary cross entropy I set up earlier would be simpler and more effective
result_loss = []
result_acc = []

i=1
for train, test in kf.split(X):
    X_train = np.array(X)[train]
    y_train = np.array(y)[train]
    X_test = np.array(X)[test]
    y_test = np.array(y)[test]
    
    model.fit(X_train, y_train, batch_size=BATCH_SIZE, validation_split=0.1, epochs=1)
    
    loss, acc = model.evaluate(X_test, y_test)
    result_loss.append(loss)
    result_acc.append(acc)
    
    #pred = model.predict(X_test)
    #actual_outputs.append(y_test)
    #my_predictions.append(pred)
    
    
    

Train on 243 samples, validate on 28 samples
Train on 243 samples, validate on 28 samples
Train on 243 samples, validate on 28 samples
Train on 243 samples, validate on 28 samples
Train on 244 samples, validate on 28 samples


In [16]:
# and here are our results
#print('losses: ' + result_loss)
print('avg loss: ' + str(sum(result_loss)/5))
#print('accuracies: ' + result_acc)
print('avg accuracy: ' + str(sum(result_acc)/5))

avg loss: 3.588155827406726
avg accuracy: 0.7761632978916169


In [None]:
# For this project I decided to build a machine learning model that learned how to 
# diagnose coronavirus from CT scans of people with and without the virus. It shook down to your
# typical image processing model, each sample being classified as either 0-not infected, or 
# 1-infected. I found a dataset on github (https://github.com/ieee8023/covid-chestxray-dataset)
# and used the metadata.csv file to prep the expected labels. There's additional information
# in there, but the amount of time it'd take to get useful results out of that info didn't
# make using it appealing

# For the model itself I used a convolutional neural network. It was essentially comprised of
# two conv2D + maxpooling2D layers. The convolitional layers slide a window over the matrix of
# pixels and records the data, and the maxpooling layers reduce the dimensionality of the images
# essentially by reducing the number of pixels we're looking at. I used cross-fold validation
# to get more bang for my buck because I was working with a relatively smallish dataset.

# the resulting average accuracy from the 5 folds was 77.62%, which all in all isn't terrible
# but isn't winning me any nobel peace prizes anytime soon. 