<h5>Importing libraries</h5>

In [1]:
#data handling
import pandas as pd
import numpy as np
import os 
from PIL import Image

# Machine learning (non deep learning)
import sklearn
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder 

# Deep learning
import tensorflow as tf
import keras
from keras import layers
from keras_preprocessing.image import ImageDataGenerator
from keras.models import Sequential
from keras.applications.vgg16 import VGG16, preprocess_input
from keras.layers import Conv2D, MaxPooling2D, Flatten, Dense, Dropout, BatchNormalization
from keras.callbacks import EarlyStopping 

# plotting
import seaborn as sns
import matplotlib.pyplot as plt

#misc
import time

In [2]:
tumor_labels = pd.read_csv("dataset/label.csv")
for i in range(len(tumor_labels)):
    if (tumor_labels.iloc[i]["label"] == "no_tumor"):
        tumor_labels.iloc[i]["label"] = "no_tumor"
    else: 
        tumor_labels.iloc[i]["label"] = "tumor"
tumor_labels

Unnamed: 0,file_name,label
0,IMAGE_0000.jpg,tumor
1,IMAGE_0001.jpg,no_tumor
2,IMAGE_0002.jpg,tumor
3,IMAGE_0003.jpg,tumor
4,IMAGE_0004.jpg,tumor
...,...,...
2995,IMAGE_2995.jpg,no_tumor
2996,IMAGE_2996.jpg,tumor
2997,IMAGE_2997.jpg,tumor
2998,IMAGE_2998.jpg,tumor


<h5>Read in images into numpy arrays and record type of tumor with one hot encoder</h5>

In [3]:
img_reshape_size = (32,32)

data = []
result = []

encoder = OneHotEncoder()
encoder.fit([[0], [1]]) 

# path to train tumor files
train_tumor_image_dir = "datagen/train/tumor/"
train_tumor_files = os.listdir(train_tumor_image_dir)

# path to train tumor files
train_no_tumor_image_dir = "datagen/train/no_tumor/"
train_no_tumor_files = os.listdir(train_no_tumor_image_dir)

# path to test tumor files
test_tumor_image_dir = "datagen/test/tumor/"
test_tumor_files = os.listdir(test_tumor_image_dir)

# path to test non tumor files
test_no_tumor_image_dir = "datagen/test/no_tumor/"
test_no_tumor_files = os.listdir(test_no_tumor_image_dir)


## train
for file in train_tumor_files:
    temp_file_path = "datagen/train/tumor/"+file
    img = Image.open(temp_file_path).convert('L')
    img = img.resize(img_reshape_size)
    img = np.array(img)
    data.append(np.array(img))
    result.append(encoder.transform([[1]]).toarray())
        

for file in train_no_tumor_files:
    temp_file_path = "datagen/train/no_tumor/"+file
    img = Image.open(temp_file_path).convert('L')
    img = img.resize(img_reshape_size)
    img = np.array(img)
    data.append(np.array(img))
    result.append(encoder.transform([[0]]).toarray())

    
## test

for file in test_tumor_files:
    temp_file_path = "datagen/test/tumor/"+file
    img = Image.open(temp_file_path).convert('L')
    img = img.resize(img_reshape_size)
    img = np.array(img)
    data.append(np.array(img))
    result.append(encoder.transform([[1]]).toarray())
        

for file in test_no_tumor_files:
    temp_file_path = "datagen/test/no_tumor/"+file
    img = Image.open(temp_file_path).convert('L')
    img = img.resize(img_reshape_size)
    img = np.array(img)
    data.append(np.array(img))
    result.append(encoder.transform([[0]]).toarray())
    

<h5>Reshape data and split between test and train data</h5>

In [4]:
data = np.array(data)
print(data.shape)

result = np.array(result)
result = result.reshape(3000,2)

x_train,x_test,y_train,y_test = train_test_split(data, result , test_size=0.2, shuffle=False, random_state=0)

(3000, 32, 32)


<h5>Build the CNN model</h5>

In [5]:
callback = tf.keras.callbacks.EarlyStopping(monitor='val_accuracy', patience=10)

model = Sequential()

model.add(Conv2D(32, kernel_size=(2, 2), input_shape=(32, 32, 1), padding = 'Same'))
model.add(Conv2D(32, kernel_size=(2, 2),  activation ='relu', padding = 'Same'))


model.add(BatchNormalization())
model.add(MaxPooling2D(pool_size=(2, 2)))
model.add(Dropout(0.25))

model.add(Conv2D(64, kernel_size = (2,2), activation ='relu', padding = 'Same'))
model.add(Conv2D(64, kernel_size = (2,2), activation ='relu', padding = 'Same'))

model.add(BatchNormalization())
model.add(MaxPooling2D(pool_size=(2,2), strides=(2,2)))
model.add(Dropout(0.25))

model.add(Flatten())

model.add(Dense(512, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(2, activation='softmax'))

model.compile(loss = "categorical_crossentropy", optimizer='Adamax',  metrics = ['accuracy'])
#print(model.summary())

<h5>Train the model and record time to train</h5>

In [6]:
bs = 30

start_time = time.time()
history = model.fit(x_train, y_train, epochs = 300, batch_size = bs, verbose=1, callbacks=[callback], validation_data=(x_test, y_test))
print("--- %s seconds ---" % (time.time() - start_time))

Epoch 1/300
Epoch 2/300
Epoch 3/300
Epoch 4/300
Epoch 5/300
Epoch 6/300
Epoch 7/300
Epoch 8/300
Epoch 9/300
Epoch 10/300
Epoch 11/300
Epoch 12/300
Epoch 13/300
Epoch 14/300
Epoch 15/300
Epoch 16/300
Epoch 17/300
Epoch 18/300
Epoch 19/300
Epoch 20/300
Epoch 21/300
Epoch 22/300
Epoch 23/300
Epoch 24/300
Epoch 25/300
Epoch 26/300
Epoch 27/300
Epoch 28/300
Epoch 29/300
Epoch 30/300
--- 95.08248782157898 seconds ---


<h6>Some basic results recording, just for keeping information to hand</h6>

30.509965896606445 seconds --- 30 epochs , 0.9683 acc , 16x16 Greyscale image <br>
86.94418954849243 seconds --- 30 epochs , 0.9783 acc , 32x32 Greyscale image <br>
169.6840739250183 seconds --- 60 epochs , 0.9667 acc , 32x32 Greyscale image <br>
335.7380225658417 seconds --- 30 epochs , 0.9700 acc , 64x64 Greyscale image <br>


<h5> Print model history </h5>

In [7]:
history.history

{'loss': [1.0884978771209717,
  0.5607299208641052,
  0.3880394697189331,
  0.3433830738067627,
  0.2898510694503784,
  0.29330453276634216,
  0.270906001329422,
  0.24593757092952728,
  0.2443082183599472,
  0.23600827157497406,
  0.21157734096050262,
  0.20560288429260254,
  0.20933684706687927,
  0.18460585176944733,
  0.1838633418083191,
  0.17545665800571442,
  0.18035338819026947,
  0.1512891799211502,
  0.1524948924779892,
  0.13926458358764648,
  0.14563655853271484,
  0.13732443749904633,
  0.1152794286608696,
  0.11817266047000885,
  0.11041253060102463,
  0.1072564423084259,
  0.09733934700489044,
  0.09979616850614548,
  0.08815982937812805,
  0.09131615608930588],
 'accuracy': [0.800000011920929,
  0.8308333158493042,
  0.85916668176651,
  0.8675000071525574,
  0.8812500238418579,
  0.8883333206176758,
  0.8879166841506958,
  0.8945833444595337,
  0.8912500143051147,
  0.9058333039283752,
  0.9175000190734863,
  0.9108333587646484,
  0.9179166555404663,
  0.930833339691162

<h5>Read in the validation data in the same way we read in the training and testing data</h5>

In [8]:
img_reshape_size = (32,32)

validation_data = []
validation_result = []

# path to non tumor files
no_tumor_image_dir = "test_dataset/test/split_data/binary/no_tumor"
no_tumor_files = os.listdir(no_tumor_image_dir)

# path to tumor files
tumor_image_dir = "test_dataset/test/split_data/binary/tumor"
tumor_files = os.listdir(tumor_image_dir)

test_encoder = OneHotEncoder()
test_encoder.fit([[0], [1]]) 

for file in no_tumor_files:
    temp_file_path = "test_dataset/test/split_data/binary/no_tumor/"+file
    #print("path is " +temp_file_path)
    img = Image.open(temp_file_path).convert('L') # not an RGB image so import as greyscale
    img = img.resize(img_reshape_size)
    img = np.array(img)
    validation_data.append(np.array(img))
    validation_result.append(test_encoder.transform([[0]]).toarray())

for file in tumor_files:
    temp_file_path = "test_dataset/test/split_data/binary/tumor/"+file
    img = Image.open(temp_file_path).convert('L')
    img = img.resize(img_reshape_size)
    img = np.array(img)
    validation_data.append(np.array(img))
    validation_result.append(test_encoder.transform([[1]]).toarray())   

<h5>reshape validation data</h5>

In [9]:
validation_data = np.array(validation_data)
validation_result = np.array(validation_result)
validation_result = validation_result.reshape(200,2)

<h5>Get and print scores</h5>

In [10]:
scores = model.evaluate(validation_data, validation_result, verbose=10)

In [11]:
print("test loss: " + str(scores[0]))
print("test accuracy: " + str(scores[1]))

test loss: 0.30010929703712463
test accuracy: 0.8949999809265137


In [160]:
acc_batch_array 

[[10, 0.07656975090503693, 0.9750000238418579],
 [30, 0.06190032511949539, 0.9800000190734863],
 [50, 0.04844776168465614, 0.9800000190734863],
 [70, 0.06202394515275955, 0.9649999737739563],
 [90, 0.05173724144697189, 0.9750000238418579],
 [110, 0.05443058907985687, 0.9850000143051147],
 [130, 0.04711779206991196, 0.9800000190734863],
 [150, 0.04425444081425667, 0.9850000143051147],
 [170, 0.046532295644283295, 0.9750000238418579],
 [190, 0.039801206439733505, 0.9900000095367432],
 [210, 0.05233008414506912, 0.9750000238418579],
 [230, 0.042826972901821136, 0.9850000143051147],
 [250, 0.05040019005537033, 0.9850000143051147],
 [270, 0.04743853583931923, 0.9800000190734863],
 [290, 0.04829291254281998, 0.9750000238418579],
 [310, 0.04591016843914986, 0.9850000143051147],
 [8, 0.056518469005823135, 0.9850000143051147],
 [8, 0.05424250662326813, 0.9800000190734863],
 [8, 0.08594222366809845, 0.9649999737739563],
 [40, 0.06024970859289169, 0.9800000190734863]]