# **Brain Tumor MRI Multiclass Classification**

In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [2]:
# Re-freezing everything except for the last layer of the pretrained CNN
def Unfreeze_Layers(pretrain, layer_list):
    
    '''pretrain: takes as an argument an imported, pretrained neural network
        layer_list: take a list with the layers that are to be unfrozen
    '''
    pretrain.trainable = True
    for layer in  pretrain.layers:
        if layer.name in layer_list:
            layer.trainable = True
        else:
            layer.trainable = False
        
    for layer in pretrain.layers:
        print(layer.name, layer.trainable)
    print(len(pretrain.trainable_weights))

In [3]:
def Freeze_Pretrained_Base(pretrain, network):
    
    '''pretrain: takes as an argument an imported, pretrained neural network
       network: take the name of an established neural network
    '''
    pretrain.trainable = False
    for layer in network.layers:
        print(layer.name, layer.trainable)
    print(len(network.trainable_weights))

In [4]:
def visualize_training_results(history):
    '''
    From https://machinelearningmastery.com/display-deep-learning-model-training-history-in-keras/
    
    Input: keras history object (output from trained model)
    '''
    fig, (ax1, ax2) = plt.subplots(2, sharex=True)
    fig.suptitle('Model Results')

    # summarize history for accuracy
    ax1.plot(history.history['acc'])
    ax1.plot(history.history['val_acc'])
    ax1.set_ylabel('Accuracy')
    ax1.legend(['train', 'test'], loc='upper left')
    # summarize history for loss
    ax2.plot(history.history['loss'])
    ax2.plot(history.history['val_loss'])
    ax2.set_ylabel('Loss')
    ax2.legend(['train', 'test'], loc='upper left')
    
    plt.xlabel('Epoch')
    plt.show()

In [5]:
def f1_score(model_eval):
    
    '''takes as an argument a model that has been evaluated with the ".evaluate" method'''
    
    return 'F1 Score:',(2* model_eval[4]*model_eval[3])/(model_eval[3]+model_eval[4])

In [6]:
# Set random state for numpy operations
from numpy.random import seed
seed(2)
# Set random state for tensorflow operations
from tensorflow.random import set_seed
set_seed(3)
# General imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tensorflow import keras
from sklearn.model_selection import train_test_split
from tensorflow.keras.optimizers import Adam, SGD
from tensorflow.keras import layers
from tensorflow.keras import regularizers
from keras.regularizers import l2
from keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau
from keras.models import load_model
import seaborn as sns
from mlxtend.plotting import plot_decision_regions
from sklearn.metrics import multilabel_confusion_matrix, ConfusionMatrixDisplay
import cv2
import PIL

## **Data Augmentation and Resizing**

Because a total of 2,870 files (number of files in the training set) is a pretty small number of images to use for training a neural network, data augmentation is key, since it can help mimic the effect of having a larger number of images. The way I implemented these techiques was by using the ImageDataGenerator from Keras. The features I decided to tweak for augmentation were zoom range, rotation range, brightness range, and horizontal flipping. I decided to provided a range of different zoom values and rotation degrees because how much zoom and the angle of how the brain is positioned in an MRI image can vary a little, and so producing images with varying levels of zoom and rotation is a realistic way to mimic the effect of having more images. I decided to provide a range for brightness level because the dataset contains a variety of images with different levels of brightness and contrast, and so producing images with different levels of brightness is a realistic way to mimic the effect of having more images. I decided to flip some images along the horizontal axis, which translates to a left right flip, because regions of the brain are very symmetrical along the left/right axis. I did not include vertical flipping as part of data augmentation, because top/bottom parts of the brain are not symmetrical. I also decided not to shear any images, because shearing stretches and distorts regions of an image, and for brain scans it is very important to preserve the correct anatomical structure of the brain, as discussed in [this reresearch article](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC6917660/).
I also used to ImageDataGenerator to resize the MRIs since they are all different sizes. Because a bigger image means greater model complexity (and therefore harder to fine tune), I decided on a relatively small image size of 150x150x3.

In [7]:
# Set up ImageDataGenerator
train_imagegen = keras.preprocessing.image.ImageDataGenerator(rescale=1./255,
                                   zoom_range=[0.6,1],
                                   rotation_range=10,
                                   brightness_range=([0.6, 1.2]),
                                   horizontal_flip=True,
                                   validation_split=0.06) # this will set aside a part of training set for validation data
test_imagegen = keras.preprocessing.image.ImageDataGenerator(rescale=1./255,
                                   zoom_range=[0.6,1],
                                   rotation_range=10,
                                   brightness_range=([0.6, 1.2]),
                                   horizontal_flip=True)
# Bring the data in
train_generator = train_imagegen.flow_from_directory(
                                    '../input/brain-tumor-classification-mri/Training',
                                    classes={'no_tumor': 0,
                                            'glioma_tumor':1,
                                            'meningioma_tumor':2,
                                            'pituitary_tumor':3},
                                    target_size=(150,150),
                                    batch_size=2700,# number of training images
                                    seed=42,
                                    class_mode='categorical',
                                    subset='training')

test_generator = test_imagegen.flow_from_directory(
                                    '../input/brain-tumor-classification-mri/Testing',
                                    classes={'no_tumor': 0,
                                            'glioma_tumor':1,
                                            'meningioma_tumor':2,
                                            'pituitary_tumor':3},
                                    target_size=(150,150),
                                    batch_size=394,# number of images
                                    seed=42,
                                    class_mode='categorical')

val_generator = train_imagegen.flow_from_directory(
                                    '../input/brain-tumor-classification-mri/Training',
                                    classes={'no_tumor': 0,
                                            'glioma_tumor':1,
                                            'meningioma_tumor':2,
                                            'pituitary_tumor':3},
                                    target_size=(150,150),
                                    batch_size=170,# number of images
                                    seed=42,
                                    class_mode='categorical',
                                    subset='validation')
# First run-throughs were not done with a random seed, so model analysis may be slightly different from what will be the 
# actual numbers after running models with the random seed.

In [8]:
# # Creating variables to contain image vectors and labels for the different training sets
train_img, train_lab = next(train_generator)
test_img, test_lab = next(test_generator)
val_img, val_lab = next(val_generator)

## **Using Pre-Trained VGG-19 Weights**

Since the VGG-19 pre-trained network yielded good results when approaching these images as a binary classification problem (tumor or no tumor) I decided to first try this network when using the images divided up into different classes (glioma tumor, meningioma tumor, pituitary tumor, no tumor).

In [9]:
from keras.applications.vgg19 import VGG19
cnn_vgg = VGG19(weights='imagenet',
               include_top=False,
               input_shape=(150,150,3))

Here I am implementing early stopping on the condition of monitoring testing loss with a patience of 12, so that if more than 12 epochs occur without any drops in testing loss, the model will stop training and the weights of the epoch with the lowest testing loss will be saved.

In [None]:
# Making early stop for model
pre_early = [EarlyStopping(monitor='val_loss', patience=12, restore_best_weights=True),
            ModelCheckpoint(filepath='pretrained_model.h5', monitor='val_loss',
                           save_best_only=True)]

In [None]:
cnn_vgg.summary()

I'm going to start out by adding two dense layers after flattening. I will use the 'relu' activation function for the second to last dense layer, which often gives the best results. It gives negative values a value of zero and any positive number can range from zero to infinity. It is a simple function which does not require much computational time, and can help models train faster. I will use the 'softmax activation function in the last layer since this is a multiclass classification problem.

In [None]:
# Build first model using pretrained VGG 19 as first layer, and then some dense layers on top
pretrained = keras.Sequential()
pretrained.add(cnn_vgg)
pretrained.add(layers.Flatten())
pretrained.add(layers.Dense(128, activation='relu'))
pretrained.add(layers.Dense(4, activation='softmax'))

In [None]:
# Will start by freezing all layers of pretrained network
Freeze_Pretrained_Base(cnn_vgg, pretrained)

In [None]:
# Compiling and fitting the model
pretrained.compile(loss='categorical_crossentropy',
                optimizer='adam',
                metrics=['categorical_accuracy','acc', 'Recall', 'Precision'])

pretrained_results = pretrained.fit(x=train_img, y=train_lab,
                                              batch_size = 32,
                                         steps_per_epoch=2700//32+1,# number of samples / batch size
                                         epochs=25,
                                        callbacks= pre_early,
                                         validation_data=(test_img, test_lab),
                                        validation_steps = 394//32+1)

In [None]:
visualize_training_results(pretrained_results)

In [None]:
# Evaluating pretrained model on testing images
pretrained_eval = pretrained.evaluate(test_img, test_lab)

In [None]:
f1_score(pretrained_eval)

### **Analysis**

Testing accuracy is 71%, recall is 71%, precision is 72%, and the f1 score is 71%. It is clear from the graphs that the model is overfitting; there is a big gap between training and testing loss and accuracy. These are modest results for the first transfer learning model, but next I want to investigate whether or not unfreezing outer layers helps improve performance.

## **Unfreezing an outer Layer of the Pretrained Network**

In [None]:
# Making early stop for model
b5c1_early = [EarlyStopping(monitor='val_loss', patience=12, restore_best_weights=True),
            ModelCheckpoint(filepath='b5_c1_model.h5', monitor='val_loss',
                           save_best_only=True)]

In [None]:
# Build first model using pretrained VGG 19 as first layer, and then some dense layers on top
b5_c1 = keras.Sequential()
b5_c1.add(cnn_vgg)
b5_c1.add(layers.Flatten())
b5_c1.add(layers.Dense(128, activation='relu'))
b5_c1.add(layers.Dense(4, activation='softmax'))

In [None]:
# freezing everything 
Freeze_Pretrained_Base(cnn_vgg, b5_c1)

In [None]:
# Unfreezing the last layer of the pretrained CNN
un_b5c1 = ['block5_conv1']
Unfreeze_Layers(cnn_vgg, un_b5c1)

In [None]:
b5_c1.compile(loss='categorical_crossentropy',
                optimizer='adam',
                metrics=['categorical_accuracy','acc', 'Recall', 'Precision'])

b5_c1_results = b5_c1.fit(x=train_img, y=train_lab,
                                         batch_size=32,
                                         steps_per_epoch=2700//32+1,# number of samples / batch size
                                         epochs=25,
                                        callbacks= b5c1_early,
                                         validation_data=(test_img, test_lab),
                                        validation_steps=394//32+1)

In [None]:
visualize_training_results(b5_c1_results)

In [None]:
b5_c1_eval = b5_c1.evaluate(test_img, test_lab)

In [None]:
f1_score(b5_c1_eval)

### **Analysis**

Results are very similar to hte last iteration; overfitting is still a problem, and all metrics are around 71%. Next I will see if adding dropout layers will help improve performance.

## **Adding Dropout layers to VGG-19 pretrained network (one layer unfrozen)**

In [None]:
# Making early stop for model
vgg_drop_early = [EarlyStopping(monitor='val_loss', patience=12, restore_best_weights=True),
            ModelCheckpoint(filepath='vgg_drop_model.h5', monitor='val_loss',
                           save_best_only=True)]

In [None]:
# Build first model using pretrained VGG 19 as first layer, and then some dense layers on top
vgg_drop = keras.Sequential()
vgg_drop.add(cnn_vgg)
vgg_drop.add(layers.Flatten())
vgg_drop.add(layers.Dropout(0.4))
vgg_drop.add(layers.Dense(128, activation='relu'))
vgg_drop.add(layers.Dropout(0.2))
vgg_drop.add(layers.Dense(4, activation='softmax'))

In [None]:
# Freeze all layers
Freeze_Pretrained_Base(cnn_vgg, vgg_drop)

In [None]:
# Unfreezing the last layer of the pretrained CNN
un_b5c1 = ['block5_conv1']
Unfreeze_Layers(cnn_vgg, un_b5c1)

In [None]:
vgg_drop.compile(loss='categorical_crossentropy',
                optimizer='adam',
                metrics=['categorical_accuracy','acc', 'Recall', 'Precision'])

vgg_drop_results = vgg_drop.fit(x=train_img, y=train_lab,
                                            batch_size=32,
                                         steps_per_epoch=2700//32+1,# number of samples / batch size
                                         epochs=25,
                                        callbacks= vgg_drop_early,
                                         validation_data=(test_img, test_lab),
                                           validation_steps=394//32+1)

In [None]:
visualize_training_results(vgg_drop_results)

In [None]:
vgg_drop_eval = vgg_drop.evaluate(test_img, test_lab)

In [None]:
f1_score(vgg_drop_eval)

### **Analysis**

Results are still about the same in this iteration; overfitting is still a problem, and all metrics are around 70%. In the next iteration I will see if unfreezing yet another layer will improve performance of model.

## **Unfreezing another layer of VGG Pretrained Network**

In [None]:
# Making early stop for model
b5_c1c2_early = [EarlyStopping(monitor='val_loss', patience=12, restore_best_weights=True),
            ModelCheckpoint(filepath='b5c1c2_model.h5', monitor='val_acc',
                           save_best_only=True)]

In [None]:
# Build first model using pretrained VGG 19 as first layer, and then some dense layers on top
b5_c1c2 = keras.Sequential()
b5_c1c2.add(cnn_vgg)
b5_c1c2.add(layers.Flatten())
b5_c1c2.add(layers.Dropout(0.4))
b5_c1c2.add(layers.Dense(128, activation='relu'))
b5_c1c2.add(layers.Dropout(0.2))
b5_c1c2.add(layers.Dense(4, activation='softmax'))

In [None]:
# Freeze all layers
Freeze_Pretrained_Base(cnn_vgg, b5_c1c2)

In [None]:
# Unfreezing the last layer of the pretrained CNN
un_b5c1c2 = ['block5_conv1', 'block5_conv2']
Unfreeze_Layers(cnn_vgg, un_b5c1c2)

In [None]:
b5_c1c2.compile(loss='categorical_crossentropy',
                optimizer='adam',
                metrics=['categorical_accuracy','acc', 'Recall', 'Precision'])

b5_c1c2_results = b5_c1c2.fit(x=train_img, y=train_lab,
                                            batch_size=32,
                                         steps_per_epoch=2700//32+1,# number of samples / batch size
                                         epochs=25,
                                        callbacks= vgg_drop_early,
                                         validation_data=(test_img, test_lab))
                                           #validation_steps=394//32+1)

In [None]:
visualize_training_results(b5_c1c2_results)

In [None]:
b5_c1c2_eval = b5_c1c2.evaluate(test_img, test_lab)

In [None]:
f1_score(b5_c1c2_eval)

### **Analysis**

In this iteration, overfitting is still a problem and results are about the same, but there has been a slight improvement. Testing accuracy is 72%, recall is 71%, precision is 73%, and f1 score is 73%. In the next iteration I will see if implementing learning rate reduction and increasing the number of epochs will improve model performance.

## **Implementing Learning Rate Reduction**

In [14]:
# Making early stop for model
red_c1c2_early = [EarlyStopping(monitor='val_loss', patience=20, restore_best_weights=True),
            ModelCheckpoint(filepath='red_b5c1c2_model.h5', monitor='val_loss', save_best_only=True),
            ReduceLROnPlateau(patience=12, verbose=1)]

In [15]:
# Build first model using pretrained VGG 19 as first layer, and then some dense layers on top
red_c1c2 = keras.Sequential()
red_c1c2.add(cnn_vgg)
red_c1c2.add(layers.Flatten())
red_c1c2.add(layers.Dropout(0.4))
red_c1c2.add(layers.Dense(128, activation='relu'))
red_c1c2.add(layers.Dropout(0.2))
red_c1c2.add(layers.Dense(4, activation='softmax'))

In [16]:
# Freeze all layers
Freeze_Pretrained_Base(cnn_vgg, red_c1c2)

In [17]:
# Unfreezing the last layer of the pretrained CNN
un_b5c1c2 = ['block5_conv1', 'block5_conv2']
Unfreeze_Layers(cnn_vgg, un_b5c1c2)

In [18]:
red_c1c2.compile(loss='categorical_crossentropy',
                optimizer='adam',
                metrics=['acc','categorical_accuracy','Recall', 'Precision'])

red_c1c2_results = red_c1c2.fit(x=train_img, y=train_lab,
                                            batch_size=32,
                                         steps_per_epoch=2700//32+1,# number of samples / batch size
                                         epochs=100,
                                        callbacks= red_c1c2_early,
                                         validation_data=(test_img, test_lab),
                                           validation_steps=394//32+1)

In [19]:
visualize_training_results(red_c1c2_results)

In [20]:
red_c1c2_eval = red_c1c2.evaluate(test_img, test_lab)

In [21]:
f1_score(red_c1c2_eval)

### **Analysis**

In this iteration, overfitting has improved slightly, and all metrics are better; testing accuracy is 77%, testing recall is 76%, precision is 78%, and the f1 score is 77%. It looks like implementing learning rate reduction and increasing number of epochs improved performance. In the next iteration I will see if batch normalization improves performance.

## **Adding Batch Normalization to VGG19 Pretrained Neural Network**

In [None]:
# Making early stop for model
vgg_batch_early = [EarlyStopping(monitor='val_loss', patience=20, restore_best_weights=True),
            ModelCheckpoint(filepath='vgg_batch_model.h5', monitor='val_loss', save_best_only=True),
            ReduceLROnPlateau(patience=12, verbose=1)]

In [None]:
# Build first model using pretrained VGG 19 as first layer, and then some dense layers on top
vgg_batch = keras.Sequential()
vgg_batch.add(cnn_vgg)
vgg_batch.add(layers.Flatten())
vgg_batch.add(layers.BatchNormalization())
vgg_batch.add(layers.Dropout(0.4))
vgg_batch.add(layers.Dense(128, activation='relu'))
vgg_batch.add(layers.BatchNormalization())
vgg_batch.add(layers.Dropout(0.2))
vgg_batch.add(layers.Dense(4, activation='softmax'))

In [None]:
# Re-freezing everything except for the last layer of the pretrained CNN
# Code structure from https://github.com/learn-co-curriculum/dsc-using-pretrained-networks-codealong
Freeze_Pretrained_Base(cnn_vgg, vgg_batch)

In [None]:
# Unfreezing the last layer of the pretrained CNN
un_b5c1c2 = ['block5_conv1', 'block5_conv2']
Unfreeze_Layers(cnn_vgg, un_b5c1c2)

In [None]:
vgg_batch.compile(loss='categorical_crossentropy',
                optimizer='adam',
                metrics=['categorical_accuracy','acc', 'Recall', 'Precision'])

vgg_batch_results = vgg_batch.fit(x=train_img, y=train_lab,
                                         steps_per_epoch=2700//32+1,
                                          batch_size=32,
                                         epochs=100,
                                        callbacks= vgg_batch_early,
                                         validation_data=(test_img, test_lab),
                                         validation_steps=394//32+1)

In [None]:
visualize_training_results(vgg_batch_results)

In [None]:
vgg_batch_eval = vgg_batch.evaluate(test_img, test_lab)
vgg_batch_eval

In [None]:
f1_score(vgg_batch_eval)

### **Analysis**

In this iteration, testing accuracy is 66%, recall is 63%, precision is 71%, and the f1 score is 67%. It is clear that adding batch normalization decreased model performance. In the next iteration, I will remove batch normalization and see if unfreezing more layers will improve model performance.

## **Unfreezing Four Layers of the VGG19 Pretrained Network**

In [None]:
# Making early stop for model
four_vgg_early = [EarlyStopping(monitor='val_loss', patience=20, restore_best_weights=True),
            ModelCheckpoint(filepath='vgg_four_model.h5', monitor='val_loss', save_best_only=True),
            ReduceLROnPlateau(patience=12, verbose=1)]

In [None]:
# Build first model using pretrained VGG 19 as first layer, and then some dense layers on top
four_vgg = keras.Sequential()
four_vgg.add(cnn_vgg)
four_vgg.add(layers.Dropout(0.4))
four_vgg.add(layers.Flatten())
four_vgg.add(layers.Dense(128, activation='relu'))
four_vgg.add(layers.Dropout(0.2))
four_vgg.add(layers.Dense(4, activation='softmax'))

In [None]:
# Re-freezing everything except for the last layer of the pretrained CNN
# Code structure from https://github.com/learn-co-curriculum/dsc-using-pretrained-networks-codealong
Freeze_Pretrained_Base(cnn_vgg, four_vgg)

In [None]:
cnn_vgg.summary()

In [None]:
un_four_vgg = ['block5_conv1', 'block5_conv2','block5_conv3', 'block5_conv4']

In [None]:
Unfreeze_Layers(cnn_vgg, un_four_vgg)

In [None]:
four_vgg.compile(loss='categorical_crossentropy',
                optimizer='adam',
                metrics=['categorical_accuracy','acc', 'Recall', 'Precision'])

four_vgg_results = four_vgg.fit(x=train_img, y=train_lab,
                              batch_size=32,
                              steps_per_epoch=2700//32+1,# number of samples / batch size
                              epochs=100,
                             callbacks= four_vgg_early,
                            validation_data= (test_img, test_lab),
                            validation_steps = 394//32+1)

In [None]:
visualize_training_results(four_vgg_results)

In [None]:
four_vgg_eval = four_vgg.evaluate(test_img, test_lab)

In [None]:
f1_score(four_vgg_eval)

### **Analysis**

It looks like unfreezing four layers was not beneficial to model performance; testing accuracy is 68%, recall is 62%, precision is 78%, and f1 score is 78%. In the next iteration, I will start by just unfreezing the block5_conv4 layer, since this is the outer most layer. The first layer I unfroze was block5_conv1 layer, which is actually not the outer most layer. Maybe keeping the weights of the pretrained network for every layer but the most outer layer will be beneficial to model performance. 

## **Unfreezing Block 5, conv 4 layer of VGG-19 Network**

In [None]:
# Making early stop for model
b5c4_vgg_early = [EarlyStopping(monitor='val_loss', patience=20, restore_best_weights=True),
            ModelCheckpoint(filepath='vgg_b5c4_model.h5', monitor='val_loss', save_best_only=True),
            ReduceLROnPlateau(patience=12, verbose=1)]

In [None]:
# Build first model using pretrained VGG 19 as first layer, and then some dense layers on top
b5c4_vgg = keras.Sequential()
b5c4_vgg.add(cnn_vgg)
b5c4_vgg.add(layers.Dropout(0.4))
b5c4_vgg.add(layers.Flatten())
b5c4_vgg.add(layers.Dense(128, activation='relu'))
b5c4_vgg.add(layers.Dropout(0.2))
b5c4_vgg.add(layers.Dense(4, activation='softmax'))

In [None]:
# Re-freezing everything except for the last layer of the pretrained CNN
# Code structure from https://github.com/learn-co-curriculum/dsc-using-pretrained-networks-codealong
Freeze_Pretrained_Base(cnn_vgg, b5c4_vgg)

In [None]:
un_b5c4 = ['block5_conv4']

In [None]:
Unfreeze_Layers(cnn_vgg, un_b5c4)

In [None]:
b5c4_vgg.compile(loss='categorical_crossentropy',
                optimizer='adam',
                metrics=['categorical_accuracy','acc', 'Recall', 'Precision'])

b5c4_vgg_results = b5c4_vgg.fit(x=train_img, y=train_lab,
                              batch_size=32,
                              steps_per_epoch=2700//32+1,# number of samples / batch size
                              epochs=100,
                             callbacks= four_vgg_early,
                            validation_data= (test_img, test_lab),
                            validation_steps = 394//32+1)

In [None]:
visualize_training_results(b5c4_vgg_results)

In [None]:
b5c4_vgg_eval = b5c4_vgg.evaluate(test_img, test_lab)

In [None]:
f1_score(b5c4_vgg_eval)

### **Analysis**

This iteration performed worse than the last; overfitting has increased, testing accuracy is 69%, recall is 68%, precision is 69%, and f1 score is 70%. It is obvious that unfreezing the outermost layer has not improved model performance.

## **Evaluating Best Model (red_c1c2) on the holdout set**

In [22]:
best_model = load_model('red_b5c1c2_model.h5')
# 'b5c1c2_model.h5'

In [23]:
best_model_eval = best_model.evaluate(val_img, val_lab)

In [24]:
f1_score(best_model_eval)

### **Analysis**

When the final model (red_b5c1c2_model) was evaluated on the hold out set (val_img and val_lab), loss was 0.56, accuracy was 78%, recall was 73%, precision was 85%, and the f1 score was 78%. These results are similar to when the model was evaluated on testing data, so this is good news. However for the sake of this task, it would have been better for recall to be higher, since it is worse to have a false negative ( meaning that a tumor was incorrectly diagnosed, either as having no tumor or the wrong tumor type) than a false positive (meaning that an MRI scan with no tumor was identified as containing one of the types of tumors).

In [25]:
# Predictions for Validation data
val_preds_raw = best_model.predict(val_img)
val_preds = (val_preds_raw > 0.5).astype('int32')

In [26]:
# Transforming one-hot encoded labels into a single numeric value for each label and converting labels to integers
val_lab_ints = val_lab.astype('int32')
fin_val_lab = np.argmax(val_lab_ints, axis=1)

# Separating each label type into a separate list
three_lab = [i for i in fin_val_lab if i ==3]
two_lab = [i for i in fin_val_lab if i==2]
one_lab = [i for i in fin_val_lab if i==1]
zero_lab = [i for i in fin_val_lab if i==0]

# Calculating the number of each type of tumor 
num_pituitary = len(three_lab)
num_meningioma = len(two_lab)
num_glioma = len(one_lab)
num_no_tumor = len(zero_lab)

In [27]:
# Printing the number of different tumor types
print('Number of Pituitary Tumors', num_pituitary)
print('Number of Meningioma Tumors', num_meningioma)
print('Number of Glioma Tumors', num_glioma)
print('Number of No Tumor Images', num_no_tumor)

#### Number of Pituitary Tumors: 49
#### Number of Meningioma Tumors: 49
#### Number of Glioma Tumors: 49
#### Number of No Tumor Images: 23

In [28]:
# Transforming one-hot encoded predictions into a single numeric value for each prediction
fin_val_preds = np.argmax(val_preds, axis=1)

In [31]:
# Adding correct predictions to a list
correct = []
for i in range(len(fin_val_preds)):
    if fin_val_preds[i] == fin_val_lab[i]:
        correct.append(fin_val_preds[i])

In [32]:
# Separating each prediction type into a separate list
g_correct = [i for i in correct if i == 1]
m_correct = [i for i in correct if i == 2]
p_correct = [i for i in correct if i == 3]
no_correct = [i for i in correct if i == 0]

# Getting the number of correct predictions for each tumor type
num_g_corr = len(g_correct)
num_m_corr = len(m_correct)
num_p_corr = len(p_correct)
num_no_corr = len(no_correct)

In [33]:
# Printing the number of different tumor types from predictions
print('Number of Correct Pituitary Tumors', num_p_corr)
print('Number of Correct Meningioma Tumors', num_m_corr)
print('Number of Correct Glioma Tumors', num_g_corr)
print('Number of Correct No Tumor Images', num_no_tumor)

#### Number of Correct Pituitary Tumors: 37
#### Number of Correct Meningioma Tumors: 18
#### Number of Correct Glioma Tumors: 40
#### Number of Correct No Tumor Images: 23

In [34]:
# Printing the percent of correct predictions for each tumor type
print('Percent Gliomas Correct:', str(np.round((num_g_corr/num_glioma)*100))+' %')
print('Percent Meningioma Correct:', str(np.round((num_m_corr/num_meningioma)*100))+' %')
print('Percent Pituitary Correct:', str(np.round((num_p_corr/num_pituitary)*100))+' %')
print('Percent No Tumor Correct:', str(np.round((num_no_corr/num_no_tumor)*100))+' %')

### **Percentage of Correct Predictions for each Tumor Type**

#### - Percent Gliomas Correct: 82.0 %
#### - Percent Meningioma Correct: 37.0 %
#### - Percent Pituitary Correct: 76.0 %
#### - Percent No Tumor Correct: 91.0 %

In [49]:
# Plotting double bar graph to illustrate actual vs. predicted brain tumor type
# Code structure for graph from https://www.geeksforgeeks.org/plotting-multiple-bar-charts-using-matplotlib-in-python/#_=_
labels = ['No Tumor', 'Pituitary', 'Meningioma', 'Glioma']
actual = [num_no_tumor, num_pituitary, num_meningioma, num_glioma]
predicted = [num_no_corr, num_p_corr, num_m_corr, num_g_corr]
x_axis = np.arange(len(labels))
plt.figure(figsize=(12,8))
plt.bar(x_axis - 0.2, actual, 0.4, label='Actual Labels')
plt.bar(x_axis + 0.2, predicted, 0.4, label = 'Predicted Labels')

plt.xticks(x_axis, labels, fontsize=14)
plt.xlabel("Brain Tumor Type", fontsize=14)
plt.ylabel("Number of MRIs", fontsize=14)
plt.title("Comparison of Actual vs. Predicted Brain Tumor Type", fontsize=16)
plt.legend()
plt.show()