# Data Wrangling and EDA (Brain Tumor Classification)

## Step 1: Importing Libraries

In [11]:
import tensorflow as tf
import numpy as np
import pandas as pd
from tensorflow import keras
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.preprocessing import image
import matplotlib.pyplot as plt
import time
import os, sys
import cv2
import PIL
import os.path
from PIL import Image, ImageOps
import scipy.ndimage as ndi
from skimage import color
from skimage.filters import gaussian
from tqdm import tqdm
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split
from keras.utils.np_utils import to_categorical
from tensorflow.keras.applications import MobileNet, MobileNetV2, VGG16
# from tensorflow.keras.applications.mobilenet import MobileNet
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.image as mpimgimport seaborn as sns
import cv2
from sklearn.model_selection import train_test_split
from keras.models import Sequential
from keras.callbacks import EarlyStopping
from keras.layers import Dropout, Dense, BatchNormalization, Flatten
from keras.preprocessing.image import ImageDataGenerator
from keras.callbacks import EarlyStopping, ReduceLROnPlateau
from tqdm import tqdm
from sklearn.utils import shuffle

In [2]:
path = "../BrainMRI"

In [3]:
list(os.listdir(path))

['Testing', 'Training']

In [4]:
list(os.listdir(path + "/Training"))

['Glioma_Tumor', 'Meningioma_Tumor', 'No_Tumor', 'Pituitary_Tumor']

In [5]:
list(os.listdir(path + "/Testing"))

['Glioma_Tumor', 'Meningioma_Tumor', 'No_Tumor', 'Pituitary_Tumor']

In [6]:
labels = ['Glioma_Tumor','No_Tumor','Meningioma_Tumor','Pituitary_Tumor']

class_map = {
    'No_Tumor': 0,
    'Glioma_Tumor': 1,
    'Meningioma_Tumor': 2,
    'Pituitary_Tumor': 3
}

inverse_class_map = {
    0: 'No_Tumor',
    1: 'Glioma_Tumor',
    2: 'Meningioma_Tumor',
    3: 'Pituitary_Tumor'
}

In [7]:
h, w = 300, 300
batch_size = 32
epochs = 10

In [8]:
IMAGE = []
LABELS = []

for label in labels:
    folderPath = os.path.join('../BrainMRI/Training', label)
    for j in tqdm(os.listdir(folderPath)):
        img = cv2.imread(os.path.join(folderPath, j))
        img = cv2.resize(img,(h, w))
        IMAGE.append(img)
        LABELS.append(class_map[label])
            
        
for label in labels:
    folderPath = os.path.join('../BrainMRI/Testing', label)
    for j in tqdm(os.listdir(folderPath)):
        img = cv2.imread(os.path.join(folderPath,j))
        img = cv2.resize(img,(h, w))
        IMAGE.append(img)
        LABELS.append(class_map[label])
        
X = np.array(IMAGE)
y = np.array(LABELS)

100%|██████████| 826/826 [00:01<00:00, 441.32it/s]
100%|██████████| 395/395 [00:00<00:00, 527.61it/s]
100%|██████████| 822/822 [00:01<00:00, 411.99it/s]
100%|██████████| 827/827 [00:02<00:00, 388.66it/s]
100%|██████████| 100/100 [00:00<00:00, 477.85it/s]
100%|██████████| 105/105 [00:00<00:00, 1210.90it/s]
100%|██████████| 115/115 [00:00<00:00, 618.17it/s]
100%|██████████| 74/74 [00:00<00:00, 210.50it/s]


In [9]:
X, y = shuffle(X, y, random_state=42)

X_train, X_test, y_train, y_test = train_test_split(X, to_categorical(y), test_size=0.1, random_state=42)

## Step 2: Inspecting the Data Directories

In [None]:
# Data directories
data_path = "../BrainMRI/"
os.listdir(data_path)

In [None]:
# Training subfolders
os.listdir(data_path+'Training')

In [None]:
# Testing subfolders
os.listdir(data_path+'Testing')

In [None]:
# Training Directories
glioma_path_tr = "../BrainMRI/Training/Glioma_Tumor/"
notumor_path_tr = "../BrainMRI/Training/No_Tumor/"
pituitary_path_tr = "../BrainMRI/Training/Pituitary_Tumor/"
meningioma_path_tr = "../BrainMRI/Training/Meningioma_Tumor/"

training_paths = [glioma_path_tr, pituitary_path_tr, meningioma_path_tr, notumor_path_tr]


In [None]:
# Test Directories
glioma_path_te = "../BrainMRI/Testing/Glioma_Tumor/"
notumor_path_te = "../BrainMRI/Testing/No_Tumor/"
pituitary_path_te = "../BrainMRI/Testing/Pituitary_Tumor/"
meningioma_path_te = "../BrainMRI/Testing/Meningioma_Tumor/"

test_paths = [glioma_path_te, pituitary_path_te, meningioma_path_te, notumor_path_te]

## Step 3: Inspecting the Image File Format

In [None]:
# Checking Image File Format other than [.jpg] for Training Set
count = 0
for path in training_paths:
    list_images = os.listdir(path)
    for i in list_images:
        if i.split('.')[-1] != 'jpg':
            count += 1
            # print('Other Format Found')
        else:
            continue
print(f'Other Than [.jpg]: {count}')

In [None]:
# Checking Image File Format other than [.jpg] for Testing Set
count = 0
for path in test_paths:
    list_images = os.listdir(path)
    for i in list_images:
        if i.split('.')[-1] != 'jpg':
            count += 1
            #print('Other Format Found')
        else:
            continue
print(f'Other Than [.jpg]: {count}')

## Step 4: Visualizing the Different Tumor Classes

In [None]:
# Visualizing all different tumor types in training dataset and 
# The respective size of the images

for path in training_paths:
    list_images = os.listdir(path)
    fig, ax = plt.subplots(ncols=2, nrows=1, figsize=(12,6))
    for i in range(2):
        # plt.figure()
        array = plt.imread(os.path.join(path, list_images[i]))
        ax[i].imshow(array)
        if path.split('/')[-2] == 'NoTumor':
            ax[i].set_title(path.split('/')[-2])
        else:
            ax[i].set_title(path.split('/')[-2]+' Tumor')
    fig.tight_layout()

#### Meningioma Tumor
Meningioma is the most common primary brain tumor, accounting for more than 30% of all brain tumors. Meningiomas originate in the meninges, the outer three layers of tissue that cover and protect the brain just under the skull. Women are diagnosed with meningiomas more often than men. About 85% of meningiomas are noncancerous, slow-growing tumors. Almost all meningiomas are considered benign, but some meningiomas can be persistent and come back after treatment.

#### Pituitary Tumor
A pituitary tumor is an abnormal growth in the pituitary gland. The pituitary is a small gland in the brain. It is located behind the back of the nose. It makes hormones that affect many other glands and many functions in your body. Most pituitary tumors are not cancerous (benign). They don’t spread to other parts of your body. But they can cause the pituitary to make too few or too many hormones, causing problems in the body.

#### Glioma Tumor:
Gliomas are the most prevalent type of adult brain tumor, accounting for 78 percent of malignant brain tumors. They arise from the supporting cells of the brain, called the glia. These cells are subdivided into astrocytes, ependymal cells and oligodendroglial cells (or oligos). Glial tumors include the following:

#### No Tumor
There is no tumor present in the brain cell.

## Step 5: Inspecting a Single RGB Image

In [None]:
# Inspecting the size, shape, and data-type of a single image
plt.figure(figsize=(6,6))
image_array = plt.imread("../BrainMRI/Training/Glioma_Tumor/gg (1).jpg")
plt.imshow(image_array)
plt.show()
print(f"Size :{image_array.size} \nShape: {image_array.shape} \nData Type: {image_array.dtype}")

In [None]:
# Inspecting the different channels of the RGB image
red_channel = image_array[:,:,0]
green_channel = image_array[:,:,1]
blue_channel = image_array[:,:,2]
fig, ax = plt.subplots(ncols=3, nrows=1, figsize=(12,4))
ax[0].imshow(red_channel)
ax[0].set_title('Red Channel')
ax[1].imshow(green_channel)
ax[1].set_title('Green Channel')
ax[2].imshow(blue_channel)
ax[2].set_title('Blue Channel')
plt.show()
fig.tight_layout()

In [None]:
# Inspecting the green channel of the RGB Image using scipy.ndimage
plt.figure(figsize=(12,8))
hist = ndi.histogram(green_channel, min=0, max=255, bins=256)
plt.plot(hist)
plt.show()

In [None]:
# Visualizing the histogram red channel of the RGB image using matplotlib.pyplot
plt.figure(figsize=(12,8))
plt.hist(red_channel.ravel(), bins=256)
plt.show()

In [None]:
# The gray scale of the RGB images
plt.figure(figsize=(6,6))
gray_image = color.rgb2gray(image_array)
print("Shape:", gray_image.shape)
plt.imshow(gray_image)

## Step 6: Filtering, Sharpening, Manually Segmenting

In [None]:
# Reducing Sharpness using filter
gaussian_image = gaussian(gray_image, multichannel=True)

# Show original and resulting image to compare
fig, ax = plt.subplots(1,2, figsize=(12,8))
ax[0].imshow(gray_image)
ax[0].set_title('Original')
ax[1].imshow(gaussian_image)
ax[1].set_title('Reduced sharpness')

The original image looks better quality than filtered image.

In [None]:
# Smooth "im" with Gaussian filters
im_s1 = ndi.gaussian_filter(gray_image, sigma=1)
im_s3 = ndi.gaussian_filter(gray_image, sigma=3)

# Draw bone masks of each image
fig, axes = plt.subplots(1,3, figsize=(15,10))
axes[0].imshow(gray_image)
axes[0].set_title("Original Image")
axes[1].imshow(im_s1)
axes[1].set_title("Sharpen Image: sigma=1")
axes[2].imshow(im_s3)
axes[2].set_title("Sharpen Image: sigma=3")
plt.show()

The original image looks better quality than sharpen image.

In [None]:
# Trying to manually segment and visualize the tumor area inside the head
plt.figure(figsize=(6,6))
# plt.imshow(gray>0.37)
# plt.imshow(array)
plt.imshow(image_array[:,:,0]>90)

## Step 7: Counting the total number of training, testing, and validation images and locating any grayscale images

In [None]:
# Checking for Gray Scale images in training datasets
train_count = 0
gray_count = 0
for path in training_paths:
    for img_name in os.listdir(path):
        img_array = cv2.imread(os.path.join(path, img_name))
        try:
            channel = img_array.shape[2]
            if channel != 3:
                gray_count += 1
            else:
                train_count += 1
        except:
            pass
print(f"Total Grayscale Images: {gray_count}")
print(f'Total Training Images: {train_count}')

In [None]:
# Checking for Gray Scale images in testing datasets
test_count = 0
gray_count = 0

for path in test_paths:
    for img_name in os.listdir(path):
        img_array = cv2.imread(os.path.join(path, img_name))
        try:
            channel = img_array.shape[2]
            if channel != 3:
                gray_count += 1
            else:
                test_count += 1
        except:
            pass
        
print(f"Grayscale Images Found: {gray_count}")
print(f'Total Test Images: {test_count}')          

By inspecting the shape of the images in trainig, testing, and validation sub folders its visible that there is no gray scale images and all of the images are color image (RGB). There are 4956 trainig images, 1194 testing images, and 873 validation images.

## Step 8: Inspecting the height and width of the images in each training, testing, and validation subfolders

In [None]:
# Checking the image shape of each training subfolders before cropping and resizing

base_HW = 512
min_height = 512
min_width = 512
max_height = 512
max_width = 512
count = 0

for path in training_paths:
    
    for img_name in os.listdir(path):
        img_array = cv2.imread(os.path.join(path, img_name))
        height, width, channel = img_array.shape
        
        if (height == base_HW) and (width == base_HW) and (channel == 3):
            continue
            
        elif ((height < base_HW) or (width < base_HW)) and ((height < min_height) or (width < min_width)):
            # print(f"{os.path.join(path, img_name)}")
            min_height = height
            min_width = width
            count += 1
            
        elif ((height > base_HW) or (width > base_HW)) and ((height > max_height) or (width > max_width)):
            # print(f"{os.path.join(path, img_name)}")
            max_height = height
            max_width = width
            count += 1
            
print(f"\nDifferent Shape Count: {count} \nMin Height: {min_height}, Min Width: {min_width} \
      \nMax Height: {max_height}, Max Width {max_width}")

Most of the images have a shape of (512, 512, 3). There are 299 training images that does not contain the same height and width or in other words they contain different shape than (512, 512). So, We need to resize the images in respective subfolders.

In [None]:
# Checking the image shape of each testing subfolders before cropping and resizing

base_hw = 512
min_height = 512
min_width = 512
max_height = 512
max_width = 512
count = 0

for path in test_paths:
    
    for img_name in os.listdir(path):
        img_array = cv2.imread(os.path.join(path, img_name))
        height, width, channel = img_array.shape
        
        if (height == base_hw) and (width == base_hw) and (channel == 3):
            continue
            
        elif ((height < base_HW) or (width < base_HW)) and ((height < min_height) or (width < min_width)):
            # print(f"{os.path.join(path, img_name)}")
            min_height = height
            min_width = width
            count += 1
            
        elif ((height > base_HW) or (width > base_HW)) and ((height > max_height) or (width > max_width)):
            # print(f"{os.path.join(path, img_name)}")
            max_height = height
            max_width = width
            count += 1
            
print(f"\nDifferent Shape Count: {count} \nMin Height: {min_height}, Min Width: {min_width} \
      \nMax Height: {max_height}, Max Width {max_width}")

Most of the images have a shape of (512, 512, 3). There are 196 test images that does not contain the same height and width or in other words they contain different shape than (512, 512). So, We need to resize the images in respective subfolders.

## Step 9: Resizing, rgb to grayscale conversion, and save them to a new directory for modeling

In [None]:
# New Training Directories
new_glioma_path_tr = "../BrainMRI_New/Training/Glioma_Tumor/"
new_notumor_path_tr = "../BrainMRI_New/Training/No_Tumor/"
new_pituitary_path_tr = "../BrainMRI_New/Training/Pituitary_Tumor/"
new_meningioma_path_tr = "../BrainMRI_New/Training/Meningioma_Tumor/"

new_training_paths = [new_glioma_path_tr, new_pituitary_path_tr, new_meningioma_path_tr, new_notumor_path_tr]

# New Test Directories
new_glioma_path_te = "../BrainMRI_New/Testing/Glioma_Tumor/"
new_notumor_path_te = "../BrainMRI_New/Testing/No_Tumor/"
new_pituitary_path_te = "../BrainMRI_New/Testing/Pituitary_Tumor/"
new_meningioma_path_te = "../BrainMRI_New/Testing/Meningioma_Tumor/"

new_test_paths = [new_glioma_path_te, new_pituitary_path_te, new_meningioma_path_te, new_notumor_path_te]

In [None]:
# Training dataset
# rgb to gray conversion
# resizing into (300, 300)

for path, new_path in zip(training_paths, new_training_paths):
    
    for img_name in os.listdir(path):
        
        if os.path.isfile(path+img_name):
        
            old_dir = path+img_name
            new_dir = new_path+img_name
            img = Image.open(old_dir)
            
            try:
                img_gray = ImageOps.grayscale(img)
                resized_gray = img_gray.resize((300,300), PIL.Image.ANTIALIAS)
                resized_gray.save(new_dir, quality=95)
            except ValueError:
                print("There is something wrong")

In [None]:
# test dataset
# rgb to gray conversion
# resizing into (300, 300)

for path, new_path in zip(test_paths, new_test_paths):
    
    for img_name in os.listdir(path):
        
        if os.path.isfile(path+img_name):
        
            old_dir = path+img_name
            new_dir = new_path+img_name
            img = Image.open(old_dir)
            
            try:
                img_gray = ImageOps.grayscale(img)
                resized_gray = img_gray.resize((300,300), PIL.Image.ANTIALIAS)
                resized_gray.save(new_dir, quality=95)
            except ValueError:
                print("There is something wrong")

## Step 10: Inspecting the new resized and grayscale images

In [None]:
# Visualizing the new images of different class in test datasets

for path in new_test_paths:
    list_images = os.listdir(path)
    fig, ax = plt.subplots(ncols=2, nrows=1, figsize=(12,6))
    for i in range(2):
        # plt.figure()
        array = plt.imread(os.path.join(path, list_images[i]))
        ax[i].imshow(array)
        if path.split('/')[-2] == 'NoTumor':
            ax[i].set_title(path.split('/')[-2])
        else:
            ax[i].set_title(path.split('/')[-2]+' Tumor')
    fig.tight_layout()

In [16]:
# Specify the Simple model using Dense Layer [optimizer = 'adam']
early_stopping_monitor = EarlyStopping(patience=3)
model1 = Sequential()
model1.add(Flatten(input_shape=(300,300,3)))
model1.add(Dense(512, activation='relu'))
model1.add(Dense(256, activation='relu'))
model1.add(Dense(128, activation='relu'))
model1.add(Dense(64, activation='relu'))
model1.add(Dense(32, activation='relu'))
model1.add(Dense(16, activation='relu'))
model1.add(Dense(4, activation='softmax'))

# Compile the model
model1.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Fit the model
history1 = model1.fit((X_train, y_train), epochs = 20, batch_size=10, validation_data = ImageDataGenerator(1./255).flow((X_test, y_test)), callbacks=[early_stopping_monitor])

Epoch 1/20


ValueError: in user code:

    File "C:\Users\shaja\Anaconda3\lib\site-packages\keras\engine\training.py", line 878, in train_function  *
        return step_function(self, iterator)
    File "C:\Users\shaja\Anaconda3\lib\site-packages\keras\engine\training.py", line 867, in step_function  **
        outputs = model.distribute_strategy.run(run_step, args=(data,))
    File "C:\Users\shaja\Anaconda3\lib\site-packages\keras\engine\training.py", line 860, in run_step  **
        outputs = model.train_step(data)
    File "C:\Users\shaja\Anaconda3\lib\site-packages\keras\engine\training.py", line 808, in train_step
        y_pred = self(x, training=True)
    File "C:\Users\shaja\Anaconda3\lib\site-packages\keras\utils\traceback_utils.py", line 67, in error_handler
        raise e.with_traceback(filtered_tb) from None
    File "C:\Users\shaja\Anaconda3\lib\site-packages\keras\engine\input_spec.py", line 199, in assert_input_compatibility
        raise ValueError(f'Layer "{layer_name}" expects {len(input_spec)} input(s),'

    ValueError: Layer "sequential_5" expects 1 input(s), but it received 2 input tensors. Inputs received: [<tf.Tensor 'IteratorGetNext:0' shape=(None, 300, 300, 3) dtype=uint8>, <tf.Tensor 'IteratorGetNext:1' shape=(None, 4) dtype=float32>]


In [None]:
# Specifying Model Architecture (Convolutional Neural Network)
early_stopping_monitor = EarlyStopping(patience=3)

model2 = keras.Sequential()

# Convolutional layer and maxpool layer 1
model2.add(keras.layers.Conv2D(32,(3,3),activation='relu',input_shape=(300,300,1)))
model2.add(keras.layers.MaxPool2D(2,2))

# Convolutional layer and maxpool layer 2
model2.add(keras.layers.Conv2D(64,(3,3),activation='relu'))
model2.add(keras.layers.MaxPool2D(2,2))

# Convolutional layer and maxpool layer 3
model2.add(keras.layers.Conv2D(128,(3,3),activation='relu'))
model2.add(keras.layers.MaxPool2D(2,2))

# Convolutional layer and maxpool layer 4
model2.add(keras.layers.Conv2D(128,(3,3),activation='relu'))
model2.add(keras.layers.MaxPool2D(2,2))

# This layer flattens the resulting image array to 1D array
model2.add(keras.layers.Flatten())

# Hidden layer with 512 neurons and Rectified Linear Unit activation function 
model2.add(keras.layers.Dense(512,activation='relu'))

# Output layer with single neuron which gives 0 for Cat or 1 for Dog 
#Here we use sigmoid activation function which makes our model output to lie between 0 and 1
model2.add(keras.layers.Dense(4,activation='softmax'))

In [None]:
model2.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
train_dataset = datagen.flow_from_directory("../BrainMRI/Training", target_size=(300,300), batch_size = 32,
                                          class_mode = 'categorical', color_mode="grayscale")
test_dataset  = datagen.flow_from_directory("../BrainMRI/Testing", target_size=(300,300), batch_size = 32,
                                          class_mode = 'categorical', color_mode="grayscale")

In [None]:
#steps_per_epoch = train_imagesize/batch_size

start = time.time()

model2.fit(train_dataset, epochs = 15, batch_size=32, validation_data = test_dataset, callbacks = [early_stopping_monitor])

end = time.time() 
print(end - start)

In [None]:
model2.evaluate(test_dataset)

In [None]:
predictions2 = model2.predict(test_dataset)

In [None]:
# base_model = MobileNet(
#     input_shape=(h, w, 3), 
#     weights='imagenet',
#     include_top=False, 
#     pooling='avg'
# )

base_model = VGG16(
    input_shape=(h, w, 3), 
    weights='imagenet',
    include_top=False, 
    pooling='max'
)

base_model.summary()

In [None]:
base_model.trainable = False

output_class = 4

model = Sequential([
    base_model,
    Dropout(rate=0.5),
    Dense(output_class, activation='softmax')
])

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()

In [None]:
earlystop = EarlyStopping(monitor='val_loss', patience=5)

learning_rate_reduction = ReduceLROnPlateau(monitor='val_acc', 
                                            patience=2, 
                                            verbose=1, 
                                            factor=0.5, 
                                            min_lr=0.00001)

callbacks = [earlystop, learning_rate_reduction]

In [None]:
datagen = ImageDataGenerator(
        rescale=1./255,
        shear_range=0.2,
        zoom_range=0.2,
        horizontal_flip=True)

In [None]:
history = model.fit(datagen.flow(X_train, y_train, batch_size = batch_size), validation_data = (X_test, y_test),
                    steps_per_epoch = len(X_train) / batch_size, epochs = epochs, callbacks = EarlyStopping(patience=3))