In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        pass
        #print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Importing Libraries

In [None]:
import tensorflow as tf
from tensorflow import keras
from keras.callbacks import EarlyStopping
import matplotlib.pyplot as plt
from sklearn import metrics
from sklearn.model_selection import train_test_split
import seaborn as sns
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import GlobalAveragePooling2D, Dense, Dropout, Flatten
from tensorflow.keras.layers import Conv2D, MaxPool2D, LeakyReLU, BatchNormalization, InputLayer
from keras.losses import BinaryCrossentropy
from tensorflow.keras.optimizers import Adam
from keras_tuner import RandomSearch
from keras_tuner.engine.hyperparameters import HyperParameters
import warnings
warnings.filterwarnings('ignore')

# Reading the data:

In [None]:
df = pd.read_csv('/kaggle/input/brain-tumor/Brain Tumor.csv')
df.head()

In [None]:
import cv2
classes =[]
images= []
path = '/kaggle/input/brain-tumor/Brain Tumor/Brain Tumor/'
files = os.listdir(path)

for i, fle in enumerate(files):
    fileName = fle.split('.')[0]
    status = df[df['Image']== fileName]['Class'].to_numpy()
    classes.append(status)
    total = path + fle
    image= cv2.imread(total)
    image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    image= cv2.resize(image,(240,240))
    images.append(image)

In [None]:
images_f=np.expand_dims(np.array(images, dtype = np.float64), axis = -1)
classes_f=np.array(classes, dtype = np.float64)
del images, classes
images_f.shape

In [None]:
#Viewing a sample image
i = 234
plt.imshow(images_f[i], cmap = 'bone')
print(f'The label is: {classes_f[i]}')

In [None]:
#Inspecting the prior distribution of each class

values, counts = np.unique(classes_f, return_counts=True)
plt.bar(values, counts, color ='maroon',
        width = 0.4)
plt.xticks([0, 1], ['Healthy', 'Tumor'])
plt.xlabel('Classes')
plt.ylabel('Distribution')
plt.show()

In [None]:
clss = np.squeeze(classes_f)
threshold = 3200                    # Threshold of data split
ds = pd.DataFrame({                  # Defining dataframe for training and validation
    'file_name': files[:threshold],
    'class': clss[:threshold]
})

dt = pd.DataFrame({                  # Defining dataframe for testing 
    'file_name': files[threshold:],
    'class': clss[threshold:]
})
ds.tail()

# Defining Data-loader for our CNN model:

In [None]:
# create datagenerator object for training and validation datasets
#rescale images using 1/255
train_datagen = ImageDataGenerator(
            #we will do normalization for image pixel values as following
            horizontal_flip=True,
            brightness_range=[0.5,1.2],
            zoom_range=0.4,
            rescale=1 / 255.0,
            )

#use flow_from_dataframe method to load images from directory and labels from dataframe
train_datagen_flow = train_datagen.flow_from_dataframe(
    dataframe = ds,
    directory= path,
    x_col="file_name", 
    y_col="class",
    #we are doing regression, so we will assign class_mode to 'raw'
    class_mode="raw",
    #to convert all images to same pixel size, for neural networks, all images should have similar size
    target_size=(200,200),
    #we will load images batch by batch (every time 32 images will be loaded)
    #batch_size=32,
    seed=4,
    )

In [None]:
#get one batch from our datagenerator and display images in it
features, target = next(train_datagen_flow)
# display 32 images
fig = plt.figure(figsize=(12,12))
for i in range(32):
    fig.add_subplot(4, 8, i+1)
    plt.imshow(features[i])
    plt.title(f'Class: {target[i]}')
# remove axes and place the images closer to one another for a more compact output
    plt.xticks([])
    plt.yticks([])
    plt.suptitle('Images with real class',  y=0.9,fontsize=16, color='b')
    plt.tight_layout()
print(f'Classes: 1 => Tumor, 0 => No-Tumor')

In [None]:
from keras import backend as K

def recall_m(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    recall = true_positives / (possible_positives + K.epsilon())
    return recall

def precision_m(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    precision = true_positives / (predicted_positives + K.epsilon())
    return precision

def f1_m(y_true, y_pred):
    precision = precision_m(y_true, y_pred)
    recall = recall_m(y_true, y_pred)
    return 2*((precision*recall)/(precision+recall+K.epsilon()))


def load_train(path, labels):
    
    """
    It loads the train part of dataset from path
    """
    train_datagen = ImageDataGenerator(horizontal_flip=True,
                                       vertical_flip=True,
                                        brightness_range=[0.5,1.2],
                                        zoom_range=0.4,
                                        rescale=1 / 255.0,
                                        dtype='float64')
    train_gen_flow = train_datagen.flow_from_dataframe(
        dataframe=labels,
        directory=path,
        x_col='file_name',
        y_col='class',
        target_size=(240, 240),
        #batch_size=32,
        class_mode='raw',
        seed=12345)

    return train_gen_flow


# def load_test(path, labels):
    
#     """
#     It loads the train part of dataset from path
#     """
#     train_datagen = ImageDataGenerator(#validation_split=0.25,
#                                        rescale=1 / 255)
#     train_gen_flow = train_datagen.flow_from_dataframe(
#         dataframe=labels,
#         directory=path,
#         x_col='file_name',
#         y_col='class',
#         target_size=(240, 240),
#         #batch_size=32,
#         class_mode='raw',
#         seed=12345)

#     return train_gen_flow


def create_model(input_shape):
    
    """
    It defines the model
    """
    
    model = keras.Sequential()
    model.add(InputLayer(input_shape))
    model.add(Conv2D(filters=32,kernel_size=5, activation="relu", padding="same"))
    model.add(MaxPool2D())
    model.add(Conv2D(filters=64,kernel_size=3, activation="relu", padding="same"))
    model.add(MaxPool2D())
    model.add(Conv2D(filters=96,kernel_size=3, activation="relu", padding="same"))
    model.add(MaxPool2D())
    model.add(Conv2D(filters=128,kernel_size=3, activation="relu", padding="same"))
    model.add(MaxPool2D())
    model.add(Conv2D(filters=96,kernel_size=3, activation="relu", padding="same"))
    model.add(MaxPool2D())
    model.add(keras.layers.GlobalAveragePooling2D())
    model.add(Dense(96, activation='relu'))
    model.add(Dropout(rate=0.3))
    model.add(BatchNormalization())
    model.add(Dense(64, activation='relu'))
    model.add(Dropout(rate=0.3))
    model.add(BatchNormalization())
    model.add(Dense(32, activation="relu"))
    model.add(Dropout(rate=0.3))
    model.add(BatchNormalization())
#     model.add(Dense(32, activation="relu"))
#     model.add(BatchNormalization())
#     model.add(Dropout(rate=0.1))
    model.add(Dense(1, activation="sigmoid"))
    optimizer = Adam(lr=0.001)
    model.compile(optimizer=optimizer, loss = BinaryCrossentropy(), metrics=['binary_accuracy', f1_m, precision_m, recall_m])
    print(model.summary())

    return model


def train_model(model, train_data, test_data, batch_size, epochs=50,):

    """
    Trains the model given the parameters
    """
    
    checkpoint_filepath = 'my_best_model.epoch{epoch:02d}-Loss{val_loss:.4f}.hdf5'
    model_checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
        filepath=checkpoint_filepath,
        save_weights_only= False,
        monitor='val_loss',
        mode='min',
        save_best_only=True)
    
    history = model.fit(train_data, validation_data=test_data, batch_size=batch_size,
              epochs=epochs, callbacks = [model_checkpoint_callback], verbose=1)

    return model, history

# CNN Model Creation & Training:

In [None]:
test_split = 0.20
n = len(ds)
test_idx = np.random.choice(n, int(test_split*n), replace=False)
train_idx = np.setdiff1d(np.arange(n), test_idx)
train_idx.shape, test_idx.shape

In [None]:
train_data = load_train(path, ds.iloc[train_idx])
test_data = load_train(path, ds.iloc[test_idx])
model1 = create_model(input_shape = (240, 240, 3))
model1, history = train_model(model1, train_data, test_data, 32, epochs = 80)

## Keras Tuner (optional)

In [None]:
# def build_model(hp): 
#     '''
#     In the previous model hyper-parameters were fixed. 
#     We need to define the model over some range of hyperparameters to tune it later.
#     '''
#     model = keras.Sequential([
#     keras.layers.InputLayer(input_shape = (240, 240, 3)),
#     keras.layers.Conv2D(
#         filters=hp.Int('conv_1_filter', min_value=32, max_value=128, step=16),
#         kernel_size=hp.Choice('conv_1_kernel', values = [3,5]),
#         activation='relu',
#         padding="same",
#         input_shape=(240, 240, 3)
#     ),
#     keras.layers.MaxPool2D(),
#     keras.layers.Conv2D(
#         filters=hp.Int('conv_2_filter', min_value=32, max_value=128, step=16),
#         kernel_size=hp.Choice('conv_2_kernel', values = [3,5]),
#         activation='relu',
#         padding="same"
#     ),
#     keras.layers.MaxPool2D(),
#     keras.layers.Conv2D(
#         filters=hp.Int('conv_3_filter', min_value=32, max_value=128, step=16),
#         kernel_size=hp.Choice('conv_2_kernel', values = [3,5]),
#         activation='relu',
#         padding="same"
#     ),
#     keras.layers.MaxPool2D(),
#     keras.layers.Conv2D(
#         filters=hp.Int('conv_4_filter', min_value=32, max_value=128, step=16),
#         kernel_size=hp.Choice('conv_2_kernel', values = [3,5]),
#         activation='relu',
#         padding="same"
#     ),
#     keras.layers.MaxPool2D(),
#     keras.layers.GlobalAveragePooling2D(),
#     keras.layers.Dense(
#         units=hp.Int('dense_1_units', min_value=32, max_value=128, step=16),
#         activation='relu'
#     ),
#     keras.layers.Dropout(rate=0.3),
#     keras.layers.BatchNormalization(),
#     keras.layers.Dense(
#         units=hp.Int('dense_2_units', min_value=32, max_value=128, step=16),
#         activation='relu'
#     ),
#     keras.layers.BatchNormalization(),
#     keras.layers.Dropout(rate=0.3),
#     keras.layers.Dense(1, activation='sigmoid')
#     ])

#     model.compile(optimizer=keras.optimizers.Adam(1e-3),#hp.Choice('learning_rate', values=[1e-2, 1e-3])),
#                   loss='binary_crossentropy',
#                   metrics=['accuracy'])

#     return model

In [None]:
# # Defining keras tuner object
# tuner_search=RandomSearch(build_model,
#                           objective='val_accuracy',
#                           max_trials=8,directory='output',project_name="Brain tumor classification"
#                          )

In [None]:
# tuner_search.search(images_f[:threshold],classes_f[:threshold],epochs=15,validation_split=0.3)

In [None]:
# model = tuner_search.get_best_models(num_models=1)[0]
# model.summary()

In [None]:
# This shows a pictorial representation of the model
tf.keras.utils.plot_model(
    model1, to_file='model.png',
    show_shapes=True,
    show_dtype=True,
    show_layer_names=True,
    expand_nested=True,
    dpi=96,
    show_layer_activations=True,
    show_trainable=True
)

In [None]:
# model, history = train_model(model, train_data, test_data, 32, epochs = 75)

In [None]:
m = np.min(history.history['val_loss'])
i = np.argmin(history.history['val_loss'])
print('Minimum validation loss: ' + str(m)+' and validation accuracy: ' + str(history.history['val_binary_accuracy'][i]*100) + ' %')
print('after iteration no. ' + str(i+1) +'.')

In [None]:
fig, ax =  plt.subplots(1, 2, figsize = (13, 5))
ax[0].plot(history.history['binary_accuracy'], label='Training accuracy')
ax[0].plot(history.history['val_binary_accuracy'], label = 'Validation accuracy')
ax[0].scatter(i, m, marker = 'x', s = 200, c = 'r')
ax[0].set_xlabel('Epoch')
ax[0].set_ylabel('Accuracy')
ax[0].set_ylim([0.4, 1.1])
ax[0].grid(); ax[0].legend(loc='lower right')
plt.legend(loc='lower right')
ax[1].plot(history.history['loss'], label='Training loss')
ax[1].plot(history.history['val_loss'], label = 'Validation loss')
ax[1].scatter(i, history.history['val_loss'][i], marker = 'x', s = 200, c = 'r')
ax[1].set_xlabel('Epoch')
ax[1].set_ylabel('BCE Loss')
plt.grid()
plt.legend(loc='upper right')
plt.savefig('Accuracy and Loss.png')
plt.show()

In [None]:
def evaluate_model(model1, dataframe = dt, directory = path):
    # Now we shall test the model with a previously seperated chunk of data
    test_datagen = ImageDataGenerator(rescale=1 / 255.0,
                                     dtype = 'float64')
    final_test_data = test_datagen.flow_from_dataframe(
        dataframe = dt,
        directory= path,
        x_col="file_name", 
        y_col="class",
        #we are doing regression, so we will assign class_mode to 'raw'
        class_mode="raw",
        #to convert all images to same pixel size, for neural networks, all images should have similar size
        target_size=(240,240),
        seed=4,
        )
    test_loss, test_acc = model1.evaluate(final_test_data, verbose=2)
    return test_loss, test_acc
evaluate_model(model1)

In [None]:
# from sklearn.model_selection import train_test_split

# # Assuming you have images in a numpy array 'images' of shape (3762, 240, 240, 3)
# # And class labels in a numpy array 'labels' of shape (3762,)

# # Step 1: Split the data into training set and temporary set (validation + test set)
# X_train, X_temp, y_train, y_temp = train_test_split(images_f, classes_f, test_size=0.4, random_state=42)
# del images_f, classes_f, 
# # Step 2: Split the temporary set into validation and test sets
# X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

# # Check the shapes of the datasets
# print("Training set shape:", X_train.shape, y_train.shape)
# print("Validation set shape:", X_val.shape, y_val.shape)
# print("Test set shape:", X_test.shape, y_test.shape)

In [None]:
# model = create_model(input_shape = (240, 240, 1))
# checkpoint_filepath = 'my_best_model.epoch{epoch:02d}-Loss{val_loss:.4f}.hdf5'
# model_checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
#     filepath=checkpoint_filepath,
#     save_weights_only= False,
#     monitor='val_loss',
#     mode='min',
#     save_best_only=True)

# history = model.fit(X_train, y_train, validation_data=(X_val, y_val), batch_size=32,
#           epochs=75, callbacks = [model_checkpoint_callback], verbose=1)

In [None]:
# fig, ax =  plt.subplots(1, 2, figsize = (13, 5))
# ax[0].plot(history.history['binary_accuracy'], label='Training accuracy')
# ax[0].plot(history.history['val_binary_accuracy'], label = 'Validation accuracy')
# ax[0].set_xlabel('Epoch')
# ax[0].set_ylabel('Accuracy')
# ax[0].set_ylim([0.4, 1.1])
# ax[0].grid(); ax[0].legend(loc='lower right')
# plt.legend(loc='lower right')
# ax[1].plot(history.history['loss'], label='Training loss')
# ax[1].plot(history.history['val_loss'], label = 'Validation loss')
# ax[1].set_xlabel('Epoch')
# ax[1].set_ylabel('BCE Loss')
# plt.grid()
# plt.legend(loc='upper right')
# plt.savefig('Accuracy and Loss.png')
# plt.show()

In [None]:
model2 = keras.models.load_model(r'/kaggle/working/my_best_model.epoch60-Loss0.0802.hdf5')
test_loss, test_acc = evaluate_model(model2)


In [None]:
print(f'So here we are getting test accuracy of {test_acc*100} %, which is descent for such simple CNN model.')

In [None]:
%cd /kaggle/working
os.listdir('/kaggle/working')
# from IPython.display import FileLink
# FileLink(r'my_best_model.epoch13-loss0.99.hdf5')
# FileLink(r'Accuracy and Loss.png')

In [None]:
# test_images = []
# test_classes = []
# n = len(dt)
# for i in range(n):
#     fle = dt['file_name'][i]
#     status = dt['class'][i]
#     test_classes.append(status)
#     total = path + fle
#     image= cv2.imread(total)
#     image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
#     image= cv2.resize(image,(240,240))
#     test_images.append(image)
    
# test_images = np.array(test_images, dtype = np.float64)/255.0
# test_classes = np.squeeze(np.array(test_classes, dtype = np.float64))

In [None]:
from tensorflow.keras.models import Model
ft_input = model2.input
ft_layers = model2.layers
features = Model(inputs = ft_input, outputs = ft_layers[-4].output)
features.summary()

In [None]:
# X = np.concatenate([X_train, X_val], axis = 0)
# y = np.concatenate([y_train, y_val], axis = 0)
# del X_train, X_val, y_train, y_val

In [None]:
def show_images_with_labels(images, labels, nrows=5, ncols=5, figsize=(12, 12)):
    fig, axes = plt.subplots(nrows=nrows, ncols=ncols, figsize=figsize)
    fig.suptitle('Sample Images with Class Labels', fontsize=16)

    for i, ax in enumerate(axes.flatten()):
        if i < len(images):
            # Display the image
            ax.imshow(images[i], cmap = 'bone')
            ax.axis('off')

            # Set the corresponding class label as the title
            ax.set_title(f"Class: {labels[i]}", fontsize=12)

        else:
            # If there are no more images, remove the axis
            ax.axis('off')

    # Adjust spacing between subplots
    fig.tight_layout(rect=[0, 0, 1, 0.96])

    # Show the plot
    plt.show()

# Assuming you have images in a numpy array 'images' of shape (3762, 240, 240, 3)
# And class labels in a numpy array 'labels' of shape (3762,)
# Call the function to display a 5x5 grid of sample images with labels
show_images_with_labels(test_images[:25], test_classes[:25])

In [None]:
from tqdm import tqdm
image_features = []
n = len(ds)
for i in tqdm(range(n)):
    temp = np.array(images_f[i], dtype = np.float64)/255.0
    temp = np.expand_dims(temp, axis = 0)
    feat = features.predict(temp)
    image_features.append(feat)

del images_f
image_features = np.array(image_features)
image_features.shape

In [None]:
y = np.squeeze(classes_f[:n])
image_features = np.squeeze(image_features)
image_features.shape, y.shape

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold
import lightgbm as lgbm

# Set the number of folds for cross-validation
n_folds = 10

# Create StratifiedKFold object for cross-validation
kf = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=42)

# Create an empty list to store the predictions
y_pred_cv = []
image_features = np.squeeze(image_features)
# Iterate over the folds
for train_idx, val_idx in kf.split(image_features, y):
    X_train, X_val = image_features[train_idx], image_features[val_idx]  
    y_train, y_val = y[train_idx], y[val_idx]

    # Create and train LightGBM model with GPU support
    # model_lgbm = lgbm.LGBMClassifier(device="gpu")
    model_classifier = RandomForestClassifier()
    model_classifier.fit(X_train, y_train)

    # Make predictions on the validation set
    y_pred_val = model_classifier.predict(X_val)
    y_pred_cv.append(y_pred_val)

# Concatenate the predictions from all folds
y_pred_cv = np.concatenate(y_pred_cv)

from sklearn.metrics import accuracy_score, classification_report, roc_auc_score

# Calculate accuracy
accuracy = accuracy_score(y, y_pred_cv)
print("Accuracy:", accuracy)

# Print classification report
print("Classification Report:")
print(classification_report(y, y_pred_cv))

# Calculate ROC-AUC score
roc_auc = roc_auc_score(y, y_pred_cv)
print("ROC-AUC Score:", roc_auc)


In [None]:
model_lgbm = lgbm.LGBMClassifier(device="gpu", gpu_platform_id=0, gpu_device_id=0)
model_lgbm.fit(image_features, y)

pred = []
for i in range(len(X_test)):
    sample = np.expand_dims(X_test[i], axis = 0)
    sample = features.predict(sample)
    sample = np.squeeze(sample)
    sample = model_lgbm.predict(sample)
    pred.append(sample)
    
pred = np.array(pred, dtype = np.float64)
    

# Classification using *Random Forest Classifier* by extracted features form images:

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
df1 = df.drop(['Image', 'Class'], axis = 1) # droping image name and label informations from feature set
df1.head()

In [None]:
from sklearn.preprocessing import StandardScaler

X = df1 
y = df.Class
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X) # Transforming the data to uniform scale for better accuracy

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score

model_rf = RandomForestClassifier()
model_rf.fit(X_train, y_train)
pred_rfc = model_rf.predict(X_test)
print(classification_report(y_test, pred_rfc))
acc_rfc = accuracy_score(y_test, pred_rfc)
print(f'Random Forest Classifier has {100*acc_rfc} % accuracy!')

**So our CNN model performs almost on a par with the inbuilt Random Forest Classifier in terms of test accuracy!**