# Brain Tumor MRI Classification Project

## Project Overview
This project aims to classify brain tumors from MRI images into four distinct categories: Glioma, Meningioma, No Tumor, and Pituitary. In this first notebook, we establish the data pipeline, perform exploratory data analysis (EDA), and prepare the images for the deep learning training phase.

## Extract Preprocessed Data

In [None]:
import os
import zipfile
import glob

def extract_preprocessed_data():
    zip_candidates = ['/content/preprocessed_data.zip', *glob.glob('/content/*preprocessed*.zip')]
    zip_path = None
    for candidate in zip_candidates:
        if os.path.exists(candidate):
            zip_path = candidate
            break

    if not zip_path:
        print("preprocessed_data.zip not found in /content/")
        return False

    if os.path.exists('/content/preprocessed_data') and os.path.exists('/content/preprocessed_data/config.json'):
        required_files = [
            'X_train.npy', 'X_val.npy', 'X_test.npy',
            'y_train.npy', 'y_val.npy', 'y_test.npy',
            'y_train_cat.npy', 'y_val_cat.npy', 'y_test_cat.npy',
            'config.json'
        ]
        missing = [f for f in required_files if not os.path.exists(f'/content/preprocessed_data/{f}')]
        if not missing:
            print("preprocessed_data folder already exists")
            return True

    try:
        with zipfile.ZipFile(zip_path, 'r') as zip_ref:
            zip_ref.extractall('/content/')

        print(f"Extraction completed: {os.path.basename(zip_path)}")
        return True
    except Exception as e:
        print(f"ERROR: {str(e)}")
        return False

if extract_preprocessed_data():
    for f in sorted(os.listdir('/content/preprocessed_data')):
        print(f"├── {f}")
else:
    print("Cannot proceed without preprocessed data")

Extraction completed: preprocessed_data.zip
├── X_test.npy
├── X_train.npy
├── X_val.npy
├── config.json
├── y_test.npy
├── y_test_cat.npy
├── y_train.npy
├── y_train_cat.npy
├── y_val.npy
├── y_val_cat.npy


## Environment and Dependencies
We utilize TensorFlow and Keras for building the neural network, along with NumPy and Pandas
for data handling. Matplotlib and Seaborn are used for performance visualization

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
import json
import time
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers, models
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.callbacks import (ModelCheckpoint,ReduceLROnPlateau,LearningRateScheduler)
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.applications import MobileNetV2
np.random.seed(42)
tf.random.set_seed(42)
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 8)
print(f"TensorFlow Version: {tf.__version__}")
print(f"GPU Available: {tf.config.list_physical_devices('GPU')}")

TensorFlow Version: 2.19.0
GPU Available: [PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]


## Data Pipeline
The dataset consists of preprocessed MRI scans stored as NumPy arrays. We define paths for
loading data and saving training artifacts.

In [None]:
DATA_PATH = '/content/preprocessed_data'
OUTPUT_PATH = '/content/training_results'
os.makedirs(OUTPUT_PATH, exist_ok=True)
os.makedirs(f'{OUTPUT_PATH}/models', exist_ok=True)
os.makedirs(f'{OUTPUT_PATH}/histories', exist_ok=True)
os.makedirs(f'{OUTPUT_PATH}/plots', exist_ok=True)
X_train = np.load(f'{DATA_PATH}/X_train.npy')
X_val = np.load(f'{DATA_PATH}/X_val.npy')
X_test = np.load(f'{DATA_PATH}/X_test.npy')
y_train_cat = np.load(f'{DATA_PATH}/y_train_cat.npy')
y_val_cat = np.load(f'{DATA_PATH}/y_val_cat.npy')
y_test_cat = np.load(f'{DATA_PATH}/y_test_cat.npy')
with open(f'{DATA_PATH}/config.json', 'r') as f:
    config = json.load(f)

## Data Augmentation Strategy

To improve model generalization and mitigate overfitting, we implement a moderate augmentation strategy that includes rotations, shifts, and flips. Vertical flipping is deemed safe for MRI
brain scans.

In [None]:
train_datagen = ImageDataGenerator(
    rotation_range=20,
    width_shift_range=0.15,
    height_shift_range=0.15,
    shear_range=0.15,
    zoom_range=0.15,
    horizontal_flip=True,
    vertical_flip=True,
    fill_mode='nearest'
)

## Model Architecture

This time we use MobileNetV2 as transfer learning model which loaded with imagenet weight. we freeze the base model layer and train the top layer of the model. Each block after transfer learning block is followed by Batch Normalization and Dropout.

In [None]:
def build_mobilenetv2(input_shape=(224, 224, 3), num_classes=4):
    # Load MobileNetV2 base model
    base_model = MobileNetV2(
        input_shape=input_shape,
        include_top=False,
        weights='imagenet'
    )

    # Freeze base model layers
    base_model.trainable = False

    # Build model with MobileNetV2 base
    model = models.Sequential([
        base_model,
        layers.GlobalAveragePooling2D(),
        layers.BatchNormalization(),
        layers.Dropout(0.55),
        layers.Dense(512, activation='relu'),
        layers.BatchNormalization(),
        layers.Dropout(0.55),
        layers.Dense(256, activation='relu'),
        layers.BatchNormalization(),
        layers.Dropout(0.55),
        layers.Dense(num_classes, activation='softmax')
    ])

    return model

In [None]:
model = build_mobilenetv2(
    input_shape=X_train.shape[1:],
    num_classes=config['num_classes']
)

Downloading data from https://storage.googleapis.com/tensorflow/keras-applications/mobilenet_v2/mobilenet_v2_weights_tf_dim_ordering_tf_kernels_1.0_224_no_top.h5
[1m9406464/9406464[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 0us/step


## Training Process

The model is trained for 100 epochs using the Adam optimizer. We monitor validation accuracy
to save the best weights and reduce the learning rate when the loss plateaus.

In [None]:
model.compile(
    optimizer=Adam(learning_rate=0.001),
    loss='categorical_crossentropy',
    metrics=['accuracy', tf.keras.metrics.Precision(name='precision'), tf.keras.metrics.Recall(name='recall')]
)

In [None]:
callbacks = [
    ModelCheckpoint(filepath=f'{OUTPUT_PATH}/models/best_model.h5', monitor='val_accuracy', save_best_only=True),
    ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=7, min_lr=1e-7)
]

In [None]:
history = model.fit(
    train_datagen.flow(X_train, y_train_cat, batch_size=32),
    epochs=100,
    validation_data=(X_val, y_val_cat),
    callbacks=callbacks
)

Epoch 1/100
[1m152/152[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 451ms/step - accuracy: 0.5701 - loss: 1.3596 - precision: 0.5941 - recall: 0.5436



[1m152/152[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m110s[0m 568ms/step - accuracy: 0.5708 - loss: 1.3574 - precision: 0.5948 - recall: 0.5443 - val_accuracy: 0.7713 - val_loss: 0.5924 - val_precision: 0.7917 - val_recall: 0.7585 - learning_rate: 0.0010
Epoch 2/100
[1m152/152[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 359ms/step - accuracy: 0.7555 - loss: 0.6921 - precision: 0.7764 - recall: 0.7393



[1m152/152[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m56s[0m 368ms/step - accuracy: 0.7557 - loss: 0.6917 - precision: 0.7765 - recall: 0.7394 - val_accuracy: 0.8378 - val_loss: 0.4249 - val_precision: 0.8537 - val_recall: 0.8238 - learning_rate: 0.0010
Epoch 3/100
[1m152/152[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 361ms/step - accuracy: 0.7818 - loss: 0.6168 - precision: 0.8021 - recall: 0.7635



[1m152/152[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m56s[0m 371ms/step - accuracy: 0.7819 - loss: 0.6165 - precision: 0.8022 - recall: 0.7635 - val_accuracy: 0.8390 - val_loss: 0.4209 - val_precision: 0.8556 - val_recall: 0.8226 - learning_rate: 0.0010
Epoch 4/100
[1m152/152[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 357ms/step - accuracy: 0.7981 - loss: 0.5194 - precision: 0.8195 - recall: 0.7779



[1m152/152[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m56s[0m 365ms/step - accuracy: 0.7982 - loss: 0.5194 - precision: 0.8195 - recall: 0.7780 - val_accuracy: 0.8623 - val_loss: 0.3513 - val_precision: 0.8854 - val_recall: 0.8471 - learning_rate: 0.0010
Epoch 5/100
[1m152/152[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m55s[0m 365ms/step - accuracy: 0.8373 - loss: 0.4493 - precision: 0.8532 - recall: 0.8155 - val_accuracy: 0.8541 - val_loss: 0.3549 - val_precision: 0.8811 - val_recall: 0.8390 - learning_rate: 0.0010
Epoch 6/100
[1m152/152[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 355ms/step - accuracy: 0.8290 - loss: 0.4379 - precision: 0.8504 - recall: 0.8080



[1m152/152[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m55s[0m 363ms/step - accuracy: 0.8290 - loss: 0.4379 - precision: 0.8504 - recall: 0.8080 - val_accuracy: 0.8681 - val_loss: 0.3544 - val_precision: 0.8819 - val_recall: 0.8448 - learning_rate: 0.0010
Epoch 7/100
[1m152/152[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 356ms/step - accuracy: 0.8209 - loss: 0.4515 - precision: 0.8433 - recall: 0.8005



[1m152/152[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m55s[0m 363ms/step - accuracy: 0.8210 - loss: 0.4514 - precision: 0.8434 - recall: 0.8005 - val_accuracy: 0.8775 - val_loss: 0.3237 - val_precision: 0.8956 - val_recall: 0.8611 - learning_rate: 0.0010
Epoch 8/100
[1m152/152[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m55s[0m 364ms/step - accuracy: 0.8425 - loss: 0.4157 - precision: 0.8617 - recall: 0.8169 - val_accuracy: 0.8716 - val_loss: 0.3280 - val_precision: 0.8938 - val_recall: 0.8541 - learning_rate: 0.0010
Epoch 9/100
[1m152/152[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m56s[0m 365ms/step - accuracy: 0.8427 - loss: 0.4141 - precision: 0.8589 - recall: 0.8204 - val_accuracy: 0.8658 - val_loss: 0.3307 - val_precision: 0.8873 - val_recall: 0.8541 - learning_rate: 0.0010
Epoch 10/100
[1m152/152[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 355



[1m152/152[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m55s[0m 365ms/step - accuracy: 0.8525 - loss: 0.3932 - precision: 0.8728 - recall: 0.8308 - val_accuracy: 0.8868 - val_loss: 0.2916 - val_precision: 0.8978 - val_recall: 0.8716 - learning_rate: 0.0010
Epoch 11/100
[1m152/152[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m56s[0m 370ms/step - accuracy: 0.8473 - loss: 0.3952 - precision: 0.8637 - recall: 0.8318 - val_accuracy: 0.8786 - val_loss: 0.3169 - val_precision: 0.8936 - val_recall: 0.8623 - learning_rate: 0.0010
Epoch 12/100
[1m152/152[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m55s[0m 364ms/step - accuracy: 0.8581 - loss: 0.3697 - precision: 0.8738 - recall: 0.8419 - val_accuracy: 0.8868 - val_loss: 0.3029 - val_precision: 0.8971 - val_recall: 0.8751 - learning_rate: 0.0010
Epoch 13/100
[1m152/152[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m57s[0m 



[1m152/152[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m57s[0m 375ms/step - accuracy: 0.8671 - loss: 0.3516 - precision: 0.8788 - recall: 0.8538 - val_accuracy: 0.8880 - val_loss: 0.2752 - val_precision: 0.9017 - val_recall: 0.8775 - learning_rate: 0.0010
Epoch 24/100
[1m152/152[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m58s[0m 382ms/step - accuracy: 0.8637 - loss: 0.3405 - precision: 0.8768 - recall: 0.8488 - val_accuracy: 0.8786 - val_loss: 0.2893 - val_precision: 0.8940 - val_recall: 0.8658 - learning_rate: 0.0010
Epoch 25/100
[1m152/152[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 368ms/step - accuracy: 0.8606 - loss: 0.3668 - precision: 0.8721 - recall: 0.8421



[1m152/152[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m57s[0m 376ms/step - accuracy: 0.8607 - loss: 0.3667 - precision: 0.8721 - recall: 0.8422 - val_accuracy: 0.8973 - val_loss: 0.2496 - val_precision: 0.9088 - val_recall: 0.8833 - learning_rate: 0.0010
Epoch 26/100
[1m152/152[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m58s[0m 379ms/step - accuracy: 0.8736 - loss: 0.3365 - precision: 0.8816 - recall: 0.8602 - val_accuracy: 0.8915 - val_loss: 0.2697 - val_precision: 0.9042 - val_recall: 0.8810 - learning_rate: 0.0010
Epoch 27/100
[1m152/152[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m56s[0m 368ms/step - accuracy: 0.8641 - loss: 0.3634 - precision: 0.8789 - recall: 0.8499 - val_accuracy: 0.8961 - val_loss: 0.2725 - val_precision: 0.9012 - val_recall: 0.8833 - learning_rate: 0.0010
Epoch 28/100
[1m152/152[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m56s[0m 



[1m152/152[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m57s[0m 373ms/step - accuracy: 0.8780 - loss: 0.3276 - precision: 0.8888 - recall: 0.8665 - val_accuracy: 0.9032 - val_loss: 0.2510 - val_precision: 0.9127 - val_recall: 0.8903 - learning_rate: 0.0010
Epoch 31/100
[1m152/152[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m58s[0m 382ms/step - accuracy: 0.8799 - loss: 0.3226 - precision: 0.8932 - recall: 0.8706 - val_accuracy: 0.8868 - val_loss: 0.2840 - val_precision: 0.9016 - val_recall: 0.8763 - learning_rate: 0.0010
Epoch 32/100
[1m152/152[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m56s[0m 371ms/step - accuracy: 0.8702 - loss: 0.3424 - precision: 0.8798 - recall: 0.8564 - val_accuracy: 0.8996 - val_loss: 0.2702 - val_precision: 0.9143 - val_recall: 0.8833 - learning_rate: 0.0010
Epoch 33/100
[1m152/152[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m57s[0m 



[1m152/152[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m56s[0m 368ms/step - accuracy: 0.8987 - loss: 0.2825 - precision: 0.9108 - recall: 0.8900 - val_accuracy: 0.9078 - val_loss: 0.2591 - val_precision: 0.9142 - val_recall: 0.8950 - learning_rate: 2.5000e-04
Epoch 47/100
[1m152/152[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m56s[0m 367ms/step - accuracy: 0.8996 - loss: 0.2821 - precision: 0.9059 - recall: 0.8871 - val_accuracy: 0.9067 - val_loss: 0.2560 - val_precision: 0.9144 - val_recall: 0.8973 - learning_rate: 1.2500e-04
Epoch 48/100
[1m152/152[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m56s[0m 365ms/step - accuracy: 0.8919 - loss: 0.2834 - precision: 0.9005 - recall: 0.8802 - val_accuracy: 0.9055 - val_loss: 0.2510 - val_precision: 0.9155 - val_recall: 0.8973 - learning_rate: 1.2500e-04
Epoch 49/100
[1m152/152[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m 

In [None]:
MODEL_NAME = 'mobilenetv2'

In [None]:
history_path = f'{OUTPUT_PATH}/histories/{MODEL_NAME}_history.npy'
np.save(history_path, history.history)

In [None]:
final_model_path = f'{OUTPUT_PATH}/models/{MODEL_NAME}_final.h5'
model.save(final_model_path)



In [None]:
best_model = keras.models.load_model(f'{OUTPUT_PATH}/models/{MODEL_NAME}_final.h5')



## Test Time Augmentation (TTA)

TTA is utilized during the inference phase. By generating 10 augmented versions of each test
image and averaging the predictions, we significantly increase the robustness of the final classification.

In [None]:
def predict_with_tta(model, X, n_augmentations=10):
    predictions = []
    preds = model.predict(X, verbose=0)
    predictions.append(preds)
    tta_gen = ImageDataGenerator(
        rotation_range=15,
        width_shift_range=0.1,
        height_shift_range=0.1,
        horizontal_flip=True,
        vertical_flip=True
    )
    for i in range(n_augmentations):
        aug_iterator = tta_gen.flow(X, batch_size=len(X), shuffle=False)
        X_aug = next(iter(aug_iterator))
        preds_aug = model.predict(X_aug, verbose=0)
        predictions.append(preds_aug)
    return np.mean(predictions, axis=0)

test_preds_tta = predict_with_tta(best_model, X_test, n_augmentations=10)
test_acc_tta = np.mean(np.argmax(test_preds_tta, axis=1) == np.argmax(y_test_cat, axis=1))

print(f"\nTTA completed!")

# EVALUATION

print("\n")
print("EVALUATION")

# Validation
val_results = best_model.evaluate(X_val, y_val_cat, verbose=0)
print(f"\nValidation Results (Best Model):")
print(f"Loss: {val_results[0]:.4f}")
print(f"Accuracy: {val_results[1]*100:.2f}%")
print(f"Precision: {val_results[2]:.4f}")
print(f"Recall: {val_results[3]:.4f}")

# Test (standard)
test_results = best_model.evaluate(X_test, y_test_cat, verbose=0)
print(f"\nTest Results (Standard):")
print(f"Loss: {test_results[0]:.4f}")
print(f"Accuracy: {test_results[1]*100:.2f}%")
print(f"Precision: {test_results[2]:.4f}")
print(f"Recall: {test_results[3]:.4f}")

# Test (with TTA)
print(f"\nTest Results (With TTA):")
print(f"Accuracy: {test_acc_tta*100:.2f}%")

print("\nSUMMARY:")
print(f"Baseline Test Acc: 93.82%")
print(f"Test Acc (Standard): {test_results[1]*100:.2f}%")
print(f"Test Acc (TTA): {test_acc_tta*100:.2f}%")


TTA completed!


EVALUATION

Validation Results (Best Model):
Loss: 0.2478
Accuracy: 90.43%
Precision: 0.9144
Recall: 0.8973

Test Results (Standard):
Loss: 0.2768
Accuracy: 89.55%
Precision: 0.9014
Recall: 0.8856

Test Results (With TTA):
Accuracy: 93.14%

SUMMARY:
Baseline Test Acc: 93.82%
Test Acc (Standard): 89.55%
Test Acc (TTA): 93.14%
