# Brain Tumor MRI Classification Project

## Project Overview
This project aims to classify brain tumors from MRI images into four distinct categories: Glioma, Meningioma, No Tumor, and Pituitary. In this first notebook, we establish the data pipeline, perform exploratory data analysis (EDA), and prepare the images for the deep learning training phase.

## Extract Preprocessed Data

In [None]:
import os
import zipfile
import glob

def extract_preprocessed_data():
    zip_candidates = ['/content/preprocessed_data.zip', *glob.glob('/content/*preprocessed*.zip')]
    zip_path = None
    for candidate in zip_candidates:
        if os.path.exists(candidate):
            zip_path = candidate
            break

    if not zip_path:
        print("preprocessed_data.zip not found in /content/")
        return False

    if os.path.exists('/content/preprocessed_data') and os.path.exists('/content/preprocessed_data/config.json'):
        required_files = [
            'X_train.npy', 'X_val.npy', 'X_test.npy',
            'y_train.npy', 'y_val.npy', 'y_test.npy',
            'y_train_cat.npy', 'y_val_cat.npy', 'y_test_cat.npy',
            'config.json'
        ]
        missing = [f for f in required_files if not os.path.exists(f'/content/preprocessed_data/{f}')]
        if not missing:
            print("preprocessed_data folder already exists")
            return True

    try:
        with zipfile.ZipFile(zip_path, 'r') as zip_ref:
            zip_ref.extractall('/content/')

        print(f"Extraction completed: {os.path.basename(zip_path)}")
        return True
    except Exception as e:
        print(f"ERROR: {str(e)}")
        return False

if extract_preprocessed_data():
    for f in sorted(os.listdir('/content/preprocessed_data')):
        print(f"├── {f}")
else:
    print("Cannot proceed without preprocessed data")

Extraction completed: preprocessed_data.zip
├── X_test.npy
├── X_train.npy
├── X_val.npy
├── config.json
├── y_test.npy
├── y_test_cat.npy
├── y_train.npy
├── y_train_cat.npy
├── y_val.npy
├── y_val_cat.npy


## Environment and Dependencies
We utilize TensorFlow and Keras for building the neural network, along with NumPy and Pandas
for data handling. Matplotlib and Seaborn are used for performance visualization

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
import json
import time
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers, models
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.callbacks import (ModelCheckpoint,ReduceLROnPlateau,LearningRateScheduler)
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.applications import ResNet50
np.random.seed(42)
tf.random.set_seed(42)
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 8)
print(f"TensorFlow Version: {tf.__version__}")
print(f"GPU Available: {tf.config.list_physical_devices('GPU')}")

TensorFlow Version: 2.19.0
GPU Available: [PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]


## Data Pipeline
The dataset consists of preprocessed MRI scans stored as NumPy arrays. We define paths for
loading data and saving training artifacts.

In [None]:
DATA_PATH = '/content/preprocessed_data'
OUTPUT_PATH = '/content/training_results'
os.makedirs(OUTPUT_PATH, exist_ok=True)
os.makedirs(f'{OUTPUT_PATH}/models', exist_ok=True)
os.makedirs(f'{OUTPUT_PATH}/histories', exist_ok=True)
os.makedirs(f'{OUTPUT_PATH}/plots', exist_ok=True)
X_train = np.load(f'{DATA_PATH}/X_train.npy')
X_val = np.load(f'{DATA_PATH}/X_val.npy')
X_test = np.load(f'{DATA_PATH}/X_test.npy')
y_train_cat = np.load(f'{DATA_PATH}/y_train_cat.npy')
y_val_cat = np.load(f'{DATA_PATH}/y_val_cat.npy')
y_test_cat = np.load(f'{DATA_PATH}/y_test_cat.npy')
with open(f'{DATA_PATH}/config.json', 'r') as f:
    config = json.load(f)

## Data Augmentation Strategy

To improve model generalization and mitigate overfitting, we implement a moderate augmentation strategy that includes rotations, shifts, and flips. Vertical flipping is deemed safe for MRI
brain scans.

In [None]:
train_datagen = ImageDataGenerator(
    rotation_range=20,
    width_shift_range=0.15,
    height_shift_range=0.15,
    shear_range=0.15,
    zoom_range=0.15,
    horizontal_flip=True,
    vertical_flip=True,
    fill_mode='nearest'
)

## Model Architecture

This time we use ResNet-50 as transfer learning model which loaded with imagenet weight. we freeze the base model early layers and train the later layers of the model. Each block after transfer learning block is followed by Batch Normalization and Dropout.

In [None]:
def build_resnet50(input_shape=(224, 224, 3), num_classes=4):
    # Load ResNet50 with pretrained ImageNet weights
    base_model = ResNet50(
        weights='imagenet',
        include_top=False,
        input_shape=input_shape
    )

    # Freeze early layers and unfreeze later layers for fine-tuning
    for layer in base_model.layers[:-30]:
        layer.trainable = False
    for layer in base_model.layers[-30:]:
        layer.trainable = True

    # Build model with ResNet50 base
    model = models.Sequential([
        base_model,
        layers.GlobalAveragePooling2D(),
        layers.BatchNormalization(),
        layers.Dropout(0.5),
        layers.Dense(512, activation='relu'),
        layers.BatchNormalization(),
        layers.Dropout(0.5),
        layers.Dense(256, activation='relu'),
        layers.BatchNormalization(),
        layers.Dropout(0.5),
        layers.Dense(num_classes, activation='softmax')
    ])

    return model

In [None]:
model = build_resnet50(
    input_shape=X_train.shape[1:],
    num_classes=config['num_classes']
)

Downloading data from https://storage.googleapis.com/tensorflow/keras-applications/resnet/resnet50_weights_tf_dim_ordering_tf_kernels_notop.h5
[1m94765736/94765736[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 0us/step


## Training Process

The model is trained for 100 epochs using the Adam optimizer. We monitor validation accuracy
to save the best weights and reduce the learning rate when the loss plateaus.

In [None]:
model.compile(
    optimizer=Adam(learning_rate=0.001),
    loss='categorical_crossentropy',
    metrics=['accuracy', tf.keras.metrics.Precision(name='precision'), tf.keras.metrics.Recall(name='recall')]
)

In [None]:
callbacks = [
    ModelCheckpoint(filepath=f'{OUTPUT_PATH}/models/best_model.h5', monitor='val_accuracy', save_best_only=True),
    ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=7, min_lr=1e-7)
]

In [None]:
history = model.fit(
    train_datagen.flow(X_train, y_train_cat, batch_size=32),
    epochs=100,
    validation_data=(X_val, y_val_cat),
    callbacks=callbacks
)

Epoch 1/100
[1m152/152[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 463ms/step - accuracy: 0.4596 - loss: 1.6115 - precision: 0.4813 - recall: 0.3937



[1m152/152[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m111s[0m 554ms/step - accuracy: 0.4599 - loss: 1.6099 - precision: 0.4817 - recall: 0.3939 - val_accuracy: 0.3232 - val_loss: 3.1414 - val_precision: 0.3198 - val_recall: 0.3127 - learning_rate: 0.0010
Epoch 2/100
[1m152/152[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 399ms/step - accuracy: 0.5745 - loss: 1.0594 - precision: 0.6329 - recall: 0.4962



[1m152/152[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m64s[0m 418ms/step - accuracy: 0.5746 - loss: 1.0592 - precision: 0.6330 - recall: 0.4962 - val_accuracy: 0.6068 - val_loss: 0.9740 - val_precision: 0.7664 - val_recall: 0.4364 - learning_rate: 0.0010
Epoch 3/100
[1m152/152[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m63s[0m 415ms/step - accuracy: 0.6145 - loss: 0.9440 - precision: 0.6756 - recall: 0.5287 - val_accuracy: 0.5286 - val_loss: 1.2055 - val_precision: 0.5644 - val_recall: 0.4807 - learning_rate: 0.0010
Epoch 4/100
[1m152/152[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m62s[0m 406ms/step - accuracy: 0.6354 - loss: 0.8827 - precision: 0.6965 - recall: 0.5445 - val_accuracy: 0.5088 - val_loss: 1.4430 - val_precision: 0.5132 - val_recall: 0.5006 - learning_rate: 0.0010
Epoch 5/100
[1m152/152[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m62s[0m 409



[1m152/152[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m64s[0m 418ms/step - accuracy: 0.6727 - loss: 0.7865 - precision: 0.7297 - recall: 0.5983 - val_accuracy: 0.6476 - val_loss: 0.8895 - val_precision: 0.6773 - val_recall: 0.6196 - learning_rate: 0.0010
Epoch 9/100
[1m152/152[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m62s[0m 405ms/step - accuracy: 0.7011 - loss: 0.7618 - precision: 0.7487 - recall: 0.6246 - val_accuracy: 0.4586 - val_loss: 2.5716 - val_precision: 0.4589 - val_recall: 0.4434 - learning_rate: 0.0010
Epoch 10/100
[1m152/152[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m62s[0m 407ms/step - accuracy: 0.6912 - loss: 0.7592 - precision: 0.7505 - recall: 0.6089 - val_accuracy: 0.6231 - val_loss: 0.9674 - val_precision: 0.6305 - val_recall: 0.5636 - learning_rate: 0.0010
Epoch 11/100
[1m152/152[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m60s[0m 3



[1m152/152[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m62s[0m 406ms/step - accuracy: 0.7088 - loss: 0.6969 - precision: 0.7638 - recall: 0.6466 - val_accuracy: 0.7270 - val_loss: 0.7045 - val_precision: 0.7525 - val_recall: 0.6919 - learning_rate: 0.0010
Epoch 13/100
[1m152/152[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m62s[0m 404ms/step - accuracy: 0.7110 - loss: 0.7194 - precision: 0.7632 - recall: 0.6386 - val_accuracy: 0.2905 - val_loss: 3.6443 - val_precision: 0.2886 - val_recall: 0.2859 - learning_rate: 0.0010
Epoch 14/100
[1m152/152[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m61s[0m 399ms/step - accuracy: 0.7131 - loss: 0.7291 - precision: 0.7624 - recall: 0.6441 - val_accuracy: 0.6091 - val_loss: 1.0903 - val_precision: 0.6124 - val_recall: 0.5881 - learning_rate: 0.0010
Epoch 15/100
[1m152/152[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m63s[0m 



[1m152/152[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m63s[0m 410ms/step - accuracy: 0.7190 - loss: 0.6918 - precision: 0.7750 - recall: 0.6601 - val_accuracy: 0.7981 - val_loss: 0.5162 - val_precision: 0.8230 - val_recall: 0.7596 - learning_rate: 0.0010
Epoch 18/100
[1m152/152[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m62s[0m 410ms/step - accuracy: 0.7171 - loss: 0.6911 - precision: 0.7670 - recall: 0.6542 - val_accuracy: 0.6313 - val_loss: 1.1180 - val_precision: 0.6710 - val_recall: 0.6044 - learning_rate: 0.0010
Epoch 19/100
[1m152/152[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m61s[0m 401ms/step - accuracy: 0.7161 - loss: 0.6871 - precision: 0.7631 - recall: 0.6489 - val_accuracy: 0.4212 - val_loss: 3.8889 - val_precision: 0.4197 - val_recall: 0.4177 - learning_rate: 0.0010
Epoch 20/100
[1m152/152[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m62s[0m 



[1m152/152[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m61s[0m 400ms/step - accuracy: 0.7609 - loss: 0.6126 - precision: 0.7963 - recall: 0.6985 - val_accuracy: 0.8261 - val_loss: 0.4862 - val_precision: 0.8635 - val_recall: 0.7748 - learning_rate: 5.0000e-04
Epoch 27/100
[1m152/152[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m61s[0m 402ms/step - accuracy: 0.7393 - loss: 0.6537 - precision: 0.7757 - recall: 0.6865 - val_accuracy: 0.5613 - val_loss: 1.4285 - val_precision: 0.5896 - val_recall: 0.5146 - learning_rate: 5.0000e-04
Epoch 28/100
[1m152/152[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m60s[0m 396ms/step - accuracy: 0.7422 - loss: 0.6549 - precision: 0.7759 - recall: 0.6870 - val_accuracy: 0.7410 - val_loss: 0.6483 - val_precision: 0.7865 - val_recall: 0.6791 - learning_rate: 5.0000e-04
Epoch 29/100
[1m152/152[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m 



[1m152/152[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m61s[0m 400ms/step - accuracy: 0.7990 - loss: 0.5120 - precision: 0.8290 - recall: 0.7647 - val_accuracy: 0.8273 - val_loss: 0.4370 - val_precision: 0.8542 - val_recall: 0.7865 - learning_rate: 6.2500e-05
Epoch 55/100
[1m152/152[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m61s[0m 400ms/step - accuracy: 0.7931 - loss: 0.5163 - precision: 0.8251 - recall: 0.7562 - val_accuracy: 0.8063 - val_loss: 0.4570 - val_precision: 0.8319 - val_recall: 0.7911 - learning_rate: 6.2500e-05
Epoch 56/100
[1m152/152[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 381ms/step - accuracy: 0.7948 - loss: 0.5355 - precision: 0.8209 - recall: 0.7585



[1m152/152[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m61s[0m 400ms/step - accuracy: 0.7948 - loss: 0.5355 - precision: 0.8208 - recall: 0.7585 - val_accuracy: 0.8576 - val_loss: 0.3717 - val_precision: 0.8762 - val_recall: 0.8343 - learning_rate: 6.2500e-05
Epoch 57/100
[1m152/152[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m61s[0m 403ms/step - accuracy: 0.7958 - loss: 0.5143 - precision: 0.8206 - recall: 0.7564 - val_accuracy: 0.7468 - val_loss: 0.6472 - val_precision: 0.7620 - val_recall: 0.7211 - learning_rate: 6.2500e-05
Epoch 58/100
[1m152/152[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m60s[0m 393ms/step - accuracy: 0.8091 - loss: 0.5071 - precision: 0.8317 - recall: 0.7742 - val_accuracy: 0.8191 - val_loss: 0.4756 - val_precision: 0.8454 - val_recall: 0.7783 - learning_rate: 6.2500e-05
Epoch 59/100
[1m152/152[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m 



[1m152/152[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m62s[0m 408ms/step - accuracy: 0.7909 - loss: 0.5216 - precision: 0.8209 - recall: 0.7576 - val_accuracy: 0.8635 - val_loss: 0.3637 - val_precision: 0.8780 - val_recall: 0.8401 - learning_rate: 6.2500e-05
Epoch 60/100
[1m152/152[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m60s[0m 397ms/step - accuracy: 0.7784 - loss: 0.5512 - precision: 0.8103 - recall: 0.7437 - val_accuracy: 0.8308 - val_loss: 0.4548 - val_precision: 0.8470 - val_recall: 0.8075 - learning_rate: 6.2500e-05
Epoch 61/100
[1m152/152[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m61s[0m 398ms/step - accuracy: 0.8053 - loss: 0.5059 - precision: 0.8369 - recall: 0.7661 - val_accuracy: 0.8331 - val_loss: 0.4157 - val_precision: 0.8601 - val_recall: 0.8110 - learning_rate: 6.2500e-05
Epoch 62/100
[1m152/152[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m 

In [None]:
MODEL_NAME = 'resnet50'

In [None]:
history_path = f'{OUTPUT_PATH}/histories/{MODEL_NAME}_history.npy'
np.save(history_path, history.history)

In [None]:
final_model_path = f'{OUTPUT_PATH}/models/{MODEL_NAME}_final.h5'
model.save(final_model_path)



In [None]:
best_model = keras.models.load_model(f'{OUTPUT_PATH}/models/{MODEL_NAME}_final.h5')



## Test Time Augmentation (TTA)

TTA is utilized during the inference phase. By generating 10 augmented versions of each test
image and averaging the predictions, we significantly increase the robustness of the final classification.

In [None]:
def predict_with_tta(model, X, n_augmentations=10):
    predictions = []
    preds = model.predict(X, verbose=0)
    predictions.append(preds)
    tta_gen = ImageDataGenerator(
        rotation_range=15,
        width_shift_range=0.1,
        height_shift_range=0.1,
        horizontal_flip=True,
        vertical_flip=True
    )
    for i in range(n_augmentations):
        aug_iterator = tta_gen.flow(X, batch_size=len(X), shuffle=False)
        X_aug = next(iter(aug_iterator))
        preds_aug = model.predict(X_aug, verbose=0)
        predictions.append(preds_aug)
    return np.mean(predictions, axis=0)

test_preds_tta = predict_with_tta(best_model, X_test, n_augmentations=10)
test_acc_tta = np.mean(np.argmax(test_preds_tta, axis=1) == np.argmax(y_test_cat, axis=1))

print(f"\nTTA completed!")

# EVALUATION

print("\n")
print("EVALUATION")

# Validation
val_results = best_model.evaluate(X_val, y_val_cat, verbose=0)
print(f"\nValidation Results (Best Model):")
print(f"Loss: {val_results[0]:.4f}")
print(f"Accuracy: {val_results[1]*100:.2f}%")
print(f"Precision: {val_results[2]:.4f}")
print(f"Recall: {val_results[3]:.4f}")

# Test (standard)
test_results = best_model.evaluate(X_test, y_test_cat, verbose=0)
print(f"\nTest Results (Standard):")
print(f"Loss: {test_results[0]:.4f}")
print(f"Accuracy: {test_results[1]*100:.2f}%")
print(f"Precision: {test_results[2]:.4f}")
print(f"Recall: {test_results[3]:.4f}")

# Test (with TTA)
print(f"\nTest Results (With TTA):")
print(f"Accuracy: {test_acc_tta*100:.2f}%")

print("\nSUMMARY:")
print(f"Baseline Test Acc: 93.82%")
print(f"Test Acc (Standard): {test_results[1]*100:.2f}%")
print(f"Test Acc (TTA): {test_acc_tta*100:.2f}%")


TTA completed!


EVALUATION

Validation Results (Best Model):
Loss: 0.4006
Accuracy: 84.25%
Precision: 0.8587
Recall: 0.8296

Test Results (Standard):
Loss: 0.5175
Accuracy: 79.71%
Precision: 0.8175
Recall: 0.7788

Test Results (With TTA):
Accuracy: 82.99%

SUMMARY:
Baseline Test Acc: 93.82%
Test Acc (Standard): 79.71%
Test Acc (TTA): 82.99%
