# Business Understanding

# Data Understanding

# Data Preparation (Binary Classification)

In [15]:
# Importing necessary libraries
import tensorflow as tf
from tensorflow.keras import models, layers, regularizers
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.wrappers.scikit_learn import KerasClassifier
from sklearn.model_selection import cross_val_predict
import numpy as np
from scikeras.wrappers import KerasClassifier
import os
import shutil
from sklearn.model_selection import train_test_split

In [17]:
# Organizing the data into tumor/nontumor and splitting it into train, val, and test sets

import os
import shutil
from sklearn.model_selection import train_test_split

os.makedirs('tumor', exist_ok=True)
os.makedirs('nontumor', exist_ok=True)

for tumor_dir in ['glioma_tumor', 'meningioma_tumor', 'pituitary_tumor']:
    for img in os.listdir(tumor_dir):
        shutil.move(os.path.join(tumor_dir, img), 'tumor')

for img in os.listdir('normal'):
    shutil.move(os.path.join('normal', img), 'nontumor')

shutil.rmtree('glioma_tumor')
shutil.rmtree('meningioma_tumor')
shutil.rmtree('pituitary_tumor')
shutil.rmtree('normal')

def split_data(source_dir, dest_dir, split_ratios=(0.8, 0.1, 0.1)):
    files = os.listdir(source_dir)
    train_files, temp_files = train_test_split(files, train_size=split_ratios[0], random_state=42)
    val_files, test_files = train_test_split(temp_files, train_size=split_ratios[1]/(split_ratios[1] + split_ratios[2]), random_state=42)
    
    for file in train_files:
        shutil.copy(os.path.join(source_dir, file), os.path.join(dest_dir, 'train', os.path.basename(source_dir), file))
    for file in val_files:
        shutil.copy(os.path.join(source_dir, file), os.path.join(dest_dir, 'val', os.path.basename(source_dir), file))
    for file in test_files:
        shutil.copy(os.path.join(source_dir, file), os.path.join(dest_dir, 'test', os.path.basename(source_dir), file))

for split in ['train', 'val', 'test']:
    os.makedirs(os.path.join(split, 'tumor'), exist_ok=True)
    os.makedirs(os.path.join(split, 'nontumor'), exist_ok=True)

split_data('tumor', '.')
split_data('nontumor', '.')

shutil.rmtree('tumor')
shutil.rmtree('nontumor')


In [18]:
# Counting the number of files in each directory and printing the number of images per category in each dataset
import os

def count_files(directory):
    return sum([len(files) for r, d, files in os.walk(directory)])

for split in ['train', 'val', 'test']:
    tumor_count = count_files(os.path.join(split, 'tumor'))
    nontumor_count = count_files(os.path.join(split, 'nontumor'))
    print(f"{split.capitalize()} set - Tumor images: {tumor_count}, Nontumor images: {nontumor_count}")


Train set - Tumor images: 2126, Nontumor images: 350
Val set - Tumor images: 266, Nontumor images: 44
Test set - Tumor images: 266, Nontumor images: 44


In [19]:
# Preprocessing the data using ImageDataGenerator with augmentation for training and normalization for validation/test

from tensorflow.keras.preprocessing.image import ImageDataGenerator

train_datagen = ImageDataGenerator(
    rescale=1./255,
    rotation_range=20,
    width_shift_range=0.2,
    height_shift_range=0.2,
    shear_range=0.2,
    zoom_range=0.2,
    horizontal_flip=True
)

val_test_datagen = ImageDataGenerator(rescale=1./255)

train_generator = train_datagen.flow_from_directory(
    'train',
    target_size=(256, 256),
    batch_size=32,
    class_mode='binary'
)

val_generator = val_test_datagen.flow_from_directory(
    'val',
    target_size=(256, 256),
    batch_size=32,
    class_mode='binary'
)

test_generator = val_test_datagen.flow_from_directory(
    'test',
    target_size=(256, 256),
    batch_size=32,
    class_mode='binary'
)


Found 2476 images belonging to 2 classes.
Found 310 images belonging to 2 classes.
Found 310 images belonging to 2 classes.


# Modeling (Binary Classification)

## Baseline CNN Model

In [20]:
# Creating, compiling, training, evaluating, and saving a CNN model for binary classification

from tensorflow.keras import models, layers, optimizers
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.models import save_model
from sklearn.metrics import confusion_matrix, precision_score, recall_score

cnn_model = models.Sequential([
    layers.Conv2D(32, (3, 3), activation='relu', input_shape=(256, 256, 3)),
    layers.MaxPooling2D((2, 2)),
    layers.Conv2D(64, (3, 3), activation='relu'),
    layers.MaxPooling2D((2, 2)),
    layers.Conv2D(128, (3, 3), activation='relu'),
    layers.MaxPooling2D((2, 2)),
    layers.Conv2D(128, (3, 3), activation='relu'),
    layers.MaxPooling2D((2, 2)),
    layers.Flatten(),
    layers.Dense(512, activation='relu'),
    layers.Dense(1, activation='sigmoid')
])

cnn_model.compile(
    optimizer=optimizers.Adam(learning_rate=0.0001),
    loss='binary_crossentropy',
    metrics=['accuracy']
)

early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)

history = cnn_model.fit(
    train_generator,
    epochs=50,
    validation_data=val_generator,
    callbacks=[early_stopping]
)

val_loss, val_accuracy = cnn_model.evaluate(val_generator)
print(f"Validation Loss: {val_loss}")
print(f"Validation Accuracy: {val_accuracy}")

save_model(cnn_model, 'binary_cnn_model.h5')

val_generator.reset()
predictions = cnn_model.predict(val_generator)
predicted_classes = np.where(predictions > 0.5, 1, 0)

true_classes = val_generator.classes

conf_matrix = confusion_matrix(true_classes, predicted_classes)
precision = precision_score(true_classes, predicted_classes)
recall = recall_score(true_classes, predicted_classes)

print("Confusion Matrix:")
print(conf_matrix)
print(f"Precision: {precision}")
print(f"Recall: {recall}")


Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Validation Loss: 0.10753267258405685
Validation Accuracy: 0.9451612830162048
Confusion Matrix:
[[  7  37]
 [ 36 230]]
Precision: 0.8614232209737828
Recall: 0.8646616541353384


## CNN with Added Layers and Dropout

In [21]:
# Creating, compiling, training, evaluating, and saving an enhanced CNN model for binary classification

cnn_model = models.Sequential([
    layers.Conv2D(64, (3, 3), activation='relu', input_shape=(256, 256, 3)),
    layers.MaxPooling2D((2, 2)),
    
    layers.Conv2D(128, (3, 3), activation='relu'),
    layers.MaxPooling2D((2, 2)),
    
    layers.Conv2D(256, (3, 3), activation='relu'),
    layers.MaxPooling2D((2, 2)),
    
    layers.Conv2D(256, (3, 3), activation='relu'),
    layers.MaxPooling2D((2, 2)),
    
    layers.Flatten(),
    layers.Dense(512, activation='relu'),
    layers.Dropout(0.5),
    layers.Dense(1, activation='sigmoid')
])

cnn_model.compile(
    optimizer=optimizers.Adam(learning_rate=0.0001),
    loss='binary_crossentropy',
    metrics=['accuracy']
)

early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)

history = cnn_model.fit(
    train_generator,
    epochs=50,
    validation_data=val_generator,
    callbacks=[early_stopping]
)

val_loss, val_accuracy = cnn_model.evaluate(val_generator)
print(f"Validation Loss: {val_loss}")
print(f"Validation Accuracy: {val_accuracy}")

save_model(cnn_model, 'enhanced_cnn_model.h5')

val_generator.reset()
predictions = cnn_model.predict(val_generator)
predicted_classes = np.where(predictions > 0.5, 1, 0)

true_classes = val_generator.classes

conf_matrix = confusion_matrix(true_classes, predicted_classes)
precision = precision_score(true_classes, predicted_classes)
recall = recall_score(true_classes, predicted_classes)

print("Confusion Matrix:")
print(conf_matrix)
print(f"Precision: {precision}")
print(f"Recall: {recall}")


Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Validation Loss: 0.08121751993894577
Validation Accuracy: 0.9612902998924255
Confusion Matrix:
[[  3  41]
 [ 43 223]]
Precision: 0.8446969696969697
Recall: 0.8383458646616542


## Adding Layers and Adjusting Learning Rate

In [22]:
# Creating, compiling, training, evaluating, and saving a more complex CNN model with adjusted learning rate

cnn_model = models.Sequential([
    layers.Conv2D(64, (3, 3), activation='relu', input_shape=(256, 256, 3)),
    layers.MaxPooling2D((2, 2)),
    
    layers.Conv2D(128, (3, 3), activation='relu'),
    layers.MaxPooling2D((2, 2)),
    
    layers.Conv2D(256, (3, 3), activation='relu'),
    layers.MaxPooling2D((2, 2)),
    
    layers.Conv2D(256, (3, 3), activation='relu'),
    layers.MaxPooling2D((2, 2)),
    
    layers.Conv2D(512, (3, 3), activation='relu'),
    layers.MaxPooling2D((2, 2)),
    
    layers.Flatten(),
    layers.Dense(512, activation='relu'),
    layers.Dropout(0.4),
    layers.Dense(1, activation='sigmoid')
])

cnn_model.compile(
    optimizer=optimizers.Adam(learning_rate=0.00005),  # Reduced learning rate for finer tuning
    loss='binary_crossentropy',
    metrics=['accuracy']
)

early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)

history = cnn_model.fit(
    train_generator,
    epochs=50,
    validation_data=val_generator,
    callbacks=[early_stopping]
)

val_loss, val_accuracy = cnn_model.evaluate(val_generator)
print(f"Validation Loss: {val_loss}")
print(f"Validation Accuracy: {val_accuracy}")

save_model(cnn_model, 'complex_cnn_model.h5')

val_generator.reset()
predictions = cnn_model.predict(val_generator)
predicted_classes = np.where(predictions > 0.5, 1, 0)

true_classes = val_generator.classes

conf_matrix = confusion_matrix(true_classes, predicted_classes)
precision = precision_score(true_classes, predicted_classes)
recall = recall_score(true_classes, predicted_classes)

print("Confusion Matrix:")
print(conf_matrix)
print(f"Precision: {precision}")
print(f"Recall: {recall}")


Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Validation Loss: 0.14099209010601044
Validation Accuracy: 0.9387096762657166
Confusion Matrix:
[[  6  38]
 [ 35 231]]
Precision: 0.8587360594795539
Recall: 0.868421052631579


# Binary Classification Model Evaluation

In [23]:
# Evaluating the final model on the test data

from tensorflow.keras.models import load_model

cnn_model = load_model('complex_cnn_model.h5')

test_loss, test_accuracy = cnn_model.evaluate(test_generator)
print(f"Test Loss: {test_loss}")
print(f"Test Accuracy: {test_accuracy}")

test_generator.reset()
predictions = cnn_model.predict(test_generator)
predicted_classes = np.where(predictions > 0.5, 1, 0)

true_classes = test_generator.classes

conf_matrix = confusion_matrix(true_classes, predicted_classes)
precision = precision_score(true_classes, predicted_classes)
recall = recall_score(true_classes, predicted_classes)

print("Confusion Matrix:")
print(conf_matrix)
print(f"Precision: {precision}")
print(f"Recall: {recall}")


Test Loss: 0.152609184384346
Test Accuracy: 0.9225806593894958
Confusion Matrix:
[[  7  37]
 [ 37 229]]
Precision: 0.8609022556390977
Recall: 0.8609022556390977
