# Pneumonia Detection from Chest X-Rays

## Deliverable 1: Deep Learning Project Notebook

This notebook demonstrates a CNN-based pneumonia detection system from chest X-ray images, including exploratory data analysis (EDA), model building and training, Grad-CAM visualization for explainability, and evaluation.

In [None]:
# Install kagglehub if not installed
# !pip install kagglehub

import kagglehub
import os
import zipfile

# Download latest version of the dataset
path = kagglehub.dataset_download("paultimothymooney/chest-xray-pneumonia")
print("Path to dataset files:", path)

# Extract zip archive
dataset_zip = path
dataset_dir = "./chest_xray_pneumonia"

with zipfile.ZipFile(dataset_zip, 'r') as zip_ref:
    zip_ref.extractall(dataset_dir)

print("Dataset extracted to:", dataset_dir)

# Define train and test directories
train_dir = os.path.join(dataset_dir, 'chest_xray', 'train')
test_dir = os.path.join(dataset_dir, 'chest_xray', 'test')

print("Train directory:", train_dir)
print("Test directory:", test_dir)

## 1. Exploratory Data Analysis (EDA)

In [None]:
import matplotlib.pyplot as plt
from matplotlib.image import imread
import numpy as np
import os

# Count images in train dataset
train_normal = os.listdir(os.path.join(train_dir, 'NORMAL'))
train_pneumonia = os.listdir(os.path.join(train_dir, 'PNEUMONIA'))

print(f"Number of training NORMAL images: {len(train_normal)}")
print(f"Number of training PNEUMONIA images: {len(train_pneumonia)}")

# Visualize some example images
def plot_sample_images(folder, label, n=5):
    plt.figure(figsize=(15,5))
    for i, img_name in enumerate(folder[:n]):
        img = imread(os.path.join(label, img_name))
        plt.subplot(1, n, i+1)
        plt.imshow(img, cmap='gray')
        plt.title(label.split(os.sep)[-1])
        plt.axis('off')
    plt.show()

plot_sample_images(train_normal, os.path.join(train_dir, 'NORMAL'))
plot_sample_images(train_pneumonia, os.path.join(train_dir, 'PNEUMONIA'))

## 2. Data Preprocessing and Loading

In [None]:
from tensorflow.keras.preprocessing.image import ImageDataGenerator

# Image parameters
img_height, img_width = 150, 150
batch_size = 32

# Data augmentation and preprocessing
train_datagen = ImageDataGenerator(
    rescale=1./255,
    shear_range=0.2,
    zoom_range=0.2,
    horizontal_flip=True)

test_datagen = ImageDataGenerator(rescale=1./255)

# Load images from directories
train_generator = train_datagen.flow_from_directory(
    train_dir,
    target_size=(img_height, img_width),
    batch_size=batch_size,
    class_mode='binary')

test_generator = test_datagen.flow_from_directory(
    test_dir,
    target_size=(img_height, img_width),
    batch_size=batch_size,
    class_mode='binary',
    shuffle=False)

## 3. Model Architecture (CNN)

In [None]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense, Dropout

model = Sequential([
    Conv2D(32, (3,3), activation='relu', input_shape=(img_height, img_width, 3)),
    MaxPooling2D(2,2),
    Conv2D(64, (3,3), activation='relu'),
    MaxPooling2D(2,2),
    Conv2D(128, (3,3), activation='relu'),
    MaxPooling2D(2,2),
    Flatten(),
    Dense(128, activation='relu'),
    Dropout(0.5),
    Dense(1, activation='sigmoid')
])

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

model.summary()

## 4. Model Training

In [None]:
epochs = 10

history = model.fit(
    train_generator,
    epochs=epochs,
    validation_data=test_generator
)

## 5. Plot Training History

In [None]:
plt.figure(figsize=(12,5))

plt.subplot(1,2,1)
plt.plot(history.history['loss'], label='train loss')
plt.plot(history.history['val_loss'], label='val loss')
plt.legend()
plt.title('Loss')

plt.subplot(1,2,2)
plt.plot(history.history['accuracy'], label='train accuracy')
plt.plot(history.history['val_accuracy'], label='val accuracy')
plt.legend()
plt.title('Accuracy')

plt.show()

## 6. Model Evaluation on Test Set

In [None]:
loss, accuracy = model.evaluate(test_generator)
print(f"Test loss: {loss:.4f}")
print(f"Test accuracy: {accuracy:.4f}")

## 7. Grad-CAM Visualization for Explainability

In [None]:
import tensorflow.keras.backend as K
import cv2
import numpy as np

def get_img_array(img_path, size):
    img = tf.keras.preprocessing.image.load_img(img_path, target_size=size)
    array = tf.keras.preprocessing.image.img_to_array(img)
    array = np.expand_dims(array, axis=0)
    array /= 255.0
    return array

def make_gradcam_heatmap(img_array, model, last_conv_layer_name, pred_index=None):
    grad_model = tf.keras.models.Model(
        [model.inputs], [model.get_layer(last_conv_layer_name).output, model.output])

    with tf.GradientTape() as tape:
        conv_outputs, predictions = grad_model(img_array)
        if pred_index is None:
            pred_index = tf.argmax(predictions[0])
        class_channel = predictions[:, pred_index]

    grads = tape.gradient(class_channel, conv_outputs)

    pooled_grads = tf.reduce_mean(grads, axis=(0, 1, 2))

    conv_outputs = conv_outputs[0]
    heatmap = conv_outputs @ pooled_grads[..., tf.newaxis]
    heatmap = tf.squeeze(heatmap)

    heatmap = tf.maximum(heatmap, 0) / tf.math.reduce_max(heatmap)
    return heatmap.numpy()

import matplotlib.cm as cm

# Select last convolutional layer name
last_conv_layer_name = 'conv2d_2'  # Based on model.summary()

# Pick sample images for visualization
sample_normal_img = os.path.join(train_dir, 'NORMAL', train_normal[0])
sample_pneumonia_img = os.path.join(train_dir, 'PNEUMONIA', train_pneumonia[0])

for img_path, label in [(sample_normal_img, 'Normal'), (sample_pneumonia_img, 'Pneumonia')]:
    img_array = get_img_array(img_path, size=(img_height, img_width))
    heatmap = make_gradcam_heatmap(img_array, model, last_conv_layer_name)

    img = tf.keras.preprocessing.image.load_img(img_path)
    img = tf.keras.preprocessing.image.img_to_array(img)
    img = cv2.resize(img, (img_width, img_height))

    heatmap = cv2.resize(heatmap, (img.shape[1], img.shape[0]))
    heatmap = np.uint8(255 * heatmap)

    heatmap = cv2.applyColorMap(heatmap, cv2.COLORMAP_JET)

    superimposed_img = heatmap * 0.4 + img
    superimposed_img = np.uint8(superimposed_img)

    plt.figure(figsize=(8,8))
    plt.title(f"Grad-CAM: {label}")
    plt.axis('off')
    plt.imshow(superimposed_img)
    plt.show()

## 8. Conclusion

This notebook demonstrated a CNN model to detect pneumonia from chest X-rays with good accuracy. The Grad-CAM visualizations provide insight into regions of the X-ray images influencing the model's predictions, increasing explainability.

Further improvements may include experimenting with transfer learning, hyperparameter tuning, or using ensemble models.