<a href="https://colab.research.google.com/github/arkajyotibhattacharya/fundusAI/blob/dev/OcularDiseaseSegmentationCase.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Ocular Disease Classification
*Dataset*: Ocular Disease Recognition (https://www.kaggle.com/datasets/andrewmvd/ocular-disease-recognition-odir5k/data)

*Additional Dataset*: Cataract Dataset (https://www.kaggle.com/datasets/jr2ngb/cataractdataset)




Basic collaboration guidelines:

The




# 1. Import Packages, Connect to Kaggle, and Import Data

In [None]:
# ONLY FOR GOOGLE COLLAB: Install libraries and clean workspace
!pip install pandas numpy matplotlib seaborn tensorflow scikit-learn
!rm -rf /content/sample_data

# ONLY FOR LOCAL USAGE / VSC: Setup correct kernel
# Steps
# - 1. click in top right on select kernel and click on "Python Environments" -> "umcg-48-hour-case (Python 3.12.12)"
# - 2. run code below
# - if needed: install ipykernel and pip package in popup



In [None]:
# Import dependencies and suppress warnings
from warnings import filterwarnings
filterwarnings('ignore')

import pandas as pd
import os
import glob
import matplotlib.pyplot as plt
import numpy as np

import tensorflow as tf
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.utils import plot_model


from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

import seaborn as sns
import time
import shutil

### Quick guide to setup connection with Kaggle

Steps:
1. Sign in / register to Kaggle: https://www.kaggle.com/.
2. Click on your profile and go to "Settings": https://www.kaggle.com/settings.
3. Go to "API Tokens (Recommended)" and click on "Generate New Token".
4. Give it a name (e.g. token-umcg-48-hour-case) and create the token.
5. Save the API token and paste it below.

In [None]:
# Set Kaggle credentials as environment variables
os.environ['KAGGLE_USERNAME'] = "..."  # your username (e.g. "jaapjansen")
os.environ['KAGGLE_KEY'] = "..." # your API token (e.g. "KGAT_c6344.........81e94a")

# Initialize and authenticate the Kaggle API client
from kaggle.api.kaggle_api_extended import KaggleApi
api = KaggleApi()
api.authenticate()

print("Authentication successful!")

# Define the specific ODIR-5K dataset identifier from Kaggle
dataset_identifier = 'andrewmvd/ocular-disease-recognition-odir5k'

# Download and unzip the files into a folder named 'ocular_data'
api.dataset_download_files(dataset_identifier, path='./ocular_data', unzip=True)

print("Download complete! Check the 'ocular_data' folder on the left sidebar.")

In [None]:
# Read the Ocular Disease data
data_ocu = pd.read_csv('ocular_data/full_df.csv')

# Convert Kaggle-specific file paths to local Colab directory paths
image_dir = "ocular_data/ODIR-5K"
data_ocu['paths'] = data_ocu['filepath'].apply(lambda x: os.path.normpath(os.path.join(image_dir,'/'.join(x.split('/')[3:]))))

data_ocu.head()

In [None]:
# Optional: import data from Cataract dataset

# Define the specific cataract dataset identifier from Kaggle
dataset_identifier = 'jr2ngb/cataractdataset'

# Download and unzip the files into a folder named 'cataract_data'
api.dataset_download_files(dataset_identifier, path='./cataract_data', unzip=True)

# Define paths for cleanup
base_path = './cataract_data'
inner_folder = os.path.join(base_path, 'dataset')
temp_path = './temp_storage'

# Move the folder we want to keep to a temporary spot
shutil.move(inner_folder, temp_path)

# Remove the entire original folder (deletes repo and readme)
shutil.rmtree(base_path)

# Rename the kept folder to 'cataract_data'
os.rename(temp_path, base_path)

print("Download complete! Check the 'cataract_data' folder on the left sidebar.")

In [None]:
# Optional: load data from cataract dataset
IMG_HEIGHT = 192
IMG_WIDTH = 256

# Define the root directory for the cataract dataset
IMG_ROOT = './cataract_data/'

# Initialize lists to store file paths and labels
filepaths = []
labels = []

# Map folder names to labels
label_map = {
    '1_normal': '0', # Normal
    '2_cataract': '1', # Cataract
    '2_glaucoma': '2', # Glaucoma
    '3_retina_disease': '3' # Retina Disease
}

# Iterate through the subdirectories to collect image paths and assign labels
for folder_name, label_value in label_map.items():
    folder_path = os.path.join(IMG_ROOT, folder_name)
    for img_path in glob.glob(os.path.join(folder_path, '*')):
        filepaths.append(img_path)
        labels.append(label_value)

# Create a DataFrame from the collected data
cat_df = pd.DataFrame({
    'paths': filepaths,
    'cataract': labels
})

# Only sample normal and cataract (labels '0' and '1')
cat_df = cat_df[(cat_df['cataract']=='0') | (cat_df['cataract']=='1')]
cat_df

In [None]:
# join datasets
joined_cataract_data = pd.concat([cat_df, data_ocu])
joined_cataract_data

# 2. Data Exploration

In [None]:
# Visualize different Ocular Diseases
unique_labels = data_ocu['labels'].unique()

# Create a figure with one subplot for each unique label
fig, ax = plt.subplots(1, len(unique_labels), figsize=(20, 5))  # Adjust the figure size as needed

# Display one image per unique label
for idx, label in enumerate(unique_labels):
    # Find the first image path corresponding to the current label
    image_row = data_ocu[data_ocu['labels'] == label].iloc[0]
    image_path = image_row['paths']
    image_label = image_row['labels']

    # Load and display the image
    img = plt.imread(image_path)
    ax[idx].imshow(img)
    ax[idx].set_title(f"Label: {image_label}")

plt.tight_layout()
plt.show()

In [None]:
# Explore the data and potential null values
data_ocu

In [None]:
# Check the distribution of labels (the different disease types)
data_ocu['labels'].value_counts()

In [None]:
# Check the distribution of other variables
plt.figure(figsize=(10,6))
sns.histplot(data_ocu['Patient Age'], bins=30, kde=True)
plt.title('Age Distribution of Patients')
plt.xlabel('Age')
plt.ylabel('Count')
plt.show()

data_ocu['Left-Diagnostic Keywords'].value_counts()

# 3. Create Train, Validation, and Test Data

In [None]:
# Train, Validation & Test data
dataset = data_ocu
y = 'labels'

test_size = 0.2 # fraction of all data to be used as test data
val_size = 0.2 # fraction of the training data (!) to be used as validation data

# Do steps for cataract dataset
train_data, test_data = train_test_split(
    dataset,
    stratify=dataset[y],
    test_size=test_size,
    random_state=1
)

# Split train_data into actual train and validation
train_data, val_data = train_test_split(
    train_data,
    stratify=train_data[y],
    test_size=val_size,
    random_state=1
)

# Image preprocessing
img_size = (128, 128)
rescale = 1./255
datagen = ImageDataGenerator(rescale=rescale) # rescale for faster convergence and preventing certain features from dominating others during training
batch_size = 32               # number of samples that will be propagated through the network

train_generator = datagen.flow_from_dataframe(
    train_data,
    x_col="paths",
    y_col=y,
    shuffle = True,     # samples are randomly ordered at start of each epoch
    target_size=img_size,
    class_mode="categorical",
    batch_size=batch_size
)

val_generator = datagen.flow_from_dataframe(
    val_data,
    x_col="paths",
    y_col=y,
    shuffle=False,
    target_size=img_size,
    class_mode="categorical",
    batch_size=batch_size
)

test_generator = datagen.flow_from_dataframe(
    test_data,
    x_col="paths",
    y_col=y,
    shuffle = False,
    target_size=img_size,
    class_mode="categorical",
    batch_size=batch_size
)

In [None]:
# Example of resizing an image

# Choose the image index you want to inspect
idx = 4

# Get original image from DataFrame
sample_path = test_data.iloc[idx]['paths']
sample_label = test_data.iloc[idx][y]

# Find which batch and which position in that batch the index belongs to
batch_size = test_generator.batch_size
batch_index = idx // batch_size
sample_index_in_batch = idx % batch_size

# Pull that specific batch from the generator
images, labels = test_generator[batch_index]
resized_image = images[sample_index_in_batch]

# Plotting
plt.figure(figsize=(12, 6))

# Original Image
plt.subplot(1, 2, 1)
plt.imshow(plt.imread(sample_path))
plt.title(f"Original Image\nLabel: {sample_label}")
plt.axis('off')

# Resized Image (From Generator)
plt.subplot(1, 2, 2)
plt.imshow(resized_image)
plt.title(f"Resized {img_size}\n Rescaled {rescale}")
plt.axis('off')

plt.show()

# 4. A Basic CNN Model


## 4.1 Set Hyperparameters, Create a CNN Architecture and Compile the Model

In [None]:
# Set the hyperparameters

## Feature Extraction
conv_layers_one = 32          # number of filters in the first convolutional layer
conv_layers_two = 64          # number of filters in the second convolutional layer
conv_kernel_size = (3,3)      # size of the kernel of the convolutional layer
conv_activation = 'relu'      # activation function of the convolutional layer

## Pooling (Downsampling)
pool_size = (2,2)             # size of pooling kernel

## Classification
fc_units = 128                # number of units in the fully-connected layer
fc_activation = 'relu'        # type of activation function
output_activation = 'softmax' # activation function for output layer

## Training Settings
loss_function = 'categorical_crossentropy'  # loss function
optimizer = 'Adam'                          # optimization technique
epochs = 1                                  # number of complete passes through the training dataset

In [None]:
# Create the CNN architecture

# Set random seed
tf.random.set_seed(1)

# Initialize model
cnn_model = keras.Sequential()

# Input Layer
cnn_model.add(layers.Input(shape=(img_size[0], img_size[1], 3)))

# Add a first 2D Convolutional Layer: might look for simple things like edges or lines
cnn_model.add(layers.Conv2D(conv_layers_one, kernel_size=conv_kernel_size, activation=conv_activation))

# Add a 2D Max Pooling layer: downsampling step
cnn_model.add(layers.MaxPooling2D(pool_size=pool_size))

# Add a second 2D Convolutional Layer: combines those edges from first convolutional layer to find more complex shapes (like the curve of an eye or the texture of a cataract)
cnn_model.add(layers.Conv2D(conv_layers_two, kernel_size=conv_kernel_size, activation=conv_activation))

# Add a 2D Max Pooling layer: downsampling step
cnn_model.add(layers.MaxPooling2D(pool_size=pool_size))

# Flatten the results to an array (CNNs see images as 3D cubes. To classify them, we have to "unroll" that cube into one long line of numbers.)
cnn_model.add(layers.Flatten())

# Add a Fully-Connected Layer: the "reasoning" layer of your model
cnn_model.add(layers.Dense(units=fc_units, activation=fc_activation))

# Add an output layer with softmax transformation
cnn_model.add(layers.Dense(units=len(unique_labels), activation=output_activation))

In [None]:
# Compile the model
cnn_model.compile(loss=loss_function, optimizer = optimizer, metrics= ['accuracy'])

In [None]:
# Display model structure
cnn_model.summary()

plot_model(
    cnn_model,
    show_shapes=True,
    show_layer_activations=True,
    show_layer_names=True
)

## 4.2 Train the Model

In [None]:
# Train the model

# Initialize the running time
cnn_model_start_time = time.time()

# Fit the model (split the data into a train and validation set to evaluate the model's performance during training)
cnn_history = cnn_model.fit(
    train_generator,
    validation_data = val_generator,
    epochs=epochs,
    verbose=1
)

# Calculate the running time
cnn_model_end_time = time.time()
cnn_model_running_time = cnn_model_end_time - cnn_model_start_time

## 4.3 Model Evaluation

In [None]:
# Set colors for confusion matrix and accuracy plots to improve aesthetics and interpretability
colors_dark = ["#1F1F1F", "#313131", '#636363', '#AEAEAE', '#DADADA']
colors_red = ["#331313", "#582626", '#9E1717', '#D35151', '#E9B4B4']
colors_green = ['#01411C','#4B6F44','#4F7942','#74C365','#D0F0C0']

In [None]:
# Plot train and validation accuracy and loss over epochs
filterwarnings('ignore')

epochs_lijst = [i for i in range(epochs)]
fig, ax = plt.subplots(1,2,figsize=(14,7))
train_acc = cnn_history.history['accuracy']
train_loss = cnn_history.history['loss']
val_acc = cnn_history.history['val_accuracy']
val_loss = cnn_history.history['val_loss']

fig.text(s='Epochs vs. Training and Validation Accuracy/Loss',size=18,fontweight='bold',
             fontname='monospace',color=colors_dark[1],y=1,x=0.28,alpha=0.8)

sns.despine()
ax[0].plot(epochs_lijst, train_acc, marker='o',markerfacecolor=colors_green[2],color=colors_green[3],
           label = 'Training Accuracy')
ax[0].plot(epochs_lijst, val_acc, marker='o',markerfacecolor=colors_red[2],color=colors_red[3],
           label = 'Validation Accuracy')
ax[0].legend(frameon=False)
ax[0].set_xlabel('Epochs')
ax[0].set_ylabel('Accuracy')

sns.despine()
ax[1].plot(epochs_lijst, train_loss, marker='o',markerfacecolor=colors_green[2],color=colors_green[3],
           label ='Training Loss')
ax[1].plot(epochs_lijst, val_loss, marker='o',markerfacecolor=colors_red[2],color=colors_red[3],
           label = 'Validation Loss')
ax[1].legend(frameon=False)
ax[1].set_xlabel('Epochs')
ax[1].set_ylabel('Training & Validation Loss')

fig.show()

In [None]:
# Predict with the model
pred_probs = cnn_model.predict(test_generator) # Prediction on the test data
pred_classes = np.argmax(pred_probs, axis=1)   # Select the class with the highest predicted probability as prediction outcome
true_classes = test_generator.classes          # Select the true classes of the test data

In [None]:
acc = accuracy_score(true_classes, pred_classes) # ratio of correct predictions (true positives + true negatives) to all predictions, indicating overall model correctness
print("Test accuracy:", acc)

cm = confusion_matrix(true_classes, pred_classes) # each cell [i, j] = number of times class i was predicted as class j
print("Confusion Matrix:\n", cm)

report = classification_report(true_classes, pred_classes, target_names=list(test_generator.class_indices.keys()))
print("Classification Report:\n", report)

# TP = True Positives
# FP = False Positives
# FN = False Negatives

# Precision = TP / (TP + FP): When the model predicts a class, how often is it correct?
# Recall = TP / (TP + FN): Of all actual samples of a class, how many did the model find?
# f1-score = 2 × (Precision × Recall) / (Precision + Recall): How good is the balance between precision and recall?
# support: How many true samples of each class exist in the test set?

# Macro avg:: simple average of the metric across classes. Every class counts equally, no matter how many samples it has.
# Weighted avg: average weighted by the number of true samples per class (support).