In [None]:
# ==============================================================================
# Part 1: Model Training on Google Colab
# This script will guide you through setting up Colab, downloading a Kaggle
# dataset, preprocessing images, training a deep learning model, and saving it.
# ==============================================================================

# ------------------------------------------------------------------------------
# Step 1: Setup and Dataset Download
# ------------------------------------------------------------------------------

# Mount Google Drive to store Kaggle API key and trained model
# This allows Colab to access files in your Google Drive.
from google.colab import drive
drive.mount('/content/drive')

# ------------------------------------------------------------------------------
# Important: Do NOT force specific TensorFlow, NumPy, or Protobuf versions here.
# Let Google Colab use its default, stable, pre-installed versions.
# Forcing older versions in a Python 3.11 environment leads to instability.
# We will check the TensorFlow version after import and use that for deployment.
# ------------------------------------------------------------------------------

# Install Kaggle API client
# Kaggle API is used to download datasets directly from Kaggle.
!pip install kaggle
# Ensure Pillow is installed, as it's common for image processing
!pip install Pillow

# Configure Kaggle API credentials
# You need to upload your `kaggle.json` file to your Google Drive.
# Go to Kaggle (kaggle.com), click on your profile picture, then "Account",
# scroll down to "API", and click "Create New API Token" to download `kaggle.json`.
# Then upload this file to the root of your Google Drive.
import os
import json
import subprocess # Used for checking command errors

# Define the path where the kaggle.json file should be placed
kaggle_dir = '/root/.kaggle'
if not os.path.exists(kaggle_dir):
    os.makedirs(kaggle_dir)

# Path to the kaggle.json file in Google Drive
kaggle_json_drive_path = '/content/drive/MyDrive/kaggle.json'
if not os.path.exists(kaggle_json_drive_path):
    print("ERROR: kaggle.json not found in your Google Drive. Please upload it to /content/drive/MyDrive/kaggle.json")
    print("Go to Kaggle -> Account -> Create New API Token to download kaggle.json.")
    print("Ensure you accept the dataset's terms on Kaggle before attempting download.")
    exit() # Exit if API key is not found

# Copy kaggle.json from Google Drive to the required path for Kaggle API
try:
    !cp {kaggle_json_drive_path} {kaggle_dir}/kaggle.json
    # Set permissions for the kaggle.json file
    !chmod 600 {kaggle_dir}/kaggle.json
    print("Kaggle API key setup complete.")
except Exception as e:
    print(f"Error copying or setting permissions for kaggle.json: {e}")
    print("Please ensure your Google Drive is mounted and kaggle.json is accessible.")
    exit() # Exit if API key setup fails


print("Proceeding to dataset download.")

# Download the dataset
# We'll use the "1000 Fundus images with 39 categories" dataset as specified.
# Dataset URL: https://www.kaggle.com/datasets/siddharthm18/1000-fundus-images-with-39-categories
kaggle_dataset_path = 'linchundan/fundusimage1000'
dataset_download_path = '/content/fundus_dataset'

# Create a directory for the dataset if it doesn't exist
if not os.path.exists(dataset_download_path):
    os.makedirs(dataset_download_path)

# Change current directory to the dataset path for Kaggle download command
%cd {dataset_download_path}
# Download the dataset using Kaggle API
try:
    print(f"Attempting to download dataset: {kaggle_dataset_path}")
    # Use -q (quiet) for quiet mode, which suppresses progress output
    # check=True raises CalledProcessError on non-zero exit codes
    result = subprocess.run(['kaggle', 'datasets', 'download', kaggle_dataset_path, '-q'], capture_output=True, text=True, check=True)
    print(f"Dataset downloaded successfully to: {dataset_download_path}")

    # Unzip the downloaded dataset
    zip_file_name = os.path.basename(kaggle_dataset_path) + '.zip'
    if os.path.exists(zip_file_name):
        print(f"Unzipping {zip_file_name}...")
        subprocess.run(['unzip', '-q', zip_file_name], check=True)
        print("Dataset unzipped.")
    else:
        print(f"Error: Downloaded zip file '{zip_file_name}' not found. Dataset download might have failed silently.")
        print("Please check your Kaggle API key and ensure you have accepted the dataset terms on Kaggle.")
        exit() # Exit if zip file is not found

except subprocess.CalledProcessError as e:
    print(f"\nERROR: Kaggle dataset download failed with exit code {e.returncode}.")
    print(f"Command: {' '.join(e.cmd)}")
    # Check if stderr output exists before trying to decode/print it
    if e.stderr:
        print(f"Error Output (stderr):\n{e.stderr}")
    elif e.stdout: # Sometimes error might be in stdout
        print(f"Output (stdout):\n{e.stdout}")
    else:
        print("No specific error output from Kaggle command. This often means permission issues.")

    print("\nCommon reasons for this error:")
    print("1. Your Kaggle API key (kaggle.json) might be incorrect or have wrong permissions.")
    print("2. You might not have accepted the terms & conditions for this dataset on Kaggle.")
    print(f"   Please visit: https://www.kaggle.com/datasets/{kaggle_dataset_path} and accept any terms.")
    print("3. There might be a network issue or Kaggle API rate limit exceeded.")
    exit() # Exit the script if download fails
except Exception as e:
    print(f"An unexpected error occurred during dataset download or unzip: {e}")
    exit()


# List the contents of the dataset directory to verify
print(f"Contents of {dataset_download_path}:")
!ls -F {dataset_download_path}

# Set the base directory for the dataset after unzipping
# The images are typically in a folder named 'Dataset' inside the unzipped content.
base_data_dir = os.path.join(dataset_download_path, 'Dataset')
if not os.path.exists(base_data_dir):
    print(f"Warning: 'Dataset' directory not found at {base_data_dir}. Checking for other main directory.")
    # Try to find the actual directory containing images, usually a single folder
    # after unzipping if not directly 'Dataset'.
    subdirs = [d for d in os.listdir(dataset_download_path) if os.path.isdir(os.path.join(dataset_download_path, d))]
    if subdirs:
        # Prioritize 'Dataset' if it exists in subdirs, otherwise pick the first one
        if 'Dataset' in subdirs:
            base_data_dir = os.path.join(dataset_download_path, 'Dataset')
        else:
            base_data_dir = os.path.join(dataset_download_path, subdirs[0])
        print(f"Assuming image directory is: {base_data_dir}")
    else:
        print("Error: Could not locate image directory within the unzipped dataset.")
        # Fallback to the main unzipped directory if no subdirectories are found
        base_data_dir = dataset_download_path
        print(f"Falling back to: {base_data_dir}. This might not be the correct image folder.")


# ------------------------------------------------------------------------------
# Step 2: Data Preprocessing
# ------------------------------------------------------------------------------

import cv2
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import tensorflow as tf # Import TensorFlow
from tensorflow.keras.utils import to_categorical
from tqdm import tqdm # For progress bars

# Print TensorFlow version for later use in requirements.txt
print(f"TensorFlow version being used: {tf.__version__}")

print("\nStarting data preprocessing...")

IMG_SIZE = 224 # Standard input size for many pre-trained CNNs
BATCH_SIZE = 32 # Batch size for training
NUM_CLASSES = 39 # As per the dataset description

images = []
labels = []
image_paths = [] # Store paths for debugging or verification

# Iterate through each subfolder (which represents a class)
# Check if base_data_dir exists and is a directory
if not os.path.isdir(base_data_dir):
    print(f"ERROR: Base data directory not found or is not a directory: {base_data_dir}")
    print("Please ensure the Kaggle dataset was downloaded and unzipped correctly, and the 'Dataset' folder (or equivalent) exists.")
    exit() # Exit if data directory is invalid

for category in os.listdir(base_data_dir):
    category_path = os.path.join(base_data_dir, category)
    if os.path.isdir(category_path): # Ensure it's a directory
        print(f"Processing category: {category}")
        for img_name in os.listdir(category_path):
            img_path = os.path.join(category_path, img_name)
            try:
                # Read image in BGR format (OpenCV default)
                img = cv2.imread(img_path)
                if img is None:
                    print(f"Warning: Could not read image {img_path}. Skipping.")
                    continue
                # Resize image to IMG_SIZE x IMG_SIZE
                img = cv2.resize(img, (IMG_SIZE, IMG_SIZE))
                # Convert BGR to RGB (TensorFlow/Keras usually expects RGB)
                img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
                # Normalize pixel values to [0, 1]
                img = img / 255.0

                images.append(img)
                labels.append(category)
                image_paths.append(img_path)
            except Exception as e:
                print(f"Error processing {img_path}: {e}. Skipping.")

images = np.array(images)
labels = np.array(labels)

print(f"Total images loaded: {len(images)}")
print(f"Total labels loaded: {len(labels)}")

# IMPORTANT: Check if any images were loaded before proceeding
if len(images) == 0:
    print("ERROR: No images were loaded. This usually means the dataset download failed or the path to images is incorrect.")
    print("Please review the error messages above related to dataset download and verify your Kaggle API key and dataset access.")
    exit() # Stop execution if no data is available

# Encode labels to numerical format
# LabelEncoder assigns a unique integer to each unique category string.
label_encoder = LabelEncoder()
integer_encoded_labels = label_encoder.fit_transform(labels)
print(f"Classes found: {label_encoder.classes_}")

# Convert integer labels to one-hot encoded format
# One-hot encoding is required for categorical cross-entropy loss in multi-class classification.
one_hot_labels = to_categorical(integer_encoded_labels, num_classes=NUM_CLASSES)

# Split data into training, validation, and test sets
# Stratify ensures that the proportion of classes is the same in all splits.
X_train, X_temp, y_train, y_temp = train_test_split(
    images, one_hot_labels, test_size=0.3, random_state=42, stratify=one_hot_labels
)
X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp, test_size=0.5, random_state=42, stratify=y_temp
)

print(f"Train set shape: {X_train.shape}, {y_train.shape}")
print(f"Validation set shape: {X_val.shape}, {y_val.shape}")
print(f"Test set shape: {X_test.shape}, {y_test.shape}")

# Save the label encoder classes to be used later in the backend
# This mapping is crucial to convert model predictions (integers) back to meaningful disease names.
label_mapping = {int(i): label for i, label in enumerate(label_encoder.classes_)}
label_mapping_path = '/content/drive/MyDrive/label_mapping.json'
with open(label_mapping_path, 'w') as f:
    json.dump(label_mapping, f)
print(f"Label mapping saved to {label_mapping_path}")


# ------------------------------------------------------------------------------
# Step 3: Model Definition and Training
# ------------------------------------------------------------------------------

# TensorFlow is already imported in Step 2, so no need to import it again here.
from tensorflow.keras.applications import MobileNetV2 # Efficient for mobile/edge devices
from tensorflow.keras.layers import Dense, GlobalAveragePooling2D, Dropout
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint

print("\nStarting model definition and training...")

# Load the pre-trained MobileNetV2 model
# include_top=False means we don't include the classification layers of MobileNetV2,
# allowing us to add our own for our specific number of classes.
# weights='imagenet' uses weights pre-trained on the ImageNet dataset.
base_model = MobileNetV2(weights='imagenet', include_top=False, input_shape=(IMG_SIZE, IMG_SIZE, 3))

# Freeze the layers of the base model
# This prevents the pre-trained weights from being updated during training
# of the new classification layers, which helps in faster convergence and avoids
# overfitting with smaller datasets.
base_model.trainable = False

# Add custom classification layers on top of the base model
x = base_model.output
x = GlobalAveragePooling2D()(x) # Reduces spatial dimensions, suitable for classification
x = Dense(256, activation='relu')(x) # A fully connected layer
x = Dropout(0.5)(x) # Dropout for regularization to prevent overfitting
predictions = Dense(NUM_CLASSES, activation='softmax')(x) # Output layer with softmax for multi-class classification

# Create the full model
model = Model(inputs=base_model.input, outputs=predictions)

# Compile the model
# Adam optimizer is a popular choice.
# categorical_crossentropy is used for one-hot encoded labels.
model.compile(optimizer=Adam(learning_rate=0.0001),
              loss='categorical_crossentropy',
              metrics=['accuracy'])

# Data Augmentation for training data
# This creates new variations of training images on-the-fly,
# further improving generalization and reducing overfitting.
train_datagen = ImageDataGenerator(
    rotation_range=20,
    width_shift_range=0.2,
    height_shift_range=0.2,
    shear_range=0.2,
    zoom_range=0.2,
    horizontal_flip=True,
    fill_mode='nearest'
)

# No augmentation for validation and test data, only normalization handled earlier
val_datagen = ImageDataGenerator() # Only used to flow data, no augmentation here

# Prepare generators for training and validation data
train_generator = train_datagen.flow(X_train, y_train, batch_size=BATCH_SIZE)
validation_generator = val_datagen.flow(X_val, y_val, batch_size=BATCH_SIZE)

# Callbacks for better training control
# EarlyStopping stops training if validation accuracy doesn't improve for 'patience' epochs
early_stopping = EarlyStopping(monitor='val_accuracy', patience=10, restore_best_weights=True)
# ModelCheckpoint saves the best model based on validation accuracy
model_checkpoint = ModelCheckpoint(
    filepath='/content/drive/MyDrive/best_fundus_model.h5', # Path to save the best model
    monitor='val_accuracy',
    save_best_only=True,
    verbose=1
)

# Train the model
# Using generators for augmented data
EPOCHS = 50 # Number of training epochs, can be adjusted
history = model.fit(
    train_generator,
    steps_per_epoch=len(X_train) // BATCH_SIZE,
    epochs=EPOCHS,
    validation_data=validation_generator,
    validation_steps=len(X_val) // BATCH_SIZE,
    callbacks=[early_stopping, model_checkpoint]
)

print("\nModel training complete.")


# ------------------------------------------------------------------------------
# Step 4: Model Evaluation and Saving
# ------------------------------------------------------------------------------

# Load the best saved model for evaluation
from tensorflow.keras.models import load_model
best_model_path = '/content/drive/MyDrive/best_fundus_model.h5'
model = load_model(best_model_path) # Load the model with best validation accuracy

print(f"\nEvaluating the best model loaded from: {best_model_path}")

# Evaluate the model on the test set
loss, accuracy = model.evaluate(X_test, y_test, verbose=1)
print(f"Test Loss: {loss:.4f}")
print(f"Test Accuracy: {accuracy:.4f}")

print("\nModel training and saving complete! The best model is saved to Google Drive.")
print("The label mapping file is also saved to Google Drive, which is essential for the backend.")

In [None]:
import cv2
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import tensorflow as tf # Import TensorFlow
from tensorflow.keras.utils import to_categorical
from tqdm import tqdm # For progress bars

print(f"TensorFlow version being used: {tf.__version__}")

print("\nStarting data preprocessing...")

IMG_SIZE = 224 # Standard input size for many pre-trained CNNs
BATCH_SIZE = 32 # Batch size for training
NUM_CLASSES = 39 # As per the dataset description

images = []
labels = []
image_paths = [] # Store paths for debugging or verification

if not os.path.isdir(base_data_dir):
    print(f"ERROR: Base data directory not found or is not a directory: {base_data_dir}")
    print("Please ensure the Kaggle dataset was downloaded and unzipped correctly, and the 'Dataset' folder (or equivalent) exists.")
    exit()

# Monitor this loop carefully for any printed errors
for category in os.listdir(base_data_dir):
    category_path = os.path.join(base_data_dir, category)
    if os.path.isdir(category_path):
        print(f"Processing category: {category}") # This should print for each category
        for img_name in os.listdir(category_path):
            img_path = os.path.join(category_path, img_name)
            try:
                img = cv2.imread(img_path)
                if img is None:
                    print(f"Warning: Could not read image {img_path}. Skipping.")
                    continue
                img = cv2.resize(img, (IMG_SIZE, IMG_SIZE))
                img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
                img = img / 255.0

                images.append(img)
                labels.append(category)
                image_paths.append(img_path)
            except Exception as e:
                print(f"Error processing {img_path}: {e}. Skipping.")

images = np.array(images)
labels = np.array(labels)

print(f"Total images loaded: {len(images)}") # IMPORTANT: This should be > 0
print(f"Total labels loaded: {len(labels)}") # IMPORTANT: This should be > 0

if len(images) == 0:
    print("ERROR: No images were loaded. This usually means the dataset download failed or the path to images is incorrect.")
    print("Please review the error messages above related to dataset download and verify your Kaggle API key and dataset access.")
    exit()

label_encoder = LabelEncoder()
integer_encoded_labels = label_encoder.fit_transform(labels)
print(f"Classes found: {label_encoder.classes_}")

one_hot_labels = to_categorical(integer_encoded_labels, num_classes=NUM_CLASSES)

X_train, X_temp, y_train, y_temp = train_test_split(
    images, one_hot_labels, test_size=0.3, random_state=42, stratify=one_hot_labels
)
X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp, test_size=0.5, random_state=42, stratify=y_temp
)

print(f"Train set shape: {X_train.shape}, {y_train.shape}")
print(f"Validation set shape: {X_val.shape}, {y_val.shape}")
print(f"Test set shape: {X_test.shape}, {y_test.shape}")

# This is the crucial part for label_mapping.json
label_mapping = {int(i): label for i, label in enumerate(label_encoder.classes_)}
label_mapping_path = '/content/drive/MyDrive/label_mapping.json'
try:
    with open(label_mapping_path, 'w') as f:
        json.dump(label_mapping, f)
    print(f"Label mapping saved to {label_mapping_path}") # Look for this line!
except Exception as e:
    print(f"ERROR: Could not save label_mapping.json: {e}") # Any error here?