<a href="https://colab.research.google.com/github/aizazaziz/ML_Projects/blob/main/Cat_vs_Dog_CNN_with_Data_Cleaning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import os
import numpy as np
import pandas as pd
from PIL import Image, UnidentifiedImageError
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Flatten, Conv2D, MaxPooling2D
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from sklearn.model_selection import train_test_split

# --- Helper Function to Clean Corrupted Images ---
def check_and_clean_dataframe(df, image_col='images'):
    """
    Iterates through image paths in the DataFrame, verifies image integrity
    using PIL, and returns a DataFrame containing only valid image paths.
    """
    valid_indices = []
    invalid_count = 0

    print("Starting robust image integrity check (This might take a few moments for large datasets)...")

    # Use iterrows to loop through the DataFrame rows
    for index, row in df.iterrows():
        filepath = row[image_col]

        # Skip if the file doesn't exist (though it shouldn't, just for safety)
        if not os.path.exists(filepath):
            invalid_count += 1
            continue

        try:
            # Check for zero size
            if os.path.getsize(filepath) == 0:
                invalid_count += 1
                continue

            # Attempt to open and verify the image
            img = Image.open(filepath)
            img.verify()
            img.close()
            valid_indices.append(index)

        except (UnidentifiedImageError, OSError, Exception) as e:
            # Catch file corruption, truncated reads, or unidentified format errors
            invalid_count += 1
            # Optional: uncomment to see which files are being removed
            # print(f"Removed invalid file: {filepath} ({type(e).__name__})")

    if invalid_count > 0:
        print(f"\n--- Data Cleaning Summary ---")
        print(f"Total files originally listed: {len(df)}")
        print(f"Total invalid/corrupt files removed: {invalid_count}")
        print(f"Total valid files remaining: {len(valid_indices)}")
        print("-----------------------------\n")
    else:
        print("\nIntegrity check complete: No corrupted files found.")

    return df.loc[valid_indices]


# --- 1. Data Collection and DataFrame Creation ---
image = []
labels = []
DATA_ROOT_DIR = "ct" # Base directory for Cat/Dog images

# Build the list of image paths and labels
for filename in os.listdir(DATA_ROOT_DIR):
    current_dir_path = os.path.join(DATA_ROOT_DIR, filename)

    if os.path.isdir(current_dir_path):
        for path in os.listdir(current_dir_path):
            full_path = os.path.join(current_dir_path, path)

            # Ensure we are only listing files
            if os.path.isfile(full_path):
                # Assuming 'Cat' directory leads to label 0, and others (e.g., 'Dog') to 1
                if filename == "Cat":
                    labels.append(0)
                else:
                    labels.append(1)
                image.append(full_path)

df = pd.DataFrame()
df['images'] = image
df['label'] = labels

if image:
    print(f"Example Path: {df.iloc[0]['images']}, Example Label: {df.iloc[0]['label']}")
else:
    print(f"No images processed. Ensure '{DATA_ROOT_DIR}' directory and its subfolders exist and contain images.")
    exit() # Exit if no images are found

# --- CRITICAL FIX: Clean Corrupted Files from the DataFrame ---
df = check_and_clean_dataframe(df)

# --- 2. Data Splitting and Preparation ---
# Convert label to string for flow_from_dataframe
df['label'] = df['label'].astype('str')
train, test = train_test_split(df, test_size=0.2, random_state=42)

print(f"Train set size (validated): {len(train)}")
print(f"Test set size (validated): {len(test)}")

# --- 3. ImageDataGenerator Setup ---
TARGET_SIZE = (224, 224)

# Data Augmentation for Training
train_datagen = ImageDataGenerator(rescale=1. / 255,
                                   horizontal_flip=True,
                                   rotation_range=20,
                                   fill_mode='nearest',
                                   zoom_range=0.2,
                                   shear_range=0.2)

# Only rescaling for Validation
val_datagen = ImageDataGenerator(rescale=1. / 255)

# Create Iterators
train_iterator = train_datagen.flow_from_dataframe(train,
                                                   x_col='images',
                                                   y_col='label',
                                                   target_size=TARGET_SIZE,
                                                   class_mode='binary',
                                                   batch_size=16,
                                                   shuffle=True)

val_iterator = val_datagen.flow_from_dataframe(test,
                                               x_col='images',
                                               y_col='label',
                                               target_size=TARGET_SIZE,
                                               class_mode='binary',
                                               batch_size=16,
                                               shuffle=True)


# --- 4. CNN Model Definition ---
# The UserWarning about input_shape is expected but harmless in Sequential model context

model = Sequential([
    Conv2D(16, (3, 3), activation='relu', input_shape=(TARGET_SIZE[0], TARGET_SIZE[1], 3)),
    MaxPooling2D((2, 2)),

    Conv2D(32, (3, 3), activation='relu'),
    MaxPooling2D((2, 2)),

    Conv2D(64, (3, 3), activation='relu'),
    MaxPooling2D((2, 2)),

    Flatten(),

    Dense(512, activation='relu'),
    # Final output layer: 1 neuron with 'sigmoid' for binary classification
    Dense(1, activation='sigmoid')
])

# FIX APPLIED: Corrected loss function spelling from 'binary_cross_entropy' to 'binary_crossentropy'
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model.summary()

# --- 5. Model Training ---

print("\nStarting model training...")

# epochs were reduced to 5 in the previous run, keeping it at 5
history = model.fit(train_iterator,
                    epochs=5,
                    validation_data=val_iterator,
                    steps_per_epoch=len(train_iterator),
                    validation_steps=len(val_iterator)
                    )

print("\nTraining complete.")

# Optional: You can add code here to plot accuracy and loss over epochs.

FileNotFoundError: [Errno 2] No such file or directory: 'ct'

In [2]:
import streamlit as st
import numpy as np
from tensorflow.keras.models import load_model
from PIL import Image
import io

# --- Configuration ---
MODEL_SAVE_PATH = 'cat_dog_cnn_model.keras'
TARGET_SIZE = (224, 224)
CLASS_NAMES = {0: 'Cat', 1: 'Dog'}

# --- Model Loading (Caching to load only once) ---
# Streamlit caches this function's result, so the model loads quickly after the first run.
@st.cache_resource
def load_and_compile_model():
    """Loads the trained Keras model."""
    try:
        model = load_model(MODEL_SAVE_PATH)
        return model
    except Exception as e:
        st.error(f"Error loading the model. Ensure '{MODEL_SAVE_PATH}' exists.")
        st.error(e)
        return None

# --- Prediction Function ---
def predict_image(model, image_file):
    """
    Loads, preprocesses, and makes a prediction on the uploaded image.
    """
    try:
        # Load image from the uploaded file
        img = Image.open(image_file).convert("RGB")

        # Display the uploaded image
        st.image(img, caption='Uploaded Image', use_column_width=True)

        # Preprocess the image
        img = img.resize(TARGET_SIZE)
        img_array = np.array(img)
        img_array = np.expand_dims(img_array, axis=0) # Add batch dimension (1, 224, 224, 3)
        img_array = img_array.astype('float32') / 255.0 # Normalize (matching training)

        # Make the prediction
        prediction = model.predict(img_array)[0]
        confidence = float(prediction[0])

        # Determine the class and confidence
        if confidence >= 0.5:
            predicted_label = CLASS_NAMES[1]
            confidence_score = confidence
        else:
            predicted_label = CLASS_NAMES[0]
            # If the model predicts Cat (0), the confidence is 1 - prediction score
            confidence_score = 1.0 - confidence

        return predicted_label, confidence_score

    except Exception as e:
        st.error(f"An error occurred during prediction: {e}")
        return None, None

# --- Streamlit Main App Layout ---
def main():
    st.set_page_config(page_title="Cat vs. Dog Classifier", layout="centered")

    st.title("üê±üê∂ Simple Cat vs. Dog Image Classifier")
    st.markdown("Upload an image of a cat or a dog to see the model's prediction.")

    # Load the model
    model = load_and_compile_model()

    if model is None:
        st.stop() # Stop the app if model loading failed

    st.subheader("Upload an Image")
    uploaded_file = st.file_uploader("Choose an image file...", type=["jpg", "jpeg", "png"])

    if uploaded_file is not None:

        # Show a spinner while processing
        with st.spinner('Analyzing image...'):
            label, confidence = predict_image(model, uploaded_file)

        st.markdown("---")

        if label and confidence is not None:
            # Display the result using Markdown for emphasis
            st.success(f"## Prediction: {label}")

            # Display confidence using a progress bar
            st.write(f"Confidence: **{confidence * 100:.2f}%**")
            st.progress(confidence)

# Run the main function
if __name__ == '__main__':
    main()

ModuleNotFoundError: No module named 'streamlit'