# Cats-vs.-Dogs Image Classification. Preprocessing
This `utils` notebook contains the data preprocessing and transformation of the cats and dogs images dataset. In particular, the first 2,000 images of the dataset are resized to 150x150 pixels and saved to a single CSV file.

This notebook is designed to be executed in a Google Collab environment.

- **Dataset Reference**: Sachin, Shaunthesheep (2020). Dataset: Cats-vs-Dogs : image dataset for binary classification. URL: [https://www.kaggle.com/shaunthesheep/microsoft-catsvsdogs-dataset](https://www.kaggle.com/shaunthesheep/microsoft-catsvsdogs-dataset)

## Package dependencies, Drive mount and parameters definition

In [None]:
import kagglehub
import os
import shutil
import cv2
import pandas as pd
import numpy as np
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from google.colab import files
from google.colab import drive
import stat
import sys
import io
import zipfile

# Mount Google Drive
drive.mount('/content/drive', force_remount=True)

# Define parameters for dataset processing
DATASET = "shaunthesheep/microsoft-catsvsdogs-dataset"  # Dataset identifier
MAX_IMAGES_PER_CLASS = 1000  # Limit to 1000 images per class (cats and dogs)
OUTPUT_DIR = f"/content/drive/MyDrive/Portfolio/data/cats_dogs{MAX_IMAGES_PER_CLASS}"  # Output directory for processed images
ZIP_OUTPUT_1 = "/content/drive/MyDrive/Portfolio/data/CatsDogsDataset_1.zip"  # First ZIP file for CSV output
ZIP_OUTPUT_2 = "/content/drive/MyDrive/Portfolio/data/CatsDogsDataset_2.zip"  # Second ZIP file for CSV output
CSV_FILENAME = "CatsDogsDataset.csv"  # Base name for CSV files
# Problematic files to skip (e.g. corrupted files)
SKIP_FILES = ["7981.jpg", "10125.jpg", "10404.jpg", "10501.jpg", "10820.jpg",
              "10158.jpg", "10401.jpg", "10747.jpg", "10797.jpg"]

## Function definition

In [None]:
def is_valid_image_cv2(file_path):
    """Check if a file is a valid image using cv2.imread."""
    try:
        img = cv2.imread(file_path)
        if img is None or img.size == 0:
            return False
        return True
    except Exception:
        return False

def copy_images(source_dir, dest_dir, max_images, class_name):
    """Copy valid images from source to destination, skipping problematic files."""
    os.makedirs(dest_dir, exist_ok=True)  # Create destination directory if it doesn't exist
    count = 0  # Track number of copied images
    for filename in sorted(os.listdir(source_dir)):
        if count >= max_images:  # Stop if max image limit is reached
            break
        if filename in SKIP_FILES:  # Skip specified problematic files
            continue
        src_path = os.path.join(source_dir, filename)
        dest_path = os.path.join(dest_dir, f"{class_name}_{count:03d}.jpg")
        if os.path.isfile(src_path) and is_valid_image_cv2(src_path):
            try:
                img = cv2.imread(src_path)  # Read image with OpenCV
                if img is not None:
                    cv2.imwrite(dest_path, img)  # Save image to destination
                    count += 1
            except Exception:
                pass
    return count  # Return number of images copied

def create_data_generator(train_dir):
    """Create a data generator for image processing."""
    train_datagen = ImageDataGenerator()  # Initialize data generator
    train_generator = train_datagen.flow_from_directory(
        train_dir,
        target_size=(150, 150),  # Resize images to 150x150
        batch_size=20,  # Process 20 images per batch
        class_mode='binary',  # Binary classification (cats vs dogs)
        shuffle=False  # Maintain order for consistent file paths and labels
    )
    return train_generator

def generate_column_names():
    """Generate column names for pixel values (R_i_j, G_i_j, B_i_j) for 150x150x3 images."""
    columns = []
    for i in range(1, 151):  # Rows 1 to 150
        for j in range(1, 151):  # Columns 1 to 150
            columns.append(f"R_{i}_{j}")  # Red channel
            columns.append(f"G_{i}_{j}")  # Green channel
            columns.append(f"B_{i}_{j}")  # Blue channel
    return columns

def extract_pixels_and_save_in_batches(generator, output_zip1, output_zip2, csv_filename1, csv_filename2):
    """Extract pixel values in batches and save as two CSVs in separate ZIP files."""
    pixel_columns = generate_column_names()  # Generate pixel column names
    columns = pixel_columns + ['Class']  # Add class column
    total_images = len(generator.filenames)  # Total number of images
    batch_size = 200  # Process 200 images per batch
    split_point = total_images // 2  # Split data into two parts

    # Initialize buffers for two CSVs
    csv_buffer1 = io.StringIO()
    csv_buffer2 = io.StringIO()
    pd.DataFrame(columns=columns).to_csv(csv_buffer1, index=False)  # Write headers to first CSV
    pd.DataFrame(columns=columns).to_csv(csv_buffer2, index=False)  # Write headers to second CSV

    pixel_data = []  # Store pixel values
    labels = []  # Store class labels
    image_count = 0  # Track total processed images

    for i in range(len(generator)):
        images, batch_labels = next(generator)  # Get next batch of images and labels
        for j in range(images.shape[0]):
            pixel_values = images[j].flatten().astype(np.uint8)  # Flatten image to 1D array
            pixel_data.append(pixel_values)
            labels.append(int(batch_labels[j]))  # Store class label
            image_count += 1

            # Process batch when batch_size is reached
            if len(pixel_data) >= batch_size:
                data = np.hstack((np.array(pixel_data), np.array(labels).reshape(-1, 1)))
                df = pd.DataFrame(data, columns=columns)
                # Write to appropriate CSV buffer based on image count
                if image_count <= split_point:
                    df.to_csv(csv_buffer1, mode='a', header=False, index=False)
                else:
                    df.to_csv(csv_buffer2, mode='a', header=False, index=False)
                pixel_data, labels = [], []  # Clear memory

    # Save any remaining data
    if pixel_data:
        data = np.hstack((np.array(pixel_data), np.array(labels).reshape(-1, 1)))
        df = pd.DataFrame(data, columns=columns)
        if image_count <= split_point:
            df.to_csv(csv_buffer1, mode='a', header=False, index=False)
        else:
            df.to_csv(csv_buffer2, mode='a', header=False, index=False)

    # Write CSV buffers to two ZIP files
    with zipfile.ZipFile(output_zip1, 'w', compression=zipfile.ZIP_DEFLATED) as zf:
        zf.writestr(csv_filename1, csv_buffer1.getvalue())
    with zipfile.ZipFile(output_zip2, 'w', compression=zipfile.ZIP_DEFLATED) as zf:
        zf.writestr(csv_filename2, csv_buffer2.getvalue())

def delete_all_images(dataset_path, output_dir):
    """Delete all downloaded and copied images, handling read-only files."""
    def remove_readonly(func, path, _):
        """Clear read-only flag and retry deletion."""
        os.chmod(path, stat.S_IWRITE)
        func(path)

    try:
        # Delete the copied images
        if os.path.exists(output_dir):
            shutil.rmtree(output_dir, onerror=remove_readonly)
        # Original dataset deletion is commented out as it's environment-specific
        # if os.path.exists(dataset_path):
        #     shutil.rmtree(dataset_path, onerror=remove_readonly)
    except Exception:
        pass

## Execution: Main

In [None]:
print("Downloading dataset...")
try:
    path = kagglehub.dataset_download(DATASET)
    print(f"Dataset downloaded to: {path}")
except Exception as e:
    print(f"Failed to download dataset: {e}")
    raise

# Define paths
cat_dir = os.path.join(path, "PetImages", "Cat")
dog_dir = os.path.join(path, "PetImages", "Dog")
output_cat_dir = os.path.join(OUTPUT_DIR, "Cat")
output_dog_dir = os.path.join(OUTPUT_DIR, "Dog")

# Check if folders exist
if not os.path.exists(cat_dir) or not os.path.exists(dog_dir):
    print("PetImages/Cat or PetImages/Dog folders not found")
    raise FileNotFoundError("Required dataset folders missing")

# Copy images
print("Copying cat images...")
cat_count = copy_images(cat_dir, output_cat_dir, MAX_IMAGES_PER_CLASS, "cat")
print("Copying dog images...")
dog_count = copy_images(dog_dir, output_dog_dir, MAX_IMAGES_PER_CLASS, "dog")
print(f"Copied {cat_count} cat images and {dog_count} dog images")

# Verify output directories
if cat_count == 0 or dog_count == 0:
    print("No images copied for one or both classes")
    raise ValueError("Image copying failed")

# Create data generator
print("Creating data generator...")
train_generator = create_data_generator(OUTPUT_DIR)
if train_generator.samples == 0:
    print("No images found by data generator")
    raise ValueError("Data generator is empty")

# Extract and save in batches
print("Extracting and saving pixel values in batches...")
extract_pixels_and_save_in_batches(train_generator, ZIP_OUTPUT_1, ZIP_OUTPUT_2, CSV_FILENAME, CSV_FILENAME)

# Delete images
print("Deleting images...")
delete_all_images(path, OUTPUT_DIR)

In [None]:
!ls /content/drive/MyDrive/Portfolio/data/