<a href="https://colab.research.google.com/github/acmeproducts/4mysunshine/blob/main/AI_Image_Catalog_Generator_(Outputting_Multiple_Files).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# @title Default title text
# This cell contains the complete Python script for generating the AI Image Catalog.
# Run this entire cell in your Google Colab notebook.

import os
from IPython.display import HTML, display
from datetime import datetime
import shutil # Added for removing directory contents

# --- Start of Program: Define paths and get user input immediately ---

print("--- Starting AI Image Catalog Generator ---")

# Define the base directory for image generation and the subfolder where generated images reside
BASE_IMAGE_GENERATION_DIR = '/content/drive/MyDrive/Image_Generation'
GENERATED_IMAGES_BASE_DIR = os.path.join(BASE_IMAGE_GENERATION_DIR, 'generated_images')

# Ensure the base directories exist (this is quick and non-interactive)
os.makedirs(BASE_IMAGE_GENERATION_DIR, exist_ok=True)
os.makedirs(GENERATED_IMAGES_BASE_DIR, exist_ok=True)

print(f"Ensuring base directory: {BASE_IMAGE_GENERATION_DIR}")
print(f"Ensuring generated images directory: {GENERATED_IMAGES_BASE_DIR}")

# --- Step 1: Prompt for the specific folder name or all folders ---
# This is the very first interactive step for the user.
target_folder_input = input("\nStep 1: Enter a specific subfolder name within 'generated_images' to process, or leave blank to process ALL subfolders: ").strip()

# --- Step 2: Mount Google Drive ---
# This is the first interaction requiring user approval after providing the folder input.
print("\nStep 2: Mounting Google Drive...")
try:
    # Ensure the mount point is clean before attempting to mount
    # This addresses the "Mountpoint must not already contain files" error
    if os.path.exists('/content/drive') and os.path.isdir('/content/drive'):
        if os.listdir('/content/drive'): # Check if directory is not empty
            print("  '/content/drive' is not empty. Attempting to clear...")
            try:
                shutil.rmtree('/content/drive') # Remove contents, or the directory itself
                os.makedirs('/content/drive') # Recreate if rmtree removed it
                print("  '/content/drive' cleared successfully.")
            except Exception as clear_e:
                print(f"  WARNING: Could not clear '/content/drive': {clear_e}. Proceeding with mount attempt.")

    from google.colab import drive
    drive.mount('/content/drive')
    print("Google Drive mounted successfully.")
except Exception as e:
    raise RuntimeError(f"ERROR: Could not mount Google Drive. Ensure you are running in Colab and grant permissions: {e}")

# --- Step 3: Authenticate PyDrive2 ---
# This is the second interaction requiring user approval.
print("\nStep 3: Authenticating with Google Drive for file ID retrieval (PyDrive2)...")
try:
    from pydrive2.auth import GoogleAuth
    from pydrive2.drive import GoogleDrive
    from google.colab import auth
    from oauth2client.client import GoogleCredentials

    auth.authenticate_user()
    gauth = GoogleAuth()
    gauth.credentials = GoogleCredentials.get_application_default()
    drive_service = GoogleDrive(gauth)
    print("PyDrive2 authentication successful!")
except Exception as e:
    print(f"Authentication failed for PyDrive2: {e}")
    print("Please ensure you follow the authentication steps carefully (open link, authorize, paste code, press Enter).")
    raise RuntimeError(f"ERROR: PyDrive2 authentication failed. Details: {e}")

# --- Step 4: Install necessary Python libraries ---
# This will run AFTER all user prompts and authentication.
print("\nStep 4: Installing required libraries (this may take a few moments)...")
!pip install -qqq scikit-image opencv-python transformers torch
print("Libraries installed successfully.")

# Import libraries that depend on the pip install
# These imports are now placed after the `!pip install` command.
from PIL import Image
import warnings
from transformers import pipeline
import torch
import json # Used for grouped_data, no direct user input interaction

# Suppress warnings for cleaner output
warnings.filterwarnings("ignore")

# --- Helper Functions ---

def get_drive_view_url(file_id):
    """Constructs a standard Google Drive view URL for a given file ID."""
    return f'https://drive.google.com/file/d/{file_id}/view?usp=sharing'

def get_images_and_ids_from_drive_folder(drive_service_instance, folder_path):
    """
    Lists image files in a specified Google Drive folder path and returns their
    titles, local paths, and Google Drive file IDs.
    Args:
        drive_service_instance: Authenticated GoogleDrive object from pydrive2.
        folder_path (str): The full path to the folder within your Colab mounted Drive.
    Returns:
        list: A list of dictionaries, where each dict contains 'title', 'local_path', and 'id'
              for each found image file. Returns an empty list if the path is invalid
              or no images are found.
    """
    drive_api_path = folder_path.replace('/content/drive/MyDrive/', 'My Drive/')
    current_folder_id = 'root'
    path_parts = [part for part in drive_api_path.strip('/').split('/') if part]

    if path_parts and path_parts[0] == 'My Drive':
        path_parts = path_parts[1:] # Remove 'My Drive' if present, as 'root' covers it

    # Navigate through the folder path to find the target folder ID
    for part in path_parts:
        file_list = drive_service_instance.ListFile({'q': f"'{current_folder_id}' in parents and mimeType='application/vnd.google-apps.folder' and trashed=false"}).GetList()
        found_folder = False
        for file_item in file_list:
            if file_item['title'] == part:
                current_folder_id = file_item['id']
                found_folder = True
                break
        if not found_folder:
            print(f"Error: Folder '{part}' not found in the path. Please check your path: '{folder_path}'")
            return []

    # List image files in the final target directory
    images_info = []
    image_file_list = drive_service_instance.ListFile({'q': f"'{current_folder_id}' in parents and mimeType contains 'image/' and trashed=false"}).GetList()
    for file_item in image_file_list:
        local_path = os.path.join(folder_path, file_item['title'])
        # Verify local existence and if it's actually an image (simple check via PIL)
        try:
            with Image.open(local_path) as img:
                img.verify() # Verify if it's a valid image
            images_info.append({
                'title': file_item['title'],
                'id': file_item['id'],
                'local_path': local_path
            })
        except (IOError, SyntaxError, FileNotFoundError) as e:
            print(f"WARNING: File '{file_item['title']}' at '{local_path}' is not a valid image or could not be opened. Skipping. Error: {e}")
    return images_info


def generate_html_catalog_file(catalog_data_chunk, part_number, base_dir, timestamp):
    """Generates a single HTML file for a chunk of image data."""
    output_html_filename = f"catalog_{timestamp}_Part_{part_number}.html"
    workbench_html_path = os.path.join(base_dir, output_html_filename)

    html_content = f"""
    <!DOCTYPE html>
    <html lang="en">
    <head>
        <meta charset="UTF-8">
        <meta name="viewport" content="width=device-width, initial-scale=1.0">
        <title>AI Image Catalog - Part {part_number}</title>
        <style>
            body {{ font-family: sans-serif; margin: 20px; background-color: #f4f4f4; color: #333; }}
            .container {{ max-width: 1200px; margin: auto; background-color: #fff; padding: 20px; border-radius: 8px; box-shadow: 0 0 10px rgba(0, 0, 0, 0.1); }}
            h1 {{ text-align: center; color: #0056b3; }}
            h2 {{ color: #0056b3; margin-top: 30px; border-bottom: 2px solid #eee; padding-bottom: 5px; }}
            table {{ width: 100%; border-collapse: collapse; margin-top: 15px; }}
            th, td {{ border: 1px solid #ddd; padding: 8px; text-align: left; }}
            th {{ background-color: #f2f2f2; }}
            a {{ color: #007bff; text-decoration: none; }}
            a:hover {{ text-decoration: underline; }}
        </style>
    </head>
    <body>
        <div class="container">
            <h1>AI Generated Image Catalog - Part {part_number}</h1>
            <p>Generated at: {datetime.now().strftime("%Y-%m-%d %H:%M:%S")}</p>
    """

    # Group data by folder for this chunk
    grouped_data_chunk = {}
    for item in catalog_data_chunk:
        folder = item['folder_name']
        if folder not in grouped_data_chunk:
            grouped_data_chunk[folder] = []
        grouped_data_chunk[folder].append(item)

    for folder_name in sorted(grouped_data_chunk.keys()):
        html_content += f"        <h2>Folder: {folder_name}</h2>\n"
        html_content += """
            <table>
                <thead>
                    <tr>
                        <th>Filename</th>
                        <th>External Link</th>
                        <th>AI Caption</th>
                    </tr>
                </thead>
                <tbody>
        """
        for data in grouped_data_chunk[folder_name]:
            full_size_link = get_drive_view_url(data['file_id'])
            html_content += f"""
                    <tr>
                        <td>{data['image_filename']}</td>
                        <td><a href="{full_size_link}" target="_blank">View File</a></td>
                        <td>{data['ai_caption']}</td>
                    </tr>
            """
        html_content += """
                </tbody>
            </table>
        """

    html_content += """
        </div>
    </body>
    </html>
    """

    try:
        with open(workbench_html_path, "w") as f:
            f.write(html_content)
        print(f"  HTML Catalog Part {part_number} generated at: {workbench_html_path}")
        return workbench_html_path
    except Exception as e:
        print(f"  ERROR: Failed to write HTML Catalog Part {part_number} to Google Drive. Error: {e}")
        return None


# --- Step 5: Load Image Captioning pipeline and process images ---
print("\nStep 5: Loading Image Captioning pipeline and processing images...")

captioning_pipeline = None
try:
    device = 0 if torch.cuda.is_available() else -1
    captioning_pipeline = pipeline("image-to-text", model="Salesforce/blip-image-captioning-base", device=device)
    print("Image Captioning pipeline loaded successfully.")
except Exception as e:
    print(f"ERROR: Failed to load Image Captioning pipeline. Ensure internet connection. Using dummy captions. Error: {e}")

# Determine which folders to process based on user input
folders_to_process = []
if target_folder_input:
    # Process only the specified folder
    specific_folder_path = os.path.join(GENERATED_IMAGES_BASE_DIR, target_folder_input)
    if os.path.isdir(specific_folder_path):
        folders_to_process.append(specific_folder_path)
    else:
        print(f"ERROR: The specified folder '{target_folder_input}' does not exist under '{GENERATED_IMAGES_BASE_DIR}'.")
else:
    # Process all subfolders if input was blank
    for item in os.listdir(GENERATED_IMAGES_BASE_DIR):
        item_path = os.path.join(GENERATED_IMAGES_BASE_DIR, item)
        if os.path.isdir(item_path):
            folders_to_process.append(item_path)

if not folders_to_process:
    raise FileNotFoundError(f"No folders found to process in '{GENERATED_IMAGES_BASE_DIR}' (or the specified folder was not found). Please ensure images are organized in subfolders and the path is correct, then re-run.")

current_datetime_str = datetime.now().strftime("%Y%m%d_%H%M%S")
image_batch_for_html = []
html_file_part_number = 1
processed_image_count = 0
generated_file_links = []
IMAGES_PER_FILE = 100 # Define how many images per HTML file

print("\n--- Starting Image Processing and HTML Generation ---")

for folder_path in folders_to_process:
    folder_name = os.path.basename(folder_path)
    print(f"\nProcessing folder: {folder_name}")

    images_info = get_images_and_ids_from_drive_folder(drive_service, folder_path)

    if not images_info:
        print(f"No images found in folder: {folder_name}. Skipping this folder.")
        continue

    for i, img_info in enumerate(images_info):
        img_filename = img_info['title']
        img_path = img_info['local_path']
        file_id = img_info['id']

        print(f"  Processing {i+1}/{len(images_info)} in '{folder_name}': {img_filename}")

        ai_caption = "Could not generate caption."
        if captioning_pipeline:
            try:
                with Image.open(img_path) as img:
                    generated_caption = captioning_pipeline(img)[0]['generated_text']
                ai_caption = generated_caption
            except Exception as e:
                print(f"    WARNING: Error generating caption for {img_filename}. Error: {e}")

        image_batch_for_html.append({
            'folder_name': folder_name,
            'image_filename': img_filename,
            'ai_caption': ai_caption,
            'file_id': file_id
        })
        processed_image_count += 1

        # If we have 100 images or more, generate an HTML file
        if processed_image_count % IMAGES_PER_FILE == 0 and processed_image_count > 0:
            print(f"\nGenerating HTML file for Part {html_file_part_number}...")
            file_path = generate_html_catalog_file(image_batch_for_html, html_file_part_number, BASE_IMAGE_GENERATION_DIR, current_datetime_str)
            if file_path:
                generated_file_links.append(file_path)
            image_batch_for_html = [] # Reset batch
            html_file_part_number += 1

# --- Handle any remaining images (less than 100) ---
if image_batch_for_html:
    print(f"\nGenerating final HTML file for Part {html_file_part_number} (remaining images)...")
    file_path = generate_html_catalog_file(image_batch_for_html, html_file_part_number, BASE_IMAGE_GENERATION_DIR, current_datetime_str)
    if file_path:
        generated_file_links.append(file_path)

if not generated_file_links:
    print("\nNo images were processed across any selected folders or no HTML files were generated. Catalog will be empty.")
else:
    print(f"\n--- All Backend Processing Complete ---")
    print(f"**CLICK THE LINKS BELOW TO OPEN YOUR IMAGE AUTO-CATALOG FILES:**")
    for link_path in generated_file_links:
        display(HTML(f"<a href='file://{link_path}' target='_blank'>Open Image Auto-Catalog: {os.path.basename(link_path)}</a>"))