<a href="https://colab.research.google.com/github/ayagup/stablediffusion/blob/main/image_to_text.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# IMPORTANT: SOME KAGGLE DATA SOURCES ARE PRIVATE
# RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES.
import kagglehub
kagglehub.login()


In [None]:
# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.

maygup123_dataset1_path = kagglehub.dataset_download('maygup123/dataset1')

print('Data source import complete.')


In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All"
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
%pip install "sentencepiece>=0.1.99"

In [None]:
"""
Simple Image-to-Text (Image Captioning) Pipeline
Generate text captions from images using HuggingFace Transformers models
"""
import transformers.utils.hub
transformers.utils.hub.list_repo_templates = lambda *args, **kwargs: []

import torch
from transformers import AutoProcessor, AutoModelForVision2Seq, BlipProcessor, BlipForConditionalGeneration
from PIL import Image
import os
import gc
import time
from typing import Optional, List

# Suppress warnings
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'expandable_segments:True'

# Monkey patch to fix transformers chat_template 404 error
# Must be done at module level before any model loading
import transformers.utils.hub as hub_utils
_original_list_repo_templates = hub_utils.list_repo_templates

def _patched_list_repo_templates(*args, **kwargs):
    """Return empty list to avoid 404 errors on chat templates"""
    return []

# Apply the patch globally
hub_utils.list_repo_templates = _patched_list_repo_templates


def print_header():
    """Print a nice header"""
    print("\n" + "="*70)
    print("üìù Image-to-Text (Captioning) Pipeline")
    print("="*70)


def clear_gpu_memory():
    """Clear GPU memory cache"""
    if torch.cuda.is_available():
        torch.cuda.empty_cache()
        gc.collect()
        print("üßπ GPU memory cache cleared")


def load_image(image_path: str, max_size: Optional[int] = None) -> Image.Image:
    """Load and optionally resize image"""
    print(f"Loading image: {image_path}")

    if image_path.startswith('http://') or image_path.startswith('https://'):
        import requests
        from io import BytesIO
        response = requests.get(image_path)
        image = Image.open(BytesIO(response.content)).convert('RGB')
    else:
        image = Image.open(image_path).convert('RGB')

    original_size = image.size
    print(f"Original size: {original_size[0]}x{original_size[1]}")

    # Resize if needed
    if max_size and max(image.size) > max_size:
        ratio = max_size / max(image.size)
        new_size = tuple(int(dim * ratio) for dim in image.size)
        image = image.resize(new_size, Image.LANCZOS)
        print(f"Resized to: {new_size[0]}x{new_size[1]}")

    return image


def generate_caption(
    image_path: str,
    model_name: str = "Salesforce/blip-image-captioning-large",
    max_length: int = 50,
    num_beams: int = 5,
    max_image_size: Optional[int] = 1024,
    generate_multiple: bool = False,
    num_captions: int = 3,
) -> List[str]:
    """
    Generate text caption from image

    Args:
        image_path: Path to input image or URL
        model_name: HuggingFace model identifier
        max_length: Maximum caption length
        num_beams: Number of beams for beam search
        max_image_size: Maximum image dimension
        generate_multiple: Generate multiple captions
        num_captions: Number of captions to generate (if generate_multiple=True)

    Returns:
        List of generated captions
    """

    print_header()
    clear_gpu_memory()

    # Device setup
    device = "cuda" if torch.cuda.is_available() else "cpu"

    if device == "cuda":
        gpu_name = torch.cuda.get_device_name(0)
        gpu_memory = torch.cuda.get_device_properties(0).total_memory / 1024**3
        print(f"\nüñ•Ô∏è  Using GPU: {gpu_name}")
        print(f"   Memory: {gpu_memory:.1f} GB\n")
    else:
        print("\nüíª Using CPU\n")

    # Load image
    image = load_image(image_path, max_image_size)

    # Load model
    print(f"\nLoading model: {model_name}")
    start_time = time.time()

    # Use BLIP models with their specific processor
    if "blip" in model_name.lower():
        processor = BlipProcessor.from_pretrained(model_name)
        model = BlipForConditionalGeneration.from_pretrained(
            model_name,
            trust_remote_code=True
        )
    else:
        processor = AutoProcessor.from_pretrained(
            model_name,
            use_fast=True,
            trust_remote_code=True
        )
        model = AutoModelForVision2Seq.from_pretrained(
            model_name,
            trust_remote_code=True
        )

    model = model.to(device)
    model.eval()

    load_time = time.time() - start_time
    print(f"‚úì Model loaded in {load_time:.2f}s")

    # Print generation parameters
    print("\n" + "="*70)
    print("üé¨ Caption Generation Parameters")
    print("="*70)
    print(f"Input image: {image_path}")
    print(f"Image size: {image.size[0]}x{image.size[1]}")
    print(f"Model: {model_name}")
    print(f"Device: {device}")
    print(f"Max caption length: {max_length} tokens")
    print(f"Beam search: {num_beams} beams")
    if generate_multiple:
        print(f"Number of captions: {num_captions}")
    print("="*70)

    # Prepare inputs
    print("\nüìù Generating caption(s)...")
    inputs = processor(images=image, return_tensors="pt")
    inputs = {k: v.to(device) for k, v in inputs.items()}

    # Generate caption(s)
    generation_start = time.time()

    with torch.no_grad():
        if generate_multiple:
            # Generate multiple captions with sampling
            outputs = model.generate(
                **inputs,
                max_length=max_length,
                num_beams=num_beams,
                num_return_sequences=num_captions,
                do_sample=True,
                top_k=50,
                top_p=0.95,
                temperature=0.7,
            )
        else:
            # Generate single best caption
            outputs = model.generate(
                **inputs,
                max_length=max_length,
                num_beams=num_beams,
            )

    generation_time = time.time() - generation_start
    print(f"‚úì Generation completed in {generation_time:.3f}s")

    # Decode captions
    captions = processor.batch_decode(outputs, skip_special_tokens=True)

    # Clean up captions
    captions = [caption.strip() for caption in captions]

    # Print results
    print("\n" + "="*70)
    print("üìã Generated Caption(s)")
    print("="*70)

    if len(captions) == 1:
        print(f"\n{captions[0]}")
    else:
        for idx, caption in enumerate(captions, 1):
            print(f"\n{idx}. {caption}")

    print("\n" + "="*70)

    # Summary
    total_time = time.time() - start_time + load_time
    print("\n" + "="*70)
    print("‚úÖ Captioning Complete!")
    print("="*70)
    print(f"Total time: {total_time:.2f}s")
    print(f"  - Model loading: {load_time:.2f}s")
    print(f"  - Caption generation: {generation_time:.3f}s")
    print(f"\nCaptions generated: {len(captions)}")
    if captions:
        print(f"Caption length: {len(captions[0].split())} words")
    print("="*70 + "\n")

    clear_gpu_memory()

    return captions


2025-10-19 06:17:42.126792: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1760854662.150184     269 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1760854662.157385     269 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [None]:
# if __name__ == "__main__":
#     import argparse

#     parser = argparse.ArgumentParser(
#         description='Simple Image-to-Text Captioning',
#         formatter_class=argparse.RawDescriptionHelpFormatter,
#         epilog="""
# Examples:
#   # Basic captioning
#   python simple_image_to_text.py --image photo.jpg

#   # Generate multiple captions
#   python simple_image_to_text.py --image photo.jpg --multiple --num-captions 5

#   # Use different model
#   python simple_image_to_text.py --image photo.jpg --model Salesforce/blip-image-captioning-base

#   # From URL
#   python simple_image_to_text.py --image https://example.com/photo.jpg

#   # Longer captions
#   python simple_image_to_text.py --image photo.jpg --max-length 100
#         """
#     )

#     parser.add_argument('--image', type=str, required=True,
#                         help='Path to input image or URL')
#     parser.add_argument('--model', type=str,
#                         default='Salesforce/blip-image-captioning-large',
#                         help='HuggingFace model name')
#     parser.add_argument('--max-length', type=int, default=50,
#                         help='Maximum caption length in tokens')
#     parser.add_argument('--num-beams', type=int, default=5,
#                         help='Number of beams for beam search')
#     parser.add_argument('--max-size', type=int, default=1024,
#                         help='Maximum image dimension')
#     parser.add_argument('--multiple', action='store_true',
#                         help='Generate multiple captions')
#     parser.add_argument('--num-captions', type=int, default=3,
#                         help='Number of captions to generate (with --multiple)')

#     args = parser.parse_args()

# Fix for transformers chat_template 404 error


# Now import and use the function
# from simple_image_to_text import generate_caption

# captions = generate_caption(
#     image_path="/kaggle/input/dataset1/google-imagen-lead-image.jpeg",
#     model_name="Salesforce/blip-image-captioning-large"
# )

try:
    captions = generate_caption(
        image_path='/kaggle/input/dataset1/google-imagen-lead-image.jpeg',
        model_name='Salesforce/blip-image-captioning-large',
        max_length=50,
        num_beams=5,
        max_image_size=1024,
        generate_multiple=True,
        num_captions=3,
    )

    # Also print to stdout for easy scripting
    if len(captions) == 1:
        print(f"\nCaption: {captions[0]}")
    else:
        print(f"\nCaptions:")
        for idx, caption in enumerate(captions, 1):
            print(f"  {idx}. {caption}")

except Exception as e:
    print(f"\n‚ùå Error: {e}")
    import traceback
    traceback.print_exc()


Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.



üìù Image-to-Text (Captioning) Pipeline
üßπ GPU memory cache cleared

üñ•Ô∏è  Using GPU: Tesla P100-PCIE-16GB
   Memory: 15.9 GB

Loading image: /kaggle/input/dataset1/google-imagen-lead-image.jpeg
Original size: 1180x885
Resized to: 1024x768

Loading model: Salesforce/blip-image-captioning-large


vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/1.88G [00:00<?, ?B/s]

‚úì Model loaded in 9.68s

üé¨ Caption Generation Parameters
Input image: /kaggle/input/dataset1/google-imagen-lead-image.jpeg
Image size: 1024x768
Model: Salesforce/blip-image-captioning-large
Device: cuda
Max caption length: 50 tokens
Beam search: 5 beams
Number of captions: 3

üìù Generating caption(s)...
‚úì Generation completed in 2.073s

üìã Generated Caption(s)

1. blue bird sitting on top of a pile of macaroons on top of a table

2. blue bird sitting on top of a pile of macaroons on top of a plate

3. blue bird sitting on top of a plate of macarons with colorful macarons


‚úÖ Captioning Complete!
Total time: 21.50s
  - Model loading: 9.68s
  - Caption generation: 2.073s

Captions generated: 3
Caption length: 15 words

üßπ GPU memory cache cleared

Captions:
  1. blue bird sitting on top of a pile of macaroons on top of a table
  2. blue bird sitting on top of a pile of macaroons on top of a plate
  3. blue bird sitting on top of a plate of macarons with colorful macarons


In [None]:
%pip install "httpx==0.27.2"

In [None]:
%pip freeze

In [None]:
!pip install 'httpx>=0.27.0,<0.28.0' 'huggingface-hub>=0.30.0' --quiet --upgrade

print("\n‚úÖ Installation complete!")
print("\n" + "="*70)
print("‚ö†Ô∏è  CRITICAL: You MUST restart your kernel now!")
print("="*70)
print("\nAfter restarting, your vit-gpt2 model should work.")
print("\nIf this still doesn't work, the issue is Kaggle-specific.")
print("In that case, please use BLIP model instead:")
print("  model_name='Salesforce/blip-image-captioning-large'")
print("="*70)
