<a href="https://colab.research.google.com/github/ayagup/stablediffusion/blob/main/text_to_image.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All"
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
"""
Simple Text-to-Image Generation Pipeline
Generate images from text prompts using HuggingFace Diffusion models
"""

import torch
from diffusers import DiffusionPipeline, StableDiffusionPipeline, StableDiffusionXLPipeline
from PIL import Image
import os
import gc
import time
from typing import Optional

# Suppress warnings
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'expandable_segments:True'


def print_header():
    """Print a nice header"""
    print("\n" + "="*70)
    print("üé® Text-to-Image Generation Pipeline")
    print("="*70)


def clear_gpu_memory():
    """Clear GPU memory cache"""
    if torch.cuda.is_available():
        torch.cuda.empty_cache()
        gc.collect()
        print("üßπ GPU memory cache cleared")


def generate_image(
    prompt: str,
    model_name: str = "stabilityai/stable-diffusion-2-1",
    output_path: str = "generated_image.png",
    negative_prompt: Optional[str] = None,
    num_inference_steps: int = 50,
    guidance_scale: float = 7.5,
    width: int = 512,
    height: int = 512,
    seed: Optional[int] = None,
) -> Image.Image:
    """
    Generate image from text prompt

    Args:
        prompt: Text description of desired image
        model_name: HuggingFace model identifier
        output_path: Output path for generated image
        negative_prompt: What to avoid in generation
        num_inference_steps: Number of denoising steps (higher = better quality)
        guidance_scale: How closely to follow prompt (7-12 recommended)
        width: Image width in pixels
        height: Image height in pixels
        seed: Random seed for reproducibility

    Returns:
        Generated PIL Image
    """

    print_header()
    clear_gpu_memory()

    # Device setup
    device = "cuda" if torch.cuda.is_available() else "cpu"

    if device == "cuda":
        gpu_name = torch.cuda.get_device_name(0)
        gpu_memory = torch.cuda.get_device_properties(0).total_memory / 1024**3
        print(f"\nüñ•Ô∏è  Using GPU: {gpu_name}")
        print(f"   Memory: {gpu_memory:.1f} GB\n")
    else:
        print("\nüíª Using CPU (Warning: This will be very slow!)\n")

    # Load model
    print(f"Loading model: {model_name}")
    start_time = time.time()

    # Determine model type and load appropriately
    if "xl" in model_name.lower():
        pipe = StableDiffusionXLPipeline.from_pretrained(
            model_name,
            torch_dtype=torch.float16 if device == "cuda" else torch.float32,
            use_safetensors=True,
            variant="fp16" if device == "cuda" else None
        )
    else:
        pipe = StableDiffusionPipeline.from_pretrained(
            model_name,
            torch_dtype=torch.float16 if device == "cuda" else torch.float32,
            use_safetensors=True,
            variant="fp16" if device == "cuda" else None
        )

    pipe = pipe.to(device)

    # Enable memory optimizations
    if device == "cuda":
        pipe.enable_attention_slicing()
        pipe.enable_vae_slicing()

    load_time = time.time() - start_time
    print(f"‚úì Model loaded in {load_time:.2f}s")

    # Set seed for reproducibility
    if seed is not None:
        generator = torch.Generator(device=device).manual_seed(seed)
        print(f"üé≤ Using seed: {seed}")
    else:
        generator = None

    # Print generation parameters
    print("\n" + "="*70)
    print("üé¨ Generation Parameters")
    print("="*70)
    print(f"Prompt: {prompt}")
    if negative_prompt:
        print(f"Negative: {negative_prompt}")
    print(f"Model: {model_name}")
    print(f"Device: {device}")
    print(f"Resolution: {width}x{height}")
    print(f"Steps: {num_inference_steps}")
    print(f"Guidance Scale: {guidance_scale}")
    if seed is not None:
        print(f"Seed: {seed}")
    print("="*70)

    # Generate image
    print("\nüé® Generating image...")
    print(f"Progress: This may take {num_inference_steps * 0.5:.0f}-{num_inference_steps:.0f} seconds...")

    generation_start = time.time()

    with torch.no_grad():
        image = pipe(
            prompt=prompt,
            negative_prompt=negative_prompt,
            num_inference_steps=num_inference_steps,
            guidance_scale=guidance_scale,
            width=width,
            height=height,
            generator=generator,
        ).images[0]

    generation_time = time.time() - generation_start
    print(f"‚úì Generation completed in {generation_time:.2f}s")

    # Save image
    image.save(output_path)
    print(f"\nüíæ Image saved: {output_path}")

    # Summary
    total_time = time.time() - start_time + load_time
    print("\n" + "="*70)
    print("‚úÖ Generation Complete!")
    print("="*70)
    print(f"Total time: {total_time:.2f}s")
    print(f"  - Model loading: {load_time:.2f}s")
    print(f"  - Image generation: {generation_time:.2f}s")
    print(f"\nOutput: {output_path}")
    print(f"Resolution: {width}x{height}")
    print(f"Quality: {num_inference_steps} steps")
    print("="*70 + "\n")

    clear_gpu_memory()

    return image




2025-10-18 20:04:07.087134: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1760817847.278311      37 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1760817847.335665      37 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [None]:
# if __name__ == "__main__":
#     import argparse

#     parser = argparse.ArgumentParser(
#         description='Simple Text-to-Image Generation',
#         formatter_class=argparse.RawDescriptionHelpFormatter,
#         epilog="""
# Examples:
#   # Basic generation
#   python simple_text_to_image.py --prompt "a beautiful sunset over mountains"

#   # High quality with more steps
#   python simple_text_to_image.py --prompt "a cat in a hat" --steps 100

#   # With negative prompt
#   python simple_text_to_image.py \
#     --prompt "a portrait of a person" \
#     --negative "blurry, low quality, distorted"

#   # Different resolution
#   python simple_text_to_image.py \
#     --prompt "a landscape painting" \
#     --width 768 --height 512

#   # Reproducible generation with seed
#   python simple_text_to_image.py \
#     --prompt "a fantasy castle" \
#     --seed 42
#         """
#     )

#     parser.add_argument('--prompt', type=str, required=True,
#                         help='Text prompt describing the desired image')
#     parser.add_argument('--negative', type=str, default=None,
#                         help='Negative prompt (what to avoid)')
#     parser.add_argument('--model', type=str,
#                         default='stabilityai/stable-diffusion-2-1',
#                         help='HuggingFace model name')
#     parser.add_argument('--output', type=str, default='generated_image.png',
#                         help='Output path for generated image')
#     parser.add_argument('--steps', type=int, default=50,
#                         help='Number of inference steps (20-100)')
#     parser.add_argument('--guidance', type=float, default=7.5,
#                         help='Guidance scale (7-12 recommended)')
#     parser.add_argument('--width', type=int, default=512,
#                         help='Image width')
#     parser.add_argument('--height', type=int, default=512,
#                         help='Image height')
#     parser.add_argument('--seed', type=int, default=None,
#                         help='Random seed for reproducibility')

#     args = parser.parse_args()

try:
    generate_image(
        prompt='very realistic image of a man with the head of a dog',
        model_name='stabilityai/stable-diffusion-2-1',
        output_path='/kaggle/working/output.jpg',
        negative_prompt='bad quality, low resolution image, low quality image',
        num_inference_steps=50,
        guidance_scale=7.5,
        width=512,
        height=512,
        seed=None,
    )
except Exception as e:
    print(f"\n‚ùå Error: {e}")
    import traceback
    traceback.print_exc()



üé® Text-to-Image Generation Pipeline
üßπ GPU memory cache cleared

üñ•Ô∏è  Using GPU: Tesla P100-PCIE-16GB
   Memory: 15.9 GB

Loading model: stabilityai/stable-diffusion-2-1


Loading pipeline components...:   0%|          | 0/6 [00:00<?, ?it/s]

‚úì Model loaded in 1.66s

üé¨ Generation Parameters
Prompt: very realistic image of a man with the head of a dog
Negative: bad quality, low resolution image, low quality image
Model: stabilityai/stable-diffusion-2-1
Device: cuda
Resolution: 512x512
Steps: 50
Guidance Scale: 7.5

üé® Generating image...
Progress: This may take 25-50 seconds...


  0%|          | 0/50 [00:00<?, ?it/s]

‚úì Generation completed in 14.65s

üíæ Image saved: /kaggle/working/output.jpg

‚úÖ Generation Complete!
Total time: 17.98s
  - Model loading: 1.66s
  - Image generation: 14.65s

Output: /kaggle/working/output.jpg
Resolution: 512x512
Quality: 50 steps

üßπ GPU memory cache cleared
