In [None]:
!pip -q install torch diffusers open-clip-torch Pillow numpy langchain langchain-groq gradio


In [None]:
import torch
import open_clip
from diffusers import StableDiffusionPipeline
import numpy as np
from PIL import Image
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate
from langchain_groq import ChatGroq

In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"

In [None]:
clip_model, clip_tokenizer, preprocess = open_clip.create_model_and_transforms(
    'ViT-B-32', pretrained='laion2b_s34b_b79k', device=device
)
tokenizer = clip_tokenizer

In [None]:
def blend_prompts_and_generate(prompt_a, prompt_b):
    aesthetic_prompts=(f"{prompt_a},{prompt_b},aesthetic,visually pleasing")

    model= "runwayml/stable-diffusion-v1-5"
    pipe = StableDiffusionPipeline.from_pretrained(model,torch_dtype=torch.float16)
    pipe = pipe.to(device)

    llm = ChatGroq(
    temperature=0.3,
    model="llama3-70b-8192",
    api_key="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"
    )

    system=(
        "Enhance the two prompts given by the user by adding descriptors"
        "Enhanced prompts should have all objects and subjects provided by user"
        "Enhance by adding relevant adjectives to the objects and subjects given in the prompt so as to create a visually pleasing image "
        "Apply token weighting as follows: "
        "- Main subject: (word:1.6-1.7) "
        "- Important object: (word:1.2-1.3)"
        "Main subject is what is provided in the prompts user"
        "Do not distract from the prompts provided by user, what user has provided is the main subject"
        "Limit your response to 10-12 words"
        "Return only the two enhanced prompts with weights"
    )
    prompt_template = ChatPromptTemplate.from_messages([
        ("system", system),
        ("human", "{input}")
    ])
    output_parser = StrOutputParser()  #This tells LangChain that the LLM's output should be treated as a plain string
    chain = prompt_template | llm_1 | output_parser

   def enhance_prompt(text):
        return chain.invoke({"input": text})

    enhanced_prompt_a = enhance_prompt(prompt_a)
    enhanced_prompt_b = enhance_prompt(prompt_b)

    text_input_a = pipe.tokenizer(
        enhanced_prompt_a, padding="max_length", max_length=pipe.tokenizer.model_max_length, return_tensors="pt"
    ).input_ids.to(device)
    text_input_b = pipe.tokenizer(
        enhanced_prompt_b, padding="max_length", max_length=pipe.tokenizer.model_max_length, return_tensors="pt"
    ).input_ids.to(device)

    with torch.no_grad():
        embeddings_a = pipe.text_encoder(text_input_a)[0]
        embeddings_b = pipe.text_encoder(text_input_b)[0]

    height, width = 512, 512
    generator = torch.Generator(device=device).manual_seed(42)
    latents = torch.randn(
        (1, pipe.unet.in_channels, height // 8, width // 8),
        generator=generator,
        device=device,
        dtype=torch.float16
    )
     def generate_blended_image(alpha):
        blended_embeddings = alpha * embeddings_a + (1 - alpha) * embeddings_b
        with torch.autocast(device):
            image = pipe(
                prompt_embeds=blended_embeddings,
                latents=latents,
                num_inference_steps=30,
                guidance_scale=7.5,
                height=height,
                width=width
            ).images[0]
        return image

    def calculate_aesthetic_score(image):
        img_tensor = preprocess(image).unsqueeze(0).to(device)
        text_tokens = tokenizer(aesthetic_prompts).to(device)
        with torch.no_grad(), torch.cuda.amp.autocast():
            image_features = clip_model.encode_image(img_tensor)
            text_features = clip_model.encode_text(text_tokens)
            image_features /= image_features.norm(dim=-1, keepdim=True)
            text_features /= text_features.norm(dim=-1, keepdim=True)
            similarity = (image_features @ text_features.T).softmax(dim=-1)
            return similarity[0, 0].item()

    alphas = np.linspace(0.4, 0.6, 20)
    images, scores = [], []

    for alpha in alphas:
        image = generate_blended_image(alpha)
        images.append(image)
        scores.append(calculate_aesthetic_score(image))

    best_idx = np.argmax(scores)
    best_image = images[best_idx]
    return best_image  # Return PIL image directly


In [None]:
import gradio as gr

def gradio_blend(prompt_a, prompt_b):
    image = blend_prompts_and_generate(prompt_a, prompt_b)
    return image

with gr.Blocks(theme=gr.themes.Soft(primary_hue="purple")) as demo:
    gr.Markdown(
        """
        <h1 style='text-align: center; color: #6a0dad;'>✨ Prompt Blending ✨</h1>
        <p style='text-align: center;'>Enter two prompts and generate a beautifully blended image.</p>
        """
    )

    with gr.Row():
        with gr.Column():
            prompt_a = gr.Textbox(label="Prompt A", placeholder="Enter your first prompt...")
            prompt_b = gr.Textbox(label="Prompt B", placeholder="Enter your second prompt...")
            generate_btn = gr.Button("Blend Prompts ✨")
        with gr.Column():
            output_image = gr.Image(type="pil", label="Blended Image")

    generate_btn.click(fn=gradio_blend, inputs=[prompt_a, prompt_b], outputs=output_image)

demo.launch(share=True)