In [1]:
!pip install transformers pillow pandas tqdm

import pandas as pd
import os
import requests
from PIL import Image
from tqdm import tqdm
from transformers import BlipProcessor, BlipForConditionalGeneration
#from transformers import Blip2Processor, Blip2ForConditionalGeneration

import torch
import re

def ensure_direct_image_url(url):
    """
    For imgur: convert non-direct to i.imgur.com/ID.jpg.
    All other URLs: return as-is.
    """
    if "imgur.com" in url and not re.search(r'\.(jpg|jpeg|png|gif|bmp|webp|tiff)$', url, re.IGNORECASE):
        match = re.search(r'imgur\.com/(?:gallery/|a/)?([^/?#]+)', url)
        if match:
            img_id = match.group(1)
            return f"https://i.imgur.com/{img_id}.jpg"
        match = re.search(r'imgur\.com/([^/?#]+)', url)
        if match:
            img_id = match.group(1)
            return f"https://i.imgur.com/{img_id}.jpg"
    return url

#your logic
def smart_download_image(url, save_path):
    if "dropbox.com" in url:
        url = url.replace("?dl=0", "")
        if "?raw=1" not in url:
            if "?" in url:
                url += "&raw=1"
            else:
                url += "?raw=1"
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/112.0.0.0 Safari/537.36",
        "Accept": "image/avif,image/webp,image/apng,image/svg+xml,image/*,*/*;q=0.8",
        "Referer": url,
        "Accept-Encoding": "identity",
        "Connection": "keep-alive"
    }
    try:
        resp = requests.get(url, headers=headers, timeout=30)
        if resp.status_code == 200 and resp.headers.get('content-type', '').startswith("image"):
            with open(save_path, "wb") as f:
                f.write(resp.content)
            return True
        else:
            print(f"Failed (status {resp.status_code}, type {resp.headers.get('content-type', '')}) for {url}")
    except Exception as e:
        print(f"Download error for {url}: {e}")
    return False

device = "cuda" if torch.cuda.is_available() else "cpu"
blip_processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-large")
blip_model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-large").to(device)
"""
blip2_processor = Blip2Processor.from_pretrained("Salesforce/blip2-flan-t5-xl")
blip2_model = Blip2ForConditionalGeneration.from_pretrained("Salesforce/blip2-flan-t5-xl").to(device)
"""



Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


'\nblip2_processor = Blip2Processor.from_pretrained("Salesforce/blip2-flan-t5-xl")\nblip2_model = Blip2ForConditionalGeneration.from_pretrained("Salesforce/blip2-flan-t5-xl").to(device)\n'

In [6]:
def generate_blip_caption(image_path, blip_processor, blip_model):
    try:
        image = Image.open(image_path).convert("RGB")
        inputs = blip_processor(images=image, return_tensors="pt").to(blip_model.device)
        output = blip_model.generate(**inputs, max_length=50, num_beams=11,length_penalty=1.7, repetition_penalty=1.4,early_stopping=True, do_sample=False)
        caption = blip_processor.decode(output[0], skip_special_tokens=True)
        return caption
    except Exception as e:
        print(f"BLIP failed for {image_path}: {e}")
        return ""

"""
def generate_blip2_flan_caption(image_path):
    image = Image.open(image_path).convert("RGB")
    prompt = "Describe this image in extreme detail:"
    inputs = blip2_processor(images=image, text=prompt, return_tensors="pt").to(device)
    output = blip2_model.generate(**inputs, max_new_tokens=35)
    caption = blip2_processor.decode(output[0], skip_special_tokens=True)
    return caption

"""
df = pd.read_csv("RealEdit_train_split_urls.csv")

output = []
os.makedirs("images", exist_ok=True)

N = 10
for i, row in tqdm(df.iterrows(), total=min(len(df), N)):
    if i >= N: break
    orig_url = str(row["input_url"])
    img_url = ensure_direct_image_url(orig_url)
    img_name = row["input_image_name"]
    edit_request = row['instruction']
    local_path = f"images/{img_name}"
    caption = ""
    if smart_download_image(img_url, local_path):
        caption = generate_blip_caption(local_path, blip_processor, blip_model)
        #caption = generate_blip2_flan_caption(local_path)
    if 'is no longer available' in caption:
        os.remove(local_path)
        continue
    output.append({
        "input_image_name": img_name,
        "input_url": orig_url,
        "download_url": img_url,
        "download_success": os.path.exists(local_path) and os.path.getsize(local_path) > 0,
        "caption": caption,
        "edit_request": edit_request
    })

ff = pd.DataFrame(output)

ff.to_csv("captions.csv", index=False)

100%|██████████| 10/10 [01:00<00:00,  6.06s/it]


In [21]:
import json
import openai
from typing import List

from api_key import OPENAI_API_KEY

def generate_edit_instructions(
    base_caption: str,
    example_request: str,
    num_instructions: int,
    model: str = "gpt-3.5-turbo",
    temperature: float = 0.7
) -> List[str]:
    prompt = (
        f"You are an image editing assistant. Given the image caption: '{base_caption}', "
        f"generate {num_instructions} concise and diverse edit instructions "
        f"that could be applied to the image. "
        f"Use this example edit instruction as a reference: '{example_request}'. "
        f"Return the instructions as a JSON array of strings."
    )
    client = openai.OpenAI(api_key = OPENAI_API_KEY)
    response = client.chat.completions.create(
        model=model,
        messages=[
            {"role": "system", "content": "You generate creative image edit instructions."},
            {"role": "user", "content": prompt}
        ],
        temperature=temperature,
        max_tokens=num_instructions * 25,
    )
    content = response.choices[0].message.content.strip()
    match = re.search(r"\[\s*\".*?\"\s*(?:,\s*\".*?\"\s*)*\]", content, re.DOTALL)
    
    #instructions = json.loads(content)
    instructions = json.loads(match.group(0))
    
    if not isinstance(instructions, list):
        raise ValueError("Parsed JSON is not list")
    return instructions

def generate_edited_caption(
    base_caption: str,
    edit_request: str,
    model: str = "gpt-3.5-turbo",
    temperature: float = 0.3
) -> str:
    prompt = (
        f"Original caption: '{base_caption}'\n"
        f"Edit request: '{edit_request}'\n"
        f"Write an edited image caption around the same length as the original, but with the edit request incorporated into it."
    )
    client = openai.OpenAI(api_key = OPENAI_API_KEY)
    response = client.chat.completions.create(
        model=model,
        messages=[
            {"role": "system", "content": "You summarize image edits as text instructions."},
            {"role": "user", "content": prompt}
        ],
        temperature=temperature,
        max_tokens=60,
    )
    return response.choices[0].message.content.strip()



img_to_edit_req = {}
img_to_new_cap = {}

for i, row in ff.iterrows():
    caption = row['caption']
    example_request = row['edit_request']
    img_name = row['input_image_name']
    img_to_edit_req[img_name] = list(generate_edit_instructions(caption, example_request, 5))
    img_to_new_cap[img_name] = list(generate_edited_caption(caption, img_to_edit_req[img_name][x]) for x in range(5))




for img_name, edits in img_to_edit_req.items():
    print(f"{img_name}:")
    for i, edit in enumerate(edits, 1):
        print(f"  {i}. {edit}")
    print()

for img_name, edits in img_to_new_cap.items():
    print(f"{img_name}:")
    for i, edit in enumerate(edits, 1):
        print(f"  {i}. {edit}")
    print()



#generate_edit_instructions(caption, example_request, 5)




RAW RESPONSE:
 [
  "Enhance the lighting and contrast of the image.",
  "Remove any distracting background elements.",
  "Apply a vintage filter to give the image a nostalgic feel.",
  "Add a soft focus effect to create a dreamy atmosphere.",
  "Crop the image to focus on the man and woman taking the selfie."
]
RAW RESPONSE:
 [
  "Convert the image to black and white for a vintage look.",
  "Crop the image to focus on the man and child, emphasizing the bond between them.",
  "Apply a sepia tone filter to give the photo an antique feel.",
  "Add a soft light overlay to create a dreamy atmosphere.",
  "Adjust the color balance to bring out the warm tones in the photo."
]
RAW RESPONSE:
 [
  "Add a vintage sepia tone effect to give the image a nostalgic feel.",
  "Enhance the contrast to make the shadows and highlights more dramatic.",
  "Apply a grainy texture overlay to create a weathered, aged look.",
  "Convert the image to a high contrast black and white to make the subject stand out.

This is an image of a man and woman taking a selfie with enhanced vibrant colors.
