In [None]:
import os

In [None]:
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:256,expandable_segments:True"

In [None]:
import torch

In [None]:
import os, requests, json
from PIL import Image
from tqdm import tqdm
import pandas as pd

import openai
openai.api_key = "openai_APIKEY"

from diffusers import StableDiffusionPipeline, DDIMScheduler
from transformers import CLIPModel, CLIPProcessor, AutoTokenizer, AutoModel

pipe = StableDiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5", torch_dtype=torch.float16).to("cuda")
for module in pipe.text_encoder.modules():
    if hasattr(module, "inplace") and module.inplace:
        module.inplace = False

pipe.scheduler = DDIMScheduler.from_config(pipe.scheduler.config)
pipe.enable_attention_slicing()
try:
    pipe.enable_xformers_memory_efficient_attention()
except Exception:
    pass


In [None]:
import re

def ensure_direct_image_url(url):
    """Convert Imgur/Dropbox URLs to direct image links if needed."""
    if "imgur.com" in url and not re.search(r'\.(jpg|jpeg|png|bmp)$', url):
        m = re.search(r'imgur\.com/(?:gallery/|a/)?([^.?&]+)', url)
        if m: return f"https://i.imgur.com/{m.group(1)}.jpg"
    return url

def smart_download_image(url, save_path):
    """Download an image with user-agent header, handling Dropbox links."""
    if "dropbox.com" in url:
        url = url.replace("?dl=0", "")
        if "?raw=1" not in url:
            url += "&raw=1" if "?" not in url else "&raw=1"
    headers = {"User-Agent": "Mozilla/5.0", "Accept-Encoding": "identity"}
    try:
        resp = requests.get(url, headers=headers, timeout=30)
        if resp.status_code == 200 and resp.headers.get('content-type','').startswith("image"):
            with open(save_path, "wb") as f: f.write(resp.content)
            return True
    except Exception as e:
        print(f"Download error for {url}: {e}")
    return False

df = pd.read_csv("RealEdit_train_split_urls.csv")
os.makedirs("originals", exist_ok=True)

N = 10
image_info = []
for i, row in tqdm(df.iterrows(), total=min(len(df), N)):
    if i >= N: break
    fname = row["input_image_name"]
    url = ensure_direct_image_url(str(row["input_url"]))
    save_path = f"originals/{fname}"
    if smart_download_image(url, save_path):
        image_info.append((fname, url, row.get("subreddit",""), str(row.get("title","")), str(row.get("selftext",""))))
    else:
        print(f"Skipping {fname}: download failed.")


In [None]:
from transformers import CLIPTokenizer

In [None]:
import openai
from openai import OpenAI
import base64

In [None]:
import re

tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-large-patch14")

def truncate_caption_safely(caption, max_tokens=77):
    sentences = re.split(r'(?<=[.!?]) +', caption.strip())

    current_text = ""
    for sentence in sentences:
        proposed_text = (current_text + " " + sentence).strip()
        token_ids = tokenizer(proposed_text, return_tensors="pt", truncation=False)["input_ids"][0]
        if len(token_ids) > max_tokens:
            break
        current_text = proposed_text

    return current_text


captions = []
client = OpenAI(api_key=openai.api_key)

image_dir = "originals"
image_files = [f for f in os.listdir(image_dir) if f.lower().endswith((".jpg", ".jpeg", ".png"))][:N]
captions = []

for image_file in image_files:
    image_path = os.path.join(image_dir, image_file)
    if not os.path.exists(image_path) or os.path.getsize(image_path) < 1024:
        print(f"Skipping {image_file}: file missing or too small.")
        continue

    try:
        with open(image_path, "rb") as img_file:
            image_bytes = img_file.read()
            base64_img = base64.b64encode(image_bytes).decode("utf-8")

        response = client.chat.completions.create(
            model="gpt-4o",
            messages=[
                {
                    "role": "user",
                    "content": [
                        {"type": "text", "text": "Describe the image in full detail, but limit your response to under 50 words. Focus on what's visually clear. Avoid exaggeration or hallucination. Do not include information that is not clearly visible in the image."},
                        {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{base64_img}"}}
                    ]
                }
            ],
            max_tokens=77
        )

        caption = response.choices[0].message.content.strip()
        caption = truncate_caption_safely(caption)
        captions.append((image_file, caption))

    except Exception as e:
        print(f"{image_file}, Error: {str(e)}")


In [None]:
from tabulate import tabulate
df = pd.DataFrame(captions, columns=["Image", "Caption"])
print(tabulate(df, headers="keys", tablefmt="github", showindex=True))

In [None]:
import tiktoken

In [None]:
enc = tiktoken.encoding_for_model("gpt-3.5-turbo")

In [None]:
all_edits = {}
for (fname, cap) in captions:
    print(f"\nGenerating edits for '{fname}' with caption: {cap}")
    prompt = (
    "You are simulating real user editing behavior for a dataset of image edits.\n"
    "Given a description of an image, imagine how actual users would ask to modify it. "
    "These edits should be creative, realistic, and specific — things a person might type into an AI editor, like:\n"
    "- 'Add a dog sitting near the woman'\n"
    "- 'Make the sunset more vibrant'\n"
    "- 'Change the man’s outfit to a business suit'\n"
    "- 'Remove the second person from the left'\n"
    "- 'Make the child look older'\n"
    "Each edit should involve a meaningful visual change to the image, not just generic filters like 'increase contrast'.\n"
    "\n"
    "For each instruction, generate a matching edited image caption that describes the image *after* the edit.\n"
    "Each 'edited_caption' must be **under 77 tokens**, even after tokenization (not just word count).\n"
    "Avoid repetitions. The 10 edits must be diverse (e.g. subject, background, object-level, style).\n"
    "\n"
    "Output a JSON array of 10 items, where each item is an object with two fields:\n"
    "- 'instruction': the user's edit request\n"
    "- 'edited_caption': the caption for the image after applying that edit\n"
    "Do not include any explanation. Return only the JSON array.\n\n"
    f"Image Description: \"{cap}\""
    )
    try:
        response = client.chat.completions.create(
            model="gpt-3.5-turbo",
            messages=[{"role": "user", "content": prompt}],
            temperature=0.8
        )
        gpt_output = response.choices[0].message.content
        start = gpt_output.find('[')
        end = gpt_output.rfind(']') + 1
        edits = json.loads(gpt_output[start:end])
    except Exception as e:
        print(f"GPT API call failed for {fname}: {e}")
        continue

    if not isinstance(edits, list) or len(edits) != 10:
        print(f"Unexpected format for {fname}, skipping.")
        continue

    filtered_edits = []
    for item in edits:
        tok_len = len(enc.encode(item["edited_caption"]))
        if tok_len < 77:
            filtered_edits.append(item)
        else:
            print(f"Skipping overlong caption (len={tok_len}) for '{fname}': {item['edited_caption']}")

    if len(filtered_edits) < 10:
        print(f"Only {len(filtered_edits)} valid edits (under 77 tokens) for {fname}.")

    all_edits[fname] = filtered_edits

    for idx, item in enumerate(filtered_edits, 1):
        print(f" {idx}. INSTR: {item['instruction']} | CAPTION: {item['edited_caption']}")