<a href="https://colab.research.google.com/github/Yang-star-source/Latent_Diffusion_From_Scratch/blob/main/Text_Embedding_Latent_Diffusion.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from huggingface_hub import hf_hub_download
import os

repo_id = "ziyang06315/cats_images_dataset"
DATASET_PATH = "/content/cats"
os.makedirs(DATASET_PATH,exist_ok=True)
cats_images_dataset = hf_hub_download(repo_id=repo_id,
                                      filename="cat.zip",
                                      repo_type="dataset")
!unzip -q {cats_images_dataset} -d {DATASET_PATH}


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


cat.zip:   0%|          | 0.00/2.43G [00:00<?, ?B/s]

In [None]:
# Install a version of transformers that works with Florence-2
!pip install transformers==4.41.2 timm einops pillow

# CRITICAL: You must restart the session after running this!
# In Colab menu: Runtime > Restart Session

Collecting transformers==4.41.2
  Downloading transformers-4.41.2-py3-none-any.whl.metadata (43 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/43.8 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.8/43.8 kB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers<0.20,>=0.19 (from transformers==4.41.2)
  Downloading tokenizers-0.19.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.7 kB)
Downloading transformers-4.41.2-py3-none-any.whl (9.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.1/9.1 MB[0m [31m84.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading tokenizers-0.19.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.6/3.6 MB[0m [31m126.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: tokenizers, transformers
  Attempting uninstall: tokenizers
 

In [None]:
import os
import torch
from PIL import Image
from transformers import AutoProcessor, AutoModelForCausalLM

# Install flash_attn which is a dependency for the model
!pip install flash_attn

# --- CONFIGURATION ---
IMAGE_FOLDER = "/content/cats"
MODEL_ID = "microsoft/Florence-2-large"

# Force GPU usage if available
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Device found: {DEVICE}")

if DEVICE == "cpu":
    print("WARNING: You are running on CPU. This will be very slow. Make sure 'Change Runtime Type' is set to T4 GPU.")

# --- LOAD MODEL (Main Branch) ---
print(f"Loading {MODEL_ID}...")

# We dropped the 'revision' tag. The pinned pip version handles the compatibility.
model = AutoModelForCausalLM.from_pretrained(
    MODEL_ID,
    trust_remote_code=True
).to(DEVICE).eval()

processor = AutoProcessor.from_pretrained(
    MODEL_ID,
    trust_remote_code=True
)

def generate_caption(image_path):
    try:
        image = Image.open(image_path).convert("RGB")
    except Exception as e:
        print(f"Error opening {image_path}: {e}")
        return None

    # Prompt for detailed description
    prompt = "<MORE_DETAILED_CAPTION>"

    inputs = processor(text=prompt, images=image, return_tensors="pt").to(DEVICE)

    with torch.no_grad():
        generated_ids = model.generate(
            input_ids=inputs["input_ids"],
            pixel_values=inputs["pixel_values"],
            max_new_tokens=1024,
            num_beams=3
        )

    generated_text = processor.batch_decode(generated_ids, skip_special_tokens=False)[0]

    parsed_answer = processor.post_process_generation(
        generated_text,
        task=prompt,
        image_size=(image.width, image.height)
    )

    return parsed_answer[prompt]

# --- MAIN LOOP ---
if not os.path.exists(IMAGE_FOLDER):
    os.makedirs(IMAGE_FOLDER)
    print(f"Created folder {IMAGE_FOLDER}. Please upload your images there!")

files = [f for f in os.listdir(IMAGE_FOLDER) if f.lower().endswith(('.png', '.jpg', '.jpeg'))]

if len(files) > 0:
    print(f"Found {len(files)} images. Starting processing...")
    for idx, filename in enumerate(files):
        img_path = os.path.join(IMAGE_FOLDER, filename)
        txt_path = os.path.join(IMAGE_FOLDER, os.path.splitext(filename)[0] + ".txt")

        # Skip if already exists
        if os.path.exists(txt_path):
            continue

        caption = generate_caption(img_path)
        if caption:
            with open(txt_path, "w") as f:
                f.write(caption)
            print(f"[{idx+1}/{len(files)}] Captioned {filename}")
else:
    print(f"Folder '{IMAGE_FOLDER}' is empty. Upload images to the left sidebar first.")

Collecting flash_attn
  Downloading flash_attn-2.8.3.tar.gz (8.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.4/8.4 MB[0m [31m73.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: flash_attn
  Building wheel for flash_attn (setup.py) ... [?25l[?25hdone
  Created wheel for flash_attn: filename=flash_attn-2.8.3-cp312-cp312-linux_x86_64.whl size=253780426 sha256=4e2f9e39313266b1544b68138b15b91ee6221eccf14f7902b7c6620351340810
  Stored in directory: /root/.cache/pip/wheels/3d/59/46/f282c12c73dd4bb3c2e3fe199f1a0d0f8cec06df0cccfeee27
Successfully built flash_attn
Installing collected packages: flash_attn
Successfully installed flash_attn-2.8.3
Device found: cuda
Loading microsoft/Florence-2-large...


model.safetensors:   0%|          | 0.00/1.55G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/51.0 [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/806 [00:00<?, ?B/s]

processing_florence2.py: 0.00B [00:00, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/microsoft/Florence-2-large:
- processing_florence2.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


tokenizer_config.json:   0%|          | 0.00/34.0 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
[559/5558] Captioned 2258.png
[560/5558] Captioned 4720.png
[561/5558] Captioned 4800.png
[562/5558] Captioned 5135.png
[563/5558] Captioned 2025.png
[564/5558] Captioned 1547.png
[565/5558] Captioned 0133.png
[566/5558] Captioned 4016.png
[567/5558] Captioned 5316.png
[568/5558] Captioned 4004.png
[569/5558] Captioned 1257.png
[570/5558] Captioned 4472.png
[571/5558] Captioned 2251.png
[572/5558] Captioned 1483.png
[573/5558] Captioned 1295.png
[574/5558] Captioned 1489.png
[575/5558] Captioned 2423.png
[576/5558] Captioned 3982.png
[577/5558] Captioned 0564.png
[578/5558] Captioned 4775.png
[579/5558] Captioned 1518.png
[580/5558] Captioned 2316.png
[581/5558] Captioned 3844.png
[582/5558] Captioned 2185.png
[583/5558] Captioned 4432.png
[584/5558] Captioned 4773.png
[585/5558] Captioned 0750.png
[586/5558] Captioned 1832.png
[587/5558] Captioned 1842.png
[588/5558] Captioned 3288.png
[589/5558] Captioned 0296.png
[590/

In [None]:
import shutil
from google.colab import files

# Zip the folder
shutil.make_archive('my_captioned_cats', 'zip', '/content/cats')

# Trigger download
files.download('my_captioned_cats.zip')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
from google.colab import drive
import shutil
import os

# 1. Mount your Google Drive
print("Mounting Google Drive...")
drive.mount('/content/drive')

# 2. Define source and destination
source_path = '/content/my_captioned_cats.zip'
destination_path = '/content/drive/MyDrive/my_captioned_cats_backup.zip'

# 3. Copy the file (Instant transfer)
if os.path.exists(source_path):
    print(f"Copying {source_path} to Google Drive...")
    shutil.copy(source_path, destination_path)
    print(f"✅ SUCCESS! File saved to: {destination_path}")
    print("You can now safely close this tab or restart runtime.")
else:
    print("❌ ERROR: Could not find the zip file! Did you name it correctly?")

Mounting Google Drive...
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Copying /content/my_captioned_cats.zip to Google Drive...
✅ SUCCESS! File saved to: /content/drive/MyDrive/my_captioned_cats_backup.zip
You can now safely close this tab or restart runtime.


In [None]:
import zipfile
import os

# Path to your backup on Drive
zip_path = '/content/drive/MyDrive/my_captioned_cats_backup.zip'

if os.path.exists(zip_path):
    print(f"Checking {zip_path}...")

    # Open the zip file in Read mode
    with zipfile.ZipFile(zip_path, 'r') as z:
        # Get a list of all file names inside
        all_files = z.namelist()

        # Count them
        txt_files = [f for f in all_files if f.endswith('.txt')]
        img_files = [f for f in all_files if f.endswith(('.png', '.jpg', '.jpeg'))]

        print(f"Total Files: {len(all_files)}")
        print(f"Images Found: {len(img_files)}")
        print(f"Text Files Found: {len(txt_files)}")

        if len(txt_files) > 0:
            print("\nSUCCESS! The text files are safely inside.")
            print("Example file:", txt_files[0])
        else:
            print("\nWARNING: No text files found in the zip!")
else:
    print("Could not find the zip file on Drive. Check the path.")

Checking /content/drive/MyDrive/my_captioned_cats_backup.zip...
Total Files: 11116
Images Found: 5558
Text Files Found: 5558

SUCCESS! The text files are safely inside.
Example file: 4536.txt
