Make sure to create a new folder `outfits` and upload all the files there first after connecting to a runtime


In [1]:
# 1. Install necessary libraries
# transformers: For LLaVA model
# accelerate/bitsandbytes: For 4-bit memory efficient loading on GPU
# tqdm: For the progress bar
!pip install transformers accelerate bitsandbytes tqdm Pillow

# 2. Imports
import os
import json
import torch
from pathlib import Path
from PIL import Image
import tqdm
from transformers import AutoProcessor, LlavaForConditionalGeneration, BitsAndBytesConfig

Collecting bitsandbytes
  Downloading bitsandbytes-0.48.2-py3-none-manylinux_2_24_x86_64.whl.metadata (10 kB)
Downloading bitsandbytes-0.48.2-py3-none-manylinux_2_24_x86_64.whl (59.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m59.4/59.4 MB[0m [31m16.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: bitsandbytes
Successfully installed bitsandbytes-0.48.2


In [2]:
def parse_outfit_response(response):
    """
    Parse the LLaVA response into structured outfit data (line-by-line parsing).
    This robustly handles the model's CATEGORY: [value] text output format.
    """
    lines = response.split('\n')
    outfit_data = {
        "category": "Unknown", "subcategory": "Unknown", "color_primary": "Unknown",
        "color_secondary": "N/A", "pattern": "Unknown", "material": "Unknown",
        "sleeve_length": "N/A", "length": "Unknown", "style_aesthetic": "Unknown",
        "fit_silhouette": "Unknown", "complete_description": ""
    }

    # Parse key-value pairs from response
    for line in lines:
        if ':' in line:
            key, value = line.split(':', 1)
            key = key.strip().upper()
            value = value.strip()

            # Map to outfit data keys
            if key == "CATEGORY": outfit_data["category"] = value
            elif key == "SUBCATEGORY": outfit_data["subcategory"] = value
            elif key == "COLOR_PRIMARY": outfit_data["color_primary"] = value
            elif key == "COLOR_SECONDARY": outfit_data["color_secondary"] = value if value else "N/A"
            elif key == "PATTERN": outfit_data["pattern"] = value
            elif key == "MATERIAL": outfit_data["material"] = value
            elif key == "SLEEVE_LENGTH": outfit_data["sleeve_length"] = value if value else "N/A"
            elif key == "LENGTH": outfit_data["length"] = value
            elif key == "STYLE_AESTHETIC": outfit_data["style_aesthetic"] = value
            elif key == "FIT_SILHOUETTE": outfit_data["fit_silhouette"] = value
            elif key == "COMPLETE_DESCRIPTION": outfit_data["complete_description"] = value

    return outfit_data


def generate_outfit_descriptions(folder_path, output_path):
    """
    Generate detailed outfit descriptions using LLaVA Vision Language Model.
    """
    # 1. Device and Quantization Setup (CRITICAL FOR COLAB GPU)
    device = "cuda" if torch.cuda.is_available() else "cpu"
    print(f"Using device: {device}")

    quantization_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_compute_dtype=torch.float16,
        bnb_4bit_quant_type="nf4",
    )

    # 2. Load LLaVA model
    print("Loading LLaVA Vision Language Model (4-bit)...")
    model_id = "llava-hf/llava-1.5-7b-hf"

    processor = AutoProcessor.from_pretrained(model_id)

    model = LlavaForConditionalGeneration.from_pretrained(
        model_id,
        torch_dtype=torch.float16,
        quantization_config=quantization_config,
        device_map="auto" # Automatically maps to GPU
    )

    # 3. Get all image files and sort them
    folder = Path(folder_path)
    image_extensions = {'.jpg', '.jpeg', '.png', '.webp', '.bmp', '.gif', '.tiff'}

    image_files = []
    for file in folder.iterdir():
        if file.is_file() and file.suffix.lower() in image_extensions:
            image_files.append(file)

    def sort_key(f):
        try:
            return int(f.stem)
        except ValueError:
            return float('inf')

    image_files.sort(key=sort_key)
    print(f"Found {len(image_files)} images")

    # 4. Prompt for detailed outfit analysis (Standardized for LLaVA)
    analysis_prompt = """Analyze this outfit image in detail and provide structured information about the clothing item/outfit shown.

    Please identify and describe: 1. Category, 2. Subcategory, 3. Primary color, 4. Secondary colors, 5. Pattern, 6. Material/Fabric type, 7. Sleeve length, 8. Overall length/fit, 9. Style aesthetic, 10. Fit/Silhouette, 11. A complete 2-3 sentence description of the outfit.

    Format your response exactly as follows:
    CATEGORY: [category]
    SUBCATEGORY: [subcategory]
    COLOR_PRIMARY: [primary color]
    COLOR_SECONDARY: [secondary colors or N/A]
    PATTERN: [pattern type]
    MATERIAL: [material/fabric]
    SLEEVE_LENGTH: [sleeve length or N/A]
    LENGTH: [length/fit description]
    STYLE_AESTHETIC: [aesthetic style]
    FIT_SILHOUETTE: [fit type]
    COMPLETE_DESCRIPTION: [2-3 sentence description]"""

    # 5. Generate descriptions
    outfit_descriptions = {}

    for image_path in tqdm.tqdm(image_files):
        try:
            image = Image.open(image_path).convert('RGB')
            prompt_template = f"USER: <image>\n{analysis_prompt}\nASSISTANT:"

            inputs = processor(images=image, text=prompt_template, return_tensors="pt")

            # Move inputs to the GPU where the model resides
            inputs = {k: v.to(model.device) if isinstance(v, torch.Tensor) else v for k, v in inputs.items()}

            with torch.no_grad():
                output_ids = model.generate(
                    **inputs,
                    max_new_tokens=300,
                    do_sample=False, # Use greedy search for structured output
                )

            response = processor.decode(output_ids[0], skip_special_tokens=True)
            response_text = response.split("ASSISTANT:")[-1].strip()

            outfit_data = parse_outfit_response(response_text)
            outfit_descriptions[image_path.name] = outfit_data

        except Exception as e:
            # Error handling (saves the error message instead of crashing)
            outfit_descriptions[image_path.name] = {
                "category": "Execution Error",
                "complete_description": f"Error: {str(e)}",
                "error": True
            }

    # 6. Save to JSON file
    with open(output_path, 'w') as f:
        json.dump(outfit_descriptions, f, indent=2)

    print(f"\nSaved outfit descriptions to: {output_path}")
    print(f"Successfully processed {len(outfit_descriptions)} images")

In [3]:
# Ensure your 'outfits' folder with images is uploaded to the Colab session!

outfits_folder = "outfits"
output_file = "outfit_descriptions.json"

if not os.path.exists(outfits_folder):
    print(f"ERROR: Folder '{outfits_folder}' not found! Please upload your images.")
else:
    generate_outfit_descriptions(outfits_folder, output_file)
    print("\n--- Processing Complete ---")
    print(f"Download '{output_file}' from the Colab file browser now!")

Using device: cuda
Loading LLaVA Vision Language Model (4-bit)...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


processor_config.json:   0%|          | 0.00/173 [00:00<?, ?B/s]

chat_template.json:   0%|          | 0.00/701 [00:00<?, ?B/s]

chat_template.jinja:   0%|          | 0.00/674 [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/505 [00:00<?, ?B/s]

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

added_tokens.json:   0%|          | 0.00/41.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/552 [00:00<?, ?B/s]

`torch_dtype` is deprecated! Use `dtype` instead!


config.json:   0%|          | 0.00/950 [00:00<?, ?B/s]

model.safetensors.index.json: 0.00B [00:00, ?B/s]

Fetching 3 files:   0%|          | 0/3 [00:00<?, ?it/s]

model-00001-of-00003.safetensors:   0%|          | 0.00/4.99G [00:00<?, ?B/s]

model-00002-of-00003.safetensors:   0%|          | 0.00/4.96G [00:00<?, ?B/s]

model-00003-of-00003.safetensors:   0%|          | 0.00/4.18G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/141 [00:00<?, ?B/s]

Found 58 images


100%|██████████| 58/58 [06:49<00:00,  7.07s/it]


Saved outfit descriptions to: textual_metadata.json
Successfully processed 58 images

--- Processing Complete ---
Download 'textual_metadata.json' from the Colab file browser now!



