Make sure to create a new folder `outfits` and upload all the files there first after connecting to a runtime


In [1]:
# 1. Install necessary libraries
# transformers: For LLaVA model
# accelerate/bitsandbytes: For 4-bit memory efficient loading on GPU
# tqdm: For the progress bar
!pip install transformers accelerate bitsandbytes tqdm Pillow

# 2. Imports
import os
import json
import torch
from pathlib import Path
from PIL import Image
import tqdm
from transformers import AutoProcessor, LlavaForConditionalGeneration, BitsAndBytesConfig

Collecting bitsandbytes
  Downloading bitsandbytes-0.48.2-py3-none-manylinux_2_24_x86_64.whl.metadata (10 kB)
Downloading bitsandbytes-0.48.2-py3-none-manylinux_2_24_x86_64.whl (59.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m59.4/59.4 MB[0m [31m16.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: bitsandbytes
Successfully installed bitsandbytes-0.48.2


In [4]:
def parse_outfit_response(response):
    """
    Parse the LLaVA response into structured outfit data.
    """
    lines = response.split('\n')
    # Updated default keys to include "type"
    outfit_data = {
        "type": "Unknown", "category": "Unknown", "subcategory": "Unknown",
        "color_primary": "Unknown", "color_secondary": "N/A", "pattern": "Unknown",
        "material": "Unknown", "sleeve_length": "N/A", "length": "Unknown",
        "style_aesthetic": "Unknown", "fit_silhouette": "Unknown",
        "complete_description": ""
    }

    # Parse key-value pairs from response
    for line in lines:
        if ':' in line:
            key, value = line.split(':', 1)
            key = key.strip().upper()
            value = value.strip()

            # Map to outfit data keys (now includes TYPE)
            if key == "TYPE": outfit_data["type"] = value
            elif key == "CATEGORY": outfit_data["category"] = value
            elif key == "SUBCATEGORY": outfit_data["subcategory"] = value
            elif key == "COLOR_PRIMARY": outfit_data["color_primary"] = value
            elif key == "COLOR_SECONDARY": outfit_data["color_secondary"] = value if value else "N/A"
            elif key == "PATTERN": outfit_data["pattern"] = value
            elif key == "MATERIAL": outfit_data["material"] = value
            elif key == "SLEEVE_LENGTH": outfit_data["sleeve_length"] = value if value else "N/A"
            elif key == "LENGTH": outfit_data["length"] = value
            elif key == "STYLE_AESTHETIC": outfit_data["style_aesthetic"] = value
            elif key == "FIT_SILHOUETTE": outfit_data["fit_silhouette"] = value
            elif key == "COMPLETE_DESCRIPTION": outfit_data["complete_description"] = value

    return outfit_data


def generate_outfit_descriptions(folder_path, output_path):
    """
    Generate detailed outfit descriptions using LLaVA Vision Language Model.
    """
    # 1. Device and Quantization Setup (CRITICAL FOR COLAB GPU)
    device = "cuda" if torch.cuda.is_available() else "cpu"
    print(f"Using device: {device}")

    quantization_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_compute_dtype=torch.float16,
        bnb_4bit_quant_type="nf4",
    )

    # 2. Load LLaVA model
    print("Loading LLaVA Vision Language Model (4-bit)...")
    model_id = "llava-hf/llava-1.5-7b-hf"

    processor = AutoProcessor.from_pretrained(model_id)

    model = LlavaForConditionalGeneration.from_pretrained(
        model_id,
        torch_dtype=torch.float16,
        quantization_config=quantization_config,
        device_map="auto" # Automatically maps to GPU
    )

    # 3. Get all image files and sort them
    folder = Path(folder_path)
    image_extensions = {'.jpg', '.jpeg', '.png', '.webp', '.bmp', '.gif', '.tiff'}

    image_files = [file for file in folder.iterdir() if file.is_file() and file.suffix.lower() in image_extensions]

    def sort_key(f):
        try:
            return int(f.stem)
        except ValueError:
            return float('inf')

    image_files.sort(key=sort_key)
    print(f"Found {len(image_files)} images")

    # 4. Prompt for detailed outfit analysis (Uses the same prompt text from your file)
    analysis_prompt = """Analyze this outfit image in detail and provide structured information about the clothing item/outfit shown.

    Please identify and describe:
    1. Type (Upper for tops/shirts/jackets, Lower for pants/skirts/shorts, Dress for one-piece outfits/dresses)
    2. Category (e.g., Outerwear, Top, Bottom, Dress, Accessories, Footwear)
    3. Subcategory (e.g., Overcoat, T-shirt, Jeans, Sneakers)
    4. Primary color
    5. Secondary colors (if any)
    6. Pattern (Solid, Striped, Plaid, Floral, etc.)
    7. Material/Fabric type (Cotton, Wool, Silk, Leather, etc.)
    8. Sleeve length (for tops/outerwear)
    9. Overall length/fit
    10. Style aesthetic (Business Casual, Streetwear, Minimalist, Vintage, etc.)
    11. Fit/Silhouette (Fitted, Relaxed, Oversized, etc.)
    12. A complete 2-3 sentence description of the outfit

    Format your response as follows:
    TYPE: [Upper/Lower/Dress]
    CATEGORY: [category]
    SUBCATEGORY: [subcategory]
    COLOR_PRIMARY: [primary color]
    COLOR_SECONDARY: [secondary colors or N/A]
    PATTERN: [pattern type]
    MATERIAL: [material/fabric]
    SLEEVE_LENGTH: [sleeve length or N/A]
    LENGTH: [length/fit description]
    STYLE_AESTHETIC: [aesthetic style]
    FIT_SILHOUETTE: [fit type]
    COMPLETE_DESCRIPTION: [2-3 sentence description]"""

    # 5. Generate descriptions
    outfit_descriptions = {}

    for image_path in tqdm.tqdm(image_files):
        try:
            image = Image.open(image_path).convert('RGB')
            prompt_template = f"USER: <image>\n{analysis_prompt}\nASSISTANT:"

            inputs = processor(images=image, text=prompt_template, return_tensors="pt")

            # Move inputs to the GPU
            inputs = {k: v.to(model.device) if isinstance(v, torch.Tensor) else v for k, v in inputs.items()}

            # Generate response
            with torch.no_grad():
                output_ids = model.generate(
                    **inputs,
                    max_new_tokens=300,
                    do_sample=False, # Use greedy search for structured output
                )

            response = processor.decode(output_ids[0], skip_special_tokens=True)
            response_text = response.split("ASSISTANT:")[-1].strip()

            # 6. Parse and Store
            outfit_data = parse_outfit_response(response_text)
            outfit_descriptions[image_path.name] = outfit_data

        except Exception as e:
            # 7. Error Handling (Stores error message, does NOT overwrite good data)
            outfit_descriptions[image_path.name] = {
                "type": "Execution Error",
                "category": "Unknown",
                "complete_description": f"Error: {str(e)}",
                "error": True
            }

    # 8. Save to JSON file
    with open(output_path, 'w') as f:
        json.dump(outfit_descriptions, f, indent=2)

    print(f"\nSaved outfit descriptions to: {output_path}")
    print(f"Successfully processed {len(outfit_descriptions)} images")

    # 9. Print sample entries
    print(f"\nSample descriptions (first 2 items):")
    for filename, data in list(outfit_descriptions.items())[:2]:
        print(f"\n{filename}:")
        print(f"  Type: {data.get('type', 'Unknown')}")
        print(f"  Style: {data.get('style_aesthetic', 'Unknown')}")

In [5]:
# Ensure your 'outfits' folder with images is uploaded to the Colab session!

outfits_folder = "outfits"
output_file = "outfit_descriptions.json"

if not os.path.exists(outfits_folder):
    print(f"ERROR: Folder '{outfits_folder}' not found! Please upload your images.")
else:
    generate_outfit_descriptions(outfits_folder, output_file)
    print("\n--- Processing Complete ---")
    print(f"Download '{output_file}' from the Colab file browser now!")

Using device: cuda
Loading LLaVA Vision Language Model (4-bit)...


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

Found 58 images


100%|██████████| 58/58 [07:36<00:00,  7.87s/it]


Saved outfit descriptions to: outfit_descriptions.json
Successfully processed 58 images

Sample descriptions (first 2 items):

1.jpg:
  Type: Dress
  Style: Business Casual

2.jpg:
  Type: Dress
  Style: Casual

--- Processing Complete ---
Download 'outfit_descriptions.json' from the Colab file browser now!



