In [1]:
%%capture
!pip install -U bitsandbytes
!pip install qwen-vl-utils



In [2]:
import torch
from transformers import Qwen2VLForConditionalGeneration, AutoProcessor
from qwen_vl_utils import process_vision_info
import json
from PIL import Image
import os

print("Loading Qwen2-VL-2B with 4-bit quantization...")

model = Qwen2VLForConditionalGeneration.from_pretrained(
    "Qwen/Qwen2-VL-2B-Instruct",
    dtype=torch.float16,
    device_map="auto",
    load_in_4bit=True,  # 4-bit quantization for RTX 3050
)

processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-2B-Instruct")

print("✓ Model loaded successfully!")


`torch_dtype` is deprecated! Use `dtype` instead!


Loading Qwen2-VL-2B with 4-bit quantization...


Access to the secret `HF_TOKEN` has not been granted on this notebook.
You will not be requested again.
Please restart the session if you want to be prompted again.
The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.


model.safetensors.index.json: 0.00B [00:00, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/429M [00:00<?, ?B/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/3.99G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/272 [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/347 [00:00<?, ?B/s]

The image processor of type `Qwen2VLImageProcessor` is now loaded as a fast processor by default, even if the model checkpoint was saved with a slow processor. This is a breaking change and may produce slightly different outputs. To continue using the slow processor, instantiate this class with `use_fast=False`. Note that this behavior will be extended to all models in a future release.


tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

chat_template.json: 0.00B [00:00, ?B/s]

✓ Model loaded successfully!


In [22]:
# @title
def classify_clothing_local(image_path):
    """
    Classify clothing using local Qwen2-VL-2B model
    Returns structured JSON metadata
    """

    # Load image
    image = Image.open(image_path).convert('RGB')

    # Detailed prompt for Indian ethnic wear
    prompt = """Analyze this clothing item image. Focus only on the garment, ignore background and people.

Provide detailed analysis in ONLY valid JSON format:

{
  "specific_type": "detailed garment name (e.g., silk saree, embroidered kurti)",
  "category": "exact category: kurta/kurti/palazzo/churidar/salwar/saree/lehenga/anarkali_suit/gown/dupatta/blouse/choli/dhoti_pants/skirt/shirt/t_shirt/jeans/trousers/crop_top/peplum_top/anarkali_top/cape/jacket/shawl/lehenga_set",
  "color_primary": "dominant color with shade (e.g., deep maroon)",
  "color_secondary": ["secondary color1", "secondary color2"],
  "pattern": "design pattern (floral/paisley/solid/geometric/embroidery/prints)",
  "material": "fabric type (silk/cotton/chiffon/georgette/denim/linen/crepe)",
  "style": "traditional/contemporary/fusion/casual/formal/festive",
  "occasions": ["wedding", "festival", "casual", "office", "party", "daily_wear"],
  "weather": ["summer", "winter", "monsoon", "all_season"],
  "formality": "casual/semi_formal/formal/festive",
  "embellishments": ["embroidery", "sequins", "prints", "zari_work", "mirror_work", "plain"],
  "gender": "male/female/unisex",
  "fit": "loose/fitted/flowy/structured"
}

Return ONLY the JSON object, no other text."""

    # Prepare messages
    messages = [
        {
            "role": "user",
            "content": [
                {"type": "image", "image": image},
                {"type": "text", "text": prompt}
            ]
        }
    ]

    # Prepare inputs
    text = processor.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=True
    )

    image_inputs, video_inputs = process_vision_info(messages)

    inputs = processor(
        text=[text],
        images=image_inputs,
        videos=video_inputs,
        padding=True,
        return_tensors="pt"
    )

    inputs = inputs.to("cuda")

    # Generate classification
    with torch.no_grad():
        generated_ids = model.generate(
            **inputs,
            max_new_tokens=512,
            temperature=0.3
        )

    # Decode output
    generated_ids_trimmed = [
        out_ids[len(in_ids):]
        for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
    ]

    output_text = processor.batch_decode(
        generated_ids_trimmed,
        skip_special_tokens=True,
        clean_up_tokenization_spaces=False
    )[0]

    # Parse JSON
    try:
        # Extract JSON from output
        json_start = output_text.find('{')
        json_end = output_text.rfind('}') + 1
        json_str = output_text[json_start:json_end]
        classification = json.loads(json_str)

        # Check for inconsistencies and unnecessary information
        if (classification.get("specific_type") == "detailed garment name (e.g., silk saree, embroidered kurti)" or
            classification.get("category") == "exact category: kurta/kurti/palazzo/churidar/salwar/saree/lehenga/anarkali_suit/gown/dupatta/blouse/choli/dhoti_pants/skirt/shirt/t_shirt/jeans/trousers/crop_top/peplum_top/anarkali_top/cape/jacket/shawl/lehenga_set"):
            print(f"Warning: Inconsistent output for {image_path}. Output was: {output_text[:200]}...") # Print truncated output
            return None

        return classification
    except json.JSONDecodeError:
        print(f"Warning: Could not parse JSON for {image_path}. Output was: {output_text[:200]}...") # Print truncated output
        return None
    except Exception as e:
        print(f"An unexpected error occurred during JSON parsing for {image_path}: {e}")
        return None

In [23]:
result = classify_clothing_local("test_kurti.jpg")
print(json.dumps(result, indent=2))

{
  "specific_type": "kurti",
  "category": "casual",
  "color_primary": "beige",
  "color_secondary": [
    "gray"
  ],
  "pattern": "floral",
  "material": "cotton",
  "style": "casual",
  "occasions": [
    "daily_wear"
  ],
  "weather": [
    "summer"
  ],
  "formality": "casual",
  "embellishments": [
    "embroidery"
  ],
  "gender": "female",
  "fit": "flowy"
}


In [10]:
# @title
import requests

image_url = "https://hassus.com/cdn/shop/files/HS23WK008CR_1.jpg?format=pjpg&v=1693392031&width=1100"
image_filename = "test_kurti.jpg"

try:
    response = requests.get(image_url, stream=True)
    response.raise_for_status()  # Raise an HTTPError for bad responses (4xx or 5xx)

    with open(image_filename, 'wb') as file:
        for chunk in response.iter_content(chunk_size=8192):
            file.write(chunk)

    print(f"Image downloaded and saved as {image_filename}")

except requests.exceptions.RequestException as e:
    print(f"Error downloading the image: {e}")

Image downloaded and saved as test_kurti.jpg


In [12]:
%%capture
!wget https://huggingface.co/datasets/barryallen16/fitcheck-annotate-dataset/resolve/main/fitcheck-dataset.zip
!unzip fitcheck-dataset.zip

In [20]:
import os
BasePath='fitcheck-dataset'
for folder_name in os.listdir(BasePath):
    print(folder_name)
    for image_name in os.listdir(BasePath+'/'+folder_name):
      image_path=BasePath+'/'+folder_name+'/'+image_name
      print(image_path)


women-kurthi
fitcheck-dataset/women-kurthi/light-blue.jpg
fitcheck-dataset/women-kurthi/white.jpg
fitcheck-dataset/women-kurthi/purple.jpg
fitcheck-dataset/women-kurthi/olive-green.jpg
fitcheck-dataset/women-kurthi/black.jpg
fitcheck-dataset/women-kurthi/navy-blue.jpg
fitcheck-dataset/women-kurthi/pink.jpg
men-shirts
fitcheck-dataset/men-shirts/maroon.jpg
fitcheck-dataset/men-shirts/light-blue.jpg
fitcheck-dataset/men-shirts/brown.jpg
fitcheck-dataset/men-shirts/white.jpg
fitcheck-dataset/men-shirts/green.jpg
fitcheck-dataset/men-shirts/mustard.jpg
fitcheck-dataset/men-shirts/grey.jpg
fitcheck-dataset/men-shirts/black.jpg
fitcheck-dataset/men-shirts/navy-blue.jpg
fitcheck-dataset/men-shirts/pink.jpg
women-bottoms
fitcheck-dataset/women-bottoms/white-lehenga-skirt.jpg
fitcheck-dataset/women-bottoms/pink-palazzo.jpg
fitcheck-dataset/women-bottoms/light-blue-churidar.jpg
fitcheck-dataset/women-bottoms/black-salwar.jpg
fitcheck-dataset/women-bottoms/navy-blue-sharara.jpg
men-bottoms
fitche

In [24]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import time
import os

def process_indofashion_local(base_path, output_csv="wardrobe_database.csv"):
    """
    Process IndoFashion dataset locally with Qwen2-VL-2B
    """

    results = []
    image_files = []

    # Collect all image paths from subdirectories
    for folder_name in os.listdir(base_path):
        folder_path = os.path.join(base_path, folder_name)
        if os.path.isdir(folder_path):
            for image_name in os.listdir(folder_path):
                if image_name.lower().endswith(('.png', '.jpg', '.jpeg', '.webp')):
                    image_files.append(os.path.join(folder_path, image_name))


    print(f"\n{'='*80}")
    print(f"Processing {len(image_files)} images locally on RTX 3050")
    # Estimate time based on the previous average per image time
    # This part relies on global variables from previous runs, which might not be reliable after modifications.
    # Removing this estimation for now to avoid potential errors.
    # if 'elapsed_time' in globals() and 'results' in globals() and len(results) > 0:
    #     avg_time_per_image = elapsed_time / len(results)
    #     estimated_time_minutes = len(image_files) * avg_time_per_image / 60
    #     print(f"Estimated time: {estimated_time_minutes:.1f} minutes")
    print(f"{'='*80}\n")


    start_time = time.time()

    # Process with progress bar
    for idx, image_path in enumerate(tqdm(image_files, desc="Classifying")):

        try:
            # Classify image
            classification = classify_clothing_local(image_path)

            if classification:
                # Add metadata
                result_entry = {
                    'item_id': f"item_{idx:04d}",
                    'filename': os.path.basename(image_path),
                    'image_path': image_path,
                    'classification': classification # Save the whole classification dictionary
                }
                results.append(result_entry)


            # Save checkpoint every 50 images
            if (idx + 1) % 50 == 0:
                temp_df = pd.DataFrame(results)
                temp_df.to_csv(f"checkpoint_{idx+1}.csv", index=False)
                print(f"\n✓ Checkpoint saved: {idx+1} images processed")

        except Exception as e:
            print(f"\n❌ Error processing {image_path}: {e}")
            continue

    # Calculate time
    elapsed_time = time.time() - start_time

    # Save final results
    df = pd.DataFrame(results)
    df.to_csv(output_csv, index=False)

    print(f"\n{'='*80}")
    print(f"✅ Processing Complete!")
    print(f"✅ Processed: {len(results)} images")
    print(f"✅ Time taken: {elapsed_time/60:.1f} minutes")
    if len(results) > 0:
        print(f"✅ Average: {elapsed_time/len(results):.2f} seconds per image")
    print(f"✅ Saved to: {output_csv}")
    print(f"{'='*80}\n")

    return df


# Run processing
if __name__ == "__main__":
    df = process_indofashion_local("fitcheck-dataset/")


Processing 42 images locally on RTX 3050



Classifying:   2%|▏         | 1/42 [00:25<17:15, 25.25s/it]

  "specific_type": "kurti",
  "category": "exact category: kurta/kurti/palazzo/churidar/salwar/saree/lehenga/anarkali_suit/gown/dupatta/blouse/choli/dhoti_pants/skirt/shirt/t_shirt/jeans/trousers/cr...


Classifying:  14%|█▍        | 6/42 [01:39<10:34, 17.63s/it]

  "specific_type": "detailed garment name (e.g., silk saree, embroidered kurti)",
  "category": "exact category: kurta/kurti/palazzo/churidar/salwar/saree/lehenga/anarkali_suit/gown/dupatta/blouse/c...


Classifying:  67%|██████▋   | 28/42 [06:34<03:25, 14.65s/it]

  "specific_type": "embroidered kurti",
  "category": "exact category: kurta/kurti/palazzo/churidar/salwar/saree/lehenga/anarkali_suit/gown/dupatta/blouse/choli/dhoti_pants/skirt/shirt/t_shirt/jeans...


Classifying: 100%|██████████| 42/42 [09:44<00:00, 13.93s/it]


✅ Processing Complete!
✅ Processed: 39 images
✅ Time taken: 9.7 minutes
✅ Average: 15.00 seconds per image
✅ Saved to: wardrobe_database.csv






In [None]:
# @title
# import pandas as pd
# import numpy as np
# from tqdm import tqdm
# import time
# import os

# def process_indofashion_local(base_path, output_csv="wardrobe_database.csv"):
#     """
#     Process IndoFashion dataset locally with Qwen2-VL-2B
#     """

#     results = []
#     image_files = []

#     # Collect all image paths from subdirectories
#     for folder_name in os.listdir(base_path):
#         folder_path = os.path.join(base_path, folder_name)
#         if os.path.isdir(folder_path):
#             for image_name in os.listdir(folder_path):
#                 if image_name.lower().endswith(('.png', '.jpg', '.jpeg', '.webp')):
#                     image_files.append(os.path.join(folder_path, image_name))


#     print(f"\n{'='*80}")
#     print(f"Processing {len(image_files)} images locally on RTX 3050")
#     # Estimate time based on the previous average per image time
#     # This part relies on global variables from previous runs, which might not be reliable after modifications.
#     # Removing this estimation for now to avoid potential errors.
#     # if 'elapsed_time' in globals() and 'results' in globals() and len(results) > 0:
#     #     avg_time_per_image = elapsed_time / len(results)
#     #     estimated_time_minutes = len(image_files) * avg_time_per_image / 60
#     #     print(f"Estimated time: {estimated_time_minutes:.1f} minutes")
#     print(f"{'='*80}\n")


#     start_time = time.time()

#     # Process with progress bar
#     for idx, image_path in enumerate(tqdm(image_files, desc="Classifying")):

#         try:
#             # Classify image
#             classification = classify_clothing_local(image_path)

#             if classification:
#                 # Add metadata
#                 result_entry = {
#                     'item_id': f"item_{idx:04d}",
#                     'filename': os.path.basename(image_path),
#                     'image_path': image_path,
#                     'classification': classification # Save the whole classification dictionary
#                 }
#                 results.append(result_entry)


#             # Save checkpoint every 50 images
#             if (idx + 1) % 50 == 0:
#                 temp_df = pd.DataFrame(results)
#                 temp_df.to_csv(f"checkpoint_{idx+1}.csv", index=False)
#                 print(f"\n✓ Checkpoint saved: {idx+1} images processed")

#         except Exception as e:
#             print(f"\n❌ Error processing {image_path}: {e}")
#             continue

#     # Calculate time
#     elapsed_time = time.time() - start_time

#     # Save final results
#     df = pd.DataFrame(results)
#     df.to_csv(output_csv, index=False)

#     print(f"\n{'='*80}")
#     print(f"✅ Processing Complete!")
#     print(f"✅ Processed: {len(results)} images")
#     print(f"✅ Time taken: {elapsed_time/60:.1f} minutes")
#     if len(results) > 0:
#         print(f"✅ Average: {elapsed_time/len(results):.2f} seconds per image")
#     print(f"✅ Saved to: {output_csv}")
#     print(f"{'='*80}\n")

#     return df


# # Run processing
# if __name__ == "__main__":
#     df = process_indofashion_local("fitcheck-dataset/")