In [5]:
from transformers import AutoProcessor, AutoModelForZeroShotImageClassification
from PIL import Image
import torch

In [15]:
# Load processor and model
processor = AutoProcessor.from_pretrained("patrickjohncyh/fashion-clip")
model = AutoModelForZeroShotImageClassification.from_pretrained("patrickjohncyh/fashion-clip")

In [14]:
# Load and prepare the image
image_path = "sweater.png"
image = Image.open(image_path).convert("RGB")

In [16]:
# Define fashion labels
candidate_labels = [
    "streetwear", "vintage style", "90s aesthetic", "oversized outfit",
    "y2k fashion", "boho", "classy evening wear", "minimalist", "grunge", "sweater"
]

# Preprocess input
inputs = processor(images=image, text=candidate_labels, return_tensors="pt", padding=True)

In [17]:
# Forward pass
with torch.no_grad():
    outputs = model(**inputs)
    logits = outputs.logits_per_image  # shape: [1, num_labels]
    probs = logits.softmax(dim=1)

# Print ranked results
for label, score in sorted(zip(candidate_labels, probs[0]), key=lambda x: -x[1]):
    print(f"{label}: {score.item():.4f}")

sweater: 0.9970
boho: 0.0021
vintage style: 0.0004
minimalist: 0.0002
grunge: 0.0001
oversized outfit: 0.0001
streetwear: 0.0001
y2k fashion: 0.0000
90s aesthetic: 0.0000
classy evening wear: 0.0000


In [20]:
# BLIP-2 example using transformers
from transformers import BlipProcessor, BlipForConditionalGeneration
from PIL import Image
import torch

In [21]:
# Load BLIP-2
processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")

In [22]:
# Load your image
image = Image.open("jeans.png").convert("RGB")

# Generate caption
inputs = processor(image, return_tensors="pt")
out = model.generate(**inputs)
caption = processor.decode(out[0], skip_special_tokens=True)
print("BLIP-2 Caption:", caption)

BLIP-2 Caption: a woman in a white top and blue pants
