In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import os
import shutil
from pathlib import Path
drive_base = '/content/drive/MyDrive/Colab Notebooks/image_retrieval/dataset/fashion-iq'
colab_base = '/content/fashion-iq'

# Create destination root in Colab internal storage
os.makedirs(colab_base, exist_ok=True)

# Define subdirectories
subdirs = ['image_splits', 'captions']

# Copy 'image_splits' and 'captions' directories
for subdir in subdirs:
    src = os.path.join(drive_base, subdir)
    dst = os.path.join(colab_base, subdir)
    if os.path.exists(dst):
        shutil.rmtree(dst)  # Remove existing to avoid duplication/conflict
    shutil.copytree(src, dst)
    print(f"✅ Copied {subdir} to {dst}")

# Handle images.zip
zip_path = os.path.join(drive_base, 'images/images.zip')
extract_dir = os.path.join(colab_base, 'images')

# Make sure destination for images exists
os.makedirs(extract_dir, exist_ok=True)

# Unzip images.zip into /content/fashion-iq/images/
!unzip -q "$zip_path" -d "$extract_dir"

print("✅ Unzipped images.zip into Colab storage.")

✅ Copied image_splits to /content/fashion-iq/image_splits
✅ Copied captions to /content/fashion-iq/captions
replace /content/fashion-iq/images/images/245600258X.png? [y]es, [n]o, [A]ll, [N]one, [r]ename: ✅ Unzipped images.zip into Colab storage.


In [None]:
import os
import shutil
from pathlib import Path
drive_base = '/content/drive/MyDrive/Colab Notebooks/image_retrieval/dataset/fashion-iq'
colab_base = '/content/fashion-iq'
base_path =  Path("/content/fashion-iq")

In [None]:
import torch
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cuda


In [None]:
import sys
sys.path.append('/content/drive/MyDrive/Colab Notebooks/image_retrieval')
import importlib
import data_utils  # initial import
from data_utils import targetpad_transform,_convert_image_to_rgb

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
✅ Copied image_splits to /content/fashion-iq/image_splits
✅ Copied captions to /content/fashion-iq/captions


In [None]:
!pip install ftfy regex tqdm
!pip install git+https://github.com/openai/CLIP.git


Collecting git+https://github.com/openai/CLIP.git
  Cloning https://github.com/openai/CLIP.git to /tmp/pip-req-build-io4dn95y
  Running command git clone --filter=blob:none --quiet https://github.com/openai/CLIP.git /tmp/pip-req-build-io4dn95y
  Resolved https://github.com/openai/CLIP.git to commit dcba3cb2e2827b402d2701e7e1c7d9fed8a20ef1
  Preparing metadata (setup.py) ... [?25l[?25hdone


In [None]:
import clip
import torch
from PIL import Image


In [None]:
transform = targetpad_transform(target_ratio=1.0, dim=224)


In [None]:
prompts = [
    "a clothing item",
    "a fashion dress",
    "a shirt or top",
    "a brand logo",
    "an image with only text",
    "an empty product image",
]

In [None]:
model, preprocess_clip = clip.load("ViT-B/32", device=device)
text_tokens = clip.tokenize(prompts).to(device)
with torch.no_grad():
    text_features = model.encode_text(text_tokens)
    text_features /= text_features.norm(dim=-1, keepdim=True)  # Normalize

In [None]:
from pathlib import Path
from tqdm import tqdm

In [None]:
image_folder = base_path / 'images' / 'images'
output_folder = Path("/content/fashion-iq-cleaned")
output_folder.mkdir(parents=True, exist_ok=True)

In [None]:
unrelated_images = []
for img_path in tqdm(list(image_folder.glob("*.png"))):
    try:
        img = Image.open(img_path).convert("RGB")
        img_tensor = transform(img).unsqueeze(0).to(device)

        with torch.no_grad():
            img_features = model.encode_image(img_tensor)
            img_features /= img_features.norm(dim=-1, keepdim=True)

            similarity = (100.0 * img_features @ text_features.T).softmax(dim=-1)
            best_prompt_idx = similarity.argmax().item()
            best_prompt = prompts[best_prompt_idx]

            if best_prompt in ["a brand logo", "an image with only text", "an empty product image"]:
                unrelated_images.append(str(img_path))
            else:
                shutil.copy(img_path, output_folder / img_path.name)
    except Exception as e:
        print(f"⚠️ Error reading {img_path.name}: {e}")

100%|██████████| 74381/74381 [17:29<00:00, 70.88it/s]


In [None]:
# Define paths
cleaned_colab_path = "/content/fashion-iq-cleaned"
drive_destination = "/content/drive/MyDrive/Colab Notebooks/image_retrieval/dataset/fashion-iq-cleaned"

# Remove if already exists in Drive to avoid duplication (optional)
import shutil
import os

if os.path.exists(drive_destination):
    shutil.rmtree(drive_destination)

# Copy the cleaned folder to your Drive
shutil.copytree(cleaned_colab_path, drive_destination)

print("✅ Cleaned dataset successfully saved to Google Drive!")


✅ Cleaned dataset successfully saved to Google Drive!


In [None]:
cleaned_path = "/content/fashion-iq-cleaned"

# Count image files (e.g., .png, .jpg, .jpeg)
image_extensions = (".png", ".jpg", ".jpeg")
image_count = sum(
    1 for fname in os.listdir(cleaned_path) if fname.lower().endswith(image_extensions)
)

print(f"✅ Number of images in '{cleaned_path}': {image_count}")


✅ Number of images in '/content/fashion-iq-cleaned': 73620
