In [1]:
import pandas as pd
import numpy as np
import requests
import torch
import torchvision.models as models
import torchvision.transforms as transforms
from torchvision.models import ResNet50_Weights
from PIL import Image
from io import BytesIO
from tqdm import tqdm
import pickle
import ast
from concurrent.futures import ThreadPoolExecutor, as_completed

In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"✅ Using device: {device}")

✅ Using device: cpu


In [3]:
df = pd.read_csv('/content/drive/MyDrive/Stylumia/Dataset/ready_dataset.csv')

In [4]:
transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

In [5]:
resnet = models.resnet50(weights=ResNet50_Weights.DEFAULT)
resnet = torch.nn.Sequential(*list(resnet.children())[:-1])
resnet.eval().to(device)

Downloading: "https://download.pytorch.org/models/resnet50-11ad3fa6.pth" to /root/.cache/torch/hub/checkpoints/resnet50-11ad3fa6.pth
100%|██████████| 97.8M/97.8M [00:00<00:00, 208MB/s]


Sequential(
  (0): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
  (1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (2): ReLU(inplace=True)
  (3): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
  (4): Sequential(
    (0): Bottleneck(
      (conv1): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv3): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (bn3): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
      (downsample): Sequential(
        (0): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)


In [6]:
def process_row(row):
    try:
        image_url = row['feature_image_s3']
        product_id = row['product_id']
        box = ast.literal_eval(row['box'])

        if not box or box == []:
            return None, None

        bbox = box[0]['bbox']  # [x1, y1, x2, y2]

        # Download image
        response = requests.get(image_url, timeout=10)
        img = Image.open(BytesIO(response.content)).convert('RGB')

        # Crop based on bounding box
        cropped = img.crop((bbox[0], bbox[1], bbox[2], bbox[3]))

        # Transform and extract features
        input_tensor = transform(cropped).unsqueeze(0).to(device)
        with torch.no_grad():
            feature = resnet(input_tensor).squeeze().cpu().numpy()

        return feature, product_id

    except Exception as e:
        return None, None

In [7]:
features = []
product_ids = []

with ThreadPoolExecutor(max_workers=8) as executor:
    futures = [executor.submit(process_row, row) for _, row in df.iterrows()]
    for future in tqdm(as_completed(futures), total=len(futures), desc="Parallel Feature Extraction"):
        feature, pid = future.result()
        if feature is not None:
            features.append(feature)
            product_ids.append(pid)

print(f"✅ Extracted {len(features)} valid feature vectors.")

Parallel Feature Extraction: 100%|██████████| 17441/17441 [1:30:44<00:00,  3.20it/s]

✅ Extracted 17410 valid feature vectors.





In [8]:
features_np = np.array(features)

with open("/content/drive/MyDrive/Stylumia/Models/similarity_model.pkl", "wb") as f:
    pickle.dump({
        "features": features_np,
        "product_ids": product_ids
    }, f)

print("📦 Model saved to: similarity_model.pkl")

📦 Model saved to: similarity_model.pkl


In [9]:
from sklearn.metrics.pairwise import cosine_similarity

def get_top_10_similar(input_image_path, model_path="/content/drive/MyDrive/Stylumia/Models/similarity_model.pkl"):
    with open(model_path, "rb") as f:
        model_data = pickle.load(f)
    features = model_data["features"]
    product_ids = model_data["product_ids"]

    # Preprocess input image
    img = Image.open(input_image_path).convert('RGB')
    img_tensor = transform(img).unsqueeze(0).to(device)

    with torch.no_grad():
        query_feature = resnet(img_tensor).squeeze().cpu().numpy()

    # Compute similarity
    similarities = cosine_similarity([query_feature], features)[0]
    top_indices = similarities.argsort()[-10:][::-1]
    top_products = [product_ids[i] for i in top_indices]

    return top_products

In [10]:
top_10 = get_top_10_similar("/content/drive/MyDrive/Stylumia/Dataset/img 2.webp")
print(top_10)

['49fbac1277f1ec0ad1d1847e55d2d3963e54d1c96b197f748aa2877fae8f56b6', '840dad8a6c079f14bdb54c8cfe78334dc3bd0edfa3276b787f0ae40e000f7be2', 'df5e82e036e08dbed71cee16e97c1f86f632a4c6b198afd02709083adf86617e', '113fa7d82a612dcba811e7985e8d523ff3f90e2699eadb6f17bbede08c773851', '83668a9c4a0c9b165fa32ccc87f6e77b36e9c0853da9dbf7bac551173897d6a4', '5fd3c74fdabcca27d1efca045761fa32ac9f624e3f7c922d8562742293f26e15', '31a7dd6edb2ee346fa9520c56f43830379d9aa47a64a053f02bb23f2ffd68384', '32cd4c1c9070b55845f21f8fef679eb1cdd695807d22c770c38b16e62ba41c96', '57447acee0611a7bc2dc387c18c1f7fda21f09b44f194a30920fed4ea970e6ca', '180aa27e3fcf7a1e2c0f9ef589b519198635dd3b715a367dc9ccaaa97a46bc05']
