In [13]:
import torch
import open_clip
import pandas as pd
import cv2
import numpy as np
from ultralytics import YOLO
from torchvision import transforms
from PIL import Image
from sklearn.metrics.pairwise import cosine_similarity

# Load YOLOv8 model for object detection
yolo_model = YOLO("yolov8n.pt")  # Use a pre-trained YOLO model

# Load CLIP model for embedding extraction
device = "cuda" if torch.cuda.is_available() else "cpu"
clip_model_name = "ViT-B-32"
clip_model, _, clip_preprocess = open_clip.create_model_and_transforms(clip_model_name, pretrained="openai", device=device)
clip_tokenizer = open_clip.get_tokenizer(clip_model_name)

# Load stored embeddings from file
stored_embeddings_df = pd.read_csv("clip_text_embeddings.csv")

# Convert stored embeddings from string to numpy array
stored_embeddings_df["embedding"] = stored_embeddings_df["embedding"].apply(lambda x: np.array(eval(x)))
stored_product_titles = stored_embeddings_df["product_title"].tolist()
stored_embeddings = np.vstack(stored_embeddings_df["embedding"].values)

# Function to process image and extract objects
def detect_objects(image_path):
    image = cv2.imread(image_path)
    results = yolo_model(image)  # Run object detection

    detected_objects = []
    for result in results:
        for box in result.boxes.xyxy:  # Get bounding boxes
            x1, y1, x2, y2 = map(int, box)
            cropped_img = image[y1:y2, x1:x2]  # Crop detected object
            detected_objects.append(cropped_img)
    
    return detected_objects

# Function to compute CLIP embeddings for detected objects
def get_clip_embedding(image):
    image_pil = Image.fromarray(cv2.cvtColor(image, cv2.COLOR_BGR2RGB))
    image_tensor = clip_preprocess(image_pil).unsqueeze(0).to(device)

    with torch.no_grad():
        image_embedding = clip_model.encode_image(image_tensor)

    image_embedding /= image_embedding.norm(dim=-1, keepdim=True)  # Normalize
    return image_embedding.cpu().numpy()

# Function to match detected objects to stored embeddings
def match_objects(image_path):
    detected_objects = detect_objects(image_path)
    results = []

    for obj in detected_objects:
        obj_embedding = get_clip_embedding(obj)
        similarities = cosine_similarity(obj_embedding, stored_embeddings)  # Compare with stored embeddings
        best_match_idx = np.argmax(similarities)  # Find closest match
        best_match_title = stored_product_titles[best_match_idx]
        best_match_score = similarities[0, best_match_idx]

        results.append({
            "detected_object_index": len(results),
            "best_match_product": best_match_title,
            "similarity_score": best_match_score
        })

    # Save results
    df_results = pd.DataFrame(results)
    #df_results.to_csv("matched_results.csv", index=False)
    print("Results saved to matched_results.csv")
    print(df_results)


# Run the matching
image_path = "images/test_image.png"  # Change this to your image path
match_objects(image_path)





0: 352x640 2 persons, 36.7ms
Speed: 10.9ms preprocess, 36.7ms inference, 0.4ms postprocess per image at shape (1, 3, 352, 640)
Results saved to matched_results.csv
   detected_object_index                                 best_match_product  \
0                      0  Linen with Blouse Piece Saree (GSKI14456_Green...   
1                      1                                    Cotton Jumpsuit   

   similarity_score  
0          0.295117  
1          0.240274  


In [23]:
# map objects using fine-tuned clip 

import torch
import open_clip
import pandas as pd
import cv2
import numpy as np
from ultralytics import YOLO
from torchvision import transforms
from PIL import Image
from sklearn.metrics.pairwise import cosine_similarity

# 🔹 Load YOLOv8 model for object detection
yolo_model = YOLO("yolov8n.pt")  # Use a pre-trained YOLO model

# 🔹 Load Fine-Tuned CLIP Model
device = "cuda" if torch.cuda.is_available() else "cpu"
clip_model_name = "ViT-B-32"
clip_model, _, clip_preprocess = open_clip.create_model_and_transforms(clip_model_name, pretrained="openai", device=device)

# 🔹 Load fine-tuned weights
fine_tuned_weights = "finetuned_clip.pth"
clip_model.load_state_dict(torch.load(fine_tuned_weights, map_location=device))  # Load fine-tuned CLIP
clip_model.eval()  # Set model to evaluation mode
clip_tokenizer = open_clip.get_tokenizer(clip_model_name)

# 🔹 Load stored embeddings from file (generated using the fine-tuned model)
stored_embeddings_df = pd.read_csv("clip_text_embeddings.csv")

# Convert stored embeddings from string to numpy array
stored_embeddings_df["embedding"] = stored_embeddings_df["embedding"].apply(lambda x: np.array(eval(x)))
stored_product_titles = stored_embeddings_df["product_title"].tolist()
stored_embeddings = np.vstack(stored_embeddings_df["embedding"].values)

# 🔹 Function to process image and extract objects
def detect_objects(image_path):
    image = cv2.imread(image_path)
    results = yolo_model(image)  # Run object detection

    detected_objects = []
    for result in results:
        for box in result.boxes.xyxy:  # Get bounding boxes
            x1, y1, x2, y2 = map(int, box)
            cropped_img = image[y1:y2, x1:x2]  # Crop detected object
            detected_objects.append(cropped_img)
    
    return detected_objects

# 🔹 Function to compute CLIP embeddings for detected objects (using fine-tuned model)
def get_clip_embedding(image):
    image_pil = Image.fromarray(cv2.cvtColor(image, cv2.COLOR_BGR2RGB))
    image_tensor = clip_preprocess(image_pil).unsqueeze(0).to(device)

    with torch.no_grad():
        image_embedding = clip_model.encode_image(image_tensor)  # Use fine-tuned model

    image_embedding /= image_embedding.norm(dim=-1, keepdim=True)  # Normalize
    return image_embedding.cpu().numpy()

# 🔹 Function to match detected objects to stored embeddings
def match_objects(image_path):
    detected_objects = detect_objects(image_path)
    results = []

    for obj in detected_objects:
        obj_embedding = get_clip_embedding(obj)
        similarities = cosine_similarity(obj_embedding, stored_embeddings)  # Compare with fine-tuned embeddings
        best_match_idx = np.argmax(similarities)  # Find closest match
        best_match_title = stored_product_titles[best_match_idx]
        best_match_score = similarities[0, best_match_idx]

        results.append({
            "detected_object_index": len(results),
            "best_match_product": best_match_title,
            "similarity_score": best_match_score
        })

    # 🔹 Save results
    df_results = pd.DataFrame(results)
    df_results.to_csv("matched_results.csv", index=False)
    print("Results saved to matched_results.csv")
    print(df_results)

# 🔹 Run the matching
image_path = "ksbkbt_s1e1_frames/keyframe_165.jpg"  # Change this to your image path
match_objects(image_path)



0: 384x640 5 persons, 1 couch, 1 tv, 72.6ms
Speed: 7.9ms preprocess, 72.6ms inference, 11.2ms postprocess per image at shape (1, 3, 384, 640)
Results saved to matched_results.csv
   detected_object_index                                 best_match_product  \
0                      0  Women's Satin Silk Printed Square Scarf Dupatt...   
1                      1  Men's Tweed Woollen Bandhgala Waistcoat Brown ...   
2                      2  Women's Satin Silk Printed Square Scarf Dupatt...   
3                      3                      mens Extras size (s14_indo_p)   
4                      4  Boys Sherwani Set Full Sleeve Nehru Collared G...   
5                      5  Men Gold-Coloured &amp; Copper-Toned Self Desi...   
6                      6                     Indo pants For Men (UP-RI-516)   

   similarity_score  
0          0.383973  
1          0.402881  
2          0.414169  
3          0.113357  
4          0.311615  
5          0.116813  
6          0.088393  
