In [1]:
from spacy import displacy
import spacy

nlp = spacy.load("en_core_web_sm")

sentence = "There is a person in a yellow coat."

doc = nlp(sentence)
displacy.serve(doc, style="dep")




Using the 'dep' visualizer
Serving on http://0.0.0.0:5000 ...
Shutting down server on port 5000.


In [2]:
import spacy
import clip
import torch

# Load the model
device = "cuda" if torch.cuda.is_available() else "cpu"
model, _ = clip.load("ViT-B/32", device=device)

# Initialize spacy
nlp = spacy.load("en_core_web_sm")

In [88]:
# Preprocessing function
def remove_of(sent):
    """
    Removes any "of" phrases from the sentence to improve entity extraction
    :param sent: The sentence to remove "of" phrases from
    :return: The sentence with "of" phrases removed
    """
    # Remove any "of" phrases from the sentence to improve entity extraction
    of_phrases = [
        "the side of", "the handle of", "the bunch of", "the corner of",
        "the top of", "the bottom of", "the front of", "the back of",
        "the edge of", "the middle of", "the end of", "the tip of",
        "the base of", "the center of", "the surface of", "the middle of",
        "the left of", "the right of", "the rear of", "the rim of"
    ]
    
    for phrase in of_phrases:
        if phrase in sent:
            index = sent.find(phrase)
            sent = sent[index + len(phrase):]
            return sent.strip()  # Stripping any leading or trailing whitespace
    return sent

# Extract root or subject
def get_root_or_subject(sent):
    """
    Extracts the root or subject of a sentence
    :param sent: The sentence to extract the root or subject from
    :return: The root or subject of the sentence
    """
    sent = sent.lower()
    
    starting_phrases = [
        "there is", "this is", "there are", "these are", "it is", "there's", "there're", "it's", "this's"
    ]
    
    # Remove the starting phrase from the sentence if it exists
    for phrase in starting_phrases:
        if sent.startswith(phrase):
            sent = sent[len(phrase):]
            break
            
    article_phrases = [
        "a ", "an ", "the "
    ]
    
    # Remove any articles from the sentence to improve entity extraction
    for phrase in article_phrases:
        if sent.startswith(phrase):
            sent = sent[len(phrase):]
            break
            
    # Remove any "of" phrases from the sentence to improve entity extraction
    sent = remove_of(sent)
    
    # Extract the root or subject of the sentence
    doc = nlp(sent)
    for token in doc:
        # Check for nominal subjects
        if token.dep_ in ["nsubj", "nsubjpass"]:
            return token.text
        
    # If no nominal subject, return the root of the sentence
    return [token.text for token in doc if token.dep_ == "ROOT"]
    

def get_clip_embedding_for_text(text):
    text = clip.tokenize([text]).to(device)
    with torch.no_grad():
        text_features = model.encode_text(text)
        text_features /= text_features.norm(dim=-1, keepdim=True)
    return text_features

def compute_similarity(text1, text2):
    text1_embedding = get_clip_embedding_for_text(text1)
    text2_embedding = get_clip_embedding_for_text(text2)
    similarity = (text1_embedding @ text2_embedding.T).squeeze().item()
    return similarity

In [89]:
def extract_entities(sentence):
    doc = nlp(sentence)
    entities = [ent.text for ent in doc.ents] + [noun_chunk.text for noun_chunk in doc.noun_chunks]
    return entities

In [114]:
# implement a function to detect the objects using YOLObv8
from ultralytics import YOLO
import torch
from PIL import Image
import matplotlib.pyplot as plt
import numpy as np
import os
import clip


device = "cuda" if torch.cuda.is_available() else "cpu"


# Load the model
# yolo_model = YOLO("yolov5s.pt").to(device)
yolo_model = torch.hub.load('ultralytics/yolov5', 'yolov5s', pretrained=True).to(device)
yolo_model.conf = 0.7
yolo_model.iou = 0.45
CLASSES = yolo_model.names
print("Classes:", CLASSES)

clip_model, clip_preprocess = clip.load("ViT-B/32", device=device)

Using cache found in C:\Users\adnan/.cache\torch\hub\ultralytics_yolov5_master
YOLOv5  2023-8-20 Python-3.9.16 torch-2.1.0.dev20230609+cu121 CUDA:0 (NVIDIA GeForce GTX 1660 Ti, 6144MiB)

Fusing layers... 
YOLOv5s summary: 213 layers, 7225885 parameters, 0 gradients
Adding AutoShape... 


Classes: {0: 'person', 1: 'bicycle', 2: 'car', 3: 'motorcycle', 4: 'airplane', 5: 'bus', 6: 'train', 7: 'truck', 8: 'boat', 9: 'traffic light', 10: 'fire hydrant', 11: 'stop sign', 12: 'parking meter', 13: 'bench', 14: 'bird', 15: 'cat', 16: 'dog', 17: 'horse', 18: 'sheep', 19: 'cow', 20: 'elephant', 21: 'bear', 22: 'zebra', 23: 'giraffe', 24: 'backpack', 25: 'umbrella', 26: 'handbag', 27: 'tie', 28: 'suitcase', 29: 'frisbee', 30: 'skis', 31: 'snowboard', 32: 'sports ball', 33: 'kite', 34: 'baseball bat', 35: 'baseball glove', 36: 'skateboard', 37: 'surfboard', 38: 'tennis racket', 39: 'bottle', 40: 'wine glass', 41: 'cup', 42: 'fork', 43: 'knife', 44: 'spoon', 45: 'bowl', 46: 'banana', 47: 'apple', 48: 'sandwich', 49: 'orange', 50: 'broccoli', 51: 'carrot', 52: 'hot dog', 53: 'pizza', 54: 'donut', 55: 'cake', 56: 'chair', 57: 'couch', 58: 'potted plant', 59: 'bed', 60: 'dining table', 61: 'toilet', 62: 'tv', 63: 'laptop', 64: 'mouse', 65: 'remote', 66: 'keyboard', 67: 'cell phone', 68

In [162]:
def filter_detector_predictions(detector_results, subject_enr):
    """
    Filters the detector results by the subject/entities
    :param detector_results: bbox, classes, scores
    :param subject_enr: list of subjects or entities to filter the yolo detections by calculating similarity (text-to-text)
    :return: filtered results (bbox, classes, scores)
    """
    predictions_with_sim_scores = list()
    for bbox, cls, score in zip(*detector_results):
        # calculate the similarity between the subject and each class or entity
        for enr in subject_enr:
            sim_score = compute_similarity(enr, "this is a " + cls)
            predictions_with_sim_scores.append((bbox, cls, score, sim_score))
            
    # sort the predictions by the similarity score
    predictions_with_sim_scores.sort(key=lambda x: x[-1], reverse=True)
    print(predictions_with_sim_scores)
    # return prediction with the highest similarity score
    return predictions_with_sim_scores[0]
    

In [163]:
def detect_objects(image_path):
    """
    Detects objects in an image using YOLOv5
    :param image_path: The path to the image to detect objects in
    :return: A tuple of lists of predicted classes, boxes, and scores
    """
    # Run the YOLOv8 model
    yolo_detections = yolo_model(image_path)
    
    predicted_boxes = yolo_detections.xyxy[0].cpu().numpy().tolist()
    predicted_classes = [CLASSES[boxes[-1]] for boxes in predicted_boxes]
    predicted_scores = [boxes[-2] for boxes in predicted_boxes]
    
    return predicted_boxes, predicted_classes, predicted_scores

In [164]:
img_path = r"D:\Study\Trento University Study\Samester-2\Deep Learning\DL\DL Project\dataset\refcocog\images\COCO_train2014_000000374391.jpg"
sentence = "A WITHOUT HAIRY BROWN COLOR TEDDY BEAR"

# img_path = r"D:\Study\Trento University Study\Samester-2\Deep Learning\DL\DL Project\dataset\refcocog\images\COCO_train2014_000000380440.jpg"
# sentence = "the man in yellow coat"

In [165]:
sbj = get_root_or_subject(sentence)
print("Subject or root:", sbj)

enr = extract_entities(sentence)
# enr = ['brown bear', 'soda bottle']
print("Entities:", enr)

enr = enr + sbj
print("Entities + Subject:", enr)

Subject or root: ['without']
Entities: ['HAIRY BROWN']
Entities + Subject: ['HAIRY BROWN', 'without']


In [166]:
res = detect_objects(img_path)
res

([[396.26519775390625,
   191.4452362060547,
   578.55126953125,
   398.91943359375,
   0.9016379714012146,
   77.0],
  [348.9093933105469,
   301.9761962890625,
   443.2050476074219,
   418.247802734375,
   0.8622263669967651,
   39.0]],
 ['teddy bear', 'bottle'],
 [0.9016379714012146, 0.8622263669967651])

In [167]:
filtered_results = filter_detector_predictions(res, sbj)
filtered_results

[([348.9093933105469, 301.9761962890625, 443.2050476074219, 418.247802734375, 0.8622263669967651, 39.0], 'bottle', 0.8622263669967651, 0.83349609375), ([396.26519775390625, 191.4452362060547, 578.55126953125, 398.91943359375, 0.9016379714012146, 77.0], 'teddy bear', 0.9016379714012146, 0.8193359375)]


([348.9093933105469,
  301.9761962890625,
  443.2050476074219,
  418.247802734375,
  0.8622263669967651,
  39.0],
 'bottle',
 0.8622263669967651,
 0.83349609375)

In [None]:
# text to image similarity