In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from collections import defaultdict, Counter

In [2]:
from typing import List, Dict, Any, Tuple

In [3]:
from tqdm.notebook import tqdm, trange

In [4]:
from scipy import stats

In [5]:
import faiss 

from sentence_transformers import SentenceTransformer
#from sentence_transformers.quantization import quantize_embeddings

  from tqdm.autonotebook import tqdm, trange


In [22]:
import inflect
number_lexicalizer = inflect.engine()

In [23]:
number_lexicalizer.number_to_words(5)

'five'

In [7]:
models_to_evaluate = {
    "LaBSE": "sentence-transformers/LaBSE",
    "miniLM-L12": "sentence-transformers/all-MiniLM-L12-v2",
    "miniLM-L6": "sentence-transformers/all-MiniLM-L6-v2",
    "mxbai": "mixedbread-ai/mxbai-embed-large-v1",
    "jina-base": "jinaai/jina-embeddings-v2-base-en",
    "jina-small": "jinaai/jina-embeddings-v2-small-en",
    "jina-code": "jinaai/jina-embeddings-v2-base-code",
    "textCLIP": "sentence-transformers/clip-ViT-B-32"
}

In [8]:
encoder_models = defaultdict()

for m in models_to_evaluate:
    encoder_models[m] = SentenceTransformer(
        models_to_evaluate[m], 
        trust_remote_code=True
    )

modules.json:   0%|          | 0.00/461 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/2.22k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/804 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.88G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/397 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/5.22M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.62M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

2_Dense/config.json:   0%|          | 0.00/114 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/2.36M [00:00<?, ?B/s]

  torch.load(os.path.join(input_path, "pytorch_model.bin"), map_location=torch.device("cpu"))


modules.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/1.91k [00:00<?, ?B/s]

0_CLIPModel/merges.txt:   0%|          | 0.00/525k [00:00<?, ?B/s]

0_CLIPModel/vocab.json:   0%|          | 0.00/961k [00:00<?, ?B/s]

0_CLIPModel/preprocessor_config.json:   0%|          | 0.00/316 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/605M [00:00<?, ?B/s]

0_CLIPModel/tokenizer_config.json:   0%|          | 0.00/604 [00:00<?, ?B/s]

0_CLIPModel/special_tokens_map.json:   0%|          | 0.00/389 [00:00<?, ?B/s]

0_CLIPModel/config.json:   0%|          | 0.00/4.03k [00:00<?, ?B/s]

In [9]:
# read the data from the file resturants.list 
# and convert to list
data = pd.read_csv("restaurants.list", sep="\t", header=None).to_dict()[0]
RESTAURANTS = list(data.values())

In [10]:
templates = {
    "restaurants": {
        "query": [
            "Find restaurants that are rated with at least {} {}", 
            "List restaurants with {} {} rating or higher",
            "I am looking for restaurants with at least {} {} rating",
            "Show me restaurants that have {} {} rating or higher",
            "Which restaurants have at least {} {} rating",
            "Restaurants with at least {} {} rating",
            "List restaurants with at least {} {} rating",
            "Restaurants that have {} {} rating or higher",
            "I want to see great restaurants with at least {} {} rating",
            "Give me suggestions for restaurants with {} {} rating or higher",
        ], 
        "candidate": "{} restaurant has {} stars rating.",
        "attribute": "stars"
    }
}

In [33]:
def create_test_case(
        attribute: str,
        search_items: List[str],
        query_template: str, 
        candidate_template: str,
        lexicalized: bool=False) -> Tuple[str, List[str], List[int]]:
    """
    Create a test case for the evaluation.
    :param attribute: the attribute to be queried (e.g., "stars", "awards")
    :param search_items: a list of items to search over (e.g., restaurants)
    :param query_template: a query template to be used
    :param candidate_template: a candidate template to be used
    :return: a tuple of query sentence, candidate sentences, and hit flags
    """
    # test if input parameters are valid
    assert attribute in ["stars", "awards"], "Invalid attribute"
    assert len(search_items) >= 10, "Not enough search items"

    # sample a target number 
    target_number = np.random.randint(6, 10)

    # associate a value for each search item
    # where only N of those are equal to or higher than the target number
    items_to_retrieve = np.random.randint(1, 11)

    hit_ratings = np.random.randint(target_number, 11, items_to_retrieve)
    miss_ratings = np.random.randint(
        1, target_number, len(search_items) - items_to_retrieve
    )
    
    item_ratings = np.concatenate([hit_ratings, miss_ratings])

    # define a boolean list to check if the rating is hit (should be returned)
    relevance_score = [
        0 if rating < target_number else 1 for rating in item_ratings
    ]

    # create the query sentence
    if lexicalized:
        target_number = number_lexicalizer.number_to_words(target_number)

    query_sentence = query_template.format(target_number, attribute)


    if lexicalized:
        candidates = [
            candidate_template.format(
                restaurant, 
                number_lexicalizer.number_to_words(rating))
            for restaurant, rating in zip(search_items, item_ratings)
        ]

    else:
        candidates = [
            candidate_template.format(restaurant, rating)
            for restaurant, rating in zip(search_items, item_ratings)
        ]

    return query_sentence, candidates, relevance_score

In [34]:
relevance_to_emoji = {0: "✖", 1: "✅"}

In [35]:
# iterate over the templates and create test cases
search_items = {"restaurant": RESTAURANTS}

test_cases = defaultdict(list)

for search_need in templates:
    for query_template in templates[search_need]["query"]:
        query, candidates, relevance_scores = create_test_case(
            templates[search_need]["attribute"], 
            RESTAURANTS, 
            query_template, 
            templates[search_need]["candidate"], 
            lexicalized=False
        )

        test_cases[search_need].append(
            {
                "query": query,
                "candidates": candidates,
                "relevance_scores": relevance_scores
            }
        )

In [36]:
len(test_cases["restaurants"])

10

In [43]:
# evalauting the models
debug = True

for model_name, model in encoder_models.items():

    precision_at_10_values, recall_at_10_values = [], []

    # iterate over the test cases
    for t_case in test_cases["restaurants"]:
        # get the query and the candidates
        query, candidates = t_case["query"], t_case["candidates"]
        relevance_scores = t_case["relevance_scores"]

        # encode the query and the candidates
        query_embedding = model.encode(query).reshape(1, -1)
        candidate_embeddings = model.encode(candidates)

        # L2 normalize the embeddings
        faiss.normalize_L2(query_embedding)
        faiss.normalize_L2(candidate_embeddings)

        # get dimensions of the embeddings
        d = query_embedding.shape[1]

        # make search index
        index = faiss.IndexFlatIP(d)
        index.add(candidate_embeddings)

        # search the index
        k = 10

        D, I = index.search(query_embedding, k)

        if debug:

            print(f"Query: {query}")
            print()

            for i in range(k):
                retrieved_sentence = candidates[I[0][i]]
                hit_or_not = relevance_scores[I[0][i]]
                print(f"Rank {i+1:>2}: {retrieved_sentence:60} ", end="")
                print(f"{D[0][i]:>5.4f} ", end="")
                print(f"{relevance_to_emoji[hit_or_not]:>3}")


        # compute precision and recall at 10
        retrieved_items = sum(np.array(relevance_scores)[I[0]])

        precision_at_10 = retrieved_items / k
        recall_at_10 = retrieved_items / np.sum(relevance_scores)

        precision_at_10_values.append(precision_at_10)
        recall_at_10_values.append(recall_at_10)

        print()
        print(f"Total num of relevant items: {np.sum(relevance_scores)}")
        print(f"Total num of retrieved items: {retrieved_items}")

        print()
        print(f"P@10: {precision_at_10:.2f}")
        print(f"R@10: {recall_at_10:.2f}")
        print()

    # caluclate the average precision and recall at 10
    avg_precision_at_10 = np.mean(precision_at_10_values)
    avg_recall_at_10 = np.mean(recall_at_10_values)
    print(f"Model: {model_name}")
    print(f"Average P@10: {avg_precision_at_10:.2f}")
    print(f"Average R@10: {avg_recall_at_10:.2f}")

    print("-" * 80)

Query: Find restaurants that are rated with at least nine stars

Rank  1: Nacho Nirvana restaurant has seven stars rating.             0.5603   ✖
Rank  2: Fish ‘n’ Giggles restaurant has seven stars rating.          0.5321   ✖
Rank  3: Knights and Noodles restaurant has seven stars rating.       0.5304   ✖
Rank  4: Carnivore Carnival restaurant has eight stars rating.        0.5301   ✖
Rank  5: Nacho Average Taco restaurant has nine stars rating.         0.5296   ✅
Rank  6: Holy Cannoli! restaurant has nine stars rating.              0.5280   ✅
Rank  7: Noodle Nook restaurant has three stars rating.               0.5243   ✖
Rank  8: Gnocchi Knockout restaurant has seven stars rating.          0.5164   ✖
Rank  9: Seoul Food restaurant has eight stars rating.                0.5060   ✖
Rank 10: Danish Delight restaurant has eight stars rating.            0.5019   ✖

Total num of relevant items: 3
Total num of retrieved items: 2

P@10: 0.20
R@10: 0.67

Query: List restaurants with seven st

In [147]:
# sample target number
target_number = np.random.randint(6, 10)
query_template = "Show me restaurants with at least {} {} rating"
query_sentence = query_template.format(target_number)

# create a list of randome ratings between 1 and 10
# where only 10 of those are equal to or higher than the target number
hit_ratings = np.random.randint(target_number, 11, 10)
miss_ratings = np.random.randint(1, target_number, len(RESTAURANTS) - 10)
all_ratings = np.concatenate([hit_ratings, miss_ratings])

#ratings = np.random.randint(1, 11, len(RESTAURANTS))


candidate_template = "{} restaurant has {} stars rating."

# define a boolean list to check if the rating is hit (should be returned)
is_hit = [
    0 if rating < target_number else 1 for rating in all_ratings
]

candidate_sentences = [
        candidate_template.format(restaurant, rating)
        for restaurant, rating in zip(RESTAURANTS, all_ratings)
]

hit_to_emoji = {0: "✖", 1: "✅"}

In [148]:
assert sum(is_hit) == 10

In [149]:
for c, h in zip(candidate_sentences, is_hit):
    print(f"{c:60}  {hit_to_emoji[h]:>5}")

Holy Cannoli! restaurant has 8 stars rating.                      ✅
Sushi Samurai restaurant has 7 stars rating.                      ✅
Nacho Average Taco restaurant has 8 stars rating.                 ✅
Curry Up Now restaurant has 8 stars rating.                       ✅
Oui, Chef! restaurant has 10 stars rating.                        ✅
The Souvlaki Shack restaurant has 10 stars rating.                ✅
Kimchi Commandos restaurant has 9 stars rating.                   ✅
Pad Thai Guy restaurant has 10 stars rating.                      ✅
Tagine Time restaurant has 10 stars rating.                       ✅
Carnivore Carnival restaurant has 9 stars rating.                 ✅
Wok This Way restaurant has 4 stars rating.                       ✖
Hummus a Tune restaurant has 5 stars rating.                      ✖
Rumba Roti restaurant has 4 stars rating.                         ✖
Lederhosen Lounge restaurant has 3 stars rating.                  ✖
Mamma Mia's Pizzeria restaurant has 1 stars rati

In [157]:
debug = True

# encode the text query and candidate
query_embedding = model.encode(query_sentence).reshape(1, -1)
candidate_embeddings = model.encode(candidate_sentences)

# L2 normalize the embeddings
faiss.normalize_L2(query_embedding)
faiss.normalize_L2(candidate_embeddings)

# compute the cosine similarity using FAISS
d = model.get_sentence_embedding_dimension()

index = faiss.IndexFlatIP(d)
index.add(candidate_embeddings)

k = 10
D, I = index.search(query_embedding, k)


# print the result
# print query 
print(f"Query: {query_sentence}", end="\n\n")

if debug:
    for i in range(k):
        retrieved_sentence = candidate_sentences[I[0][i]]
        hit_or_not = is_hit[I[0][i]]
        print(f"Rank {i+1:>2}: {retrieved_sentence:60} {D[0][i]:>5.2f} ", end="")
        print(f"{hit_to_emoji[hit_or_not]:>3}")

# compute precision and recall at 10
precision_at_10 = sum([is_hit[i] for i in list(I[0][:10])]) / 10
recall_at_10 = sum([is_hit[i] for i in list(I[0][:10])]) / sum(is_hit)

print()
print(f"P@10: {precision_at_10:.2f}")
print(f"R@10: {recall_at_10:.2f}")

Query: Show me restaurants with at least 7 stars rating

Rank  1: Adriatic Appetites restaurant has 6 stars rating.             0.72   ✖
Rank  2: Oui, Chef! restaurant has 10 stars rating.                    0.72   ✅
Rank  3: Wok This Way restaurant has 4 stars rating.                   0.72   ✖
Rank  4: Grillin' & Chillin' restaurant has 5 stars rating.            0.72   ✖
Rank  5: Curry Up Now restaurant has 8 stars rating.                   0.72   ✅
Rank  6: The Souvlaki Shack restaurant has 10 stars rating.            0.71   ✅
Rank  7: Holy Cannoli! restaurant has 8 stars rating.                  0.71   ✅
Rank  8: The Fish Fryer restaurant has 5 stars rating.                 0.71   ✖
Rank  9: Carnivore Carnival restaurant has 9 stars rating.             0.71   ✅
Rank 10: Fon-Do or Fon-Don't restaurant has 5 stars rating.            0.71   ✖

P@10: 0.50
R@10: 0.50
