In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from collections import defaultdict, Counter

In [2]:
from typing import List, Dict, Any, Tuple

In [3]:
from tqdm.notebook import tqdm, trange

In [4]:
from scipy import stats

In [5]:
import faiss 

from sentence_transformers import SentenceTransformer
#from sentence_transformers.quantization import quantize_embeddings

  from tqdm.autonotebook import tqdm, trange


In [22]:
import inflect
number_lexicalizer = inflect.engine()

In [23]:
number_lexicalizer.number_to_words(5)

'five'

In [7]:
models_to_evaluate = {
    "LaBSE": "sentence-transformers/LaBSE",
    "miniLM-L12": "sentence-transformers/all-MiniLM-L12-v2",
    "miniLM-L6": "sentence-transformers/all-MiniLM-L6-v2",
    "mxbai": "mixedbread-ai/mxbai-embed-large-v1",
    "jina-base": "jinaai/jina-embeddings-v2-base-en",
    "jina-small": "jinaai/jina-embeddings-v2-small-en",
    "jina-code": "jinaai/jina-embeddings-v2-base-code",
    "textCLIP": "sentence-transformers/clip-ViT-B-32"
}

In [8]:
encoder_models = defaultdict()

for m in models_to_evaluate:
    encoder_models[m] = SentenceTransformer(
        models_to_evaluate[m], 
        trust_remote_code=True
    )

modules.json:   0%|          | 0.00/461 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/2.22k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/804 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.88G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/397 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/5.22M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.62M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

2_Dense/config.json:   0%|          | 0.00/114 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/2.36M [00:00<?, ?B/s]

  torch.load(os.path.join(input_path, "pytorch_model.bin"), map_location=torch.device("cpu"))


modules.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/1.91k [00:00<?, ?B/s]

0_CLIPModel/merges.txt:   0%|          | 0.00/525k [00:00<?, ?B/s]

0_CLIPModel/vocab.json:   0%|          | 0.00/961k [00:00<?, ?B/s]

0_CLIPModel/preprocessor_config.json:   0%|          | 0.00/316 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/605M [00:00<?, ?B/s]

0_CLIPModel/tokenizer_config.json:   0%|          | 0.00/604 [00:00<?, ?B/s]

0_CLIPModel/special_tokens_map.json:   0%|          | 0.00/389 [00:00<?, ?B/s]

0_CLIPModel/config.json:   0%|          | 0.00/4.03k [00:00<?, ?B/s]

In [9]:
# read the data from the file resturants.list 
# and convert to list
data = pd.read_csv("restaurants.list", sep="\t", header=None).to_dict()[0]
RESTAURANTS = list(data.values())

In [56]:
templates = {
    "restaurants": {
        "query": [
            "Find restaurants that are rated with at least {} {}", 
            "List all restaurants with {} {} rating or higher",
            "I am looking for restaurants with at least {} {} rating",
            "Show me restaurants that have {} {} rating or higher",
            "Which restaurants have at least {} {} rating",
            "Restaurants with at least {} {} rating",
            "List restaurants with at least {} {} rating",
            "Restaurants that have {} {} rating or higher",
            "I want to see great restaurants with at least {} {} rating",
            "Give me suggestions for restaurants with {} {} rating or higher",
            "I want to know which restaurants have at least {} {} rating",
            "Which restaurants have {} {} rating or higher",
            "Great restaurants that have at least {} {} rating",
            "Show me of restaurants with at least {} {} rating",
        ], 
        "candidate": "{} restaurant has {} stars rating.",
        "attribute": "stars"
    }
}

In [85]:
def create_test_case(
        attribute: str,
        search_items: List[str],
        query_template: str, 
        candidate_template: str, 
        max_items_to_retrieve: int=11) -> Dict[str, Dict[str, Any]]:
    """
    Create a test case for the evaluation.
    :param attribute: the attribute to be queried (e.g., "stars", "awards")
    :param search_items: a list of items to search over (e.g., restaurants)
    :param query_template: a query template to be used
    :param candidate_template: a candidate template to be used
    :return: a tuple of query sentence, candidate sentences, and hit flags
    """
    # test if input parameters are valid
    assert attribute in ["stars", "awards"], "Invalid attribute"
    assert len(search_items) >= 1, "Provide at least one search items"

    # sample a target number 
    target_number = np.random.randint(6, 10)

    # associate a value for each search item
    # where only N of those are equal to or higher than the target number
    items_to_retrieve = np.random.randint(1, max_items_to_retrieve)

    hit_ratings = np.random.randint(target_number, 10, items_to_retrieve)
    miss_ratings = np.random.randint(
        1, target_number, len(search_items) - items_to_retrieve
    )
    
    item_ratings = np.concatenate([hit_ratings, miss_ratings])

    # define a boolean list to check if the rating is hit (should be returned)
    relevance_score = [
        0 if rating < target_number else 1 for rating in item_ratings
    ]

    # create the query sentence
    query_sentence = query_template.format(target_number, attribute)
    target_number_lex = number_lexicalizer.number_to_words(target_number)
    query_sentence_lex = query_template.format(target_number_lex, attribute)

    candidates = [
        candidate_template.format(restaurant, rating)
        for restaurant, rating in zip(search_items, item_ratings)
    ]

    candidates_lex = [
        candidate_template.format(
            restaurant, 
            number_lexicalizer.number_to_words(rating))
        for restaurant, rating in zip(search_items, item_ratings)
    ]


    return { 
        "query": {
            "numeral": query_sentence,
            "lexical": query_sentence_lex
        }, 
        "candidates": {
            "numeral": candidates,
            "lexical": candidates_lex
        },
        "relevance_score": relevance_score
    }

In [88]:
t = create_test_case(
    "stars", 
    ["Taj Mahal", "Burger King", "McDonald's", "KFC", "Pizza Hut", "Subway", 
     "Greggs", "Pret A Manger", "Nando's", "Starbucks", "Costa"],
    templates["restaurants"]["query"][0],
    templates["restaurants"]["candidate"], 
    max_items_to_retrieve=5
)

In [87]:
# code to evaluate a single test case and return precision and recall at 10
def evaluate_test_case(
        query: Dict[str, str], 
        candidates: List[str], 
        relevance_score: List[int], 
        model: SentenceTransformer) -> Tuple[float, float]:
    """
    Evaluate a test case using a given model.
    :param query: a query sentence
    :param candidates: a list of candidate sentences
    :param relevance_score: a list of relevance scores
    :param model: a sentence transformer model to use
    :return: a tuple of precision and recall at 10
    """
    # encode the query and candidates
    query_embedding = model.encode(query)
    candidate_embeddings = model.encode(candidates)

    # compute the cosine similarity between the query and candidates
    similarity = np.dot(candidate_embeddings, query_embedding.T)

    # rank the candidates based on the similarity
    ranked_indices = np.argsort(similarity, axis=0)[::-1]

    # retrieve the relevance scores based on the ranking
    ranked_relevance = np.array(relevance_score)[ranked_indices]

    # compute precision and recall at 10
    precision_at_10 = np.mean(ranked_relevance[:10])
    recall_at_10 = np.sum(ranked_relevance[:10]) / np.sum(relevance_score)

    return precision_at_10, recall_at_10


In [89]:
evaluate_test_case(t["query"]["numeral"], t["candidates"]["numeral"], t["relevance_score"], encoder_models["LaBSE"])

(0.3, 1.0)

In [34]:
relevance_to_emoji = {0: "✖", 1: "✅"}

In [63]:
# iterate over the templates and create test cases
search_items = {"restaurant": RESTAURANTS}


model_name = "LaBSE"
model = encoder_models[model_name]

for k in range(10):
    #print(f"Experiment {k+1}")
    test_cases = defaultdict(list)

    print(f'Create test case {k + 1}')

    for search_need in templates:
        for query_template in templates[search_need]["query"]:
            query, candidates, relevance_scores = create_test_case(
                templates[search_need]["attribute"], 
                RESTAURANTS, 
                query_template, 
                templates[search_need]["candidate"], 
                lexicalized=True
            )

            test_cases[search_need].append(
                {
                    "query": query,
                    "candidates": candidates,
                    "relevance_scores": relevance_scores
                }
            )

    precision_at_10_values, recall_at_10_values = [], []


    # iterate over the test cases
    for t_case in tqdm(test_cases["restaurants"]):
        # get the query and the candidates
        query, candidates = t_case["query"], t_case["candidates"]
        relevance_scores = t_case["relevance_scores"]

        # encode the query and the candidates
        query_embedding = model.encode(query).reshape(1, -1)
        candidate_embeddings = model.encode(candidates)

        # L2 normalize the embeddings
        faiss.normalize_L2(query_embedding)
        faiss.normalize_L2(candidate_embeddings)

        # get dimensions of the embeddings
        d = query_embedding.shape[1]

        # make search index
        index = faiss.IndexFlatIP(d)
        index.add(candidate_embeddings)

        # search the index
        k = 10

        D, I = index.search(query_embedding, k)

        if debug:

            print(f"Query: {query}")
            print()

            for i in range(k):
                retrieved_sentence = candidates[I[0][i]]
                hit_or_not = relevance_scores[I[0][i]]
                print(f"Rank {i+1:>2}: {retrieved_sentence:60} ", end="")
                print(f"{D[0][i]:>5.4f} ", end="")
                print(f"{relevance_to_emoji[hit_or_not]:>3}")


        # compute precision and recall at 10
        retrieved_items = sum(np.array(relevance_scores)[I[0]])

        precision_at_10 = retrieved_items / k
        recall_at_10 = retrieved_items / np.sum(relevance_scores)

        precision_at_10_values.append(precision_at_10)
        recall_at_10_values.append(recall_at_10)

        if debug:

            print()
            print(f"Total num of relevant items: {np.sum(relevance_scores)}")
            print(f"Total num of retrieved items: {retrieved_items}")

            print()
            print(f"P@10: {precision_at_10:.3f}")
            print(f"R@10: {recall_at_10:.3f}")
            print()

    # caluclate the average precision and recall at 10
    avg_precision_at_10 = np.mean(precision_at_10_values)
    avg_recall_at_10 = np.mean(recall_at_10_values)
    print(f"Model: {model_name}")
    print(f"Average P@10: {avg_precision_at_10:.3f}")
    print(f"Average R@10: {avg_recall_at_10:.3f}")

    print("-" * 80)

Create test case 1


  0%|          | 0/14 [00:00<?, ?it/s]

Model: LaBSE
Average P@10: 0.207
Average R@10: 0.302
--------------------------------------------------------------------------------
Create test case 2


  0%|          | 0/14 [00:00<?, ?it/s]

Model: LaBSE
Average P@10: 0.207
Average R@10: 0.328
--------------------------------------------------------------------------------
Create test case 3


  0%|          | 0/14 [00:00<?, ?it/s]

Model: LaBSE
Average P@10: 0.221
Average R@10: 0.387
--------------------------------------------------------------------------------
Create test case 4


  0%|          | 0/14 [00:00<?, ?it/s]

Model: LaBSE
Average P@10: 0.229
Average R@10: 0.504
--------------------------------------------------------------------------------
Create test case 5


  0%|          | 0/14 [00:00<?, ?it/s]

Model: LaBSE
Average P@10: 0.221
Average R@10: 0.491
--------------------------------------------------------------------------------
Create test case 6


  0%|          | 0/14 [00:00<?, ?it/s]

Model: LaBSE
Average P@10: 0.243
Average R@10: 0.401
--------------------------------------------------------------------------------
Create test case 7


  0%|          | 0/14 [00:00<?, ?it/s]

Model: LaBSE
Average P@10: 0.207
Average R@10: 0.394
--------------------------------------------------------------------------------
Create test case 8


  0%|          | 0/14 [00:00<?, ?it/s]

Model: LaBSE
Average P@10: 0.157
Average R@10: 0.265
--------------------------------------------------------------------------------
Create test case 9


  0%|          | 0/14 [00:00<?, ?it/s]

Model: LaBSE
Average P@10: 0.186
Average R@10: 0.396
--------------------------------------------------------------------------------
Create test case 10


  0%|          | 0/14 [00:00<?, ?it/s]

Model: LaBSE
Average P@10: 0.186
Average R@10: 0.393
--------------------------------------------------------------------------------


In [54]:
len(test_cases["restaurants"])

10

In [55]:
# evalauting the models
debug = False

for model_name, model in encoder_models.items():

    precision_at_10_values, recall_at_10_values = [], []

    # iterate over the test cases
    for t_case in test_cases["restaurants"]:
        # get the query and the candidates
        query, candidates = t_case["query"], t_case["candidates"]
        relevance_scores = t_case["relevance_scores"]

        # encode the query and the candidates
        query_embedding = model.encode(query).reshape(1, -1)
        candidate_embeddings = model.encode(candidates)

        # L2 normalize the embeddings
        faiss.normalize_L2(query_embedding)
        faiss.normalize_L2(candidate_embeddings)

        # get dimensions of the embeddings
        d = query_embedding.shape[1]

        # make search index
        index = faiss.IndexFlatIP(d)
        index.add(candidate_embeddings)

        # search the index
        k = 10

        D, I = index.search(query_embedding, k)

        if debug:

            print(f"Query: {query}")
            print()

            for i in range(k):
                retrieved_sentence = candidates[I[0][i]]
                hit_or_not = relevance_scores[I[0][i]]
                print(f"Rank {i+1:>2}: {retrieved_sentence:60} ", end="")
                print(f"{D[0][i]:>5.4f} ", end="")
                print(f"{relevance_to_emoji[hit_or_not]:>3}")


        # compute precision and recall at 10
        retrieved_items = sum(np.array(relevance_scores)[I[0]])

        precision_at_10 = retrieved_items / k
        recall_at_10 = retrieved_items / np.sum(relevance_scores)

        precision_at_10_values.append(precision_at_10)
        recall_at_10_values.append(recall_at_10)

        if debug:

            print()
            print(f"Total num of relevant items: {np.sum(relevance_scores)}")
            print(f"Total num of retrieved items: {retrieved_items}")

            print()
            print(f"P@10: {precision_at_10:.3f}")
            print(f"R@10: {recall_at_10:.3f}")
            print()

    # caluclate the average precision and recall at 10
    avg_precision_at_10 = np.mean(precision_at_10_values)
    avg_recall_at_10 = np.mean(recall_at_10_values)
    print(f"Model: {model_name}")
    print(f"Average P@10: {avg_precision_at_10:.3f}")
    print(f"Average R@10: {avg_recall_at_10:.3f}")

    print("-" * 80)

Model: LaBSE
Average P@10: 0.130
Average R@10: 0.220
--------------------------------------------------------------------------------
Model: miniLM-L12
Average P@10: 0.180
Average R@10: 0.428
--------------------------------------------------------------------------------
Model: miniLM-L6
Average P@10: 0.240
Average R@10: 0.525
--------------------------------------------------------------------------------


KeyboardInterrupt: 

In [None]:
# sample target number
target_number = np.random.randint(6, 10)
query_template = "Show me restaurants with at least {} {} rating"
query_sentence = query_template.format(target_number)

# create a list of randome ratings between 1 and 10
# where only 10 of those are equal to or higher than the target number
hit_ratings = np.random.randint(target_number, 11, 10)
miss_ratings = np.random.randint(1, target_number, len(RESTAURANTS) - 10)
all_ratings = np.concatenate([hit_ratings, miss_ratings])

#ratings = np.random.randint(1, 11, len(RESTAURANTS))


candidate_template = "{} restaurant has {} stars rating."

# define a boolean list to check if the rating is hit (should be returned)
is_hit = [
    0 if rating < target_number else 1 for rating in all_ratings
]

candidate_sentences = [
        candidate_template.format(restaurant, rating)
        for restaurant, rating in zip(RESTAURANTS, all_ratings)
]

hit_to_emoji = {0: "✖", 1: "✅"}

IndexError: Replacement index 1 out of range for positional args tuple

In [None]:
assert sum(is_hit) == 10

In [None]:
for c, h in zip(candidate_sentences, is_hit):
    print(f"{c:60}  {hit_to_emoji[h]:>5}")

Holy Cannoli! restaurant has 8 stars rating.                      ✅
Sushi Samurai restaurant has 7 stars rating.                      ✅
Nacho Average Taco restaurant has 8 stars rating.                 ✅
Curry Up Now restaurant has 8 stars rating.                       ✅
Oui, Chef! restaurant has 10 stars rating.                        ✅
The Souvlaki Shack restaurant has 10 stars rating.                ✅
Kimchi Commandos restaurant has 9 stars rating.                   ✅
Pad Thai Guy restaurant has 10 stars rating.                      ✅
Tagine Time restaurant has 10 stars rating.                       ✅
Carnivore Carnival restaurant has 9 stars rating.                 ✅
Wok This Way restaurant has 4 stars rating.                       ✖
Hummus a Tune restaurant has 5 stars rating.                      ✖
Rumba Roti restaurant has 4 stars rating.                         ✖
Lederhosen Lounge restaurant has 3 stars rating.                  ✖
Mamma Mia's Pizzeria restaurant has 1 stars rati

In [157]:
debug = True

# encode the text query and candidate
query_embedding = model.encode(query_sentence).reshape(1, -1)
candidate_embeddings = model.encode(candidate_sentences)

# L2 normalize the embeddings
faiss.normalize_L2(query_embedding)
faiss.normalize_L2(candidate_embeddings)

# compute the cosine similarity using FAISS
d = model.get_sentence_embedding_dimension()

index = faiss.IndexFlatIP(d)
index.add(candidate_embeddings)

k = 10
D, I = index.search(query_embedding, k)


# print the result
# print query 
print(f"Query: {query_sentence}", end="\n\n")

if debug:
    for i in range(k):
        retrieved_sentence = candidate_sentences[I[0][i]]
        hit_or_not = is_hit[I[0][i]]
        print(f"Rank {i+1:>2}: {retrieved_sentence:60} {D[0][i]:>5.2f} ", end="")
        print(f"{hit_to_emoji[hit_or_not]:>3}")

# compute precision and recall at 10
precision_at_10 = sum([is_hit[i] for i in list(I[0][:10])]) / 10
recall_at_10 = sum([is_hit[i] for i in list(I[0][:10])]) / sum(is_hit)

print()
print(f"P@10: {precision_at_10:.2f}")
print(f"R@10: {recall_at_10:.2f}")

Query: Show me restaurants with at least 7 stars rating

Rank  1: Adriatic Appetites restaurant has 6 stars rating.             0.72   ✖
Rank  2: Oui, Chef! restaurant has 10 stars rating.                    0.72   ✅
Rank  3: Wok This Way restaurant has 4 stars rating.                   0.72   ✖
Rank  4: Grillin' & Chillin' restaurant has 5 stars rating.            0.72   ✖
Rank  5: Curry Up Now restaurant has 8 stars rating.                   0.72   ✅
Rank  6: The Souvlaki Shack restaurant has 10 stars rating.            0.71   ✅
Rank  7: Holy Cannoli! restaurant has 8 stars rating.                  0.71   ✅
Rank  8: The Fish Fryer restaurant has 5 stars rating.                 0.71   ✖
Rank  9: Carnivore Carnival restaurant has 9 stars rating.             0.71   ✅
Rank 10: Fon-Do or Fon-Don't restaurant has 5 stars rating.            0.71   ✖

P@10: 0.50
R@10: 0.50
