In [1]:
# basic imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from collections import defaultdict, Counter
from typing import List, Dict, Any, Tuple, Generator
import json

# utilities
from tqdm.notebook import tqdm, trange

# data processing
from scipy import stats
import faiss 

from sentence_transformers import SentenceTransformer

  from tqdm.autonotebook import tqdm, trange


In [11]:
from pprint import pprint

In [2]:
# a module for lexicalizing numbers (e.g. 8 -> eight)")
import inflect
number_lexicalizer = inflect.engine()


# try it out 
number_lexicalizer.number_to_words(8)

'eight'

In [3]:
models_to_evaluate = {
    "miniLM-L6": "sentence-transformers/all-MiniLM-L6-v2",
    # "miniLM-L12": "sentence-transformers/all-MiniLM-L12-v2",
    # "mxbai": "mixedbread-ai/mxbai-embed-large-v1",
    # "jina-base": "jinaai/jina-embeddings-v2-base-en",
    # "jina-small": "jinaai/jina-embeddings-v2-small-en",
    # "jina-code": "jinaai/jina-embeddings-v2-base-code",
    # "LaBSE": "sentence-transformers/LaBSE",
    #"textCLIP": "sentence-transformers/clip-ViT-B-32"
}

In [4]:
encoder_models = defaultdict()

for m in models_to_evaluate:
    encoder_models[m] = SentenceTransformer(
        models_to_evaluate[m], 
        trust_remote_code=True
    )

In [5]:
# open the file and load the JSON data
file_path = './restaurants.json'
with open(file_path, 'r') as file:
    restaurants_data = json.load(file)

restaurant_documents = [
    restaurant['description'] for restaurant in restaurants_data
]

#len(restaurants_data), len(restaurant_documents)

In [7]:
restaurant_queries = [
    ("Show me restaurants rated above {} stars", ">="),
    ("I'm looking for restaurants with at least {} stars", ">="),
    ("Suggest restaurants rated {} stars or higher", ">="),
    ("Find restaurants with more than {} stars", ">="),
    ("List restaurants rated {} stars and above", ">="),
    ("Recommend restaurants rated with {} stars or more", ">="),
    ("Restaurants with a minimum of {} stars", ">="),
    ("Show me restaurants rated above {} stars", ">"),
    ("Restaurants rated no less than {} stars", ">="),
    ("Find Restaurants with star ratings above {} stars", ">=")
]

In [85]:
def generate_test_sample_query(
    query: str,
    target_number: int) -> Dict[str, Any]:
    """
    Function to generate a test sample query. 
    """

    # check if the target number is 10 and the operator is ">"
    if target_number >= 10 or target_number <= 0:
        raise ValueError("target_number should be in the range [1, 10].")
    
    
    # create the query sentence
    # for example, query = "Show me restaurants rated above {} stars" and 
    # target_number = 5 --> query = "Show me restaurants rated above 5 stars"
    query_sent = query.format(target_number)

    # lexicalize the target number
    target_number_lex = number_lexicalizer.number_to_words(target_number)

    # create the query sentence with lexicalized number
    # above example, query_lex = "Show me restaurants rated above five stars"
    query_sent_lex = query.format(target_number_lex)
    
    return {
        "numeral": query_sent,
        "lexical": query_sent_lex,
    }

In [86]:
# lets test generate_test_sample_query
generate_test_sample_query(
    query=restaurant_queries[0][0], 
    target_number=5
)

{'numeral': 'Show me restaurants rated above 5 stars',
 'lexical': 'Show me restaurants rated above five stars'}

In [87]:
def generate_test_sample_candidates(
    operator: str, 
    candidates: List[str],
    target_number: int,  
    max_items_to_retrieve:int, 
    items_to_retrieve:int = 1,
    random_seed: int=42) -> Dict[str, Any]:
    """
    Generate a test sample for the item retrieval task.
    """
    # handle edge cases
    if operator not in [">", ">=", "<", "<="]:
        raise ValueError("Operator must be one of ['>', '>=', '<', '<=']")
    
    if max_items_to_retrieve > len(candidates):
        err_msg = "max_items_to_retrieve can't be greater than num of candidates"
        raise ValueError(err_msg)
    
    # ensure items_to_retrieve is a valide value 
    if items_to_retrieve < 1:
        raise ValueError("items_to_retrieve must be greater than 0.")
    
    # ensure the target items to retrieve is less than the max items to retrieve
    if items_to_retrieve > max_items_to_retrieve:
        raise ValueError("items_to_retrieve <= max_items_to_retrieve.")
      
    # set random seed for reproducibility
    np.random.seed(random_seed)
    
    # associate a value for each search item
    # where only N of those are equal to or higher than the target number
    #items_to_retrieve = np.random.randint(1, max_items_to_retrieve)

    # adjust target number based on operator
    # comment this for now 
    # let us be genrous and consider above n is a number that includes n
    #target_number = target_number + 1 if operator == ">" else target_number

    hit_items = np.random.randint(target_number, 11, items_to_retrieve)
    miss_items = np.random.randint(
        1, target_number, len(candidates) - items_to_retrieve
    )

    # list of number for each item
    items_numbers: List[int] = np.concatenate([hit_items, miss_items])

    # define a boolean list to check if the rating is hit (should be returned)
    relevance_scores: List[int] = [
        0 if rating < target_number else 1 for rating in items_numbers
    ]

    # ensure correct number of relevant items
    if np.sum(relevance_scores) != items_to_retrieve:
        print(relevance_scores)
        raise ValueError("Relevance score != equal to max_items_to_retrieve!")    


    candidates_sent = [
        candidate.format(num)
        for candidate, num in zip(candidates, items_numbers)
    ]

    candidates_sent_lex = [
        candidate.format(number_lexicalizer.number_to_words(num))
        for candidate, num in zip(candidates, items_numbers)
    ]
    
    # construct the result
    return {
        "candidates": {
            "numeral": candidates_sent,
            "lexical": candidates_sent_lex
        },
        "relevance_scores": relevance_scores
    }

In [88]:
# test generate_test_sample_candidates
test_samples = generate_test_sample_candidates(
    operator=">=",
    candidates=restaurant_documents[:10],
    target_number=5,
    max_items_to_retrieve=10,
    #items_to_retrieve=1
)

for case, rel in zip(test_samples["candidates"]["numeral"], test_samples["relevance_scores"]):
    print(f"{case:90} {'✔️' if rel == 1 else '':15} ")

Le Bernardin, a seafood restaurant, has a 8 star rating.                                   ✔️              
Alinea, specializing in molecular gastronomy, is rated 1 stars.                                            
Joe's Place, a classic American diner, received a 3 star rating from customers.                            
Noma, featuring New Nordic cuisine, currently holds a 3 star rating.                                       
Luigi's Pizzeria, serving traditional Italian pies, has been given 4 stars by diners.                      
El Celler de Can Roca, offering modern Spanish dishes, maintains a 1 star rating.                          
Sukiyabashi Jiro, a sushi restaurant, has received 1 stars.                                                
Burger Bonanza, a fast-food burger joint, averages 3 stars across its locations.                           
Osteria Francescana, presenting contemporary Italian cuisine, holds a 2 star rating.                       
Green Fields, a farm-to-tabl

In [89]:
def generate_test_sample(
    query: str,
    candidates: List[str],
    operator: str,
    target_number: int,
    max_items_to_retrieve: int,
    items_to_retrieve: int = 1,
    random_seed: int = 42) -> Dict[str, Any]:
    """
    Generate a test sample for the item retrieval task.
    """
    # generate the query
    query_sample = generate_test_sample_query(query, target_number)
    
    # generate the candidates
    candidates_sample = generate_test_sample_candidates(
        operator=operator,
        candidates=candidates,
        target_number=target_number,
        max_items_to_retrieve=max_items_to_retrieve,
        items_to_retrieve=items_to_retrieve,
        random_seed=random_seed
    )
    
    return {
        "query": query_sample,
        "candidates": candidates_sample['candidates'],
        "relevance_scores": candidates_sample['relevance_scores']
    }

In [92]:
# try out test sample generation
full_test_sample = generate_test_sample(
    query="Show me restaurants rated above {} stars", 
    candidates=restaurant_documents[:10],  
    operator=">=",
    target_number=5,
    max_items_to_retrieve=10,
    items_to_retrieve=2,
    random_seed=42

)

print(f"Query: {full_test_sample['query']['numeral']}", end="\n\n")

for case, rel in zip(full_test_sample["candidates"]["numeral"], full_test_sample["relevance_scores"]):
    print(f"{case:90} {'✔️' if rel == 1 else '':15} ")


Query: Show me restaurants rated above 5 stars

Le Bernardin, a seafood restaurant, has a 8 star rating.                                   ✔️              
Alinea, specializing in molecular gastronomy, is rated 9 stars.                            ✔️              
Joe's Place, a classic American diner, received a 3 star rating from customers.                            
Noma, featuring New Nordic cuisine, currently holds a 3 star rating.                                       
Luigi's Pizzeria, serving traditional Italian pies, has been given 4 stars by diners.                      
El Celler de Can Roca, offering modern Spanish dishes, maintains a 1 star rating.                          
Sukiyabashi Jiro, a sushi restaurant, has received 1 stars.                                                
Burger Bonanza, a fast-food burger joint, averages 3 stars across its locations.                           
Osteria Francescana, presenting contemporary Italian cuisine, holds a 2 star rating.    

In [68]:
# generate full test samples

test_cases = []

# iterate over each query templates and generate test samples
for query, operator in restaurant_queries:
    for _ in range(10):

        # sample a target number 
        # if oeprator is >, then we need to sample a number between 6 and 10
        if operator == ">" or operator == ">=":
            target_number = np.random.randint(5, 11)
        else:
            target_number = np.random.randint(1, 6)

        # get items to retrieve
        max_items_to_retrieve=10
        items_to_retrieve = np.random.randint(1, max_items_to_retrieve)


        test_sample = generate_test_sample(
            query=query,
            candidates=restaurant_documents,
            operator=operator,
            target_number=target_number,
            max_items_to_retrieve=max_items_to_retrieve,
            items_to_retrieve=items_to_retrieve,
            random_seed=42
        )
        
        test_cases.append(test_sample)

In [74]:
# peek into first test case 

print(test_cases[0]["query"]["query"])

for case, rel in zip(test_cases[0]["candidates"]["numeral"], test_cases[0]["relevance_scores"]):
    print(f"{case:90} {'✔️' if rel == 1 else '':15} ")

Show me restaurants rated above 8 stars
Le Bernardin, a seafood restaurant, has a 10 star rating.                                  ✔️              
Alinea, specializing in molecular gastronomy, is rated 8 stars.                            ✔️              
Joe's Place, a classic American diner, received a 10 star rating from customers.           ✔️              
Noma, featuring New Nordic cuisine, currently holds a 10 star rating.                      ✔️              
Luigi's Pizzeria, serving traditional Italian pies, has been given 8 stars by diners.      ✔️              
El Celler de Can Roca, offering modern Spanish dishes, maintains a 8 star rating.          ✔️              
Sukiyabashi Jiro, a sushi restaurant, has received 10 stars.                               ✔️              
Burger Bonanza, a fast-food burger joint, averages 9 stars across its locations.           ✔️              
Osteria Francescana, presenting contemporary Italian cuisine, holds a 10 star rating.      ✔️   

In [56]:
# try out test sample generation
generate_test_sample(
    query="Show me restaurants rated above {} stars", 
    operator=">",
    candidates=[
        "Tickets, known for creative tapas, has been given {} stars.",
        "The Restaurant at Meadowood, showcasing California cuisine, boasts a {} star rating.",
        "Toyo Eatery, celebrating modern Filipino flavors, has earned {} stars.",
        "Septime, a neo-bistro in Paris, holds {} stars.",
    ], 
    max_items_to_retrieve=3, 
    target_number=7,
    random_seed=1234
)

{'query': {'query': 'Show me restaurants rated above 7 stars',
  'query_lex': 'Show me restaurants rated above seven stars'},
 'candidates': {'numeral': ['Tickets, known for creative tapas, has been given 10 stars.',
   'The Restaurant at Meadowood, showcasing California cuisine, boasts a 9 star rating.',
   'Toyo Eatery, celebrating modern Filipino flavors, has earned 6 stars.',
   'Septime, a neo-bistro in Paris, holds 5 stars.'],
  'lexical': ['Tickets, known for creative tapas, has been given ten stars.',
   'The Restaurant at Meadowood, showcasing California cuisine, boasts a nine star rating.',
   'Toyo Eatery, celebrating modern Filipino flavors, has earned six stars.',
   'Septime, a neo-bistro in Paris, holds five stars.']},
 'relevance_scores': [1, 1, 0, 0]}

In [10]:
def generate_test_samples_for_query(
    query_item: Tuple[str, str],
    candidates: List[str],
    max_items_to_retrieve: int, 
    random_seeds: List[int]) -> Generator:
    """
    Generate a test dataset for the item retrieval task.
    """
    # unpack the query item
    query, operator = query_item

    # generate test samples for each target number
    # sample a target number 
    for rand_seed in random_seeds:
        np.random.seed(rand_seed)

        target_number = np.random.randint(6, 10)

        yield generate_test_sample(
            query=query, 
            operator=operator,
            candidates=candidates,
            max_items_to_retrieve=max_items_to_retrieve,
            target_number=target_number,
            random_seed=rand_seed
        )  

In [11]:
test_cases = []

for q in restaurant_queries:
    test_cases.extend(
        [
            t_case for t_case in generate_test_samples_for_query(
                query_item=q,
                candidates=restaurant_documents,
                max_items_to_retrieve=10,
                random_seeds=[
                    42, 1234, 5678, 9101, 321, 765, 13, 1212, 42, 8, 46648
                ]
            )
        ]
    )

In [12]:
hit_to_emoji = {0: "", 1: "✅"}

In [13]:
# evaluate the performance of the model
# code to evaluate a single test case and return precision and recall at 10
def evaluate_test_case(
        query: str, 
        candidates: List[str], 
        relevance_scores: List[int],
        top_k: int, 
        model: SentenceTransformer,
        debug=False) -> Tuple[float, float]:
    """
    Evaluate a test case using a given model.
    :param query: a query sentence
    :param candidates: a list of candidate sentences
    :param relevance_score: a list of relevance scores
    :param top_k: the number of items to retrieve
    :param model: a sentence transformer model to use
    :return: a tuple of precision and recall at 10
    """
    # encode the query and candidates
    query_embedding = model.encode(query) #.reshape(1, -1)
    candidate_embeddings = model.encode(candidates)

    # compute the cosine similarity between the query and candidates
    similarity = np.dot(candidate_embeddings, query_embedding.T)

    # rank the candidates based on the similarity
    ranked_indices = np.argsort(similarity, axis=0)[::-1]

    ranked_similarity = similarity[ranked_indices]

    # retrieve the relevance scores based on the ranking
    ranked_relevance = np.array(relevance_scores)[ranked_indices]

    # print query and top 10 results
    if debug:
        print(f"Query: {query}")
        print()
        print("\n".join(
            [
                f"{i + 1:>3}: {candidates[j]:<90}{ranked_similarity[i]:>5.3f} {hit_to_emoji[ranked_relevance[i]]:>3}" \
                    for i, j in enumerate(ranked_indices[:top_k])
            ]), end=''
        ) 
        print() 

    # compute precision and recall at 10
    precision_at_k = np.sum(ranked_relevance[:top_k]) / top_k
    recall_at_k = np.sum(ranked_relevance[:top_k]) / np.sum(relevance_scores)

    return precision_at_k, recall_at_k


In [14]:
# evaluate the performance of the model
# code to evaluate a single test case and return precision and recall at 10
def evaluate_test_case_faiss(
        query: str, 
        candidates: List[str], 
        relevance_scores: List[int],
        top_k: int, 
        model: SentenceTransformer,
        debug=False) -> Tuple[float, float]:
    """
    Evaluate a test case using a given model.
    :param query: a query sentence
    :param candidates: a list of candidate sentences
    :param relevance_score: a list of relevance scores
    :param top_k: the number of items to retrieve
    :param model: a sentence transformer model to use
    :return: a tuple of precision and recall at 10
    """
    # encode the query and candidates
    query_embedding = model.encode(query).reshape(1, -1)
    candidate_embeddings = model.encode(candidates)

    # use faiss to normalize the vectors
    faiss.normalize_L2(query_embedding)
    faiss.normalize_L2(candidate_embeddings)

    # compute the cosine similarity between the query and candidates using faiss
    index = faiss.IndexFlatIP(candidate_embeddings.shape[1])
    index.add(candidate_embeddings)
    ranked_similarity, ranked_indices = index.search(query_embedding, top_k)

    # retrieve the relevance scores based on the ranking
    relevance_scores = np.array(relevance_scores)
    ranked_relevance = relevance_scores[ranked_indices[0]]

    # print query and top 10 results
    if debug:
        print(f"Query: {query}")
        print()
        print("\n".join(
            [
                f"{i + 1:>3}: {candidates[j]:<90}{ranked_similarity[0][i]:>5.3f} {hit_to_emoji[ranked_relevance[i]]:>3}" \
                    for i, j in enumerate(ranked_indices[0][:top_k])
            ]), end=''
        ) 
        print() 

    # compute precision and recall at 10
    precision_at_k = np.sum(ranked_relevance[:top_k]) / top_k
    recall_at_k = np.sum(ranked_relevance[:top_k]) / np.sum(relevance_scores)

    return precision_at_k, recall_at_k


In [16]:
%time
evaluate_test_case(
    query="Show me restaurants rated above 8 stars",
    candidates=[
        "Tickets, known for creative tapas, has been given 9 stars.",
        "The Restaurant at Meadowood, showcasing California cuisine, boasts a 7 star rating.",
        "Toyo Eatery, celebrating modern Filipino flavors, has earned 8 stars.",
        "Septime, a neo-bistro in Paris, holds 8 stars.",
    ],
    relevance_scores=[1, 0, 0, 0],
    top_k=4,
    model=encoder_models["miniLM-L6"],
    debug=True
)

CPU times: user 2 μs, sys: 0 ns, total: 2 μs
Wall time: 3.81 μs
Query: Show me restaurants rated above 8 stars

  1: The Restaurant at Meadowood, showcasing California cuisine, boasts a 7 star rating.       0.663    
  2: Toyo Eatery, celebrating modern Filipino flavors, has earned 8 stars.                     0.486    
  3: Tickets, known for creative tapas, has been given 9 stars.                                0.440   ✅
  4: Septime, a neo-bistro in Paris, holds 8 stars.                                            0.376    


(0.25, 1.0)

In [16]:
%time
evaluate_test_case(
    query="I have more than 7 apples.",
    candidates=[
        "I have 2 apples.",
        "I have 3 apples.",
        "I have 4 apples.",
        "I have 5 apples.",
        "I have 6 apples.",
        "I have 8 apples.",
        "I have 9 apples.",
        "I have 10 apples.",
        "I have 11 apple.",
        "I have 12 apples.",
    ],
    relevance_scores=[1, 0, 0, 0, 0, 0, 0, 0, 0, 0,],
    top_k=10,
    model=encoder_models["miniLM-L6"],
    debug=True
)

CPU times: user 1 μs, sys: 0 ns, total: 1 μs
Wall time: 2.86 μs
Query: I have more than 7 apples.

  1: I have 8 apples.                                                                          0.926    
  2: I have 6 apples.                                                                          0.923    
  3: I have 9 apples.                                                                          0.918    
  4: I have 5 apples.                                                                          0.916    
  5: I have 12 apples.                                                                         0.910    
  6: I have 10 apples.                                                                         0.897    
  7: I have 4 apples.                                                                          0.881    
  8: I have 3 apples.                                                                          0.880    
  9: I have 2 apples.                                        

(0.1, 1.0)

In [67]:
%time
evaluate_test_case_faiss(
    query="Show me restaurants rated above 8 stars",
    candidates=[
        "Tickets, known for creative tapas, has been given 9 stars.",
        "The Restaurant at Meadowood, showcasing California cuisine, boasts a 7 star rating.",
        "Toyo Eatery, celebrating modern Filipino flavors, has earned 8 stars.",
        "Septime, a neo-bistro in Paris, holds 8 stars.",
    ],
    relevance_scores=[1, 0, 0, 0],
    top_k=4,
    model=encoder_models["LaBSE"],
    debug=True
)

CPU times: user 2 μs, sys: 0 ns, total: 2 μs
Wall time: 4.77 μs
Query: Show me restaurants rated above 8 stars

  1: Toyo Eatery, celebrating modern Filipino flavors, has earned 8 stars.                     0.442    
  2: The Restaurant at Meadowood, showcasing California cuisine, boasts a 7 star rating.       0.413    
  3: Tickets, known for creative tapas, has been given 9 stars.                                0.372   ✅
  4: Septime, a neo-bistro in Paris, holds 8 stars.                                            0.354    


(0.25, 1.0)

In [68]:
%time

recall_values = []

for i in tqdm(range(10)):
    p, r = evaluate_test_case(
        test_cases[i]["query"]["numeral"],
        test_cases[i]["candidates"]["numeral"],
        test_cases[i]["relevance_scores"],
        top_k=10,
        model=encoder_models["miniLM-L6"],
        debug=True
    )
    #print(f"Case {i:>3}:    Recall@10: {r:.3f}")
    print()

    recall_values.append(r)


print(f"Mean Recall@10: {np.mean(recall_values):.3f}")
print(f"Std Recall@10: {np.std(recall_values):.3f}")

CPU times: user 2 μs, sys: 0 ns, total: 2 μs
Wall time: 3.81 μs


  0%|          | 0/10 [00:00<?, ?it/s]

Query: Show me restaurants rated above 8 stars

  1: The Test Kitchen, an eclectic dining experience, has received a 8 star rating.            0.675    
  2: The Restaurant at Meadowood, showcasing California cuisine, boasts a 6 star rating.       0.647    
  3: Mugaritz, an experimental restaurant, has earned a 5 star rating.                         0.639    
  4: Eleven Madison Park, serving contemporary American cuisine, is rated 3 stars.             0.633    
  5: Joe's Place, a classic American diner, received a 9 star rating from customers.           0.598   ✅
  6: Gaa, a modern Indian restaurant, has been awarded 1 stars.                                0.591    
  7: Core by Clare Smyth, a modern British restaurant, has received 7 stars.                   0.578    
  8: Le Bernardin, a seafood restaurant, has a 10 star rating.                                 0.575   ✅
  9: Alo, a contemporary French restaurant, holds 3 stars.                                     0.568    
 10: Li

In [70]:
recall_values = []

for encoder in encoder_models:

    print(f"Model: {encoder}    ", end="")
    for i in tqdm(range(len(test_cases))):
        p, r = evaluate_test_case(
            test_cases[i]["query"]["numeral"],
            test_cases[i]["candidates"]["numeral"],
            test_cases[i]["relevance_scores"],
            top_k=10,
            model=encoder_models[m]
        )
        #print(f"Case {i:>3}:    Recall@10: {r:.3f}")
        #print()

        recall_values.append(r)

    print(f"Mean Recall@10: {np.mean(recall_values):.3f}")
    print(f"Std Recall@10: {np.std(recall_values):.3f}")

Model: miniLM-L6    

  0%|          | 0/110 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [141]:

for i in tqdm(range(len(test_cases))):
    p, r = evaluate_test_case(
        test_cases[i]["query"]["lexical"],
        test_cases[i]["candidates"]["lexical"],
        test_cases[i]["relevance_scores"],
        top_k=10,
        model=encoder_models["miniLM-L6"]
    )
    #print(f"Case {i:>3}:    Recall@10: {r:.3f}")
    #print()

    recall_values.append(r)


print(f"Mean R@10: {np.mean(recall_values):.3f}    ", end="")
print(f" Std R@10: {np.std(recall_values):.3f}")

  0%|          | 0/110 [00:00<?, ?it/s]

Mean Recall@10: 0.347
Std Recall@10: 0.160


In [None]:
 >>> generate_test_sample(
    query="Show me restaurants rated above {} stars"", 
    candidates=[
        "Tickets, known for creative tapas, has been given {} stars."
        "The Restaurant at Meadowood, showcasing California cuisine, boasts a {} star rating."
        "Toyo Eatery, celebrating modern Filipino flavors, has earned {} stars."
        "Septime, a neo-bistro in Paris, holds {} stars."
    ], 
    max_items_to_retrieve=2, 
    target_number=7
)

{
    "query": "Show me restaurants rated above 7 stars",
    "comparison": ">",
    "candidates": [
        "Tickets, known for creative tapas, has been given 6 stars.",
        "The Restaurant at Meadowood, showcasing California cuisine, boasts a 8 star rating.",
        "Toyo Eatery, celebrating modern Filipino flavors, has earned 5 stars.", 
        "Septime, a neo-bistro in Paris, holds 9 stars."
    ],
    "relevance_scores": [0, 1, 0, 1]
}

In [94]:
# read the data from the file resturants.list 
# and convert to list
data = pd.read_csv("restaurants.list", sep="\t", header=None).to_dict()[0]
RESTAURANTS = list(data.values())

data = pd.read_csv("movies.list", sep="\t", header=None).to_dict()[0]
MOVIES = list(data.values())

In [95]:
templates = {
    "restaurants": {
        "query": [
            "Find restaurants that are rated with at least {} {}", 
            "List all restaurants with {} {} rating or higher",
            "I am looking for restaurants with at least {} {} rating",
            "Show me restaurants that have {} {} rating or higher",
            "Which restaurants have at least {} {} rating",
            "Restaurants with at least {} {} rating",
            "List restaurants with at least {} {} rating",
            "Restaurants that have {} {} rating or higher",
            "I want to see great restaurants with at least {} {} rating",
            "Give me suggestions for restaurants with {} {} rating or higher",
            "I want to know which restaurants have at least {} {} rating",
            "Which restaurants have {} {} rating or higher",
            "Great restaurants that have at least {} {} rating",
            "Show me of restaurants with at least {} {} rating",
        ], 
        "candidate": "{} restaurant has {} stars rating.",
        "attribute": "stars"
    }, 
    "movies": {
        "query": [
            "Find movies that are rated with at least {} {}", 
            "List all movies with {} {} rating or higher",
            "I am looking for movies with at least {} {} rating",
            "Show me movies that have {} {} rating or higher",
            "Which movies have at least {} {} rating",
            "Movies with at least {} {} rating",
            "List movies with at least {} {} rating",
            "Movies that have {} {} rating or higher",
            "I want to see great movies with at least {} {} rating",
            "Give me suggestions for movies with {} {} rating or higher",
            "I want to know which movies have at least {} {} rating",
            "Which movies have {} {} rating or higher",
            "Great movies that have at least {} {} rating",
            "Show me of movies with at least {} {} rating",
        ],
        "candidate": "The {} movie is rated with {} {}.",
        "attribute": "stars"
    }

}

In [109]:
def create_test_case(
        attribute: str,
        search_items: List[str],
        query_template: str, 
        candidate_template: str, 
        max_items_to_retrieve: int=11) -> Dict[str, Any]:
    """
    Create a test case for the evaluation.
    :param attribute: the attribute to be queried (e.g., "stars", "awards")
    :param search_items: a list of items to search over (e.g., restaurants)
    :param query_template: a query template to be used
    :param candidate_template: a candidate template to be used
    :return: a tuple of query sentence, candidate sentences, and hit flags
    """
    # test if input parameters are valid
    assert attribute in ["stars", "awards"], "Invalid attribute"
    assert len(search_items) >= 1, "Provide at least one search items"

    # sample a target number 
    target_number = np.random.randint(6, 10)

    # associate a value for each search item
    # where only N of those are equal to or higher than the target number
    items_to_retrieve = np.random.randint(1, max_items_to_retrieve)

    hit_ratings = np.random.randint(target_number, 10, items_to_retrieve)
    miss_ratings = np.random.randint(
        1, target_number, len(search_items) - items_to_retrieve
    )
    
    item_ratings = np.concatenate([hit_ratings, miss_ratings])

    # define a boolean list to check if the rating is hit (should be returned)
    relevance_scores = [
        0 if rating < target_number else 1 for rating in item_ratings
    ]

    # create the query sentence
    query_sentence = query_template.format(target_number, attribute)
    target_number_lex = number_lexicalizer.number_to_words(target_number)
    query_sentence_lex = query_template.format(target_number_lex, attribute)

    candidates = [
        candidate_template.format(restaurant, rating)
        for restaurant, rating in zip(search_items, item_ratings)
    ]

    candidates_lex = [
        candidate_template.format(
            restaurant, 
            number_lexicalizer.number_to_words(rating))
        for restaurant, rating in zip(search_items, item_ratings)
    ]


    return { 
        "query": {
            "numeral": query_sentence,
            "lexical": query_sentence_lex
        }, 
        "candidates": {
            "numeral": candidates,
            "lexical": candidates_lex
        },
        "relevance_scores": relevance_scores
    }

In [110]:
t = create_test_case(
    "stars", 
    ["Taj Mahal", "Burger King", "McDonald's", "KFC", "Pizza Hut", "Subway", 
     "Greggs", "Pret A Manger", "Nando's", "Starbucks", "Costa"],
    templates["restaurants"]["query"][0],
    templates["restaurants"]["candidate"], 
    max_items_to_retrieve=5
)

t

{'query': {'numeral': 'Find restaurants that are rated with at least 6 stars',
  'lexical': 'Find restaurants that are rated with at least six stars'},
 'candidates': {'numeral': ['Taj Mahal restaurant has 9 stars rating.',
   'Burger King restaurant has 4 stars rating.',
   "McDonald's restaurant has 1 stars rating.",
   'KFC restaurant has 5 stars rating.',
   'Pizza Hut restaurant has 3 stars rating.',
   'Subway restaurant has 4 stars rating.',
   'Greggs restaurant has 3 stars rating.',
   'Pret A Manger restaurant has 2 stars rating.',
   "Nando's restaurant has 1 stars rating.",
   'Starbucks restaurant has 1 stars rating.',
   'Costa restaurant has 3 stars rating.'],
  'lexical': ['Taj Mahal restaurant has nine stars rating.',
   'Burger King restaurant has four stars rating.',
   "McDonald's restaurant has one stars rating.",
   'KFC restaurant has five stars rating.',
   'Pizza Hut restaurant has three stars rating.',
   'Subway restaurant has four stars rating.',
   'Greggs 

In [111]:
# code to evaluate a single test case and return precision and recall at 10
def evaluate_test_case(
        query: str, 
        candidates: List[str], 
        relevance_scores: List[int], 
        model: SentenceTransformer) -> Tuple[float, float]:
    """
    Evaluate a test case using a given model.
    :param query: a query sentence
    :param candidates: a list of candidate sentences
    :param relevance_score: a list of relevance scores
    :param model: a sentence transformer model to use
    :return: a tuple of precision and recall at 10
    """
    # encode the query and candidates
    query_embedding = model.encode(query)
    candidate_embeddings = model.encode(candidates)

    # compute the cosine similarity between the query and candidates
    similarity = np.dot(candidate_embeddings, query_embedding.T)

    # rank the candidates based on the similarity
    ranked_indices = np.argsort(similarity, axis=0)[::-1]

    # retrieve the relevance scores based on the ranking
    ranked_relevance = np.array(relevance_scores)[ranked_indices]

    # compute precision and recall at 10
    precision_at_10 = np.mean(ranked_relevance[:10])
    recall_at_10 = np.sum(ranked_relevance[:10]) / np.sum(relevance_scores)

    return precision_at_10, recall_at_10


In [117]:
t["relevance_scores"]

[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]

In [116]:
evaluate_test_case(
    t["query"]["numeral"], 
    t["candidates"]["numeral"], 
    t["relevance_scores"], 
    encoder_models["LaBSE"]
)

(0.1, 1.0)

In [107]:
relevance_to_emoji = {0: "✖", 1: "✅"}

In [114]:
# iterate over the templates and create test cases
search_items = {
    "restaurants": RESTAURANTS, 
    "movies": MOVIES
}


debug = False

model_name = "LaBSE"
model = encoder_models[model_name]

for k in range(10):
    #print(f"Experiment {k+1}")
    test_case_dict = defaultdict(list)

    print(f'Create test case {k + 1}')

    # generate test cases for each search need
    for search_need in templates:
        for query_template in templates[search_need]["query"]:
            test_case_dict[search_need].append(
                create_test_case(
                    templates[search_need]["attribute"], 
                    search_items[search_need], 
                    query_template, 
                    templates[search_need]["candidate"], 
                )
            )

    # evaluate the test cases
    precision_at_10_values, recall_at_10_values = [], []
    for search_need in test_case_dict:

        for t_case in test_case_dict[search_need]:
            precision_at_10, recall_at_10 = evaluate_test_case(
                t_case["query"]["numeral"], 
                t_case["candidates"]["numeral"], 
                t_case["relevance_scores"], 
                model
            )

            precision_at_10_values.append(precision_at_10)
            recall_at_10_values.append(recall_at_10)

            if debug:
                print(f"Query: {t_case['query']['numeral']}")
                print()
                for i, (candidate, relevance) in enumerate(
                    zip(t_case["candidates"]["numeral"], t_case["relevance_scores"])
                ):
                    print(f"Rank {i+1:>2}: {candidate:60} ", end="")
                    print(f"{relevance_to_emoji[relevance]:>3}")

                print()
                print(f"P@10: {precision_at_10:.3f}")
                print(f"R@10: {recall_at_10:.3f}")
                print()

    avg_precision_at_10 = np.mean(precision_at_10_values)
    avg_recall_at_10 = np.mean(recall_at_10_values)
    print(f"Model: {model_name}")
    print(f"Average P@10: {avg_precision_at_10:.3f}")
    print(f"Average R@10: {avg_recall_at_10:.3f}")
    print()

Create test case 1


IndexError: Replacement index 2 out of range for positional args tuple

ModuleNotFoundError: No module named 'bs4'

In [54]:
len(test_cases["restaurants"])

10

In [55]:
# evalauting the models
debug = False

for model_name, model in encoder_models.items():

    precision_at_10_values, recall_at_10_values = [], []

    # iterate over the test cases
    for t_case in test_cases["restaurants"]:
        # get the query and the candidates
        query_sent, candidates = t_case["query"], t_case["candidates"]
        relevance_scores = t_case["relevance_scores"]

        # encode the query and the candidates
        query_embedding = model.encode(query_sent).reshape(1, -1)
        candidate_embeddings = model.encode(candidates)

        # L2 normalize the embeddings
        faiss.normalize_L2(query_embedding)
        faiss.normalize_L2(candidate_embeddings)

        # get dimensions of the embeddings
        d = query_embedding.shape[1]

        # make search index
        index = faiss.IndexFlatIP(d)
        index.add(candidate_embeddings)

        # search the index
        k = 10

        D, I = index.search(query_embedding, k)

        if debug:

            print(f"Query: {query_sent}")
            print()

            for i in range(k):
                retrieved_sentence = candidates[I[0][i]]
                hit_or_not = relevance_scores[I[0][i]]
                print(f"Rank {i+1:>2}: {retrieved_sentence:60} ", end="")
                print(f"{D[0][i]:>5.4f} ", end="")
                print(f"{relevance_to_emoji[hit_or_not]:>3}")


        # compute precision and recall at 10
        retrieved_items = sum(np.array(relevance_scores)[I[0]])

        precision_at_10 = retrieved_items / k
        recall_at_10 = retrieved_items / np.sum(relevance_scores)

        precision_at_10_values.append(precision_at_10)
        recall_at_10_values.append(recall_at_10)

        if debug:

            print()
            print(f"Total num of relevant items: {np.sum(relevance_scores)}")
            print(f"Total num of retrieved items: {retrieved_items}")

            print()
            print(f"P@10: {precision_at_10:.3f}")
            print(f"R@10: {recall_at_10:.3f}")
            print()

    # caluclate the average precision and recall at 10
    avg_precision_at_10 = np.mean(precision_at_10_values)
    avg_recall_at_10 = np.mean(recall_at_10_values)
    print(f"Model: {model_name}")
    print(f"Average P@10: {avg_precision_at_10:.3f}")
    print(f"Average R@10: {avg_recall_at_10:.3f}")

    print("-" * 80)

Model: LaBSE
Average P@10: 0.130
Average R@10: 0.220
--------------------------------------------------------------------------------
Model: miniLM-L12
Average P@10: 0.180
Average R@10: 0.428
--------------------------------------------------------------------------------
Model: miniLM-L6
Average P@10: 0.240
Average R@10: 0.525
--------------------------------------------------------------------------------


KeyboardInterrupt: 

In [None]:
# sample target number
target_number = np.random.randint(6, 10)
query_template = "Show me restaurants with at least {} {} rating"
query_sentence = query_template.format(target_number)

# create a list of randome ratings between 1 and 10
# where only 10 of those are equal to or higher than the target number
hit_ratings = np.random.randint(target_number, 11, 10)
miss_ratings = np.random.randint(1, target_number, len(RESTAURANTS) - 10)
all_ratings = np.concatenate([hit_ratings, miss_ratings])

#ratings = np.random.randint(1, 11, len(RESTAURANTS))


candidate_template = "{} restaurant has {} stars rating."

# define a boolean list to check if the rating is hit (should be returned)
is_hit = [
    0 if rating < target_number else 1 for rating in all_ratings
]

candidate_sentences = [
        candidate_template.format(restaurant, rating)
        for restaurant, rating in zip(RESTAURANTS, all_ratings)
]

hit_to_emoji = {0: "✖", 1: "✅"}

IndexError: Replacement index 1 out of range for positional args tuple

In [None]:
assert sum(is_hit) == 10

In [None]:
for c, h in zip(candidate_sentences, is_hit):
    print(f"{c:60}  {hit_to_emoji[h]:>5}")

Holy Cannoli! restaurant has 8 stars rating.                      ✅
Sushi Samurai restaurant has 7 stars rating.                      ✅
Nacho Average Taco restaurant has 8 stars rating.                 ✅
Curry Up Now restaurant has 8 stars rating.                       ✅
Oui, Chef! restaurant has 10 stars rating.                        ✅
The Souvlaki Shack restaurant has 10 stars rating.                ✅
Kimchi Commandos restaurant has 9 stars rating.                   ✅
Pad Thai Guy restaurant has 10 stars rating.                      ✅
Tagine Time restaurant has 10 stars rating.                       ✅
Carnivore Carnival restaurant has 9 stars rating.                 ✅
Wok This Way restaurant has 4 stars rating.                       ✖
Hummus a Tune restaurant has 5 stars rating.                      ✖
Rumba Roti restaurant has 4 stars rating.                         ✖
Lederhosen Lounge restaurant has 3 stars rating.                  ✖
Mamma Mia's Pizzeria restaurant has 1 stars rati

In [157]:
debug = True

# encode the text query and candidate
query_embedding = model.encode(query_sentence).reshape(1, -1)
candidate_embeddings = model.encode(candidate_sentences)

# L2 normalize the embeddings
faiss.normalize_L2(query_embedding)
faiss.normalize_L2(candidate_embeddings)

# compute the cosine similarity using FAISS
d = model.get_sentence_embedding_dimension()

index = faiss.IndexFlatIP(d)
index.add(candidate_embeddings)

k = 10
D, I = index.search(query_embedding, k)


# print the result
# print query 
print(f"Query: {query_sentence}", end="\n\n")

if debug:
    for i in range(k):
        retrieved_sentence = candidate_sentences[I[0][i]]
        hit_or_not = is_hit[I[0][i]]
        print(f"Rank {i+1:>2}: {retrieved_sentence:60} {D[0][i]:>5.2f} ", end="")
        print(f"{hit_to_emoji[hit_or_not]:>3}")

# compute precision and recall at 10
precision_at_10 = sum([is_hit[i] for i in list(I[0][:10])]) / 10
recall_at_10 = sum([is_hit[i] for i in list(I[0][:10])]) / sum(is_hit)

print()
print(f"P@10: {precision_at_10:.2f}")
print(f"R@10: {recall_at_10:.2f}")

Query: Show me restaurants with at least 7 stars rating

Rank  1: Adriatic Appetites restaurant has 6 stars rating.             0.72   ✖
Rank  2: Oui, Chef! restaurant has 10 stars rating.                    0.72   ✅
Rank  3: Wok This Way restaurant has 4 stars rating.                   0.72   ✖
Rank  4: Grillin' & Chillin' restaurant has 5 stars rating.            0.72   ✖
Rank  5: Curry Up Now restaurant has 8 stars rating.                   0.72   ✅
Rank  6: The Souvlaki Shack restaurant has 10 stars rating.            0.71   ✅
Rank  7: Holy Cannoli! restaurant has 8 stars rating.                  0.71   ✅
Rank  8: The Fish Fryer restaurant has 5 stars rating.                 0.71   ✖
Rank  9: Carnivore Carnival restaurant has 9 stars rating.             0.71   ✅
Rank 10: Fon-Do or Fon-Don't restaurant has 5 stars rating.            0.71   ✖

P@10: 0.50
R@10: 0.50
