# Boilerplate Code

This section contains the preparatory libraries, modules, and defined paths that are used repeatedly throughout the codebase. These foundational elements help ensure the code is reliable, consistent, and easy to maintain.

In [None]:
# Setting the necessary imports for the project
import module.crawler as cr
import module.parser as pr
import module.engine as en
import module.advanced_engine as aen

from scrapy.crawler import CrawlerProcess
from scrapy.settings import Settings

from concurrent.futures import ThreadPoolExecutor, as_completed

import os

import numpy as np
import pandas as pd 

import nltk

installed = True # Set to False if you don't have the nltk data installed yet
if not installed:
    nltk.download('punkt')
    nltk.download('stopwords')

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer

import re

import json

from collections import defaultdict

from itertools import chain
import heapq

import ipywidgets as widgets
from IPython.display import display, clear_output

In [2]:
# Setting the necessary direcory
os.makedirs('data', exist_ok=True)

pages_path = os.path.join('data', 'pages')
os.makedirs(pages_path, exist_ok=True)

data_path = os.path.join('data', 'data_tsv')
os.makedirs(data_path, exist_ok=True)

engine_path = os.path.join('data', 'engine')
os.makedirs(engine_path, exist_ok=True)

vocabulary_path = os.path.join(engine_path, 'vocabulary.csv')
inverted_index_path = os.path.join(engine_path, 'inverted_index.json')
inverted_index_TFIDF_path = os.path.join(engine_path, 'inverted_index_TFIDF.json')

dataset_path = os.path.join('data', 'dataset.tsv')
urls_path = os.path.join('data', 'urls.txt')

# 1. Data collection

### 1.1 Get the list of Michelin restaurants

We compile a list of restaurants through web scraping the [Michelin Restaurants in Italy](https://guide.michelin.com/en/it/restaurants). Our task is to **collect the URL** associated with each restaurant in this list. 

We created a Scrapy spider called `UrlMichelin` to comprehensively scrape restaurant URLs from the Michelin Guide website. Scrapy was chosen as the web scraping framework due to its powerful capabilities, including the ability to parallelize requests and avoid to be blocked.

To execute the spider, we set up a `CrawlerProcess` with the custom settings we had defined. 

Within the spider's parse function, we defined a list of `start_urls` covering pages 1 to 102 of the Michelin Guide website. Using CSS selectors, we then extracted the individual restaurant links from each page. For each link, we constructed the full URL and appended it to a text file, along with the page number from which the URL was obtained. This approach allowed us to maintain the context of where each URL was found, which could be valuable for further analysis or processing.

By the end of the scraping process, we had accumulated a a `.txt` file containing all the restaurant URLs from the Michelin Guide website.

In [9]:
# Create custom settings for the url spider
custom_settings = Settings({
    'REQUEST_FINGERPRINTER_IMPLEMENTATION': '2.7',  # Set to recommended value to avoid issues
    'LOG_LEVEL': 'ERROR'  # Suppress other not usefull logging informations
})

In [None]:
get_url_process = CrawlerProcess(settings=custom_settings) # Create a process for the spider
get_url_process.crawl(cr.UrlMichelin, urls_path) # Add the spider to the process
get_url_process.start() # Run the spider

In [10]:
# Check if the file exists
if os.path.exists(urls_path):
    # Check the output file and see if the number of lines is correct
    lines_in_file = open(urls_path, 'r').readlines()
    number_of_lines = len(lines_in_file)
    print(f'Number of lines in file: {number_of_lines}')
else:
    print('Failure: File not found')

Number of lines in file: 1983


### 1.2. Crawl Michelin restaurant pages

After obtaining the list of restaurant URLs, we took the following steps to efficiently download and organize the HTML content:

1. We downloaded the HTML for each collected URL. This allowed us to capture the complete web page data corresponding to each restaurant listing.

2. Immediately after downloading each page's HTML, we saved the content to individual files. This proactive saving ensured that if our program encountered any interruptions or issues, we would not lose the data collected up to that point. Preserving the HTML data was crucial for maintaining the integrity of our dataset.

3. To keep the downloaded content organized, we structured the HTML files into folders based on the page number from which the URLs were originally extracted. This folder-based organization made it easier to track the source of each HTML page and facilitated any later analysis or processing that might require grouping the data by page.

To perform this rapid crawling of the HTML pages, we leveraged a multi-threaded approach. By assigning one thread per available CPU core, we were able to maximize the utilization of our system's computing resources and achieve high performance without overloading the CPU. Each thread executed the `HTML_downloader` function, which handled the request for a page's HTML and the subsequent saving of the data to a file.

This combination of immediate HTML saving, page-based folder organization, and efficient multi-threaded crawling allowed us to thoroughly and reliably collect the restaurant data from the Michelin Guide website.

In [None]:
lines_of_urls = []
with open(urls_path, 'r') as file:
    lines_of_urls = file.readlines()

original_directory = os.getcwd()
os.chdir(os.path.join(original_directory, pages_path))

# Create folders for the HTML files
cr.make_folders(100)

max_w = os.cpu_count()

# Download the HTML files concurrently
with ThreadPoolExecutor(max_workers=max_w) as executor:
    download_futures = []
    for line in lines_of_urls:
        # Split the line into URL and page number
        page_num = int(line.split("|")[1])
        url = line.split("|")[0].strip()
                
        # Submit download task to the executor
        download_futures.append(executor.submit(cr.HTML_downloader, url, page_num))

    # Wait for all tasks to complete
    for future in as_completed(download_futures):
        try:
            future.result()
        except Exception as e:
            print(f"An error occurred: {e}")

# Notify completion
print("Downloaded all pages!")

# Return to the original directory
os.chdir(original_directory)

Downloaded all pages!


In [5]:
# Check if the files exist and are 1983

current_dir = os.getcwd()
dir_path = os.path.join(current_dir, pages_path)
os.chdir(dir_path)
count = 0

for i in range(1,101):
    folder = f'page_{i}'
    for path in os.listdir(folder):
        if os.path.isfile(os.path.join(folder, path)):
            count += 1

os.chdir(current_dir)

print('File count:', count)

File count: 1983


### 1.3 Parse downloaded pages

We need to extract the following information from each restaurant's HTML page:

1. **Restaurant Name** (to save as `restaurantName`): string;
2. **Address** (to save as `address`): string;
3. **City** (to save as `city`): string;
4. **Postal Code** (to save as `postalCode`): string;
5. **Country** (to save as `country`): string;
6. **Price Range** (to save as `priceRange`): string;
7. **Cuisine Type** (to save as `cuisineType`): string;
8. **Description** (to save as `description`): string;
9. **Facilities and Services** (to save as `facilitiesServices`): list of strings;
10. **Accepted Credit Cards** (to save as `creditCards`): list of strings;
11. **Phone Number** (to save as `phoneNumber`): string;
12. **URL to the Restaurant Page** (to save as `website`): string.

To efficiently parse this data, we'll use a multi-threaded approach. Each thread will run the `tsv_extractor` function, which takes the folder containing the HTML files and the output folder for the extracted data. 

The `tsv_extractor` function calls `extract_info_from_html` to parse the required information from each HTML page. By parallelizing the parsing across multiple threads, we can save significant time compared to a single-threaded approach.

This combined strategy of immediate HTML saving, organized folder structure, and multi-threaded parsing allows us to comprehensively extract the desired restaurant details from the Michelin Guide website.


In [6]:
keys = ['index', 'restaurantName', 'address', 'city', 'postalCode', 'country', 'priceRange', 'cuisineType', 'description', 'creditCards', 'facilitiesServices', 'phoneNumber', 'website']

max_w = os.cpu_count()

# Download the data from HTML files concurrently
with ThreadPoolExecutor(max_workers=max_w) as executor:
    extractor_future = []
    for i in range(1,101):
        start_dir = os.path.join(pages_path, f'page_{i}')
        start_index = (i-1)*20 
        extractor_future.append(executor.submit(pr.tsv_extractor, start_dir, data_path, start_index, keys))

    # Wait for all tasks to complete
    for future in as_completed(extractor_future):
        try:
            future.result()
        except Exception as e:
            print(e)

# Notify completion
print("Extracted all data!")

Extracted all data!


In [5]:
# Check if the files exist and are 1983
count = 0

for path in os.listdir(data_path):
        if os.path.isfile(os.path.join(data_path, path)):
            count += 1

print('File count:', count)

File count: 1983


In [3]:
# Generate a unique dataset from all the TSV files to be used for the search engine

# List all TSV files in the directory
tsv_files = [f for f in os.listdir(data_path) if f.endswith('.tsv')]

# Load all TSV files into a list of dataframes
dfs = [pd.read_csv(os.path.join(data_path, file), sep='\t') for file in tsv_files]

# Unite all dataframes into one
df = pd.concat(dfs, ignore_index=True)
df.sort_values(by=['index'], inplace=True)

# Set the index to the original 'index' column
df.set_index('index', inplace=True)

# Set the columns to the correct data types instead of strings
df['creditCards'] = df['creditCards'].apply(lambda x: eval(x) if isinstance(x, str) else [])
df['facilitiesServices'] = df['facilitiesServices'].apply(lambda x: eval(x) if isinstance(x, str) else [])

# 2. Search Engine

This search engine allows you to retrieve restaurants based on a user query. We’ll build two types of search engines:

- **Conjunctive Search Engine**: Returns restaurants where all query terms appear in the description.
- **Ranked Search Engine**: Returns the top-k restaurants sorted by similarity to the query, using TF-IDF and Cosine Similarity.

To effectively analyze restaurant descriptions, it is crucial to *pre-process the text*. As in any optimal text analysis, we must proceed with preprocessing, which we addressed in the first part.In general we followed these steps.:

- Firstly we ensured text pre-processing through the `preprocess_and_stem_text` function

- The next step involved constructing a `vocabulary` and an `inverted_index`. This setup allows us to define a `search_query` function where, by inputting a word or phrase, we can retrieve all documents containing all of those words.

## 2.0 Preprocessing

The function, `preprocess_and_stem_text`, performs several key operations using the Natural Language Toolkit (NLTK). 
- First, it tokenizes the text by splitting it into individual words and converting them to lowercase
- Next, it cleans the characters by removing non-alphanumeric symbols
- Subsequently, the function removes stop words, common words that do not provide so much information.
- Finally, it applies stemming to the remaining words, reducing them to their root forms. 

In the table below, we can see the results of the pre-processing for some example words from the `description` column

### Table 1: Pre-processing example
| Original           | Stemmed             |
|--------------------|---------------------|
| situated           | situat              |
| contemporarystyle  | contemporarystyl    |
| restaurant         | restaur             |            
| focuses            | focus               |    


In [4]:
def preprocess_and_stem_text(text, language='english'):
    """
    Preprocesses and stems the text.

    Parameters:
        text (str): The text to preprocess and stem.
        language (str): The language of the text. Defaults to 'english'.

    Returns:
        list: A list of preprocessed and
    """

    words = word_tokenize(text.lower())
    # Split the text into words and convert to lowercase.
    words = [re.sub(r"[^a-zA-Z']", '', word) for word in words]
    # Remove non-alphanumeric characters, keeping only letters and apostrophes
    stop_words = set(stopwords.words(language))
    # Create a set of stop words to exclude
    stemmer = PorterStemmer() 
    
    filtered_stemmed_words = []
    for word in words:
        if word and word not in stop_words:
            stemmed_word = stemmer.stem(word) # Stemming in order to reduce words to their roots.
            filtered_stemmed_words.append(stemmed_word)
    
    return filtered_stemmed_words

df["processed_description"] = df["description"].apply(preprocess_and_stem_text)


## 2.1 Conjunctive Query

### 2.1.1 Create Your Index!

In this section, we need to create two main structures: a `vocabulary` and an `inverted_index`

- `vocabulary`: This is a dictionary where each unique word found across all descriptions is assigned a unique number (ID)
- `inverted_index`: This is a dictionary that maps each word's unique ID (from the vocabulary) to a list of restaurant IDs in which that word appears. This allows find out which documents contain specific words.

For each restaurant description, we extract the set of unique processed words. Then, for each word in the document:

- If the word is not already in the vocabulary, we add it with a new unique ID.
- We then update the inverted index, adding the restaurant ID to the list associated with that word's unique ID.

In [5]:
def create_vocabulary_and_inverted_index(df, coloumn_name="processed_description"):
    """
    Creates a vocabulary and an inverted index from a restaurants DataFrame text coloumn.

    Parameters:
        df (DataFrame): The DataFrame to create the vocabulary and inverted index from.
        coloumn_name (str): The name of the column to create the vocabulary and inverted index from. Defaults to 'processed_description'.
    
    Returns:
        dict, dict: A dictionary with the vocabulary and a dictionary with the inverted index.

    """

    vocabulary = {}
    inverted_index = defaultdict(list)
    
    # Term ID counter
    term_id = 0

    # Iterate over each row in the DataFrame
    for idx, row in df.iterrows():
        # Convert the processed description into a set to avoid duplicates
        description = set(row[coloumn_name])
        restaurant_id = idx

        # Iterate over each unique word in the description
        for word in description:
            
            if word not in vocabulary:
                vocabulary[word] = term_id
                term_id += 1

            # Get the term ID for the current word
            term_id_for_word = vocabulary[word]
            
            inverted_index[term_id_for_word].append(restaurant_id)
    
    return vocabulary, inverted_index

vocabulary, inverted_index = create_vocabulary_and_inverted_index(df)


In [7]:
# Save vocabulary as vocabulary.csv
vocab_df = pd.DataFrame(list(vocabulary.items()), columns=["word", "term_id"])
vocab_df.to_csv(vocabulary_path, index=False)

# Save inverted index as inverted_index.json
with open(inverted_index_path, "w") as f:
    json.dump(inverted_index, f)


### 2.1.2 Execute the Query

The following function, `execute_query`, allows to search through a dataset of restaurant descriptions based on input search terms by the user. In more details, the function checks if the set of query terms is a subset of the words present in each restaurant's description.
Once the matching restaurants are identified, relevant rows from the DataFrame are selected based on the indices of the matches found

In [None]:
def execute_query(query, df, all_coloumns=False):
    """
    Executes a query on a restaurant DataFrame on the description coloumn.

    Parameters:
        query (str): The query to execute.
        df (DataFrame): The DataFrame to execute the query on.
        all_coloumns (bool): Specified if the results must have all coloumns

    Returns:
        DataFrame: A DataFrame with the results of the query.

    """
    
    processed_query = preprocess_and_stem_text(query)
    query_terms = set(processed_query)
    
    # Find the indices of restaurants that match the query
    matching_indices = df[df["processed_description"].apply(lambda x: query_terms.issubset(x))].index
    
    # loc is used to select rows and filter the columns
    if not all_coloumns:
        result = df.loc[matching_indices][["restaurantName", "address", "description", "website"]]
    else:
        result = df.loc[matching_indices]
    
    return result

query = input("Please enter your search terms (e.g., 'modern seasonal cuisine'): ")


result = execute_query(query, df)
print("Descriptions that contain the query '{}':".format(query))

# Display the results without showing the index
display(result.style.hide(axis='index'))


Descriptions that contain the query 'modern seasonal cuisine':


restaurantName,address,description,website
Casin del Gamba,via Roccolo Pizzati 1,"The journey to get here – a winding road through woods and hills – may be challenging at times but the warm welcome offered by the whole family Dal Lago at this delightful restaurant makes you feel completely at home and more than compensates for the effort. Owner-chef Antonio shares the honours and duties of his role with long-established sous-chef Biolo. Together, they prepare cuisine that is perfectly balanced between classic flavours and modern trends, using local ingredients that respect the passing seasons. Game features on the menu in winter, while dishes are often seasoned with wild aromatic herbs (for example, the local snails served with thyme and onion bread). For her part, Signora Daria determinedly takes on two roles: she not only prepares the desserts, but also skilfully supervises front of house, ably assisted by son Luca who is happy to advise guests on their choice of wine. The well-structured wine list also includes a good number of organic wines.",https://www.casindelgamba.it/
San Giorgio,viale Brigate Bisagno 69r,"Situated in the city albeit not right in the centre, San Giorgio is a typical Genovese restaurant – elegant and classic in style, with excellent cuisine served by the Scala family who have been in charge for decades. Young chef Samuele Di Mauro is at the helm, having started here as an apprentice and worked his way to the position of sous-chef and then head chef in 2022. Despite a focus on modern presentation and techniques, as demonstrated in dishes such as the raw bluefin tuna seasoned with an intense verbena mousse and served with caper cream and a “caviar” of dried and fried tuna, his cuisine is Mediterranean in style, with its roots in Ligurian aromas and flavours. The wine list is as impressive as the cuisine, with an entire section dedicated to French and Italian sparkling wines.",https://www.ristorantesangiorgiogenova.it/
Il Luogo Aimo e Nadia,via Montecuccoli 6,"This long-established restaurant has been part of the Milanese culinary scene for over 60 years and continues to attract food-lovers in search of top-quality cuisine today. It serves a generous tasting menu that showcases Italian cuisine in modern dishes, some inspired by the origins of the two dynamic chefs, Alessandro Negrini and Fabio Pisani (the former from Lombardy, the latter from Puglia). There’s also a menu dedicated to the seasons, where seasonal vegetables enhance the different dishes, while the “Omaggio a Milano” (Tribute to Milan – tortelli pasta stuffed with Fassona beef ossobuco and marrow in a Sardinian saffron and parmesan reduction) is always very popular. Make sure you allow plenty of time to peruse the extensive wine selection which is displayed on a tablet and includes some hard-to-find labels. The service is courteous and attentive throughout.",https://www.aimoenadia.com/il-luogo-aimo-e-nadia
Vesta Mare,viale Roma 41,"This typical, elegant Versilian beach club with an open-plan feel extending from the car park to the beach serves top-quality cuisine to guests. Its facilities include an entrance lounge, stylish dining room full of character, a swimming pool and a second lounge space. The menu is simpler at lunchtime, except during the low season when you can request the gourmet menu (normally only served in the evenings) when booking. Fish and seafood take pride of place on the menu, with classic dishes reinterpreted with a modern twist by a chef who takes particular care with presentation.",https://vestafiorichiari.com/mare/
Ca' Del Moro,località Erbin 31,"Situated within the La Collina dei Ciliegi wine estate amid the verdant Valpantena hills, this first-floor restaurant (lift available) is home to two chefs – he is from Calabria, while she is from Puglia. Together they create exciting, modern Mediterranean cuisine using ingredients from their native regions, including their signature dish – homemade spaghetti served with a delicious, well-balanced 'nduja, smoked ricotta and tomato ristretto sauce. Top-quality local ingredients take pride of place, as demonstrated by the Brogna mutton, which is raised on site, barbecued and served with an exquisite caper sauce and seasonal mushroom pie. The concise wine list features labels from the Veneto alongside a few French options.",https://www.cadelmoro.wine/it
Contrasto,via Roma 55,"Having returned to his native village, owner-chef Lucio Testa has opened this restaurant in a former sheepfold on the outskirts of the village perched at an altitude of 950m. The restaurant boasts a perfectly preserved stone façade, various small rooms arranged on different levels, exposed beams and contemporary-style lamps which give the dining room a modern feel. The creative cuisine features reinterpretations of traditional recipes from the Molise (such as pork tripe and cabbage soup) which combine local and seasonal ingredients (including produce from the restaurant’s own kitchen garden) with culinary techniques such as sauces, seasonings and emulsions acquired by the chef from his time working in France.",https://contrastoristorante.it
Saur,via Filippo Turati 8,"In a tiny rural village, this contemporary, almost minimalist-style restaurant serves modern cuisine with an emphasis on seasonal, regional produce.",https://ristorantesaur.it
San Michele,via Castello di Fagagna 33,"Situated next to the ruins of the old castle and the small church of San Michele, this 13C building, thought to have once housed a guardroom, is now home to a small restaurant serving regional, seasonal cuisine with a modern twist. During the week, the restaurant offers a lunchtime menu of top-quality snacks (the Venetian “cicchetti”), as well as a few “primi” and desserts.",http://sanmichele.restaurant
Chichibio,via Guglielmo Marconi 1,"Despite its lack of awards, this restaurant stands out for the quality of its cuisine. Situated in the town centre, it has a small intimate dining room with just a few tables (it’s best to book ahead) that acts as a backdrop for good-quality dishes made from local, seasonal ingredients, as well as a few modern and imaginative fish options. Highly recommended!",
Winter Garden Florence,piazza Ognissanti 1,"Horse-drawn carriages once entered the old courtyard of the St Regis hotel, now converted into an elegant winter garden which also includes a cocktail bar with sofas and armchairs. Seasonality and local gems are fundamental pillars of the modern Mediterranean cuisine.",https://www.wintergardenflorence.com/it/


## 2.2 Ranked Search Engine with TF-IDF and Cosine Similarity

### 2.2.1 Inverted Index with TF-IDF Scores

For the second search engine, given a query, we retrieve the top-k restaurants ranked by relevance to the query. To enable this, we created a comprehensive function to calculate TF-IDF scores.

The function first builds an inverted index, mapping each unique term to the restaurants it appears in. It then iterates through this inverted index, computing the inverse document frequency (IDF) as the logarithm of the total number of texts divided by the number of texts containing the term. It combines this IDF with the term frequency (TF), the number of times the term appears in the specified text, to derive the final TF-IDF scores. 

These scores capture the importance of each term in characterizing a particular text and can be used to rank restaurants by relevance to a given query, making this function a crucial component of the search engine.


In [7]:
def create_inverted_tf_idf(df, vocabulary, inverted_index, coloumn_name="processed_description"):
    """
        Creates an inverted index with TF-IDF values from a restaurants DataFrame text coloumn.

        Parameters:
            df (DataFrame): The DataFrame to create the inverted index from.
            vocabulary (dict): The vocabulary dictionary.
            inverted_index (dict): The inverted index dictionary.
            coloumn_name (str): The name of the column to create the inverted index from. Defaults to 'processed_description'.
        
        Returns:
            dict: A dictionary with the inverted index with TF-IDF values.

    """

    # Create a dictionary to store the TF-IDF values
    tf_idf = defaultdict(list)

    # Get the number of restaurants
    num_restaurants = len(df)

    voc_values = list(vocabulary.values())
    voc_keys = list(vocabulary.keys())

    for term_id, restaurant_ids in inverted_index.items():
        # Calculate the inverse document frequency (IDF) for the current term
        idf = np.log10(num_restaurants / len(restaurant_ids))
        
        # Iterate over the restaurants that contain the current term and calculate the TF-IDF
        for restaurant_id in restaurant_ids:
            desc = df[coloumn_name].loc[restaurant_id]
            term = voc_keys[voc_values.index(term_id)]
            tf = desc.count(term)
            tf_idf[term_id].append((restaurant_id, tf * idf))
        
    return tf_idf

inverted_tf_idf = create_inverted_tf_idf(df, vocabulary, inverted_index)

In [10]:
# Save inverted index as inverted_index_TFIDF.json
with open(inverted_index_TFIDF_path, "w") as f:
    json.dump(inverted_tf_idf, f)

### 2.2.2 Execute the Ranked Query

To perform the ranked query, we first construct a Term Frequency-Inverse Document Frequency (TF-IDF) matrix that links individual terms (keywords) with their corresponding restaurant descriptions, along with the associated TF-IDF values for each term in the descriptions.

Next, we process the query provided by the user.We calculate the TF-IDF values for the query itself. This is done by considering the query terms in relation to all the restaurant descriptions in the dataset, which gives us a query vector where each term is weighted based on its significance within the entire collection of documents.

Once the query vector is created with its TF-IDF values, we calculate the cosine similarity between this query vector and each restaurant description vector (which also contains TF-IDF values). Cosine similarity measures the cosine of the angle between two vectors, reflecting how similar the query and the restaurant descriptions are based on their term distributions. A higher cosine similarity score indicates a greater degree of relevance between the query and the restaurant description.

Finally, the restaurants are ranked based on their cosine similarity scores, with the highest-scoring restaurants being placed at the top of the results. This ranking enables us to retrieve and present the most relevant restaurant descriptions in response to the user's query.

In [None]:
def execute_query_ranked(query, df, tf_idf_matrix, vocabulary, inverted_index, k, all_coloumns = False):
    """
    Executes a ranked query on a restaurant DataFrame on the description coloumn.

    Parameters:
        query (str): The query to execute.
        df (DataFrame): The DataFrame to execute the query on.
        tf_idf_matrix (dict): The TF-IDF matrix.
        vocabulary (dict): The vocabulary.
        k (int): The number of top results to return.
        all_coloumns (bool): Specified if the results must have all coloumns

    Returns:
        DataFrame: A DataFrame with the top k results of the query.

    """

    # Preprocess and stem the query and vectorize it
    processed_query = preprocess_and_stem_text(query)
    query_vector = en.vectorize_query(len(df), vocabulary, inverted_index, processed_query)

    cosine_similarities = en.compute_cosine_similarity(query_vector, tf_idf_matrix)

    # Create a DataFrame with the results
    if not all_coloumns:
        result = df[["restaurantName", "address", "description", "website"]].copy()
    else:
        result = df.copy()

    # Add a column with the similarity scores for each restaurant
    result["similarity_score"] = result.index.map(cosine_similarities)
    
    # Sort the DataFrame by the similarity scores
    result.sort_values(by="similarity_score", ascending=False, inplace=True)

    # Drop rows with similarity score of 0
    result.drop(result[result["similarity_score"] == 0].index, inplace=True)

    return result.head(k)

num_doc = len(df)
# Compute the TF-IDF matrix for the descriptions of the restaurants
tf_idf_matrix = en.vectorize_documents(num_doc, vocabulary, inverted_tf_idf)

# Prompt user for input and display results
query = input("Please enter your search terms (e.g., 'modern seasonal cuisine'): ")
k = int(input("How many top similar restaurants would you like to see? "))
result = execute_query_ranked(query, df, tf_idf_matrix, vocabulary, inverted_index, k)

display(result.style.hide(axis='index'))

restaurantName,address,description,website,similarity_score
Saur,via Filippo Turati 8,"In a tiny rural village, this contemporary, almost minimalist-style restaurant serves modern cuisine with an emphasis on seasonal, regional produce.",https://ristorantesaur.it,0.241322
La Botte,via Giuseppe Garibaldi 8,"A modern and welcoming contemporary bistro situated in the heart of Stresa’s historic centre. Run by an entire family, the restaurant serves modern and imaginative fish and meat dishes where the focus is always on seasonal ingredients. The interesting wine list also includes a selection of wines by the glass.",http://www.trattorialabottestresa.it,0.226342
Razzo,via Andrea Doria 17/f,"A quiet restaurant with a relaxed, young and modern feel serving contemporary cuisine prepared from seasonal, regional products. Charming romantic outdoor area with soft lighting.",https://vadoarazzo.it/,0.223906
Piccolo Lord,corso San Maurizio 69 bis/g,"Professional service in a welcoming, modern restaurant run by a young couple. He works in the kitchen while she (having also worked as a chef in the past) runs the front of house. Delicious Mediterranean cuisine with a seasonal focus.",https://www.ristorantepiccololord.it/,0.213379
La Valle,"via Umberto I 25, località Valle Sauglio","A well - run restaurant in a quiet area just outside the village, where the owner - chef serves modern cuisine with the occasional regional influence. Careful attention is given to the use of seasonal produce and natural herbs, and there is a small selection of fish - based dishes.",https://www.ristorantelavalle.it/,0.200445
Al Vecchio Convento,viale Borri 348,"Ask for a table in the main dining room, with a classic atmosphere and elegant furnishings, to taste the dishes of a cuisine, which is seasonal and mainly Tuscan.",https://www.alvecchioconvento.it/,0.193184
RistoFante,via Mazzini 41,"The motto of this restaurant is “In step with the times yet inspired by tradition”. The cuisine here is made from fresh, seasonal ingredients (with a particular focus on fish) prepared using modern techniques. Elegant dining room, plus a shaded outdoor space for the summer months.",https://www.ristofante.it/,0.174776
Aprudia,largo del Forno 16,"At this restaurant in the historic centre, where the vaulted brick ceiling provides a striking contrast with the modern furnishings, the chef celebrates the seasons through a whole range of ingredients to create truly delicious cuisine. Interesting tasting menus add to its appeal.",http://www.aprudia.com,0.172211
Barbieri,via Italo Barbieri,"Enjoy your meal in the classic - style dining room or on the outdoor terrace in fine weather. The menu here is seasonal, with a focus on delicious Calabrian cuisine, while the wine list also features exclusively local options.",https://www.hotelbarbieri.it,0.169545
Locanda Solagna,piazza I Novembre 2,"Although this restaurant has been in business since the 1950s, it feels much more modern thanks to its updated appearance. Renowned for its excellent regional cuisine served in the evening, the restaurant pays careful attention to seasonal ingredients. At lunchtime, the ”osteria”-style menu is much simpler. Excellent wine selection.",https://www.locandasolagna.it/,0.163434


### Comparison and Evaluation:
- The **Conjunctive Search Engine** offers precise, narrow results where all query terms must appear, making it highly accurate for specific searches.
- The **Ranked Search Engine** provides a broader range of results sorted by relevance, making it more flexible for general queries.

This two-pronged search engine approach enhances user experience by catering to both specific and general queries. The conjunctive engine ensures accurate filtering, while the ranked engine offers a sorted list of the most relevant options, leveraging TF-IDF scoring to highlight the best matches.

# 3. Define a New Score!

To improve the relevance and diversity of search results, we will introduce a custom ranking metric that accounts for multiple restaurant attributes alongside the query's textual similarity.

**Approach:**

1. **User Query Input**:  
   Begin with the user-provided query text to retrieve relevant restaurants using the search engine developed in Step 2.1.

2. **Incorporate Multi-Attribute Scoring**:  
   Move beyond the basic description similarity by considering other attributes such as:
   - **Cuisine Type**: Prioritize matches with cuisine preferences.
   - **Facilities and Services**: Boost scores for restaurants offering sought-after amenities.
   - **Price Range**: Tailor scoring to favor budget-friendly or premium options, based on user preference.
   - **Description Match**: Retain weightage for TF-IDF-based textual relevance.

3. **Efficient Ranking with Heap**:  
   Leverage a heap data structure to dynamically maintain the top-k restaurants as they are scored.

**New Scoring Function**:

The scoring function will evaluate restaurants on multiple criteria and assign a composite score:
   - **Description Similarity**: A weighted score from the TF-IDF vector similarity.
   - **Cuisine Preference**: Add points for matching the cuisine type.
   - **Facilities Match**: Increment the score for each amenity in the user's query.
   - **Affordability Factor**: Assign additional weight to restaurants in the user's preferred price range.

**Implementation Steps**:
1. Preprocess and tokenize the query to identify keywords related to descriptions, facilities, and cuisine types.
2. Compute the description similarity using the TF-IDF vector and cosine similarity from Step 2.1.
3. Evaluate cuisine and facilities matches by cross-referencing attributes.
4. Integrate all scores into a single composite score using predefined weights.
5. Use a heap to efficiently maintain the top-k results.

**Output**:

The final output will include:
   - **restaurantName**: The name of the restaurant.
   - **address**: Location details for user convenience.
   - **description**: A brief overview of the restaurant.
   - **website**: Direct link for further exploration.
   - **Custom Metric Score**: The computed score based on the new ranking function.


In [None]:
# In order to not interfer with the previous and next code, we will create a new DataFrame
custom_df = df.copy()

custom_df['cuisineType_prep'] = custom_df['cuisineType'].apply(lambda x: preprocess_and_stem_text(str(x)))
custom_df['facilities_prep'] = custom_df['facilitiesServices'].apply(
    lambda x: list(chain.from_iterable([preprocess_and_stem_text(str(i)) for i in x]))
)

In [None]:
# Custom scoring function
def custom_score(row, query_terms, vocabulary, inverted_index, description_tfidf, cuisine_weight=1.0, facilities_weight=1.0, price_weight=1.0):
    """
    Computes a custom score for a restaurant based on multiple factors: description similarity, cuisine type,
    facilities, and price range, using TF-IDF and cosine similarity for the description.

    Parameters:
        row (Series): A row from the DataFrame containing restaurant information.
        query_terms (list): A list of terms from the query to compare against.
        vocabulary (dict): A dictionary mapping terms to their respective indices in the TF-IDF matrix.
        inverted_index (dict): A dictionary mapping terms to document indices, used for efficient query vectorization.
        description_tfidf (dict): A dictionary representing the TF-IDF scores of the description column in the DataFrame.
        cuisine_weight (float, optional): Weight factor for the cuisine type match score (default is 1.0).
        facilities_weight (float, optional): Weight factor for the facilities match score (default is 1.0).
        price_weight (float, optional): Weight factor for the price range score (default is 1.0).

    Returns:
        float: The total custom score for the restaurant based on the description, cuisine, facilities, and price range.
    """

    # Description similarity score using TF-IDF and Cosine Similarity
    query_vector = en.vectorize_query(len(df), vocabulary, inverted_index, query_terms)
    description_score = en.compute_cosine_similarity(query_vector, description_tfidf, [row.name])[row.name]

    # Cuisine match score
    cuisine_score = sum([cuisine_weight for cuisine in row['cuisineType_prep'] if cuisine in query_terms])

    # Facilities match score
    facilities_score = sum([facilities_weight for facility in row['facilities_prep'] if facility in query_terms])

    # Price range score (based on € range, prioritizing lower prices)
    price_range = len(row['priceRange'])  # € -> low, €€€€ -> high
    price_score = max(5 - price_range, 0) * price_weight  # prioritize lower price if desired

    #Total score
    total_score = (description_score * 2) + cuisine_score + facilities_score + price_score
    return total_score


In [104]:
def rank_conjunctive_results(query, data, inverted_index_desc, voc_desc, description_tfidf, top_k=10):
    """
    Ranks restaurants based on a conjunctive query using a custom scoring function for the description.
    The function preprocesses the query, scores each restaurant, and returns the top-k ranked results.

    Parameters:
        query (str): The query string to search for.
        data (DataFrame): The DataFrame containing restaurant information.
        inverted_index_desc (dict): The inverted index for the description column, mapping terms to document indices.
        voc_desc (dict): The vocabulary for the description, mapping terms to their indices.
        description_tfidf (dict): The TF-IDF representation of the descriptions of the restaurants.
        top_k (int, optional): The number of top results to return (default is 10).

    Returns:
        DataFrame: A DataFrame with the top-k ranked restaurants based on the custom score, including the custom score.
    """
    subset_data = data

    # Preprocess the query terms
    query_terms = preprocess_and_stem_text(query)

    # Score each restaurant in the conjunctive results
    scored_restaurants = []
    for idx, row in subset_data.iterrows():
        score = custom_score(row, query_terms, voc_desc,inverted_index_desc, description_tfidf)
        scored_restaurants.append((score, idx))

    #Heap to get the top-k restaurants based on the custom score
    top_k_restaurants = heapq.nlargest(top_k, scored_restaurants, key=lambda x: x[0])
    top_k_indices = [idx for _, idx in top_k_restaurants]

    #Top-k results and sort by custom score
    results = subset_data.loc[top_k_indices].copy()
    results['custom_score'] = [score for score, _ in top_k_restaurants]
    results = results.sort_values(by='custom_score', ascending=False).reset_index(drop=True)

    return results

query='modern seasonal cusine'

sorted_results = rank_conjunctive_results(query, custom_df, inverted_index, vocabulary, tf_idf_matrix, top_k=10)
sorted_results[["restaurantName", "address", "description","cuisineType","facilitiesServices","website","custom_score"]]


Unnamed: 0,restaurantName,address,description,cuisineType,facilitiesServices,website,custom_score
0,Al Piave,via Cormons 6 - Fraz. Corona,This welcoming family - run trattoria comprise...,"Friulian, Seasonal Cuisine","[Air conditioning, Terrace, Wheelchair access]",https://www.trattoriaalpiave.it/,5.216309
1,Locanda delle Tre Chiavi,via Vannetti 8,This restaurant housed in an 18C building is r...,"Regional Cuisine, Seasonal Cuisine","[Car park, Terrace]",http://www.locandadelletrechiavi.it,5.200912
2,Menabò Vino e Cucina,via delle Palme 44 d/e,Two brothers have brought fresh life to this e...,"Farm to table, Modern Cuisine","[Air conditioning, Interesting wine list]",https://menabovinoecucina.it,5.190455
3,Roma,via Roma 15,Over a century of history and various generati...,"Ligurian, Seasonal Cuisine","[Air conditioning, Garden or park, Terrace]",https://www.romamontoggio.it/,5.150542
4,Osteria Zanchetti,via Cesare Battisti 1,Situated at the top of a steep slope in Fossom...,"Country cooking, Seasonal Cuisine","[Air conditioning, Terrace]",http://www.osteriazanchetti.it,5.138171
5,Altriménti,via Monte Bianco 2/a,An informal and contemporary restaurant which ...,"Modern Cuisine, Seasonal Cuisine",[],https://altrimenti.eu/,5.128293
6,La Gioconda,via Brancuti,"Situated in the heart of the historic centre, ...","Country cooking, Modern Cuisine","[Interesting wine list, Restaurant offering ve...",https://www.ristorantelagioconda.it/,5.123208
7,La Cantinella,località Montemarciano 70/g,A little country restaurant with pleasantly di...,"Tuscan, Modern Cuisine","[Car park, Great view, Restaurant offering veg...",,5.111082
8,Al Baliaggio,via Vittorio Emanuele II 136,Housed in a 15C building in the centre of town...,Modern Cuisine,"[Air conditioning, Wheelchair access]",https://www.albaliaggio.it/,5.095547
9,La Cucina dei Frigoriferi Milanesi,via Piranesi 10,An interesting location in the artistic-cultur...,Modern Cuisine,"[Terrace, Wheelchair access]",https://www.lacucinadeifrigoriferimilanesi.it/,5.089375


## Analysis of Restaurant Order Changes

### Key Adjustments and Benefits:
- The custom scoring metric prioritizes restaurants with **matching cuisine types** and **desired facilities** from the user's query.
  - For example, a query like *"modern seasonal cuisine with a garden"* gives higher priority to restaurants such as **Winter Garden**, which offers modern cuisine and garden seating.
- **Middle-ranked restaurants** like **La Bandiera** and **Ape Vino** were ranked higher due to diverse services (e.g., terrace seating, seasonal offerings) and specific cuisines matching user preferences.
- Restaurants that lacked key features (e.g., garden seating) were **lower-ranked**, even if their descriptions were textually similar.

### Methodology and Results:
1. **Custom Metric Implementation**:
   - Textual relevance (TF-IDF) was combined with a custom scoring metric that considers:
     - Facilities like garden, terrace seating.
     - Cuisine type and affordability.
2. **Impact of Changes**:
   - Conjunctive filtering and custom scoring led to a more **personalized recommendation list**.
   - Top restaurants like **Osteria del Miglio 2.10** balanced textual relevance with user preferences.
3. **Example Results**:
   - Example query: *"modern seasonal cuisine"*
   - Top-ranked results included:
     - Osteria del Miglio 2.10
     - Osteria Ophis
     - Osteria Taviani

### Conclusion:
- The custom scoring function effectively **reshaped the ranking order** by integrating user preferences such as cuisine type, facilities, and affordability.
- This enhanced ranking quality by providing recommendations that align closely with user expectations.


# 4. Visualizing the Most Relevant Restaurants

To complete this step, we first needed to gather unique locations in the format of city and region. For this, we relied on **OpenCage**, which offers 2500 requests per day. Since we had 1983 restaurants, we used **OpenCage's API** to fetch the location data. We followed the suggested code from OpenCage to retrieve the necessary information. Using the city, postal code, latitude, and longitude for each restaurant, we generated a CSV file with the geographic data. This output file was then mapped to our initial dataframe, matching the data based on the address column.

In [84]:
import pandas as pd

df2 = pd.read_csv("file_geocoded_output_with_region.csv", sep=";") 


df2_unique = df2.drop_duplicates(subset='address', keep='first')


df['lat'] = df['address'].map(df2_unique.set_index('address')['latitude'])
df['long'] = df['address'].map(df2_unique.set_index('address')['longitude'])
df['region'] = df['address'].map(df2_unique.set_index('address')['region'])


## Map

In [85]:
import folium
import requests
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# Function to determine color based on price
def determine_color(price):
    if price == "€":
        return "green"   # Green for €
    elif price == "€€":
        return "blue"    # Blue for €€
    elif price == "€€€":
        return "orange"  # Orange for €€€
    elif price == "€€€€":
        return "red"     # Red for €€€€
    return "purple"      # Purple for mixed prices

def determine_color_and_size(prices):
    unique_prices = set(prices)
    if len(unique_prices) == 1:
        return determine_color(unique_prices.pop()), 9
    return "purple", 9  # Purple for mixed prices and fixed size

# Prepare the TF-IDF matrix for descriptions

# Ask the user for search terms and how many results to show
query = input("Please enter your search terms (e.g., 'modern seasonal cuisine'): ")
k = int(input("How many top similar restaurants would you like to see? "))
top_k_results = execute_query_ranked(query, df, tf_idf_matrix, vocabulary, inverted_index, k, True)

# Group the restaurants by city
filtered_cities = top_k_results.groupby('city').agg({
    'lat': 'first', 
    'long': 'first',  
    'restaurantName': list,  # List of restaurant names
    'priceRange': list,  # List of price ranges for each restaurant
    'similarity_score': 'mean',  # Average similarity score for the city
}).reset_index()

# Create a map centered on Italy (Rome)
map = folium.Map(location=[41.9028, 12.4964], zoom_start=6)

# Load the GeoJSON file of Italian regions
url = "https://raw.githubusercontent.com/codeforamerica/click_that_hood/master/public/data/italy-regions.geojson"
geojson_data = requests.get(url).json()

# Add the GeoJSON of regions to the map with black borders
folium.GeoJson(
    geojson_data,
    style_function=lambda feature: {
        'fillColor': 'none',  
        'color': 'black',     
        'weight': 1          
    }
).add_to(map)

# Add markers for each city that has at least one restaurant in the top k
for index, row in filtered_cities.iterrows():
    lat = float(row['lat'])
    lon = float(row['long'])
    restaurants = row['restaurantName']
    prices = row['priceRange']
    city_similarity = top_k_results[top_k_results['city'] == row['city']]
    
    # Sort the restaurants by similarity in descending order
    city_similarity = city_similarity.sort_values(by='similarity_score', ascending=False)
    
    # Determine the color and size based on prices
    color, radius = determine_color_and_size(prices)
    
    # Create content for the popup showing restaurants and their prices
    popup_content = f"<b>Restaurants in {row['city']}</b><br>"
    
    # Add the restaurants sorted by similarity
    for _, restaurant in city_similarity.iterrows():
        name = restaurant['restaurantName']
        price = restaurant['priceRange']
        sim_score = restaurant['similarity_score']
        address = restaurant['address']
        description = restaurant['description']
        website = restaurant['website']
        
        # Color for each price
        price_color = determine_color(price)
        price_styled = f"<span style='color:{price_color};'>{price}</span>"
        
        popup_content += f"<b>{name}</b><br>"
        popup_content += f"Address: {address}<br>"
        popup_content += f"Description: {description}<br>"
        popup_content += f"Website: <a href='{website}' target='_blank'>{website}</a><br>"
        popup_content += f"Price: {price_styled}<br>"
        popup_content += f"Similarity: {sim_score:.2f}<br><br>"
    
    popup_content += f"Average Similarity: {row['similarity_score']:.2f}<br>"
    popup = folium.Popup(popup_content, max_width=300)

    # Create the marker for the city 
    marker = folium.CircleMarker(
        location=[lat, lon],
        radius=radius, 
        color=color,  
        fill=True,
        fill_color=color,  
        fill_opacity=0.6,
        popup=popup,
        tooltip=f"{row['city']}: {len(restaurants)} restaurants"
    )

    # Add the marker to the map
    marker.add_to(map)

# Add layer control to enable/disable regions
folium.LayerControl().add_to(map)

# legend for price colors
legend_html = """
<div style="position: fixed;
     bottom: 50px; left: 50px; width: 150px; height: 150px;
     border:2px solid grey; z-index:9999; font-size:14px;
     background-color:white; padding: 10px;">
     <strong>Price Legend</strong><br>
     <i style="background:green; width:10px; height:10px; float:left; margin-right:10px;"></i>€<br>
     <i style="background:blue; width:10px; height:10px; float:left; margin-right:10px;"></i>€€<br>
     <i style="background:orange; width:10px; height:10px; float:left; margin-right:10px;"></i>€€€<br>
     <i style="background:red; width:10px; height:10px; float:left; margin-right:10px;"></i>€€€€<br>
     <i style="background:purple; width:10px; height:10px; float:left; margin-right:10px;"></i>Mixed Prices<br>
</div>
"""
map.get_root().html.add_child(folium.Element(legend_html))

map.save("italy_map_with_query_results.html")


In [86]:
map

In [None]:
df3= df

In [None]:
import folium
import requests
import pandas as pd
from folium.plugins import Search

# Crea una mappa centrata sull'Italia (su Roma)
mappa = folium.Map(location=[41.9028, 12.4964], zoom_start=6)

# Carica il file GeoJSON delle regioni italiane
url = "https://raw.githubusercontent.com/codeforamerica/click_that_hood/master/public/data/italy-regions.geojson"
geojson_data = requests.get(url).json()

# Aggiungi il GeoJSON delle regioni alla mappa
folium.GeoJson(geojson_data).add_to(mappa)

# Carica il DataFrame con i dati dei ristoranti
# Supponiamo che il tuo DataFrame 'df3' abbia le colonne 'lat', 'long', 'priceRange', 'restaurantName', 'region'
# Esempio di DataFrame


# Funzione per determinare il colore in base al prezzo
def determina_colore_e_dimensione(prezzo):
    if prezzo == "€":
        return "green"  # Colore verde
    elif prezzo == "€€":
        return "blue"  # Colore blu
    elif prezzo == "€€€":
        return "orange"  # Colore arancione
    elif prezzo == "€€€€":
        return "red"  # Colore rosso
    else:
        return "gray"  # Colore di default

# Aggiungi un FeatureGroup per ogni regione
regioni = df3['region'].unique()  # Ottieni tutte le regioni uniche
layers = {}  # Dizionario per gestire i layer delle regioni

# Crea un FeatureGroup per ogni regione
for regione in regioni:
    feature_group = folium.FeatureGroup(name=regione)  # Crea un nuovo gruppo per la regione
    # Filtra i ristoranti per la regione corrente
    ristoranti_regione = df3[df3['region'] == regione]
    
    # Aggiungi i marker per ogni ristorante in questa regione
    for index, ristorante in ristoranti_regione.iterrows():
        lat = float(ristorante['lat'])
        lon = float(ristorante['long'])
        colore = determina_colore_e_dimensione(ristorante['priceRange'])
        
        # Aggiungi il marker alla mappa
        folium.CircleMarker(
            location=[lat, lon],  # Posizione basata su latitudine e longitudine
            radius=10,  # Dimensione del marker
            color=colore,  # Colore del marker
            fill=True,
            fill_color=colore,
            fill_opacity=0.6,
            popup=ristorante['restaurantName'],  # Nome del ristorante
            tooltip=ristorante['region']  # Mostra la regione quando passa sopra il marker
        ).add_to(feature_group)
    
    # Aggiungi il gruppo per la regione alla mappa
    feature_group.add_to(mappa)
    layers[regione] = feature_group  # Aggiungi il gruppo al dizionario layers

# Aggiungi un controllo per il layer delle regioni
folium.LayerControl().add_to(mappa)

# Salva la mappa in un file HTML
mappa.save("italy_map_with_restaurants_by_region.html")
mappa


# 5. Advanced Search Engine


To create an advanced restaurant search, we provide users with the following options:

1. **Specify Search Criteria**: Users can specify search terms for the following features (any or all of them):
   - `restaurantName`
   - `city`
   - `cuisineType`

2. **Price Range Filter**: Allow users to set a price range (e.g., between `€` and `€€€`) to filter the results by affordability.

3. **Region Filter**: Enable users to specify a list of Italian regions to limit the search to restaurants within those regions.

4. **Accepted Credit Cards**: Provide an option to filter by accepted credit card types. Users can specify one or more preferred card types (e.g., Visa, MasterCard, Amex).

5. **Services and Facilities**: Allow users to filter based on specific services and facilities provided by the restaurant. For example, users may look for amenities like Wi-Fi, Terrace, Air Conditioning, or Parking. 

In [88]:
# In order to not interfer with the previous code, we will create a new DataFrame
advanced_df = df.copy()

To enable searches across the fields of restaurant name, cuisine type, and city, we follow the same approach used previously for the description. This involves preprocessing and stemming each field, creating vocabularies, inverted indexes, and TF-IDF matrices. For the final cosine similarity calculation, we combine the similarities from each field by weighting them according to their importance. The weights reflect the significance of each field, with restaurant name, city, and cuisine type prioritized in that order, as the first two are more specific.

In [89]:
# Preprocess and stem the text for the new columns
advanced_df['cuisineType_prep'] = advanced_df['cuisineType'].apply(lambda x: preprocess_and_stem_text(str(x)))
advanced_df['city_prep'] = advanced_df['city'].apply(lambda x: preprocess_and_stem_text(str(x)))
advanced_df['restaurantName_prep'] = advanced_df['restaurantName'].apply(lambda x: preprocess_and_stem_text(str(x)))


In [90]:
# Create vocabularies and inverted indexes for the new columns
num_doc = len(advanced_df)

voc_type, inv_index_type = create_vocabulary_and_inverted_index(advanced_df, 'cuisineType_prep')
tf_idf_type = create_inverted_tf_idf(advanced_df, voc_type, inv_index_type, 'cuisineType_prep')
tf_idf_matrix_type = en.vectorize_documents(num_doc, voc_type, tf_idf_type)


voc_city, inv_index_city = create_vocabulary_and_inverted_index(advanced_df, 'city_prep')
tf_idf_city = create_inverted_tf_idf(advanced_df, voc_city, inv_index_city, 'city_prep')
tf_idf_matrix_city = en.vectorize_documents(num_doc, voc_city, tf_idf_city)

voc_name, inv_index_name = create_vocabulary_and_inverted_index(advanced_df, 'restaurantName_prep')
tf_idf_name = create_inverted_tf_idf(advanced_df, voc_name, inv_index_name, 'restaurantName_prep')
tf_idf_matrix_name = en.vectorize_documents(num_doc, voc_name, tf_idf_name)

# Create dictionaries with the matrices, vocabularies and weights for the columns
dict_matrices = {
    'type': tf_idf_matrix_type, 
    'city': tf_idf_matrix_city, 
    'name': tf_idf_matrix_name
    }

dict_index = {
    'type': inv_index_type, 
    'city': inv_index_city, 
    'name': inv_index_name
}

dict_voc = {
    'type': voc_type,
    'city': voc_city, 
    'name': voc_name
    }

dict_weights =  {  
    "name": 0.60,
    "city": 0.20,
    "type": 0.10 
}

To refine the search, we first check if any filters, such as price range, credit card acceptance, or specific services, have been set. Restaurants that do not meet the selected criteria are excluded. After this filtering step, we rank the remaining restaurants using cosine similarity on the fields of name, city, and cuisine type (if specified). The ranking process is implemented using a function provided in the `advanced_engine` module.

In [99]:
def search_restaurants(df, dict_matrices, dict_voc, dict_weights, dict_index, name=None, city=None, cuisine_type=None, price_range=None, accepted_credit_cards=None, services=None, regions=None):
    """
    Searches for restaurants based on the specified filters.

    Parameters:
        df (DataFrame): The DataFrame to search in.
        dict_matrices (dict): A dictionary with the matrices for the search columns.
        dict_voc (dict): A dictionary with the vocabularies for the search columns.
        dict_weights (dict): A dictionary with the weights for the search columns.
        dict_index (dict): A dictionary with the inverted indexes for the search columns.
        name (str): The name of the restaurant. Defaults to None.
        city (str): The city where the restaurant is located. Defaults to None.
        cuisine_type (str): The type of cuisine. Defaults to None.
        price_range (list): The price range. Defaults to None.
        accepted_credit_cards (list): The accepted credit cards. Defaults to None.
        services (list): The services offered. Defaults to None.
    
    Returns:
        DataFrame: A DataFrame with the filtered restaurants.
    """


    results = df.copy()

    num_rows = len(results)

    if price_range:
        # Define the price levels and filter the results
        price_levels = {'€': 1, '€€': 2, '€€€': 3, '€€€€': 4}
        results['priceRange'] = results['priceRange'].apply(lambda x: x if (price_levels.get(x) >= price_range[0] and price_levels.get(x) <= price_range[1]) else None)
        results.dropna(subset=['priceRange'], inplace=True)
    if accepted_credit_cards:
        results['creditCards'] = results['creditCards'].apply(lambda x: x if set(accepted_credit_cards).issubset(x) else None)
        results.dropna(subset=['creditCards'], inplace=True)
    if services:
        results['facilitiesServices'] = results['facilitiesServices'].apply(lambda x: x if set(services).issubset(x) else None)
        results.dropna(subset=['facilitiesServices'], inplace=True)
    if regions:
        results = results[results['region'].isin(regions)]
    
    dict_query = {}

    if name:
        processed_name = preprocess_and_stem_text(name)
        name_vector = en.vectorize_query(num_rows, dict_voc['name'], dict_index['name'], processed_name)
        dict_query['name'] = name_vector
    if city:
        processed_city = preprocess_and_stem_text(city)
        city_vector = en.vectorize_query(num_rows, dict_voc['city'], dict_index['city'], processed_city)
        dict_query['city'] = city_vector
    if cuisine_type:
        processed_type = preprocess_and_stem_text(cuisine_type)
        type_vector = en.vectorize_query(num_rows, dict_voc['type'], dict_index['type'], processed_type)
        dict_query['type'] = type_vector
    
    results = aen.execute_query_rank_advanced(results, dict_query, dict_matrices, dict_weights)

    return results[["restaurantName", "address", "cuisineType", "priceRange","region"]]

To facilitate this search, we utilize `ipywidgets` to create convenient search bars and multi-select lists. This approach is motivated by the need to provide an interactive and intuitive interface for users working within a Jupyter Notebook environment. Traditional command-line queries or manual filtering can be cumbersome and error-prone, particularly for non-technical users or when dealing with large datasets. 

By leveraging `ipywidgets`, users can easily input search criteria through dynamic widgets. This design not only improves usability but also encourages exploration and experimentation, as users can iteratively refine their searches with immediate visual feedback. Overall, the motivation lies in enhancing accessibility, efficiency, and the overall user experience in performing complex searches within the Jupyter Notebook ecosystem.

In [None]:
# Widgets for GUI

# Text widgets for the search filters
restaurant_name = widgets.Text(
    description="Name:",
    placeholder="Enter restaurant name"
)

city = widgets.Text(
    description="City:",
    placeholder="Enter city name"
)

cuisine_type = widgets.Text(
    description="Cuisine:",
    placeholder="Enter cuisine type"
)

# Price range slider
price_range = widgets.IntRangeSlider(
    value=[1, 4], 
    min=1, 
    max=4, 
    step=1, 
    description="Price (€)",
    orientation='vertical')

# Multi-select widget for accepted credit cards
credit_cards = pd.Series([card for sublist in advanced_df['creditCards'] for card in sublist]).unique()
accepted_credit_cards = widgets.SelectMultiple(
    options=credit_cards,
    description="Credit Cards:",
    layout=widgets.Layout(width="400px", height="200px"),
)

# Multi-select widget for services
services_opt = pd.Series([service for sublist in advanced_df['facilitiesServices'] for service in sublist]).unique()
services = widgets.SelectMultiple(
    options=services_opt,
    description="Services:",
    layout=widgets.Layout(width="400px", height="200px"),
)

# Multi-select widget for regions
regions_opt = advanced_df.dropna(subset=['region'])['region'].unique()
regions = widgets.SelectMultiple(
    options=regions_opt,
    description="Regions:",
    layout=widgets.Layout(width="400px", height="200px"),
)

# Search button and output area
search_button = widgets.Button(
    description="Search",
    disabled=False
)
output_area = widgets.Output()


# Button click event function
def on_search_button_clicked(b):
    """
    Event handler for the search button
    """
    # Disable the button while processing
    b.disabled = True
    
    # Clear the output area
    with output_area:
        clear_output(wait=True)
        
        try:
            results = search_restaurants(
                advanced_df,
                dict_matrices,
                dict_voc,
                dict_weights,
                dict_index,
                restaurant_name.value,
                city.value,
                cuisine_type.value,
                price_range.value,
                list(accepted_credit_cards.value),
                list(services.value),
                regions.value
            )
            
            if results.shape[0] > 0:
                # Display only the relevant columns
                print(f"\nFound {results.shape[0]} restaurants matching your criteria.")
                display(results.style.hide(axis='index'))
            else:
                print("No results found.")
        except Exception as e:
            raise e
        finally:
            # Reset the values of the widgets after processing
            accepted_credit_cards.value = []
            services.value = []
            regions.value = []
            restaurant_name.value = ""
            city.value = ""
            cuisine_type.value = ""
            price_range.value = [1, 4]
            
            # Re-enable the button after processing
            b.disabled = False


# Link button to search function
search_button.on_click(on_search_button_clicked)

box_layout = widgets.Layout(display='flex',
                flex_flow='column',
                align_items='center',
                width='50%')
box = widgets.HBox(children=[search_button],layout=box_layout)

container = widgets.VBox([
        widgets.HBox([widgets.Label()], layout=widgets.Layout(margin="5px 0 0 0")),
        widgets.HBox([restaurant_name, city, cuisine_type]),
        widgets.HBox([widgets.Label()], layout=widgets.Layout(margin="5px 0 0 0")),
        widgets.HBox([price_range, accepted_credit_cards, services, regions]),
        widgets.HBox([widgets.Label()], layout=widgets.Layout(margin="5px 0 0 0")),
        box])
    
# Display the interface
display(container, output_area)


VBox(children=(HBox(children=(Label(value=''),), layout=Layout(margin='5px 0 0 0')), HBox(children=(Text(value…

Output()