In [1]:
import module.crawler as cr
import module.parser as pr

from scrapy.crawler import CrawlerProcess
from scrapy.settings import Settings

from concurrent.futures import ThreadPoolExecutor, as_completed

import os

import numpy as np
import pandas as pd 

import nltk
nltk.download('punkt')
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer

import re

import json

from collections import defaultdict

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\domaz\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\domaz\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
# Settings the necessary direcory
os.makedirs('data', exist_ok=True)

pages_path = os.path.join('data', 'pages')
os.makedirs(pages_path, exist_ok=True)

data_path = os.path.join('data', 'data_tsv')
os.makedirs(data_path, exist_ok=True)

engine_path = os.path.join('data', 'engine')
os.makedirs(engine_path, exist_ok=True)

vocabulary_path = os.path.join(engine_path, 'vocabulary.csv')
inverted_index_path = os.path.join(engine_path, 'inverted_index.json')
inverted_index_TFIDF_path = os.path.join(engine_path, 'inverted_index_TFIDF.json')

dataset_path = os.path.join('data', 'dataset.tsv')
urls_path = os.path.join('data', 'urls.txt')

# 1. Data collection

### 1.1 Get the list of Michelin restaurants

You should begin by compiling a list of restaurants to include in your document corpus. Specifically, you will focus on web scraping the [Michelin Restaurants in Italy](https://guide.michelin.com/en/it/restaurants). Your task is to **collect the URL** associated with each restaurant in this list. The output of this step should be a `.txt` file where each line contains a single restaurant’s URL. By the end, you should have approximately 2,037 restaurants on your list.


In [3]:
# Create custom settings for the url spider
custom_settings = Settings({
    'REQUEST_FINGERPRINTER_IMPLEMENTATION': '2.7',  # Set to recommended value to avoid issues
    'LOG_LEVEL': 'ERROR'  # Suppress other logging
})

In [6]:
get_url_process = CrawlerProcess(settings=custom_settings) # Create a process for the spider
get_url_process.crawl(cr.UrlMichelin, urls_path) # Add the spider to the process
get_url_process.start() # Run the spider

In [7]:
# Check if the file exists
if os.path.exists(urls_path):
    # Check the output file and see if the number of lines is correct
    lines_in_file = open(urls_path, 'r').readlines()
    number_of_lines = len(lines_in_file)
    print(f'Number of lines in file: {number_of_lines}')
else:
    print('Failure: File not found')

Number of lines in file: 1983


### 1.2. Crawl Michelin restaurant pages

Once you have all the URLs on the list, you should:

1. Download the HTML corresponding to each of the collected URLs.
2. After collecting each page, immediately save its `HTML` in a file. This way, if your program stops for any reason, you will not lose the data collected up to the stopping point.
3. Organize the downloaded `HTML` pages into folders. Each folder will contain the `HTML` of the restaurants from page 1, page 2, ... of the Michelin restaurant list.

__Tip__: Due to the large number of pages to download, consider using methods that can help shorten the process. If you employed a particular process or approach, kindly describe it.


In [10]:
lines_of_urls = []
with open(urls_path, 'r') as file:
    lines_of_urls = file.readlines()

original_directory = os.getcwd()
os.chdir(os.path.join(original_directory, pages_path))

# Create folders for the HTML files
cr.make_folders(100)

max_w = os.cpu_count()

# Download the HTML files concurrently
with ThreadPoolExecutor(max_workers=max_w) as executor:
    download_futures = []
    for line in lines_of_urls:
        # Split the line into URL and page number
        page_num = int(line.split("|")[1])
        url = line.split("|")[0].strip()
                
        # Submit download task to the executor
        download_futures.append(executor.submit(cr.HTML_downloader, url, page_num))

    # Wait for all tasks to complete
    for future in as_completed(download_futures):
        try:
            future.result()
        except Exception as e:
            print(f"An error occurred: {e}")

# Notify completion
print("Downloaded all pages!")

# Return to the original directory
os.chdir(original_directory)

Downloaded all pages!


In [11]:
# Check if the files exist and are 1983

current_dir = os.getcwd()
dir_path = os.path.join(current_dir, pages_path)
os.chdir(dir_path)
count = 0

for i in range(1,101):
    folder = f'page_{i}'
    for path in os.listdir(folder):
        if os.path.isfile(os.path.join(folder, path)):
            count += 1

os.chdir(current_dir)

print('File count:', count)

File count: 1983


### 1.3 Parse downloaded pages

At this point, you should have all the HTML documents about the restaurant of interest, and you can start to extract specific information. The list of the information we desire for each restaurant and their format is as follows:

1. **Restaurant Name** (to save as `restaurantName`): string;
2. **Address** (to save as `address`): string;
3. **City** (to save as `city`): string;
4. **Postal Code** (to save as `postalCode`): string;
5. **Country** (to save as `country`): string;
6. **Price Range** (to save as `priceRange`): string;
7. **Cuisine Type** (to save as `cuisineType`): string;
8. **Description** (to save as `description`): string;
9. **Facilities and Services** (to save as `facilitiesServices`): list of strings;
10. **Accepted Credit Cards** (to save as `creditCards`): list of strings;
11. **Phone Number** (to save as `phoneNumber`): string;
12. **URL to the Restaurant Page** (to save as `website`): string.

For each restaurant, you create a `restaurant_i.tsv` file of this structure:

```
restaurantName \t address \t  ... \t url
```

If an information is missing, you just leave it as an empty string.

In [3]:
keys = ['index', 'restaurantName', 'address', 'city', 'postalCode', 'country', 'priceRange', 'cuisineType', 'description', 'creditCards', 'facilitiesServices', 'phoneNumber', 'website']

max_w = os.cpu_count()

# Download the data from HTML files concurrently
with ThreadPoolExecutor(max_workers=max_w) as executor:
    extractor_future = []
    for i in range(1,101):
        start_dir = os.path.join(pages_path, f'page_{i}')
        start_index = (i-1)*20 
        extractor_future.append(executor.submit(pr.tsv_extractor, start_dir, data_path, start_index, keys))

    # Wait for all tasks to complete
    for future in as_completed(extractor_future):
        try:
            future.result()
        except Exception as e:
            print(e)

# Notify completion
print("Extracted all data!")

Extracted all data!


In [6]:
# Check if the files exist and are 1983
count = 0

for path in os.listdir(data_path):
        if os.path.isfile(os.path.join(data_path, path)):
            count += 1

print('File count:', count)

File count: 1983


# 2. Search Engine

This search engine allows you to retrieve restaurants based on a user query. We’ll build two types of search engines:

- **Conjunctive Search Engine**: Returns restaurants where all query terms appear in the description.
- **Ranked Search Engine**: Returns the top-k restaurants sorted by similarity to the query, using TF-IDF and Cosine Similarity.

To effectively analyze restaurant descriptions, it is crucial to *pre-process the text*. As in any optimal text analysis, we must proceed with preprocessing, which we addressed in the first part.In general we followed these steps.:

- Firstly we ensured text pre-processing through the `preprocess_and_stem_text`function

- The next step involved constructing a `vocabulary` and an `inverted_index`. This setup allows us to define a `search_query` function where, by inputting a word or phrase, we can retrieve all documents containing all of those words.

In [3]:
# List all TSV files in the directory
tsv_files = [f for f in os.listdir(data_path) if f.endswith('.tsv')]

# Load all TSV files into a list of dataframes
dfs = [pd.read_csv(os.path.join(data_path, file), sep='\t') for file in tsv_files]

# Unite all dataframes into one
df = pd.concat(dfs, ignore_index=True)
df.sort_values(by=['index'], inplace=True)
df.set_index('index', inplace=True)

## 2.0 Preprocessing

The function, `preprocess_and_stem_text`, performs several key operations using the Natural Language Toolkit (NLTK). 
- First, it tokenizes the text by splitting it into individual words and converting them to lowercase
- Next, it cleans the characters by removing non-alphanumeric symbols
- Subsequently, the function removes stop words, common words that do not provide so much information.
- Finally, it applies stemming to the remaining words, reducing them to their root forms. 

In the table below, we can see the results of the pre-processing for some example words from the `description` column

### Table 1: Pre-processing example
| Original           | Stemmed             |
|--------------------|---------------------|
| situated           | situat              |
| contemporarystyle  | contemporarystyl    |
| restaurant         | restaur             |            
| focuses            | focus               |    


In [4]:
costum = {"one","well","feature","also"}

def preprocess_and_stem_text(text):
    words = word_tokenize(text.lower())
    # Split the text into words and convert to lowercase.
    words = [re.sub(r"[^a-zA-Z']", '', word) for word in words]
    # Remove non-alphanumeric characters, keeping only letters and apostrophes
    stop_words = set(stopwords.words('english'))
    # Create a set of stop words to exclude
    stemmer = PorterStemmer() 
    
    filtered_stemmed_words = []
    for word in words:
        if word and word not in stop_words:
            stemmed_word = stemmer.stem(word) #stemming in order to reduce words to their roots.
            filtered_stemmed_words.append(stemmed_word)
    
    return filtered_stemmed_words

df["processed_description"] = df["description"].apply(preprocess_and_stem_text)


## 2.1 Conjunctive Query

### 2.1.1 Create Your Index!

In this section, we need to create two main structures: a `vocabulary` and an `inverted_index`

- `vocabulary`: This is a dictionary where each unique word found across all descriptions is assigned a unique number (ID)
- `inverted_index`: This is a dictionary that maps each word's unique ID (from the vocabulary) to a list of restaurant IDs in which that word appears. This allows find out which documents contain specific words.

For each restaurant description, we extract the set of unique processed words. Then, for each word in the document:

- If the word is not already in the vocabulary, we add it with a new unique ID.
- We then update the inverted index, adding the restaurant ID to the list associated with that word's unique ID.

In [5]:
def create_vocabulary_and_inverted_index(df):

    vocabulary = {}
    inverted_index = defaultdict(list)
    
    # Term ID counter
    term_id = 0

    # Iterate over each row in the DataFrame
    for idx, row in df.iterrows():
        # Convert the processed description into a set to avoid duplicates
        description = set(row["processed_description"])
        restaurant_id = idx

        # Iterate over each unique word in the description
        for word in description:
            
            if word not in vocabulary:
                vocabulary[word] = term_id
                term_id += 1

            # Get the term ID for the current word
            term_id_for_word = vocabulary[word]
            
            inverted_index[term_id_for_word].append(restaurant_id)
    
    return vocabulary, inverted_index

vocabulary, inverted_index = create_vocabulary_and_inverted_index(df)


In [6]:
# Save vocabulary as vocabulary.csv
vocab_df = pd.DataFrame(list(vocabulary.items()), columns=["word", "term_id"])
vocab_df.to_csv(vocabulary_path, index=False)

# Save inverted index as inverted_index.json
with open(inverted_index_path, "w") as f:
    json.dump(inverted_index, f)


### 2.1.2 Execute the Query

The following function, `execute_query`, allows to search through a dataset of restaurant descriptions based on input search terms by the user. In more details, the function checks if the set of query terms is a subset of the words present in each restaurant's description.
Once the matching restaurants are identified, relevant rows from the DataFrame are selected based on the indices of the matches found

In [6]:
def execute_query(query, df):
    
    processed_query = preprocess_and_stem_text(query)
    query_terms = set(processed_query)
    
    # Find the indices of restaurants that match the query
    matching_indices = df[df["processed_description"].apply(lambda desc: query_terms.issubset(desc))].index
    
    # iloc is used to select rows and filter the columns
    result = df.iloc[matching_indices][["restaurantName", "address", "description", "website"]]
    
    return result

query = input("Please enter your search terms (e.g., 'modern seasonal cuisine'): ")


result = execute_query(query, df)
print("Descriptions that contain the query '{}':".format(query))

# Display the results without showing the index
display(result.style.hide(axis='index'))


Descriptions that contain the query 'modern seasonal cuisine':


restaurantName,address,description,website
Fratelli Bruzzone,via Maria Vittoria 34/a,"The Bruzzone brothers deserve a round of applause for this restaurant and its extraordinary traditional Piedmontese cuisine. In just two small dining rooms (booking is recommended), guests can choose from an unforgettable array of dishes including anchovies in green sauce, baked onions with bagna cauda (an anchovy-based dip), superb agnolotti (stuffed pasta), tripe, chicken, bonet (a type of dessert) and hazelnut tart with zabaglione. With so much choice, it’s difficult to stop eating – and once you’ve left, you’ll want to return as soon as possible!",https://www.fratellibruzzone.com/FB/home.html
Vert Osteria Contemporanea,Località Bogonza,"Housed in a rustic building on the green slopes facing the lake, this restaurant has a welcoming feel with its stone and wood decor and large windows looking out at a beautiful, shaded garden where meals are served in fine weather. Delicious Italian cuisine takes pride of place on the menu.",https://vertosteria.it/
Il Ristorante - Niko Romito,via di Ripetta 73,"Situated on the fifth floor of the Hotel Bulgari with views of the Mausoleum of Augustus, Il Ristorante - Niko Romito boasts a spacious terrace and an elegant, welcoming dining room with mahogany walls adorned with works of art. The menu features traditional Italian cuisine reinterpreted with a light and imaginative touch as demonstrated by memorable dishes such as the chef’s own take on classic spaghetti in a tomato sauce.",https://www.bulgarihotels.com/it_IT/rome/dining/il-ristorante-niko-romito
l' Ciocio - Osteria di Suvereto,piazza dei Giudici 1,"Although this restaurant in the historic centre of Suvereto has the welcoming ambience of a traditional trattoria, its cuisine is more creative and elaborate in style (while still offering a few Tuscan classics). One of the house favourites is the “reale familiare”: a delicious cut of beef served with capers and anchovies.",https://www.osteriadisuvereto.it/
Controcorrente,via Colombo 101,"In this minimalist restaurant, situated below the medieval San Giovanni tower in the picturesque pedestrianised historic centre of Noli, the owner-chef serves contemporary cuisine that echoes the dining room’s modern decor. The menu features meat and fish dishes alike, all prepared from almost exclusively Ligurian ingredients, including delicious purple Sanremo prawn tartare with celery mayonnaise, lemon cream and plankton chips, as well as lamb cooked in its own juices with Albenga artichokes.",http://www.ristorantecontrocorrente.it
Futura,piazza San Giustino 7,"Situated in the heart of the historic centre overlooking the monumental Piazza San Giustino, this restaurant has a traditional feel in keeping with its location, enhanced by typical brick walls in dining rooms arranged on two floors. In contrast, the predominantly meat-based cuisine is contemporary in style, with some interesting technical and creative twists.",https://futuraristorante.it/
Uliassi,banchina di Levante 6,"The Adriatic Riviera, a perfect picture-postcard summer holiday destination, is the setting for this fine restaurant run by chef Mauro Uliassi. Hidden among the private beach clubs that crowd Senigallia’s seafront, this three-Michelin-star restaurant comes as a surprise, surrounded as it is by beach parasols and the sound of the sea lapping on the shore. The strength and originality of Uliassi’s cuisine lies in his ability to make good use of local ingredients, taking inspiration from the culinary traditions that have been so familiar and successful along this coastline over the past few decades and giving them a personal, creative twist. Fish and seafood obviously take pride of place, although there’s also room on the menu for meat, inspired by the Marche’s traditional game dishes – the delicious skewered pigeon is cooked to perfection! The colourful dining experience here is enhanced by the owner’s warm welcome, with Mauro’s sister Catia and his son Filippo working alongside the chef. It’s also worth mentioning that the restaurant has a keen focus on the environment, hence its banning of the use of plastic and doing its utmost to persuade its fish suppliers not to use polystyrene to transport their fish.",http://www.uliassi.com
Scaraboci,via XX Settembre 27,"Located just a few metres from the enchanting seafront at Marciana, this restaurant is one of the island’s gastronomic gems, offering a menu that features interesting combinations of beautifully presented meat and especially fish dishes There’s also a private terrace for romantic dining in summer (book well in advance) and although wines are not available by the glass, choosing a top-quality half-bottle is always an option.",
Contesto Alimentare,via Accademia Albertina 21/e,"Situated on the central Via Accademia Albertina, this small, simple and unfussy restaurant with small tables set close together serves top-quality Piedmontese cuisine. Alongside its delicious regional specialities, the menu also features dishes from elsewhere in Italy, including Sicily. The tajarin pasta made from 40 egg yolks and served with a veal ragu is superb, as are the rabbit and pork belly dishes. There’s a focus on meat options, followed by some delicious desserts, which include specialities from Piedmont such as panna cotta and typical bacio di dama biscuits.",https://www.contestoalimentare.it/
13 Comuni,piazza della Vittoria 31,"Situated in the village square, this restaurant is run by a couple – she’s front of house, while he’s in the kitchen – who use top - quality local produce to prepare a mix of traditional and modern dishes. House specialities include gnocchi with sheep’s ricotta cheese and hazel butter (the butter isn’t actually made with hazelnuts but is simply melted), and confit pork ribs in beer. The mountain - style decor continues in the simple guestrooms.",https://www.13comuni.it/it/


## 2.2 Ranked Search Engine with TF-IDF and Cosine Similarity

### 2.2.1 Inverted Index with TF-IDF Scores

In [7]:
def create_inverted_tf_idf(df, vocabulary, inverted_index):

    # Create a dictionary to store the TF-IDF values
    tf_idf = defaultdict(list)

    # Get the number of restaurants
    num_restaurants = len(df)

    voc_values = list(vocabulary.values())
    voc_keys = list(vocabulary.keys())

    for term_id, restaurant_ids in inverted_index.items():
        # Calculate the inverse document frequency (IDF) for the current term
        idf = np.log(num_restaurants / len(restaurant_ids))
        
        # Iterate over the restaurants that contain the current term and calculate the TF-IDF
        for restaurant_id in restaurant_ids:
            desc = df["processed_description"].loc[restaurant_id]
            term = voc_keys[voc_values.index(term_id)]
            tf = desc.count(term)
            tf_idf[term_id].append((restaurant_id, tf * idf))
        
    return tf_idf

inverted_tf_idf = create_inverted_tf_idf(df, vocabulary, inverted_index)

In [169]:
# Save inverted index as inverted_index.json
with open(inverted_index_TFIDF_path, "w") as f:
    json.dump(inverted_tf_idf, f)

### 2.2.2 Execute the Ranked Query

In [8]:
def vectorize_documents(df, vocabulary, inverted_tf_idf):
    num_doc = len(df)
    num_terms = len(vocabulary)
    
    # Create a matrix to store the TF-IDF values for each term in each document
    tf_idf_matrix = np.zeros((num_doc, num_terms))
    
    # Iterate over each term in the vocabulary
    for term_id, term_weights in inverted_tf_idf.items():
        for restaurant_id, tf_idf in term_weights:
            tf_idf_matrix[restaurant_id - 1, term_id] = tf_idf  # Adjust restaurant_id by subtracting 1
    
    return tf_idf_matrix

tf_idf_matrix = vectorize_documents(df, vocabulary, inverted_tf_idf)

In [11]:
def compute_cosine_similarity(query, tf_idf_matrix, vocabulary):
    processed_query = preprocess_and_stem_text(query)
    query_terms = set(processed_query)

    # Find the term IDs for the query terms and compute the query vector
    query_vector = defaultdict(float)
    for term in query_terms:
        term_id = vocabulary.get(term)
        query_vector[term_id] = processed_query.count(term) # Compute the term frequency for the query

    # Compute the cosine similarity between the query vector and each document vector
    cosine_similarities = {}
    for restaurant_id, doc_vector in enumerate(tf_idf_matrix):
        numerator = 0.0
        for term_id, tf in list(query_vector.items()):
            numerator += tf * doc_vector[term_id]
        denominator = np.linalg.norm(list(query_vector.values())) * np.linalg.norm(doc_vector)
        cosine_similarities[restaurant_id+1] = numerator / denominator if denominator != 0 else 0.0 # Adjust restaurant_id by adding 1
    
    # Normalize the cosine similarities to be between 0 and 1
    max_similarity = max(cosine_similarities.values())
    if max_similarity > 0:
        cosine_similarities = {restaurant_id: cosine_similarity / max_similarity for restaurant_id, cosine_similarity in cosine_similarities.items()}

    
    return cosine_similarities

In [19]:
def execute_query_cosine_similarity(query, df, tf_idf_matrix, vocabulary, k):
    
    cosine_similarities = compute_cosine_similarity(query, tf_idf_matrix, vocabulary)

    result = df.copy()

    result["similarity_score"] = result.index.map(cosine_similarities)
    
    result.sort_values(by="similarity_score", ascending=False, inplace=True)

    return result.head(10)[["restaurantName", "address", "description", "website", "similarity_score"]]


# Prompt user for input and display results
query = input("Please enter your search terms (e.g., 'modern seasonal cuisine'): ")
k = int(input("How many top similar restaurants would you like to see? "))
result = execute_query_cosine_similarity(query, df, tf_idf_matrix, vocabulary, k)

display(result.style.hide(axis='index'))

restaurantName,address,description,website,similarity_score
Saur,via Filippo Turati 8,"In a tiny rural village, this contemporary, almost minimalist-style restaurant serves modern cuisine with an emphasis on seasonal, regional produce.",https://ristorantesaur.it,1.0
Razzo,via Andrea Doria 17/f,"A quiet restaurant with a relaxed, young and modern feel serving contemporary cuisine prepared from seasonal, regional products. Charming romantic outdoor area with soft lighting.",https://vadoarazzo.it/,0.927831
La Botte,via Giuseppe Garibaldi 8,"A modern and welcoming contemporary bistro situated in the heart of Stresa’s historic centre. Run by an entire family, the restaurant serves modern and imaginative fish and meat dishes where the focus is always on seasonal ingredients. The interesting wine list also includes a selection of wines by the glass.",http://www.trattorialabottestresa.it,0.916019
Piccolo Lord,corso San Maurizio 69 bis/g,"Professional service in a welcoming, modern restaurant run by a young couple. He works in the kitchen while she (having also worked as a chef in the past) runs the front of house. Delicious Mediterranean cuisine with a seasonal focus.",https://www.ristorantepiccololord.it/,0.88421
La Valle,"via Umberto I 25, località Valle Sauglio","A well - run restaurant in a quiet area just outside the village, where the owner - chef serves modern cuisine with the occasional regional influence. Careful attention is given to the use of seasonal produce and natural herbs, and there is a small selection of fish - based dishes.",https://www.ristorantelavalle.it/,0.830615
Al Vecchio Convento,viale Borri 348,"Ask for a table in the main dining room, with a classic atmosphere and elegant furnishings, to taste the dishes of a cuisine, which is seasonal and mainly Tuscan.",https://www.alvecchioconvento.it/,0.73303
RistoFante,via Mazzini 41,"The motto of this restaurant is “In step with the times yet inspired by tradition”. The cuisine here is made from fresh, seasonal ingredients (with a particular focus on fish) prepared using modern techniques. Elegant dining room, plus a shaded outdoor space for the summer months.",https://www.ristofante.it/,0.724246
Aprudia,largo del Forno 16,"At this restaurant in the historic centre, where the vaulted brick ceiling provides a striking contrast with the modern furnishings, the chef celebrates the seasons through a whole range of ingredients to create truly delicious cuisine. Interesting tasting menus add to its appeal.",http://www.aprudia.com,0.713616
Locanda Solagna,piazza I Novembre 2,"Although this restaurant has been in business since the 1950s, it feels much more modern thanks to its updated appearance. Renowned for its excellent regional cuisine served in the evening, the restaurant pays careful attention to seasonal ingredients. At lunchtime, the ”osteria”-style menu is much simpler. Excellent wine selection.",https://www.locandasolagna.it/,0.677244
Mima,via Madonnelle 9,"You’ll be won over by the seasonal Mediterranean cuisine created by the young yet experienced chef at this restaurant. Accommodation is also available in modern guestrooms, plus there’s an enchanting roof garden in which to sip an aperitif while the sun goes down.",http://www.domo20.com/restaurant,0.64406


# 3. Define a New Score!

# 4. Visualizing the Most Relevant Restaurants

To complete this step, we first needed to gather unique locations in the format of city and region. For this, we relied on **OpenCage**, which offers 2500 requests per day. Since we had 1983 restaurants, we used **OpenCage's API** to fetch the location data. We followed the suggested code from OpenCage to retrieve the necessary information. Using the city, postal code, latitude, and longitude for each restaurant, we generated a CSV file with the geographic data. This output file was then mapped to our initial dataframe, matching the data based on the address column.

In [None]:
import pandas as pd

df2 = pd.read_csv("file_geocoded_with_region.csv", sep=";") 


df2_unique = df2.drop_duplicates(subset='address', keep='first')


df['lat'] = df['address'].map(df2_unique.set_index('address')['latitude'])
df['long'] = df['address'].map(df2_unique.set_index('address')['longitude'])
df['region'] = df['address'].map(df2_unique.set_index('address')['region'])


## Map

In [None]:
import folium
import requests
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# Function to determine color based on price
def determine_color(price):
    if price == "€":
        return "green"   # Green for €
    elif price == "€€":
        return "blue"    # Blue for €€
    elif price == "€€€":
        return "orange"  # Orange for €€€
    elif price == "€€€€":
        return "red"     # Red for €€€€
    return "purple"      # Purple for mixed prices

def determine_color_and_size(prices):
    unique_prices = set(prices)
    if len(unique_prices) == 1:
        return determine_color(unique_prices.pop()), 9
    return "purple", 9  # Purple for mixed prices and fixed size

# Prepare the TF-IDF matrix for descriptions
tfidf_matrix, vectorizer = preprocess_data(df)

# Ask the user for search terms and how many results to show
query = input("Please enter your search terms (e.g., 'modern seasonal cuisine'): ")
k = int(input("How many top similar restaurants would you like to see? "))
top_k_results = execute_ranked_query(query, df, tfidf_matrix, vectorizer, k)

# Group the restaurants by city
filtered_cities = top_k_results.groupby('city').agg({
    'lat': 'first', 
    'long': 'first',  
    'restaurantName': list,  # List of restaurant names
    'priceRange': list,  # List of price ranges for each restaurant
    'similarity_score': 'mean',  # Average similarity score for the city
}).reset_index()

# Create a map centered on Italy (Rome)
map = folium.Map(location=[41.9028, 12.4964], zoom_start=6)

# Load the GeoJSON file of Italian regions
url = "https://raw.githubusercontent.com/codeforamerica/click_that_hood/master/public/data/italy-regions.geojson"
geojson_data = requests.get(url).json()

# Add the GeoJSON of regions to the map with black borders
folium.GeoJson(
    geojson_data,
    style_function=lambda feature: {
        'fillColor': 'none',  
        'color': 'black',     
        'weight': 1          
    }
).add_to(map)

# Add markers for each city that has at least one restaurant in the top k
for index, row in filtered_cities.iterrows():
    lat = float(row['lat'])
    lon = float(row['long'])
    restaurants = row['restaurantName']
    prices = row['priceRange']
    city_similarity = top_k_results[top_k_results['city'] == row['city']]
    
    # Sort the restaurants by similarity in descending order
    city_similarity = city_similarity.sort_values(by='similarity_score', ascending=False)
    
    # Determine the color and size based on prices
    color, radius = determine_color_and_size(prices)
    
    # Create content for the popup showing restaurants and their prices
    popup_content = f"<b>Restaurants in {row['city']}</b><br>"
    
    # Add the restaurants sorted by similarity
    for _, restaurant in city_similarity.iterrows():
        name = restaurant['restaurantName']
        price = restaurant['priceRange']
        sim_score = restaurant['similarity_score']
        address = restaurant['address']
        description = restaurant['description']
        website = restaurant['website']
        
        # Color for each price
        price_color = determine_color(price)
        price_styled = f"<span style='color:{price_color};'>{price}</span>"
        
        popup_content += f"<b>{name}</b><br>"
        popup_content += f"Address: {address}<br>"
        popup_content += f"Description: {description}<br>"
        popup_content += f"Website: <a href='{website}' target='_blank'>{website}</a><br>"
        popup_content += f"Price: {price_styled}<br>"
        popup_content += f"Similarity: {sim_score:.2f}<br><br>"
    
    popup_content += f"Average Similarity: {row['similarity_score']:.2f}<br>"
    popup = folium.Popup(popup_content, max_width=300)

    # Create the marker for the city 
    marker = folium.CircleMarker(
        location=[lat, lon],
        radius=radius, 
        color=color,  
        fill=True,
        fill_color=color,  
        fill_opacity=0.6,
        popup=popup,
        tooltip=f"{row['city']}: {len(restaurants)} restaurants"
    )

    # Add the marker to the map
    marker.add_to(map)

# Add layer control to enable/disable regions
folium.LayerControl().add_to(map)

# legend for price colors
legend_html = """
<div style="position: fixed;
     bottom: 50px; left: 50px; width: 150px; height: 150px;
     border:2px solid grey; z-index:9999; font-size:14px;
     background-color:white; padding: 10px;">
     <strong>Price Legend</strong><br>
     <i style="background:green; width:10px; height:10px; float:left; margin-right:10px;"></i>€<br>
     <i style="background:blue; width:10px; height:10px; float:left; margin-right:10px;"></i>€€<br>
     <i style="background:orange; width:10px; height:10px; float:left; margin-right:10px;"></i>€€€<br>
     <i style="background:red; width:10px; height:10px; float:left; margin-right:10px;"></i>€€€€<br>
     <i style="background:purple; width:10px; height:10px; float:left; margin-right:10px;"></i>Mixed Prices<br>
</div>
"""
map.get_root().html.add_child(folium.Element(legend_html))

map.save("italy_map_with_query_results.html")


In [None]:
map

In [None]:
df3= df

In [None]:
import folium
import requests
import pandas as pd
from folium.plugins import Search

# Crea una mappa centrata sull'Italia (su Roma)
mappa = folium.Map(location=[41.9028, 12.4964], zoom_start=6)

# Carica il file GeoJSON delle regioni italiane
url = "https://raw.githubusercontent.com/codeforamerica/click_that_hood/master/public/data/italy-regions.geojson"
geojson_data = requests.get(url).json()

# Aggiungi il GeoJSON delle regioni alla mappa
folium.GeoJson(geojson_data).add_to(mappa)

# Carica il DataFrame con i dati dei ristoranti
# Supponiamo che il tuo DataFrame 'df3' abbia le colonne 'lat', 'long', 'priceRange', 'restaurantName', 'region'
# Esempio di DataFrame


# Funzione per determinare il colore in base al prezzo
def determina_colore_e_dimensione(prezzo):
    if prezzo == "€":
        return "green"  # Colore verde
    elif prezzo == "€€":
        return "blue"  # Colore blu
    elif prezzo == "€€€":
        return "orange"  # Colore arancione
    elif prezzo == "€€€€":
        return "red"  # Colore rosso
    else:
        return "gray"  # Colore di default

# Aggiungi un FeatureGroup per ogni regione
regioni = df3['region'].unique()  # Ottieni tutte le regioni uniche
layers = {}  # Dizionario per gestire i layer delle regioni

# Crea un FeatureGroup per ogni regione
for regione in regioni:
    feature_group = folium.FeatureGroup(name=regione)  # Crea un nuovo gruppo per la regione
    # Filtra i ristoranti per la regione corrente
    ristoranti_regione = df3[df3['region'] == regione]
    
    # Aggiungi i marker per ogni ristorante in questa regione
    for index, ristorante in ristoranti_regione.iterrows():
        lat = float(ristorante['lat'])
        lon = float(ristorante['long'])
        colore = determina_colore_e_dimensione(ristorante['priceRange'])
        
        # Aggiungi il marker alla mappa
        folium.CircleMarker(
            location=[lat, lon],  # Posizione basata su latitudine e longitudine
            radius=10,  # Dimensione del marker
            color=colore,  # Colore del marker
            fill=True,
            fill_color=colore,
            fill_opacity=0.6,
            popup=ristorante['restaurantName'],  # Nome del ristorante
            tooltip=ristorante['region']  # Mostra la regione quando passa sopra il marker
        ).add_to(feature_group)
    
    # Aggiungi il gruppo per la regione alla mappa
    feature_group.add_to(mappa)
    layers[regione] = feature_group  # Aggiungi il gruppo al dizionario layers

# Aggiungi un controllo per il layer delle regioni
folium.LayerControl().add_to(mappa)

# Salva la mappa in un file HTML
mappa.save("italy_map_with_restaurants_by_region.html")
mappa
