# Homework 3 

In [None]:
import module.crawler as cr
import module.costum_parser as pr
from scrapy.crawler import CrawlerProcess
from scrapy.settings import Settings
from concurrent.futures import ThreadPoolExecutor, as_completed
import os
import numpy as np
import pandas as pd
import nltk
nltk.download('punkt')
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
import string
import re
from collections import defaultdict



[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [5]:
# Settings the necessary direcory
os.makedirs('data', exist_ok=True)

pages_path = os.path.join('data', 'pages')
os.makedirs(pages_path, exist_ok=True)

data_path = os.path.join('data', 'data_tsv')
os.makedirs(data_path, exist_ok=True)


dataset_path = os.path.join('data', 'dataset.tsv')
urls_path = os.path.join('data', 'urls.txt')

# 1. Data collection


### 1.1 Get the list of Michelin restaurants

You should begin by compiling a list of restaurants to include in your document corpus. Specifically, you will focus on web scraping the [Michelin Restaurants in Italy](https://guide.michelin.com/en/it/restaurants). Your task is to **collect the URL** associated with each restaurant in this list. The output of this step should be a `.txt` file where each line contains a single restaurant’s URL. By the end, you should have approximately 2,037 restaurants on your list.


In [5]:
# Create custom settings for the url spider
custom_settings = Settings({
    'REQUEST_FINGERPRINTER_IMPLEMENTATION': '2.7',  # Set to recommended value to avoid issues
    'LOG_LEVEL': 'ERROR'  # Suppress other logging
})

In [6]:
get_url_process = CrawlerProcess(settings=custom_settings) # Create a process for the spider
get_url_process.crawl(cr.UrlMichelin, urls_path) # Add the spider to the process
get_url_process.start() # Run the spider

In [7]:
# Check if the file exists
if os.path.exists(urls_path):
    # Check the output file and see if the number of lines is correct
    lines_in_file = open(urls_path, 'r').readlines()
    number_of_lines = len(lines_in_file)
    print(f'Number of lines in file: {number_of_lines}')
else:
    print('Failure: File not found')

Number of lines in file: 1983


### 1.2. Crawl Michelin restaurant pages

Once you have all the URLs on the list, you should:

1. Download the HTML corresponding to each of the collected URLs.
2. After collecting each page, immediately save its `HTML` in a file. This way, if your program stops for any reason, you will not lose the data collected up to the stopping point.
3. Organize the downloaded `HTML` pages into folders. Each folder will contain the `HTML` of the restaurants from page 1, page 2, ... of the Michelin restaurant list.

__Tip__: Due to the large number of pages to download, consider using methods that can help shorten the process. If you employed a particular process or approach, kindly describe it.


In [10]:
lines_of_urls = []
with open(urls_path, 'r') as file:
    lines_of_urls = file.readlines()

original_directory = os.getcwd()
os.chdir(os.path.join(original_directory, pages_path))

# Create folders for the HTML files
cr.make_folders(100)

max_w = os.cpu_count()

# Download the HTML files concurrently
with ThreadPoolExecutor(max_workers=max_w) as executor:
    download_futures = []
    for line in lines_of_urls:
        # Split the line into URL and page number
        page_num = int(line.split("|")[1])
        url = line.split("|")[0].strip()
                
        # Submit download task to the executor
        download_futures.append(executor.submit(cr.HTML_downloader, url, page_num))

    # Wait for all tasks to complete
    for future in as_completed(download_futures):
        try:
            future.result()
        except Exception as e:
            print(f"An error occurred: {e}")

# Notify completion
print("Downloaded all pages!")

# Return to the original directory
os.chdir(original_directory)

Downloaded all pages!


In [11]:
# Check if the files exist and are 1983

current_dir = os.getcwd()
dir_path = os.path.join(current_dir, pages_path)
os.chdir(dir_path)
count = 0

for i in range(1,101):
    folder = f'page_{i}'
    for path in os.listdir(folder):
        if os.path.isfile(os.path.join(folder, path)):
            count += 1

os.chdir(current_dir)

print('File count:', count)

File count: 1983


### 1.3 Parse downloaded pages

At this point, you should have all the HTML documents about the restaurant of interest, and you can start to extract specific information. The list of the information we desire for each restaurant and their format is as follows:

1. **Restaurant Name** (to save as `restaurantName`): string;
2. **Address** (to save as `address`): string;
3. **City** (to save as `city`): string;
4. **Postal Code** (to save as `postalCode`): string;
5. **Country** (to save as `country`): string;
6. **Price Range** (to save as `priceRange`): string;
7. **Cuisine Type** (to save as `cuisineType`): string;
8. **Description** (to save as `description`): string;
9. **Facilities and Services** (to save as `facilitiesServices`): list of strings;
10. **Accepted Credit Cards** (to save as `creditCards`): list of strings;
11. **Phone Number** (to save as `phoneNumber`): string;
12. **URL to the Restaurant Page** (to save as `website`): string.

For each restaurant, you create a `restaurant_i.tsv` file of this structure:

```
restaurantName \t address \t  ... \t url
```

If an information is missing, you just leave it as an empty string.

In [12]:
keys = ['index', 'restaurantName', 'address', 'city', 'postalCode', 'country', 'priceRange', 'cuisineType', 'description', 'creditCards', 'facilitiesServices', 'phoneNumber', 'website']

max_w = os.cpu_count()

# Download the data from HTML files concurrently
with ThreadPoolExecutor(max_workers=max_w) as executor:
    extractor_future = []
    for i in range(1,101):
        start_dir = os.path.join(pages_path, f'page_{i}')
        start_index = (i-1)*20 
        extractor_future.append(executor.submit(pr.tsv_extractor, start_dir, data_path, start_index, keys))

    # Wait for all tasks to complete
    for future in as_completed(extractor_future):
        try:
            future.result()
        except Exception as e:
            print(e)

# Notify completion
print("Extracted all data!")

Extracted all data!


In [13]:
# Check if the files exist and are 1983
count = 0

for path in os.listdir(data_path):
        if os.path.isfile(os.path.join(data_path, path)):
            count += 1

print('File count:', count)

File count: 1983


In [6]:
# Unify all data into one file csv

# List all TSV files in the directory
tsv_files = [f for f in os.listdir(data_path) if f.endswith('.tsv')]

# Load all TSV files into a list of dataframes
dfs = [pd.read_csv(os.path.join(data_path, file), sep='\t') for file in tsv_files]

# Unite all dataframes into one
merged_df = pd.concat(dfs, ignore_index=True)
merged_df.sort_values(by=['index'], inplace=True)

# Save the merged dataframe to a TSV file
merged_df.to_csv(dataset_path, sep='\t', index=False)

# Notify completion
print("Unified all data!")


Unified all data!


# 2. Search Engine

This search engine allows you to retrieve restaurants based on a user query. We’ll build two types of search engines:

- **Conjunctive Search Engine**: Returns restaurants where all query terms appear in the description.
- **Ranked Search Engine**: Returns the top-k restaurants sorted by similarity to the query, using TF-IDF and Cosine Similarity.

To effectively analyze restaurant descriptions, it is crucial to *pre-process the text*. As in any optimal text analysis, we must proceed with preprocessing, which we addressed in the first part.In general we followed these steps.:

- Firstly we ensured text pre-processing through the `preprocess_and_stem_text`function

- The next step involved constructing a `vocabulary` and an `inverted_index`. This setup allows us to define a `search_query` function where, by inputting a word or phrase, we can retrieve all documents containing all of those words.

## 2.0 Preprocessing

The function, `preprocess_and_stem_text`, performs several key operations using the Natural Language Toolkit (NLTK). 
- First, it tokenizes the text by splitting it into individual words and converting them to lowercase
- Next, it cleans the characters by removing non-alphanumeric symbols
- Subsequently, the function removes stop words, common words that do not rpovide so much information.
- Finally, it applies stemming to the remaining words, reducing them to their root forms. 

In the table below, we can see the results of the pre-processingfor some example words from the `description` column

### Table 1: Pre-processing example
| Original           | Stemmed             |
|--------------------|---------------------|
| situated           | situat              |
| contemporarystyle  | contemporarystyl    |
| restaurant         | restaur             |            
| focuses            | focus               |    


In [132]:

df= pd.read_csv("dataset.tsv", sep='\t', encoding= "utf-8")
costum = {"one","well","feature","also"}

def preprocess_and_stem_text(text):
    words = word_tokenize(text.lower())
    #split the text into words and convert to lowercase.
    words = [re.sub(r"[^a-zA-Z']", '', word) for word in words]
    # Remove non-alphanumeric characters, keeping only letters and apostrophes
    stop_words = set(stopwords.words('english'))
    # Create a set of stop words to exclude
    stemmer = PorterStemmer() 
    
    filtered_stemmed_words = []
    for word in words:
        if word and word not in stop_words:
            stemmed_word = stemmer.stem(word) #stemming in order to reduce words to their roots.
            filtered_stemmed_words.append(stemmed_word)
    
    return filtered_stemmed_words

df["processed_description"] = df["description"].apply(preprocess_and_stem_text)


## 2.1 Conjunctive Query

### 2.1.1 Create Your Index!

In this section, we need to create two main structures: a `vocabulary` and an `inverted_index`

- `vocabulary`: This is a dictionary where each unique word found across all descriptions is assigned a unique number (ID)
- `inverted_index`: This is a dictionary that maps each word's unique ID (from the vocabulary) to a list of restaurant IDs in which that word appears. This allows find out which documents contain specific words.

For each restaurant description, we extract the set of unique processed words. Then, for each word in the document:

- If the word is not already in the vocabulary, we add it with a new unique ID.
- We then update the inverted index, adding the restaurant ID to the list associated with that word's unique ID.

In [133]:
from collections import defaultdict

def create_vocabulary_and_inverted_index(df):

    vocabulary = {}
    inverted_index = defaultdict(list)
    
    # term ID counter
    term_id = 0

    # Iterate over each row in the DataFrame
    for idx, row in df.iterrows():
        # Convert the processed description into a set to avoid duplicates
        description = set(row["processed_description"])
        restaurant_id = row["index"]

        # Iterate over each unique word in the description
        for word in description:
            
            if word not in vocabulary:
                vocabulary[word] = term_id
                term_id += 1

            # Get the term ID for the current word
            term_id_for_word = vocabulary[word]
            
            inverted_index[term_id_for_word].append(restaurant_id)
    
    return vocabulary, inverted_index

vocabulary, inverted_index = create_vocabulary_and_inverted_index(df)


In [84]:
import json

# Save vocabulary as vocabulary.csv
vocab_df = pd.DataFrame(list(vocabulary.items()), columns=["word", "term_id"])
vocab_df.to_csv("vocabulary.csv", index=False)

# Save inverted index as inverted_index.json
with open("inverted_index.json", "w") as f:
    json.dump(inverted_index, f)


### 2.1.2 Execute the Query

The following function, `execute_query`, allows to search through a dataset of restaurant descriptions based on input search terms by the user. In more details, the function checks if the set of query terms is a subset of the words present in each restaurant's description.
Once the matching restaurants are identified, relevant rows from the DataFrame are selected based on the indices of the matches found

In [134]:
def execute_query(query, df):
    
    processed_query = preprocess_and_stem_text(query)
    query_terms = set(processed_query)
    
    # Find the indices of restaurants that match the query
    matching_indices = df[df["processed_description"].apply(lambda desc: query_terms.issubset(desc))].index
    
    # iloc is used to select rows and filter the columns
    result = df.iloc[matching_indices][["restaurantName", "address", "description", "website"]]
    
    return result

query = input("Please enter your search terms (e.g., 'modern seasonal cuisine'): ")


result = execute_query(query, df)
print("Descriptions that contain the query '{}':".format(query))

# Display the results without showing the index
display(result.style.hide_index())


Descriptions that contain the query 'modern seasonal cuisine':


restaurantName,address,description,website
Casin del Gamba,via Roccolo Pizzati 1,"The journey to get here – a winding road through woods and hills – may be challenging at times but the warm welcome offered by the whole family Dal Lago at this delightful restaurant makes you feel completely at home and more than compensates for the effort. Owner-chef Antonio shares the honours and duties of his role with long-established sous-chef Biolo. Together, they prepare cuisine that is perfectly balanced between classic flavours and modern trends, using local ingredients that respect the passing seasons. Game features on the menu in winter, while dishes are often seasoned with wild aromatic herbs (for example, the local snails served with thyme and onion bread). For her part, Signora Daria determinedly takes on two roles: she not only prepares the desserts, but also skilfully supervises front of house, ably assisted by son Luca who is happy to advise guests on their choice of wine. The well-structured wine list also includes a good number of organic wines.",https://www.casindelgamba.it/
San Giorgio,viale Brigate Bisagno 69r,"Situated in the city albeit not right in the centre, San Giorgio is a typical Genovese restaurant – elegant and classic in style, with excellent cuisine served by the Scala family who have been in charge for decades. Young chef Samuele Di Mauro is at the helm, having started here as an apprentice and worked his way to the position of sous-chef and then head chef in 2022. Despite a focus on modern presentation and techniques, as demonstrated in dishes such as the raw bluefin tuna seasoned with an intense verbena mousse and served with caper cream and a “caviar” of dried and fried tuna, his cuisine is Mediterranean in style, with its roots in Ligurian aromas and flavours. The wine list is as impressive as the cuisine, with an entire section dedicated to French and Italian sparkling wines.",https://www.ristorantesangiorgiogenova.it/
Il Luogo Aimo e Nadia,via Montecuccoli 6,"This long-established restaurant has been part of the Milanese culinary scene for over 60 years and continues to attract food-lovers in search of top-quality cuisine today. It serves a generous tasting menu that showcases Italian cuisine in modern dishes, some inspired by the origins of the two dynamic chefs, Alessandro Negrini and Fabio Pisani (the former from Lombardy, the latter from Puglia). There’s also a menu dedicated to the seasons, where seasonal vegetables enhance the different dishes, while the “Omaggio a Milano” (Tribute to Milan – tortelli pasta stuffed with Fassona beef ossobuco and marrow in a Sardinian saffron and parmesan reduction) is always very popular. Make sure you allow plenty of time to peruse the extensive wine selection which is displayed on a tablet and includes some hard-to-find labels. The service is courteous and attentive throughout.",https://www.aimoenadia.com/il-luogo-aimo-e-nadia
Vesta Mare,viale Roma 41,"This typical, elegant Versilian beach club with an open-plan feel extending from the car park to the beach serves top-quality cuisine to guests. Its facilities include an entrance lounge, stylish dining room full of character, a swimming pool and a second lounge space. The menu is simpler at lunchtime, except during the low season when you can request the gourmet menu (normally only served in the evenings) when booking. Fish and seafood take pride of place on the menu, with classic dishes reinterpreted with a modern twist by a chef who takes particular care with presentation.",https://vestafiorichiari.com/mare/
Ca' Del Moro,località Erbin 31,"Situated within the La Collina dei Ciliegi wine estate amid the verdant Valpantena hills, this first-floor restaurant (lift available) is home to two chefs – he is from Calabria, while she is from Puglia. Together they create exciting, modern Mediterranean cuisine using ingredients from their native regions, including their signature dish – homemade spaghetti served with a delicious, well-balanced 'nduja, smoked ricotta and tomato ristretto sauce. Top-quality local ingredients take pride of place, as demonstrated by the Brogna mutton, which is raised on site, barbecued and served with an exquisite caper sauce and seasonal mushroom pie. The concise wine list features labels from the Veneto alongside a few French options.",https://www.cadelmoro.wine/it
Contrasto,via Roma 55,"Having returned to his native village, owner-chef Lucio Testa has opened this restaurant in a former sheepfold on the outskirts of the village perched at an altitude of 950m. The restaurant boasts a perfectly preserved stone façade, various small rooms arranged on different levels, exposed beams and contemporary-style lamps which give the dining room a modern feel. The creative cuisine features reinterpretations of traditional recipes from the Molise (such as pork tripe and cabbage soup) which combine local and seasonal ingredients (including produce from the restaurant’s own kitchen garden) with culinary techniques such as sauces, seasonings and emulsions acquired by the chef from his time working in France.",https://contrastoristorante.it
Saur,via Filippo Turati 8,"In a tiny rural village, this contemporary, almost minimalist-style restaurant serves modern cuisine with an emphasis on seasonal, regional produce.",https://ristorantesaur.it
San Michele,via Castello di Fagagna 33,"Situated next to the ruins of the old castle and the small church of San Michele, this 13C building, thought to have once housed a guardroom, is now home to a small restaurant serving regional, seasonal cuisine with a modern twist. During the week, the restaurant offers a lunchtime menu of top-quality snacks (the Venetian “cicchetti”), as well as a few “primi” and desserts.",http://sanmichele.restaurant
Chichibio,via Guglielmo Marconi 1,"Despite its lack of awards, this restaurant stands out for the quality of its cuisine. Situated in the town centre, it has a small intimate dining room with just a few tables (it’s best to book ahead) that acts as a backdrop for good-quality dishes made from local, seasonal ingredients, as well as a few modern and imaginative fish options. Highly recommended!",tel:+39 328 905 4831
Winter Garden Florence,piazza Ognissanti 1,"Horse-drawn carriages once entered the old courtyard of the St Regis hotel, now converted into an elegant winter garden which also includes a cocktail bar with sofas and armchairs. Seasonality and local gems are fundamental pillars of the modern Mediterranean cuisine.",https://www.wintergardenflorence.com/it/


## 2.2 Ranked Search Engine with TF-IDF and Cosine Similarity

### 2.2.1 Inverted Index with TF-IDF Scores

spiegaaaa

In [86]:
import pandas as pd
import numpy as np
from collections import defaultdict
from sklearn.feature_extraction.text import TfidfVectorizer

#TF-IDF scores
def build_new_inverted_index(data):
    # Vectorize descriptions using TF-IDF
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform(data['description'])
    terms = vectorizer.get_feature_names_out()

    # inverted index with TF-IDF scores
    inverted_index = defaultdict(list)
    for doc_id, row in enumerate(tfidf_matrix):
        for term_id, tfidf_score in zip(row.indices, row.data):
            term = terms[term_id]
            inverted_index[term].append((doc_id, tfidf_score))

    return inverted_index, vectorizer

# Generate the inverted index
inverted_index, vectorizer = build_new_inverted_index(df)

### 2.2.2 Execute the Ranked Query

spiegaaa

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd
import numpy as np

# TF-IDF vectors for each restaurant description
def preprocess_data(data):
    vectorizer = TfidfVectorizer() 
    tfidf_matrix = vectorizer.fit_transform(data['description'])
    return tfidf_matrix, vectorizer

# Execute the ranked query using TF-IDF and Cosine Similarity
def execute_ranked_query(query, data, tfidf_matrix, vectorizer, k):
    query_vector = vectorizer.transform([query])
    
    # Calculate cosine similarity between the query and all documents
    similarity_scores = cosine_similarity(query_vector, tfidf_matrix).flatten()
    
    # Get indices of documents sorted by score
    ranked_indices = np.argsort(similarity_scores)[::-1][:k]  # Top k
    
    results = data.iloc[ranked_indices][['restaurantName', 'address', 'description', 'website']].copy()
    results['similarity_score'] = similarity_scores[ranked_indices]
    results.reset_index(drop=True, inplace=True)
    
    return results

# Build TF-IDF matrix
tfidf_matrix, vectorizer = preprocess_data(df)

# Prompt user for input and display results
query = input("Please enter your search terms (e.g., 'modern seasonal cuisine'): ")
k = int(input("How many top similar restaurants would you like to see? "))
result = execute_ranked_query(query, df, tfidf_matrix, vectorizer, k)

display(result.style.hide_index())

restaurantName,address,description,website,similarity_score
Saur,via Filippo Turati 8,"In a tiny rural village, this contemporary, almost minimalist-style restaurant serves modern cuisine with an emphasis on seasonal, regional produce.",https://ristorantesaur.it,0.311022
Razzo,via Andrea Doria 17/f,"A quiet restaurant with a relaxed, young and modern feel serving contemporary cuisine prepared from seasonal, regional products. Charming romantic outdoor area with soft lighting.",https://vadoarazzo.it/,0.264815
La Botte,via Giuseppe Garibaldi 8,"A modern and welcoming contemporary bistro situated in the heart of Stresa’s historic centre. Run by an entire family, the restaurant serves modern and imaginative fish and meat dishes where the focus is always on seasonal ingredients. The interesting wine list also includes a selection of wines by the glass.",http://www.trattorialabottestresa.it,0.246574
