# Homework 3 

In [None]:
import module.crawler as cr
import module.costum_parser as pr
from scrapy.crawler import CrawlerProcess
from scrapy.settings import Settings
from concurrent.futures import ThreadPoolExecutor, as_completed
import os
import numpy as np
import pandas as pd
import nltk
nltk.download('punkt')
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
import string
import re
from collections import defaultdict



[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [5]:
# Settings the necessary direcory
os.makedirs('data', exist_ok=True)

pages_path = os.path.join('data', 'pages')
os.makedirs(pages_path, exist_ok=True)

data_path = os.path.join('data', 'data_tsv')
os.makedirs(data_path, exist_ok=True)


dataset_path = os.path.join('data', 'dataset.tsv')
urls_path = os.path.join('data', 'urls.txt')

# 1. Data collection


### 1.1 Get the list of Michelin restaurants

You should begin by compiling a list of restaurants to include in your document corpus. Specifically, you will focus on web scraping the [Michelin Restaurants in Italy](https://guide.michelin.com/en/it/restaurants). Your task is to **collect the URL** associated with each restaurant in this list. The output of this step should be a `.txt` file where each line contains a single restaurant’s URL. By the end, you should have approximately 2,037 restaurants on your list.


In [5]:
# Create custom settings for the url spider
custom_settings = Settings({
    'REQUEST_FINGERPRINTER_IMPLEMENTATION': '2.7',  # Set to recommended value to avoid issues
    'LOG_LEVEL': 'ERROR'  # Suppress other logging
})

In [6]:
get_url_process = CrawlerProcess(settings=custom_settings) # Create a process for the spider
get_url_process.crawl(cr.UrlMichelin, urls_path) # Add the spider to the process
get_url_process.start() # Run the spider

In [7]:
# Check if the file exists
if os.path.exists(urls_path):
    # Check the output file and see if the number of lines is correct
    lines_in_file = open(urls_path, 'r').readlines()
    number_of_lines = len(lines_in_file)
    print(f'Number of lines in file: {number_of_lines}')
else:
    print('Failure: File not found')

Number of lines in file: 1983


### 1.2. Crawl Michelin restaurant pages

Once you have all the URLs on the list, you should:

1. Download the HTML corresponding to each of the collected URLs.
2. After collecting each page, immediately save its `HTML` in a file. This way, if your program stops for any reason, you will not lose the data collected up to the stopping point.
3. Organize the downloaded `HTML` pages into folders. Each folder will contain the `HTML` of the restaurants from page 1, page 2, ... of the Michelin restaurant list.

__Tip__: Due to the large number of pages to download, consider using methods that can help shorten the process. If you employed a particular process or approach, kindly describe it.


In [10]:
lines_of_urls = []
with open(urls_path, 'r') as file:
    lines_of_urls = file.readlines()

original_directory = os.getcwd()
os.chdir(os.path.join(original_directory, pages_path))

# Create folders for the HTML files
cr.make_folders(100)

max_w = os.cpu_count()

# Download the HTML files concurrently
with ThreadPoolExecutor(max_workers=max_w) as executor:
    download_futures = []
    for line in lines_of_urls:
        # Split the line into URL and page number
        page_num = int(line.split("|")[1])
        url = line.split("|")[0].strip()
                
        # Submit download task to the executor
        download_futures.append(executor.submit(cr.HTML_downloader, url, page_num))

    # Wait for all tasks to complete
    for future in as_completed(download_futures):
        try:
            future.result()
        except Exception as e:
            print(f"An error occurred: {e}")

# Notify completion
print("Downloaded all pages!")

# Return to the original directory
os.chdir(original_directory)

Downloaded all pages!


In [11]:
# Check if the files exist and are 1983

current_dir = os.getcwd()
dir_path = os.path.join(current_dir, pages_path)
os.chdir(dir_path)
count = 0

for i in range(1,101):
    folder = f'page_{i}'
    for path in os.listdir(folder):
        if os.path.isfile(os.path.join(folder, path)):
            count += 1

os.chdir(current_dir)

print('File count:', count)

File count: 1983


### 1.3 Parse downloaded pages

At this point, you should have all the HTML documents about the restaurant of interest, and you can start to extract specific information. The list of the information we desire for each restaurant and their format is as follows:

1. **Restaurant Name** (to save as `restaurantName`): string;
2. **Address** (to save as `address`): string;
3. **City** (to save as `city`): string;
4. **Postal Code** (to save as `postalCode`): string;
5. **Country** (to save as `country`): string;
6. **Price Range** (to save as `priceRange`): string;
7. **Cuisine Type** (to save as `cuisineType`): string;
8. **Description** (to save as `description`): string;
9. **Facilities and Services** (to save as `facilitiesServices`): list of strings;
10. **Accepted Credit Cards** (to save as `creditCards`): list of strings;
11. **Phone Number** (to save as `phoneNumber`): string;
12. **URL to the Restaurant Page** (to save as `website`): string.

For each restaurant, you create a `restaurant_i.tsv` file of this structure:

```
restaurantName \t address \t  ... \t url
```

If an information is missing, you just leave it as an empty string.

In [12]:
keys = ['index', 'restaurantName', 'address', 'city', 'postalCode', 'country', 'priceRange', 'cuisineType', 'description', 'creditCards', 'facilitiesServices', 'phoneNumber', 'website']

max_w = os.cpu_count()

# Download the data from HTML files concurrently
with ThreadPoolExecutor(max_workers=max_w) as executor:
    extractor_future = []
    for i in range(1,101):
        start_dir = os.path.join(pages_path, f'page_{i}')
        start_index = (i-1)*20 
        extractor_future.append(executor.submit(pr.tsv_extractor, start_dir, data_path, start_index, keys))

    # Wait for all tasks to complete
    for future in as_completed(extractor_future):
        try:
            future.result()
        except Exception as e:
            print(e)

# Notify completion
print("Extracted all data!")

Extracted all data!


In [13]:
# Check if the files exist and are 1983
count = 0

for path in os.listdir(data_path):
        if os.path.isfile(os.path.join(data_path, path)):
            count += 1

print('File count:', count)

File count: 1983


In [6]:
# Unify all data into one file csv

# List all TSV files in the directory
tsv_files = [f for f in os.listdir(data_path) if f.endswith('.tsv')]

# Load all TSV files into a list of dataframes
dfs = [pd.read_csv(os.path.join(data_path, file), sep='\t') for file in tsv_files]

# Unite all dataframes into one
merged_df = pd.concat(dfs, ignore_index=True)
merged_df.sort_values(by=['index'], inplace=True)

# Save the merged dataframe to a TSV file
merged_df.to_csv(dataset_path, sep='\t', index=False)

# Notify completion
print("Unified all data!")


Unified all data!


# 2. Search Engine

This search engine allows you to retrieve restaurants based on a user query. We’ll build two types of search engines:

- Conjunctive Search Engine: Returns restaurants where all query terms appear in the description.
- Ranked Search Engine: Returns the top-k restaurants sorted by similarity to the query, using TF-IDF and Cosine Similarity.

To effectively analyze restaurant descriptions, it is crucial to *pre-process the text*. As in any optimal text analysis, we must proceed with preprocessing, which we addressed in the first part.In general we followed these steps.:

- Firstly we ensured the removal of stop words in English, as well as customized common words related to Italian cuisine and gourmet dining, such as "pasta," "pizza," and other frequently used terms. 

- The next step involved constructing a `vocabulary` to extract all unique words from the various descriptions and associate each with a unique integer. We decided to increment this integer sequentially for simplicity.

- Additionally, we created an `inverted_index` that maps these integers back to the specific documents in which the corresponding words appear. This setup allows us to define a `search_query` function where, by inputting a word or phrase, we can retrieve all documents containing all of those words.

## 2.0 Preprocessing

In [None]:

df= pd.read_csv("dataset.tsv", sep='\t', encoding= "utf-8")

# Funzione per il preprocessamento e lo stemming del testo
def preprocess_and_stem_text(text):
    words = word_tokenize(text.lower())
    words = [re.sub(r"[^a-zA-Z']", '', word) for word in words]  # Rimuove caratteri non alfabetici
    stop_words = set(stopwords.words('english'))  # Stopwords in inglese
    stemmer = PorterStemmer()  # Stemmer di Porter
    
    filtered_stemmed_words = []
    for word in words:
        if word and word not in stop_words:
            stemmed_word = stemmer.stem(word)
            filtered_stemmed_words.append(stemmed_word)
    
    return filtered_stemmed_words

df["processed_description"] = df["description"].apply(preprocess_and_stem_text)




## 2.1 Conjunctive Query

### 2.1.1 Create Your Index!

In [59]:
from collections import defaultdict

def create_vocabulary_and_inverted_index(df):

    vocabulary = {}
    inverted_index = defaultdict(list)
    
    term_id = 0

    for idx, row in df.iterrows():
        description = set(row["processed_description"])
        restaurant_id = row["index"]

        
        for word in description:
          
            if word not in vocabulary:
                vocabulary[word] = term_id
                term_id += 1


            term_id_for_word = vocabulary[word]
            inverted_index[term_id_for_word].append(restaurant_id)
    
    return vocabulary, inverted_index

vocabulary, inverted_index = create_vocabulary_and_inverted_index(df)


In [60]:
import json

# Save vocabulary as vocabulary.csv
vocab_df = pd.DataFrame(list(vocabulary.items()), columns=["word", "term_id"])
vocab_df.to_csv("vocabulary.csv", index=False)

# Save inverted index as inverted_index.json
with open("inverted_index.json", "w") as f:
    json.dump(inverted_index, f)


In [74]:
def execute_query(query, df):
    # Preprocessa i termini della query utilizzando la funzione preprocess_and_stem_text
    processed_query = preprocess_and_stem_text(query)
    query_terms = set(processed_query)
    
    # Trova gli indici dei ristoranti che soddisfano la query
    matching_indices = df[df["processed_description"].apply(lambda desc: query_terms.issubset(desc))].index
    
    # Usa iloc con gli indici trovati per selezionare le righe e filtrare solo le colonne desiderate
    result = df.iloc[matching_indices][["restaurantName", "address", "description", "website"]]
    
    return result

# Prompt per inserire la query
query = input("Please enter your search terms (e.g., 'modern seasonal cuisine'): ")

# Esegui la query e ottieni i risultati
result = execute_query(query, df)

# Visualizza i risultati
result



Unnamed: 0,restaurantName,address,description,website
0,20Tre,via David Chiossone 20 r,Situated in the heart of Genoa’s historic cent...,https://www.ristorante20tregenova.it/
7,Donevandro,via Garibaldi 2,"Up until a few years ago, the owner-chef at th...",http://www.donevandroristorante.it
8,Etra,piazza De Ferrari 4,Etra is an anagram of the Italian word “arte” ...,https://www.etra.art/
9,Il Ristorante Alain Ducasse Napoli,Via Cristoforo Colombo 45,"Alain Ducasse, one of the great names in conte...",https://theromeocollection.com/en/romeo-napoli...
11,La Buca,corso Garibaldi 45,Choose one of the tables on the outdoor summer...,https://www.labucaristorante.com/
...,...,...,...,...
1964,Erbaluigia,via San Frediano 10/12,"This attractive restaurant with a simple, mini...",https://erbaluigia.com/
1968,Locanda 53 Supper Club,via Vergolano 53,"Partners in life and business, Evelyn and Carl...",https://it.locanda53.it
1976,Sotto l'Arco,via Aretusi 5,Villa Aretusi is a pleasant 17C villa surround...,https://www.villa-aretusi.it/ristorante-sotto-...
1981,Café Les Paillotes,piazza Le Laudi 2,This old acquaintance of the Michelin Guide no...,https://www.lespaillotes.it/


## 2.2 Ranked Search Engine with TF-IDF and Cosine Similarity

### 2.2.1 Inverted Index with TF-IDF Scores

In [14]:
import pandas as pd
import numpy as np
from collections import defaultdict
from sklearn.feature_extraction.text import TfidfVectorizer

#TF-IDF scores
def build_new_inverted_index(data):
    # Vectorize descriptions using TF-IDF
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform(data['description'])
    terms = vectorizer.get_feature_names_out()

    # inverted index with TF-IDF scores
    inverted_index = defaultdict(list)
    for doc_id, row in enumerate(tfidf_matrix):
        for term_id, tfidf_score in zip(row.indices, row.data):
            term = terms[term_id]
            inverted_index[term].append((doc_id, tfidf_score))

    return inverted_index, vectorizer

# Generate the inverted index
inverted_index, vectorizer = build_new_inverted_index(df)

### 2.2.2 Execute the Ranked Query

In [67]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd
import numpy as np

# TF-IDF vectors for each restaurant description
def preprocess_data(data):
    vectorizer = TfidfVectorizer()  # Use custom stop words
    tfidf_matrix = vectorizer.fit_transform(data['description'])
    return tfidf_matrix, vectorizer

# Execute the ranked query using TF-IDF and Cosine Similarity
def execute_ranked_query(query, data, tfidf_matrix, vectorizer, k):
    query_vector = vectorizer.transform([query])
    
    # Calculate cosine similarity between the query and all documents
    similarity_scores = cosine_similarity(query_vector, tfidf_matrix).flatten()
    
    # Get indices of documents sorted by score
    ranked_indices = np.argsort(similarity_scores)[::-1][:k]  # Top k
    
    results = data.iloc[ranked_indices][['restaurantName', 'address', 'description', 'website']].copy()
    results['similarity_score'] = similarity_scores[ranked_indices]
    results.reset_index(drop=True, inplace=True)
    
    return results

# Build TF-IDF matrix
tfidf_matrix, vectorizer = preprocess_data(df)

# Prompt user for input and display results
query = input("Please enter your search terms (e.g., 'modern seasonal cuisine'): ")
k = int(input("How many top similar restaurants would you like to see? "))
result = execute_ranked_query(query, df, tfidf_matrix, vectorizer, k)

result

Unnamed: 0,restaurantName,address,description,website,similarity_score
0,La Botte,via Giuseppe Garibaldi 8,A modern and welcoming contemporary bistro sit...,http://www.trattorialabottestresa.it,0.388885
1,Osteria del Borgo,via Pietro Custodi 5,"Situated in the heart of the old town, this re...",tel:+39 349 160 3750,0.378145
2,Braunwirt,piazza Chiesa 3,A modern and welcoming restaurant in the heart...,https://www.braunwirt.it/,0.330758
3,Degusteria Italiana,via Lambertesca 7r,"This small, welcoming restaurant in the heart ...",http://www.degusteriaitalianafirenze.com,0.284699
