In [7]:
import crawler as cr
import parser as pr

from scrapy.crawler import CrawlerProcess
from scrapy.settings import Settings

from concurrent.futures import ThreadPoolExecutor, as_completed

import os

import numpy as np
import pandas as pd 


### 1.1 Get the list of Michelin restaurants

You should begin by compiling a list of restaurants to include in your document corpus. Specifically, you will focus on web scraping the [Michelin Restaurants in Italy](https://guide.michelin.com/en/it/restaurants). Your task is to **collect the URL** associated with each restaurant in this list. The output of this step should be a `.txt` file where each line contains a single restaurant’s URL. By the end, you should have approximately 2,037 restaurants on your list.


In [2]:
# Create custom settings for the url spider
custom_settings = Settings({
    'REQUEST_FINGERPRINTER_IMPLEMENTATION': '2.7',  # Set to recommended value to avoid issues
    'LOG_LEVEL': 'ERROR'  # Suppress other logging
})

In [3]:
get_url_process = CrawlerProcess(settings=custom_settings) # Create a process for the spider
get_url_process.crawl(cr.UrlMichelin) # Add the spider to the process
get_url_process.start() # Run the spider

In [4]:
# Check if the file exists
if os.path.exists('urls.txt'):
    # Check the output file and see if the number of lines is correct
    lines_in_file = open('urls.txt', 'r').readlines()
    number_of_lines = len(lines_in_file)
    print(f'Number of lines in file: {number_of_lines}')
else:
    print('Failure: File not found')

Number of lines in file: 1983


### 1.2. Crawl Michelin restaurant pages

Once you have all the URLs on the list, you should:

1. Download the HTML corresponding to each of the collected URLs.
2. After collecting each page, immediately save its `HTML` in a file. This way, if your program stops for any reason, you will not lose the data collected up to the stopping point.
3. Organize the downloaded `HTML` pages into folders. Each folder will contain the `HTML` of the restaurants from page 1, page 2, ... of the Michelin restaurant list.

__Tip__: Due to the large number of pages to download, consider using methods that can help shorten the process. If you employed a particular process or approach, kindly describe it.


In [None]:
lines_of_urls = []
with open('urls.txt', 'r') as file:
    lines_of_urls = file.readlines()

original_directory = os.getcwd()
os.makedirs('pages', exist_ok=True)
os.chdir(os.path.join(original_directory, 'pages'))

# Create folders for the HTML files
cr.make_folders(100)

max_w = os.cpu_count()

# Download the HTML files concurrently
with ThreadPoolExecutor(max_workers=max_w) as executor:
    download_futures = []
    for line in lines_of_urls:
        # Split the line into URL and page number
        page_num = int(line.split("|")[1])
        url = line.split("|")[0].strip()
                
        # Submit download task to the executor
        download_futures.append(executor.submit(cr.HTML_downloader, url, page_num))

    # Wait for all tasks to complete
    for future in as_completed(download_futures):
        pass

# Notify completion
print("Downloaded all pages!")

# Return to the original directory
os.chdir(original_directory)

Downloaded all pages!


In [None]:
# Check if the files exist and are 1983

current_dir = os.getcwd()
dir_path = os.path.join(current_dir, 'pages')
os.chdir(dir_path)
count = 0

for i in range(1,101):
    folder = f'page_{i}'
    for path in os.listdir(folder):
        if os.path.isfile(os.path.join(folder, path)):
            count += 1

os.chdir(current_dir)

print('File count:', count)

File count: 1983


### 1.3 Parse downloaded pages

At this point, you should have all the HTML documents about the restaurant of interest, and you can start to extract specific information. The list of the information we desire for each restaurant and their format is as follows:

1. **Restaurant Name** (to save as `restaurantName`): string;
2. **Address** (to save as `address`): string;
3. **City** (to save as `city`): string;
4. **Postal Code** (to save as `postalCode`): string;
5. **Country** (to save as `country`): string;
6. **Price Range** (to save as `priceRange`): string;
7. **Cuisine Type** (to save as `cuisineType`): string;
8. **Description** (to save as `description`): string;
9. **Facilities and Services** (to save as `facilitiesServices`): list of strings;
10. **Accepted Credit Cards** (to save as `creditCards`): list of strings;
11. **Phone Number** (to save as `phoneNumber`): string;
12. **URL to the Restaurant Page** (to save as `website`): string.

For each restaurant, you create a `restaurant_i.tsv` file of this structure:

```
restaurantName \t address \t  ... \t url
```

If an information is missing, you just leave it as an empty string.

In [3]:
current_dir = os.getcwd()
os.makedirs('data_tsv', exist_ok=True)
dest_dir = os.path.join(current_dir, 'data_tsv')

keys = ['index', 'restaurantName', 'address', 'city', 'postalCode', 'country', 'priceRange', 'cuisineType', 'description', 'creditCards', 'facilitiesServices', 'phoneNumber', 'website']

max_w = os.cpu_count()

# Download the data from HTML files concurrently
with ThreadPoolExecutor(max_workers=max_w) as executor:
    extractor_future = []
    for i in range(1,101):
        start_dir = os.path.join('pages', f'page_{i}')
        start_index = (i-1)*20 
        extractor_future.append(executor.submit(pr.tsv_extractor, start_dir, dest_dir, start_index, keys))

    # Wait for all tasks to complete
    for future in as_completed(extractor_future):
        try:
            future.result()
        except Exception as e:
            print(e)

# Notify completion
print("Extracted all data!")

Extracted all data!


In [4]:
# Check if the files exist and are 1983
current_dir = os.getcwd()
dir_path = os.path.join(current_dir, 'data_tsv')

count = 0

for path in os.listdir(dir_path):
        if os.path.isfile(os.path.join('data_tsv', path)):
            count += 1

print('File count:', count)

File count: 1983


In [10]:
# Unify all data into one file csv

# Directory containing the TSV files
current_dir = os.getcwd()
dir_path = os.path.join(current_dir, 'data_tsv')

# List all TSV files in the directory
tsv_files = [f for f in os.listdir(dir_path) if f.endswith('.tsv')]

# Load all TSV files into a list of dataframes
dfs = [pd.read_csv(os.path.join(dir_path, file), sep='\t') for file in tsv_files]

# Unite all dataframes into one
merged_df = pd.concat(dfs, ignore_index=True)
merged_df.sort_values(by=['index'], inplace=True)

# Save the merged dataframe to a TSV file
merged_df.to_csv('dataset.tsv', sep='\t', index=False)

# Notify completion
print("Unified all data!")


Unified all data!
