## Scraping "Guide Michelin" with Selenium

In [None]:
%pip install selenium

In [None]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time
import csv
import re

In [None]:
class SeleniumScraper:
    def __init__(self, query, max_pages):
        self.base_url = "https://guide.michelin.com/"
        self.query = query
        self.max_pages = max_pages
        
        chrome_options = webdriver.ChromeOptions()
        chrome_options.add_argument('--headless')
        chrome_options.add_argument('--no-sandbox')
        chrome_options.add_argument('--disable-dev-shm-usage')
        self.driver = webdriver.Chrome(options=chrome_options)
        self.restaurants = []
        self.rate_limiter_delay = 1.5

    def scrape(self):
        page = 1
        while page <= self.max_pages:
            print(f"Scraping page: {page}")
            url = f"{self.base_url}{self.query}{page}"
            self.driver.get(url)
            
            try:
                # Wait for cards to load dynamically
                WebDriverWait(self.driver, 10).until(
                    EC.presence_of_all_elements_located((By.CSS_SELECTOR, ".card__menu"))
                )
                
                # Extract restaurant data
                self.extract_restaurants()
                page += 1

                # Rate limiter
                time.sleep(self.rate_limiter_delay)
            except Exception as e:
                print(f"No more pages or error: {e}")
                break

        self.driver.quit()
        
    def _extract_number_stars(self, element):
        if element == None:
            return 0 # Just no stars
        attribute = element.get_attribute("src")
        img_name = attribute.split("/")[-1].split('.')[0]
        match = re.findall(r'\d', img_name)
        if len(match) > 0:
            return match[0] # Nb of stars in the pictures' file name
            
        return 0 # Bib Gourmand
           
    def extract_restaurants(self):
        cards = self.driver.find_elements(By.CSS_SELECTOR, ".card__menu")

        for card in cards:
            try:
                try:
                    # Attempt to find the element
                    rating_container = card.find_element(By.CSS_SELECTOR, ".flex-fill div div div div span img")
                except Exception:
                    # If not found, assign a default value
                    rating_container = None

                rating = self._extract_number_stars(rating_container)
                id_ = card.get_attribute("data-id")
                #year = card.find_element(By.CSS_SELECTOR, ".card__menu-content--rating>span").text.strip()
                img = card.find_element(By.CSS_SELECTOR, ".card__menu-image a").get_attribute("ci-bg-url")
                name = card.find_element(By.CSS_SELECTOR, ".card__menu-content--title a").text.strip()
                link = card.find_element(By.CSS_SELECTOR, ".card__menu-content--title a").get_attribute("href")
                location = card.find_element(By.CSS_SELECTOR, ".card__menu-image .card__menu-image--top div div .js-favorite-restaurant").get_attribute("data-dtm-city")
                lat = float(card.get_attribute("data-lat"))
                lon = float(card.get_attribute("data-lng"))
                type_ = card.find_element(By.CSS_SELECTOR, ".card__menu-image .card__menu-image--top div div .js-favorite-restaurant").get_attribute("data-cooking-type")
                price_category = len(card.find_element(By.CSS_SELECTOR, ".card__menu-image .card__menu-image--top div div .js-favorite-restaurant").get_attribute("data-dtm-price"))

                self.restaurants.append({
                    "id": id_,
                    "name": name,
                    "location": location,
                    "latitude": lat,
                    "longitude": lon,
                    "rating": rating,
                    "priceCategory": price_category,
                    "type": type_,
                    #"year": year,
                    "img": img,
                    "link": link,
                })
            except Exception as e:
                print(f"Error extracting card: {e}")

    def save_to_csv(self, filename="restaurants.csv"):
        headers = ["id", "name", "location", "latitude", "longitude", "rating", "priceCategory", "type", "img", "link"]

        try:
            with open(filename, mode="w", encoding="utf-8", newline="") as file:
                writer = csv.DictWriter(file, fieldnames=headers)
                writer.writeheader()
                for restaurant in self.restaurants:
                    writer.writerow(restaurant)
            print(f"Data saved to {filename}")
        except Exception as e:
            print(f"Error saving to CSV: {e}")

In [None]:
scraper = SeleniumScraper(query="us/en/selection/france/restaurants/page/", max_pages=500)
scraper.scrape()
scraper.save_to_csv("../data/raw/michelin_restaurants.csv")

Scraping page: 1
Scraping page: 2
Scraping page: 3
Scraping page: 4
Scraping page: 5
Scraping page: 6
Scraping page: 7
Scraping page: 8
Scraping page: 9
Error extracting card: Message: stale element reference: stale element not found in the current frame
  (Session info: chrome=131.0.6778.108); For documentation on this error, please visit: https://www.selenium.dev/documentation/webdriver/troubleshooting/errors#stale-element-reference-exception
Stacktrace:
	GetHandleVerifier [0x00007FF7F1CB6CF5+28821]
	(No symbol) [0x00007FF7F1C23880]
	(No symbol) [0x00007FF7F1AC578A]
	(No symbol) [0x00007FF7F1ACC2ED]
	(No symbol) [0x00007FF7F1ACF308]
	(No symbol) [0x00007FF7F1B606B0]
	(No symbol) [0x00007FF7F1B3F2FA]
	(No symbol) [0x00007FF7F1B5F412]
	(No symbol) [0x00007FF7F1B3F0A3]
	(No symbol) [0x00007FF7F1B0A778]
	(No symbol) [0x00007FF7F1B0B8E1]
	GetHandleVerifier [0x00007FF7F1FEFCED+3408013]
	GetHandleVerifier [0x00007FF7F200745F+3504127]
	GetHandleVerifier [0x00007FF7F1FFB63D+3455453]
	GetHandl

### Michelin's Green Star

![green_star_michelin](https://d3h1lg3ksw6i6b.cloudfront.net/media/image/2022/10/11/31d3d763e68745dca54c19db6978db5f_Green-Star-hero-image.jpg)

> The MICHELIN Green Star is an annual award which highlights restaurants at the forefront of the industry when it comes to their sustainable practices. They hold themselves accountable for both their ethical and environmental standards, and work with sustainable producers and suppliers to avoid waste and reduce or even remove plastic and other non-recyclable materials from their supply chain.
> 
> These restaurants offer dining experiences that combine culinary excellence with outstanding eco-friendly commitments and are a source of inspiration both for keen foodies and the hospitality industry as a whole.

[What is a MICHELIN Green Star?](https://guide.michelin.com/kr/en/article/features/what-is-a-michelin-green-star-kr)

In [None]:
scraper = SeleniumScraper(query="us/en/restaurants/sustainable_gastronomy/page/", max_pages=500)
scraper.scrape()
scraper.save_to_csv("../data/raw/green_star_michelin_restaurants.csv")

## API Fetching to get TripAdvisor Reviews

[TripAdvisor COM](https://rapidapi.com/ntd119/api/tripadvisor-com1/playground/apiendpoint_69ee0260-7cc1-470b-b652-8c58aef0dcd5)

### RapidAPI

#### GET [`auto-complete`](https://rapidapi.com/ntd119/api/tripadvisor-com1/playground/apiendpoint_0171288d-04e1-4055-9a72-49337370286b)
To get the `geoID` to use in the `restaurants/search` endpoint

#### GET [`restaurants/search`](https://rapidapi.com/ntd119/api/tripadvisor-com1/playground/apiendpoint_d9566671-eb47-40b7-99b5-abde0dc7414e) to get Restaurant ID in RapidAPI context
To get the `contentID` necessary to get the restaurants reviews in the `restaurants/reviews` endpoint

#### GET [`restaurants/reviews`](https://rapidapi.com/ntd119/api/tripadvisor-com1/playground/apiendpoint_69ee0260-7cc1-470b-b652-8c58aef0dcd5) 
To fetch the restaurants' reviews

### [Google Places API](https://developers.google.com/maps/documentation/places/web-service/op-overview?hl=fr)

#### GET [`v1/places:searchText`](https://developers.google.com/maps/documentation/places/web-service/text-search?hl=fr)
To get the necessary `PLACE_ID` for the `v1/places/PLACE_ID` endpoint

#### GET [`v1/places/PLACE_ID`](https://developers.google.com/maps/documentation/places/web-service/place-details?hl=fr)

## API Fetching to get more reviews on Reddit
with [Reddit API](https://www.reddit.com/dev/api/) or [PRAW](https://praw.readthedocs.io/en/stable/)