## Scraping "Guide Michelin" with Selenium

In [None]:
%pip install selenium

In [9]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time
import csv
import re

In [12]:
class SeleniumScraper:
    def __init__(self, query, max_pages):
        self.base_url = "https://guide.michelin.com/"
        self.query = query
        self.max_pages = max_pages
        
        chrome_options = webdriver.ChromeOptions()
        chrome_options.add_argument('--headless')
        chrome_options.add_argument('--no-sandbox')
        chrome_options.add_argument('--disable-dev-shm-usage')
        self.driver = webdriver.Chrome(options=chrome_options)
        self.restaurants = []
        self.rate_limiter_delay = 1.5

    def scrape(self):
        page = 1
        while page <= self.max_pages:
            print(f"Scraping page: {page}")
            url = f"{self.base_url}{self.query}{page}"
            self.driver.get(url)
            
            try:
                # Wait for cards to load dynamically
                WebDriverWait(self.driver, 10).until(
                    EC.presence_of_all_elements_located((By.CSS_SELECTOR, ".card__menu"))
                )
                
                # Extract restaurant data
                self.extract_restaurants()
                page += 1

                # Rate limiter
                time.sleep(self.rate_limiter_delay)
            except Exception as e:
                print(f"No more pages or error: {e}")
                break

        self.driver.quit()
        
    def _extract_number_stars(self, element):
        if element == None:
            return 0 # Just no stars
        attribute = element.get_attribute("src")
        img_name = attribute.split("/")[-1].split('.')[0]
        match = re.findall(r'\d', img_name)
        if len(match) > 0:
            return match[0] # Nb of stars in the pictures' file name
            
        return 0 # Bib Gourmand
           
    def extract_restaurants(self):
        cards = self.driver.find_elements(By.CSS_SELECTOR, ".card__menu")

        for card in cards:
            try:
                try:
                    # Attempt to find the element
                    rating_container = card.find_element(By.CSS_SELECTOR, ".flex-fill div div div div span img")
                except Exception:
                    # If not found, assign a default value
                    rating_container = None

                rating = self._extract_number_stars(rating_container)
                id_ = card.get_attribute("data-id")
                img = card.find_element(By.CSS_SELECTOR, ".card__menu-image a").get_attribute("ci-bg-url")
                name = card.find_element(By.CSS_SELECTOR, ".card__menu-content--title a").text.strip()
                link = card.find_element(By.CSS_SELECTOR, ".card__menu-content--title a").get_attribute("href")
                lat = float(card.get_attribute("data-lat"))
                lon = float(card.get_attribute("data-lng"))
                
                data_container = card.find_element(By.CSS_SELECTOR, ".card__menu-image .card__menu-image--top div div .js-favorite-restaurant")
                location = data_container.get_attribute("data-dtm-city")
                country = data_container.get_attribute("data-restaurant-selection")
                type_ = data_container.get_attribute("data-cooking-type")
                price_category = len(data_container.get_attribute("data-dtm-price"))


                self.restaurants.append({
                    "id": id_,
                    "name": name,
                    "location": location,
                    "country": country,
                    "latitude": lat,
                    "longitude": lon,
                    "rating": rating,
                    "priceCategory": price_category,
                    "type": type_,
                    #"year": year,
                    "img": img,
                    "link": link,
                })
            except Exception as e:
                print(f"Error extracting card: {e}")

    def save_to_csv(self, filename="restaurants.csv"):
        headers = ["id", "name", "location", "country", "latitude", "longitude", "rating", "priceCategory", "type", "img", "link"]

        try:
            with open(filename, mode="w", encoding="utf-8", newline="") as file:
                writer = csv.DictWriter(file, fieldnames=headers)
                writer.writeheader()
                for restaurant in self.restaurants:
                    writer.writerow(restaurant)
            print(f"Data saved to {filename}")
        except Exception as e:
            print(f"Error saving to CSV: {e}")

In [None]:
scraper = SeleniumScraper(query="us/en/selection/france/restaurants/page/", max_pages=500)
scraper.scrape()
scraper.save_to_csv("../data/raw/michelin_restaurants.csv")

Scraping page: 1
Scraping page: 2
Scraping page: 3
Scraping page: 4
Scraping page: 5
Scraping page: 6
Scraping page: 7
Scraping page: 8
Scraping page: 9
Error extracting card: Message: stale element reference: stale element not found in the current frame
  (Session info: chrome=131.0.6778.108); For documentation on this error, please visit: https://www.selenium.dev/documentation/webdriver/troubleshooting/errors#stale-element-reference-exception
Stacktrace:
	GetHandleVerifier [0x00007FF7F1CB6CF5+28821]
	(No symbol) [0x00007FF7F1C23880]
	(No symbol) [0x00007FF7F1AC578A]
	(No symbol) [0x00007FF7F1ACC2ED]
	(No symbol) [0x00007FF7F1ACF308]
	(No symbol) [0x00007FF7F1B606B0]
	(No symbol) [0x00007FF7F1B3F2FA]
	(No symbol) [0x00007FF7F1B5F412]
	(No symbol) [0x00007FF7F1B3F0A3]
	(No symbol) [0x00007FF7F1B0A778]
	(No symbol) [0x00007FF7F1B0B8E1]
	GetHandleVerifier [0x00007FF7F1FEFCED+3408013]
	GetHandleVerifier [0x00007FF7F200745F+3504127]
	GetHandleVerifier [0x00007FF7F1FFB63D+3455453]
	GetHandl

### Avoid `stale element reference` error

In [10]:
from selenium.common.exceptions import StaleElementReferenceException, TimeoutException

class SeleniumScraper:
    def __init__(self, query, max_pages):
        self.base_url = "https://guide.michelin.com/"
        self.query = query
        self.max_pages = max_pages
        
        chrome_options = webdriver.ChromeOptions()
        chrome_options.add_argument('--headless')
        chrome_options.add_argument('--no-sandbox')
        chrome_options.add_argument('--disable-dev-shm-usage')
        self.driver = webdriver.Chrome(options=chrome_options)
        self.restaurants = []
        self.rate_limiter_delay = 1.5

    def _safely_get_element(self, container, by, value, timeout=10, max_attempts=3):
        """
        Safely retrieve a web element, handling stale element references.
        
        :param container: Parent WebElement or WebDriver to search within
        :param by: Selenium By locator (e.g., By.CSS_SELECTOR, By.XPATH)
        :param value: Locator value
        :param timeout: Maximum wait time for element
        :param max_attempts: Maximum number of retry attempts
        :return: Web element or None if not found
        """
        for attempt in range(max_attempts):
            try:
                # Find the element
                element = container.find_element(by, value)
                
                # Additional check to ensure element is not stale
                element.tag_name
                return element
            
            except (StaleElementReferenceException, Exception):
                # If it's not the last attempt, continue to retry
                if attempt < max_attempts - 1:
                    print(f"Attempt {attempt + 1} failed. Retrying element location...")
                    time.sleep(timeout)
                    continue
                else:
                    print(f"Failed to locate element after {max_attempts} attempts")
                    return None

    def scrape(self):
        page = 1
        while page <= self.max_pages:
            print(f"Scraping page: {page}")
            url = f"{self.base_url}{self.query}{page}"
            self.driver.get(url)
            
            try:
                # Wait for cards to load dynamically
                WebDriverWait(self.driver, 40).until(
                    EC.presence_of_all_elements_located((By.CSS_SELECTOR, ".card__menu"))
                )
                
                # Extract restaurant data
                self._extract_restaurants()
                page += 1

                # Rate limiter
                time.sleep(self.rate_limiter_delay)
            except Exception as e:
                print(f"No more pages or error: {e}")
                break

        self.driver.quit()
        
    def _extract_number_stars(self, element):
        try:
            images = element.find_elements(By.TAG_NAME, 'img')
            if len(images) - 1 == 0:
                return -1 # Just no stars
            else:
                nb_stars = 0 # Bib Gourmand
                for img in images:
                    attribute = img.get_attribute("src")
                    if "1star.svg" in attribute:
                        nb_stars += 1 # Nb of stars in the container
                return nb_stars
        except Exception:
            pass
           
    def _extract_image_url(self, card):
        """
        Robust method to extract image URL from the Michelin Guide card element
        
        Args:
            card (WebElement): The card element to extract image from
        
        Returns:
            str: Image URL or empty string if not found
        """
        try:
            # First, try the ci-bg-url attribute on the image wrapper
            image_wrapper = self._safely_get_element(
                card, 
                By.CSS_SELECTOR, 
                ".card__menu-image .image-wrapper[ci-bg-url]"
            )
            
            if image_wrapper:
                url = image_wrapper.get_attribute("ci-bg-url")
                if url and url.strip():
                    return url
            
            # Fallback: try finding the image element with data-gallery-image
            image_element = self._safely_get_element(
                card, 
                By.CSS_SELECTOR, 
                ".card__menu-image .icon-box img[data-gallery-image]"
            )
            
            if image_element:
                gallery_images = image_element.get_attribute("data-gallery-image")
                if gallery_images:
                    # Split gallery images and return the first one
                    image_urls = gallery_images.split(',')
                    return image_urls[0] if image_urls else ""
            
            # Fallback: try to get background image via JavaScript
            url = self.driver.execute_script("""
                var card = arguments[0];
                var imageWrapper = card.querySelector('.card__menu-image .image-wrapper[ci-bg-url]');
                if (imageWrapper) {
                    return imageWrapper.getAttribute('ci-bg-url');
                }
                return '';
            """, card)
            
            if url and url.strip():
                return url
        
        except Exception as e:
            print(f"Error extracting image URL: {e}")
        
        print("No image URL found for this card")
        return ""
    
    def _extract_restaurants(self):
        try:
            # Use WebDriverWait to ensure cards are loaded and stable
            cards = WebDriverWait(self.driver, 40).until(
                EC.presence_of_all_elements_located((By.CSS_SELECTOR, ".card__menu"))
            )

            for card in cards:
                try:
                    # Use safe element retrieval for each extraction
                    try:
                        rating_container = self._safely_get_element(
                            card, 
                            By.CSS_SELECTOR, 
                            ".flex-fill div div div div span"
                        )
                    except Exception:
                        rating_container = None

                    rating = self._extract_number_stars(rating_container)
                    
                    # Safely get each element with retry mechanism
                    
                    img_link = self._extract_image_url(card)
                    if not img_link:
                        print(f"Could not extract image for restaurant: {name}")
                    
                    name_element = self._safely_get_element(
                        card, 
                        By.CSS_SELECTOR, 
                        ".card__menu-content--title a"
                    )
                    
                    name = name_element.text.strip()
                    link = name_element.get_attribute("href")
                    
                    id_ = card.get_attribute("data-id")
                    lat = float(card.get_attribute("data-lat"))
                    lon = float(card.get_attribute("data-lng"))
                    
                    data_container = self._safely_get_element(
                        card, 
                        By.CSS_SELECTOR, 
                        ".card__menu-image .card__menu-image--top div div .js-favorite-restaurant"
                    )
                    
                    if name == None:
                        name = data_container.get_attribute("data-restaurant-name")
                    location = data_container.get_attribute("data-dtm-city")
                    country = data_container.get_attribute("data-restaurant-selection")
                    type_ = data_container.get_attribute("data-cooking-type")
                    price_category = len(data_container.get_attribute("data-dtm-price"))

                    self.restaurants.append({
                        "id": id_,
                        "name": name,
                        "location": location,
                        "country": country,
                        "latitude": lat,
                        "longitude": lon,
                        "rating": rating,
                        "priceCategory": price_category,
                        "type": type_,
                        "img": img_link,
                        "link": link,
                    })
                except Exception as e:
                    print(f"Error extracting individual card: {e}")
        except Exception as e:
            print(f"Error in extracting restaurant cards: {e}")

    def save_to_csv(self, filename="restaurants.csv"):
        headers = ["id", "name", "location", "country", "latitude", "longitude", "rating", "priceCategory", "type", "img", "link"]

        try:
            with open(filename, mode="w", encoding="utf-8", newline="") as file:
                writer = csv.DictWriter(file, fieldnames=headers)
                writer.writeheader()
                for restaurant in self.restaurants:
                    writer.writerow(restaurant)
            print(f"Data saved to {filename}")
        except Exception as e:
            print(f"Error saving to CSV: {e}")

### Michelin's Green Star

![green_star_michelin](https://d3h1lg3ksw6i6b.cloudfront.net/media/image/2022/10/11/31d3d763e68745dca54c19db6978db5f_Green-Star-hero-image.jpg)

> The MICHELIN Green Star is an annual award which highlights restaurants at the forefront of the industry when it comes to their sustainable practices. They hold themselves accountable for both their ethical and environmental standards, and work with sustainable producers and suppliers to avoid waste and reduce or even remove plastic and other non-recyclable materials from their supply chain.
> 
> These restaurants offer dining experiences that combine culinary excellence with outstanding eco-friendly commitments and are a source of inspiration both for keen foodies and the hospitality industry as a whole.

[What is a MICHELIN Green Star?](https://guide.michelin.com/kr/en/article/features/what-is-a-michelin-green-star-kr)

In [11]:
%%time
scraper = SeleniumScraper(query="us/en/restaurants/sustainable_gastronomy/page/", max_pages=500)
scraper.scrape()
scraper.save_to_csv("../data/raw/green_star_michelin_restaurants.csv")

Scraping page: 1
Scraping page: 2
Scraping page: 3
Scraping page: 4
Scraping page: 5
Scraping page: 6
Scraping page: 7
Scraping page: 8
Scraping page: 9
Attempt 1 failed. Retrying element location...
Attempt 2 failed. Retrying element location...
Failed to locate element after 3 attempts
Attempt 1 failed. Retrying element location...
Attempt 2 failed. Retrying element location...
Failed to locate element after 3 attempts
No image URL found for this card
Could not extract image for restaurant: Caffè La Crepa
Scraping page: 10
Scraping page: 11
Scraping page: 12
Scraping page: 13
Scraping page: 14
Scraping page: 15
Scraping page: 16
Scraping page: 17
Scraping page: 18
Scraping page: 19
Scraping page: 20
Scraping page: 21
Scraping page: 22
Scraping page: 23
Scraping page: 24
Scraping page: 25
Scraping page: 26
Scraping page: 27
Scraping page: 28
Scraping page: 29
Scraping page: 30
Scraping page: 31
No more pages or error: Message: 

Data saved to ../data/raw/green_star_michelin_restaurant

KeyboardInterrupt: 

## API Fetching to get Reviews

### [Google Places API](https://developers.google.com/maps/documentation/places/web-service/op-overview?hl=fr)

The HTTP request to send to the API is the following
```http
POST https://places.googleapis.com/v1/places:searchText?fields=places.id%2Cplaces.formattedAddress%2Cplaces.websiteUri%2Cplaces.rating%2Cplaces.reviews%2Cplaces.regularOpeningHours.periods%2Cplaces.googleMapsLinks.directionsUri HTTP/1.1

Authorization: Bearer [YOUR_ACCESS_TOKEN]
Accept: application/json
Content-Type: application/json

{
  "textQuery": "Choko Ona,Espelette",
  "includedType": "restaurant"
}
```

In [13]:
import pandas as pd

In [14]:
restaurants = pd.read_csv("../data/raw/green_star_michelin_restaurants.csv")
restaurants.head()

Unnamed: 0,id,name,location,country,latitude,longitude,rating,priceCategory,type,img,link
0,110681,One White Street,New York,USA,40.719355,-74.006101,1,4,Contemporary,https://axwwgrkdco.cloudimg.io/v7/__gmpics3__/...,https://guide.michelin.com/us/en/new-york-stat...
1,110644,Dirt Candy,New York,USA,40.71794,-73.99087,1,4,Vegetarian,https://axwwgrkdco.cloudimg.io/v7/__gmpics3__/...,https://guide.michelin.com/us/en/new-york-stat...
2,110607,Family Meal at Blue Hill,New York,USA,40.731971,-73.999639,1,4,American,https://axwwgrkdco.cloudimg.io/v7/__gmpics3__/...,https://guide.michelin.com/us/en/new-york-stat...
3,110478,Blue Hill at Stone Barns,Tarrytown,USA,41.100715,-73.829297,2,4,American,https://axwwgrkdco.cloudimg.io/v7/__gmpics3__/...,https://guide.michelin.com/us/en/new-york-stat...
4,110338,Oyster Oyster,Washington,USA,38.909302,-77.023015,1,3,Vegetarian,https://axwwgrkdco.cloudimg.io/v7/__gmpics3__/...,https://guide.michelin.com/us/en/district-of-c...


In [15]:
restaurants.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 581 entries, 0 to 580
Data columns (total 11 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   id             581 non-null    int64  
 1   name           581 non-null    object 
 2   location       581 non-null    object 
 3   country        581 non-null    object 
 4   latitude       581 non-null    float64
 5   longitude      581 non-null    float64
 6   rating         581 non-null    int64  
 7   priceCategory  581 non-null    int64  
 8   type           581 non-null    object 
 9   img            580 non-null    object 
 10  link           581 non-null    object 
dtypes: float64(2), int64(3), object(6)
memory usage: 50.1+ KB


I keep having an issue with that one image, I don't get why... So let's manually declare it

In [16]:
restaurants[restaurants['img'].isnull()]

Unnamed: 0,id,name,location,country,latitude,longitude,rating,priceCategory,type,img,link
176,105897,Ahimè,Bologna,Italia,44.496621,11.337969,0,2,Country cooking,,https://guide.michelin.com/us/en/emilia-romagn...


In [18]:
restaurants.at[176, 'img'] = "https://d3h1lg3ksw6i6b.cloudfront.net/guide/placeholder/pic_poilist_default_3.jpg"
restaurants.iloc[176]

id                                                          105897
name                                                         Ahimè
location                                                   Bologna
country                                                     Italia
latitude                                                 44.496621
longitude                                                11.337969
rating                                                           0
priceCategory                                                    2
type                                               Country cooking
img              https://d3h1lg3ksw6i6b.cloudfront.net/guide/pl...
link             https://guide.michelin.com/us/en/emilia-romagn...
Name: 176, dtype: object

#### GET [`v1/places:searchText`](https://developers.google.com/maps/documentation/places/web-service/text-search?hl=fr)
To get the necessary `PLACE_ID` for the `v1/places/PLACE_ID` endpoint

In [6]:
GOOGLE_PLACES_API_URL = "https://places.googleapis.com/v1/places:searchText"

In [None]:
import requests
import os

def __get_location_types(row):
    return f"{row['name']}, {row['location']} {row['country']}"

def get_data():
    """Fetches nearby places data either from a test file or the Google Places API.

    Returns:
        list: A list of places fetched from the API or test file.
    """
        
    try:
        headers = {
            # "Authorization": f"Bearer { GOOGLE_PLACES_API_KEY }",
            "Authorization": f"Bearer { os.environ["GOOGLE_PLACES_API_KEY"] }",
            "Content-Type": "application/json",
            "X-Goog-Api-Key": "",
        }
        
        params = {
            # places.id,places.formattedAddress,places.websiteUri,places.rating,places.reviews,places.regularOpeningHours.periods,places.googleMapsLinks.directionsUri
            "fields": [
                "places.id",
                "places.formattedAddress",
                "places.websiteUri",
                "places.rating",
                "places.reviews",
                "places.regularOpeningHours.periods",
                "places.googleMapsLinks.directionsUri",
            ]
        }

        req_data = {
            "textQuery": __get_location_types(),
            "includedType": "restaurant"
        }
        
        # Make a request to the Google Places API
        response = requests.post(url=GOOGLE_PLACES_API_URL, params=params, data=req_data, headers=headers)
        response.raise_for_status()
        data = response.json()
        
        # Return the places
        return data["places"][0]

    except requests.RequestException as e:
        return print(f"error: {e}")
        


The Google Places API, as well as the TripAdvisor, only returns up to 5 reviews per place. So it would be relevant to get reviews from others users through another service. As an important aggregator of communities, Reddit could be a pertinent resource.

## API Fetching to get more reviews on Reddit
with [Reddit API](https://www.reddit.com/dev/api/) or [PRAW](https://praw.readthedocs.io/en/stable/)

subreddits such as [r/food](https://www.reddit.com/r/food/), [r/restaurant](https://www.reddit.com/r/restaurant/) or [r/MichelinStars](https://www.reddit.com/r/MichelinStars/)