## Imports

In [9]:
import pandas as pd 
import numpy as np 
import requests
from selenium import webdriver
import matplotlib.pyplot as plt
import folium
from math import radians, sin, cos, asin, sqrt, ceil
import time
import os

from selenium import webdriver
import chromedriver_binary

from selenium.webdriver.common.keys import Keys

from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as ec
from selenium.webdriver.chrome.options import Options

from bs4 import BeautifulSoup

from dotenv import load_dotenv, find_dotenv

In [11]:
path = find_dotenv()
load_dotenv(path)
yelp_key=os.getenv('YELP_KEY')
google_key=os.getenv('GOOGLE_PLACE_KEY')

## Get list of restaurants from Yelp API

In [12]:
#Haversine Distance function used to compute radius

def haversine_distance(lon1, lat1, lon2, lat2):
    """
    Compute distance between two pairs of coordinates (lon1, lat1, lon2, lat2)
    """
    lon1, lat1, lon2, lat2 = map(radians, [lon1, lat1, lon2, lat2])
    dlon = lon2 - lon1
    dlat = lat2 - lat1
    a = sin(dlat / 2) ** 2 + cos(lat1) * cos(lat2) * sin(dlon / 2) ** 2
    return 2 * 6371 * asin(sqrt(a))

In [13]:
#Divide Paris into subzones to request more restaurants from Yelp API

def subzones_paris(n_subzones):
    """
    Divide Paris into n_subzones squares. Returns list of centers (lat,lon)
    and the radius (in meters) to use for the API request
    """
    #lat-lon boundaries for Paris
    up_left=np.array([48.9002,2.2617])
    up_right=np.array([48.9002,2.4488])
    down_right=np.array([48.8202,2.4488])
    down_left=np.array([48.8202,2.2617])

    coor = np.array([up_left,up_right,down_right,down_left])

    #compute distance
    dist_h=up_right-up_left
    dist_v=up_left-down_left

    #divide in n_subzones

    epsilon_h=dist_h/n_subzones**0.5
    epsilon_v=dist_v/n_subzones**0.5

    centers=[]
    start=up_left

    for i in range(int(n_subzones**0.5)):
        for j in range(int(n_subzones**0.5)):
            start=(i+0.5)*epsilon_v+(j+0.5)*epsilon_h+up_left
            centers.append(start)
            
    
    radius = haversine_distance(up_left[0], up_left[1], centers[0][0], centers[0][1])
    
    return centers,radius*1000

In [14]:
def map_centers(centers,radius):
    centers_start=centers[int((len(centers)-1)/2)]
    m = folium.Map(location=list(centers_start), zoom_start=13)
    
    for c in centers:
        folium.Circle(radius=radius,location=list(c),
                      color="crimson",fill=False).add_to(m)
        
    return m 

In [15]:
map_centers(*subzones_paris(16))

In [16]:
def get_restaurants(centers,radius,api_key):
    """
    Returns DataFrame of restaurants in Paris
    """
    headers={'Authorization':f'Bearer {yelp_key}'}
    url = 'https://api.yelp.com/v3/businesses/search'

    data = []
    
    for i,c in enumerate(centers):
        print(f'---------- Requesting API for subzone #{i+1} ----------')
        for offset in range(0, 200, 50):
            print(f'   ------- Requesting API with offset = {offset} -------   ')
            params = {
                'limit': 50, 
                'categories':['restaurants'],
                'sort_by':'review_count',
                'offset': offset,
                'latitude':c[0],
                'longitude':c[1],
                'radius':int(radius)
            }

            response = requests.get(url, headers=headers, params=params)
            if response.status_code == 200:
                data += response.json()['businesses']
            elif response.status_code == 400:
                print('400 Bad Request')
                break
                
    print(f'#####   Request completed, {len(data)} businesses fetched   ###')
    return data

In [18]:
centers,radius = subzones_paris(4)
data = get_restaurants(centers,radius,yelp_key)

---------- Requesting API for subzone #1 ----------
   ------- Requesting API with offset = 0 -------   
   ------- Requesting API with offset = 50 -------   
   ------- Requesting API with offset = 100 -------   
   ------- Requesting API with offset = 150 -------   
---------- Requesting API for subzone #2 ----------
   ------- Requesting API with offset = 0 -------   
   ------- Requesting API with offset = 50 -------   
   ------- Requesting API with offset = 100 -------   
   ------- Requesting API with offset = 150 -------   
---------- Requesting API for subzone #3 ----------
   ------- Requesting API with offset = 0 -------   
   ------- Requesting API with offset = 50 -------   
   ------- Requesting API with offset = 100 -------   
   ------- Requesting API with offset = 150 -------   
---------- Requesting API for subzone #4 ----------
   ------- Requesting API with offset = 0 -------   
   ------- Requesting API with offset = 50 -------   
   ------- Requesting API with off

## Yelp Reviews Scrapping

In [19]:
### Create DF for Yelp data


def create_df_yelp(data):

    df = pd.DataFrame(columns=[
        'alias', 'name', 'url', 'categories', 'latitude', 'longitude',
        'address', 'zip_code', 'price', 'rating', 'review_count'
    ])

    features_to_loop = [
        'alias', 'name', 'url', 'categories', 'price', 'rating', 'review_count'
    ]

    #populate DF
    #if condition to avoid raising errors in case restaurant doesn't have all informations

    for i, d in enumerate(data):

        for f in features_to_loop:
            if f in d:
                df.loc[i, f] = d[f]
            else:
                df.loc[i, f] = ''

        if 'location' in d:
            if 'latitude' in d['coordinates']:
                df.loc[i, 'latitude'] = d['coordinates']['latitude']
            else:
                df.loc[i, 'latitude'] = ''

            if 'longitude' in d['coordinates']:
                df.loc[i, 'longitude'] = d['coordinates']['longitude']
            else:
                df.loc[i, 'longitude'] = ''

            if 'address1' in d['location']:
                df.loc[i, 'address'] = d['location']['address1']
            else:
                df.loc[i, 'address'] = ''

            if 'zip_code' in d['location']:
                df.loc[i, 'zip_code'] = d['location']['zip_code']
            else:
                df.loc[i, 'zip_code'] = 0

    #clean DF
    #dtypes
    df['latitude'] = df['latitude'].astype(float)
    df['longitude'] = df['longitude'].astype(float)
    df['zip_code'] = df['zip_code'].replace('', 0).astype(int)
    df['rating'] = df['rating'].astype(float)
    df['review_count'] = df['review_count'].astype(float)

    #url
    df['url'] = df['url'].apply(lambda txt: txt.split('?', 1)[0])

    #price
    prices = {'€': '1', '€€': '2', '€€€': '3', '€€€€': '4'}

    for euro, num in prices.items():
        df['price'] = df['price'].replace(euro, num)

    df['price'] = df['price'].replace('', 0).astype(int)

    #categories
    df['categories'] = df['categories'].apply(
        lambda dicts: ', '.join([d['alias'] for d in dicts]))

    return df.drop_duplicates()

In [20]:
df = create_df_yelp(data)

In [21]:
df.head()

Unnamed: 0,alias,name,url,categories,latitude,longitude,address,zip_code,price,rating,review_count
0,bouillon-chartier-paris,Bouillon Chartier,https://www.yelp.com/biz/bouillon-chartier-paris,french,48.87194,2.34317,7 rue du Faubourg Montmartre,75009,2,3.5,953.0
1,le-potager-du-père-thierry-paris-2,Le Potager du Père Thierry,https://www.yelp.com/biz/le-potager-du-p%C3%A8...,french,48.884476,2.341335,16 rue des Trois Frères,75018,2,4.5,427.0
2,la-cave-gourmande-paris-2,La Cave Gourmande,https://www.yelp.com/biz/la-cave-gourmande-par...,bistros,48.884499,2.339847,96 rue des Martyrs,75018,2,4.5,336.0
3,holybelly-19-paris,Holybelly 19,https://www.yelp.com/biz/holybelly-19-paris,"breakfast_brunch, cafes, desserts",48.87213,2.36078,19 rue Lucien Sampaix,75010,2,4.5,296.0
4,le-relais-de-venise-l-entrecôte-paris-2,Le Relais de Venise - l'Entrecôte,https://www.yelp.com/biz/le-relais-de-venise-l...,"desserts, bbq",48.878045,2.285062,271 boulevard Pereire,75017,3,4.0,275.0


In [22]:
df.loc[45]

alias                                    soul-kitchen-paris
name                                           Soul Kitchen
url             https://www.yelp.com/biz/soul-kitchen-paris
categories               coffee, desserts, breakfast_brunch
latitude                                           48.88916
longitude                                          2.342305
address                                      33 rue Lamarck
zip_code                                              75018
price                                                     2
rating                                                  4.5
review_count                                          129.0
Name: 45, dtype: object

In [None]:
 xpath_rate = f'//yelp-react-root/div[1]/div[4]/div/div/div[2]/div/div[1]/div[2]/section/div[2]/div/ul/li[{i}]/div/div[2]/div/div[1]/span/div'
            

In [45]:
url_ = df.loc[50,'url']

In [None]:
#get rating and date
        for i,rate in enumerate(soup.find_all(class_="arrange__373c0__2iVWK gutter-1__373c0__1NOjO vertical-align-middle__373c0__2sr2a border-color--default__373c0__r305k")[:10]):
            if verbose>1:
                print(f'--Getting rate and date #{i+1}...')
            rates.append(rate.find('span').find('div').attrs['aria-label'])
            dates.append(rate.find(class_="css-wc5edd").string)
                

In [155]:
### SCRAPPING ONE RESTAURANT --- YELP ###

def get_reviews_yelp(url,verbose=0,quiet_mode=True,load_strategy='eager'):
    
    #pages
    
    options=Options()
    if quiet_mode:
        options.add_argument('--headless')
    options.page_load_strategy = load_strategy
    driver = webdriver.Chrome(options=options)
    
    driver.get(url)
    xpath_page="//yelp-react-root/div[1]/div[4]/div/div/div[2]/div/div[1]/div[2]/section/div[2]/div/div[4]/div[2]/span"
    
    page = driver.find_element_by_xpath(xpath_page).text
    n_pages=int(page.split('of ')[1])
    reviews=[]
    rates=[]
    dates=[]
    
    print(f'### {n_pages} pages to scrap ###')
    
    for n in range(n_pages):
        
        if verbose>0:
            print(f'--- Fetching reviews of page #{n+1}...')
        
        #url of the n_page
        url_=f'{url}?start={n*10}'
        driver.get(url_)
        xpath_all_review=f"//yelp-react-root/div[1]/div[4]/div/div/div[2]/div/div[1]/div[2]/section/div[2]/div/ul"
            
            
           
        review_block = driver.find_element_by_xpath(xpath_all_review)
        all_reviews = review_block.find_elements_by_tag_name('li')
        for r in all_reviews:
            reviews.append(r.text)

        if verbose>1:
            print(f'--Getting reviews #{i}...')

        #review = driver.find_element_by_xpath(xpath_review)
        #rate = driver.find_element_by_xpath(xpath_rate)

        #reviews.append(review.text)
        #rates.append(rate.get_attribute('aria-label'))
            
        
            
    return dates,rates,reviews

In [156]:
df.loc[50,'url']

'https://www.yelp.com/biz/la-fourmi-paris'

In [157]:
dates,rates,reviews = get_reviews_yelp(df.loc[50,'url'],verbose=2)

  page = driver.find_element_by_xpath(xpath_page).text


### 2 pages to scrap ###
--- Fetching reviews of page #1...


  review_block = driver.find_element_by_xpath(xpath_all_review)


--Getting reviews #10...
--- Fetching reviews of page #2...
--Getting reviews #10...


In [158]:
dates,rates,reviews 

([],
 [],
 ['Ariane V.\nParis, France\n28\n45\n74\n2/11/2013\nLa fourmi literally means "the ant", and when you go there you understand why: the place is hiving of activity like a true anthill.\nIt\'s kind of complicated to find a table but somehow you always manage to. And once you have you feel home, looking at Pigalle neighborhood from the large windows.\nThe place is a true French bar, where students and young workers meet at night, to grab a cheap beer or a generous (thought very simple) meal. The staff is friendly but won\'t linger to talk to people as they are always busy.\nUseful 3\nFunny\nCool',
  "Jeremy S.\nEncinitas, CA\n916\n1257\n2303\n8/3/2015\n1 photo\nGreat spot to grab a drink and relax amidst a nice breeze of Montmarte. This location actually has a cool little breeze that runs through the Rue onto the outdoor seating. The beer selection is rather limited but the draft pints served is not bad at all. The servers here are rather attentive and the location is not too ba

In [151]:
len(dates) , len(reviews) , len(rates)

(0, 0, 0)

In [125]:
options=Options()
if True:
    options.add_argument('--headless')
options.page_load_strategy = 'normal'
driver = webdriver.Chrome(options=options)
    
driver.get(url_)
reviews=[]
for i in range(1,11):
    xpath_review=f"//yelp-react-root/div[1]/div[4]/div/div/div[2]/div/div[1]/div[2]/section/div[2]/div/ul"
    review = driver.find_element_by_xpath(xpath_review)
    reviews.append(review)
reviews

  review = driver.find_element_by_xpath(xpath_review)


AttributeError: 'list' object has no attribute 'shape'

## Google Review Scrapping

In [35]:
## Get place ID 

def get_place_google_id(name,latitude,longitude):

    url = 'https://maps.googleapis.com/maps/api/place/findplacefromtext/json'
    params={
        'key' : google_key,
        'input' : name,
        'inputtype' : 'textquery',
        'locationbias' : f'point:{latitude},{longitude}'
    }
    
    response = requests.get(url,params=params)
    
    #if conditions to avoid raising errors
    if response.status_code != 200:
        return ''
        
    if 'candidates' in response.json():
        response = response.json()['candidates']
        if len(response)==0:
            return ''
        if 'place_id' in response[0]:
            return response[0]['place_id']
    
    return ''


In [36]:
## id_ for testing purposes -- don't request API multiple times

name = df.loc[0,'name']
lat = df.loc[0,'latitude']
lon = df.loc[0,'longitude']
id_=get_place_google_id(name,lat,lon)

id_

'ChIJ4xutfT5u5kcRaJn2NkiOhPU'

In [37]:
## Get place url

def get_place_google_url(place_id):
    url='https://maps.googleapis.com/maps/api/place/details/json'
    params={
        'key' : google_key,
        'place_id' : place_id,
        'fields' : 'url'
    }
    
    response = requests.get(url,params=params)
    
    #if conditions to avoid raising errors
    if response.status_code != 200:
        return ''
    
    if 'result' in response.json():
        response = response.json()['result']
        if 'url' in response:
            return response['url']
    
    return ''


In [38]:
## url_for_test for testing purposes -- don't request API multiple times

url_for_test=get_place_google_url(id_)
url_for_test

'https://maps.google.com/?cid=17691421677029071208'

In [39]:
### Get all reviews from a Google page


def get_reviews_google(url,scroll_limit=None,quiet_mode=True,return_count=False):
    
    options=Options()
    if quiet_mode:
        options.add_argument('--headless')    
    driver = webdriver.Chrome(options=options)
    driver.get(url)

    
    ###Expand all the reviews using Selenium
        # privacy pop-up
    xpath = "/html/body/c-wiz/div/div/div/div[2]/div[1]/div[4]/form/div[1]/div/button/span"
    driver.find_element_by_xpath(xpath).click()

        #review_count click
    xpath = '//*[@id="pane"]/div/div[1]/div/div/div[2]/div[1]/div[1]/div[2]/div/div[1]/span[1]/span/span[1]/span[2]'

    review_count = driver.find_element_by_xpath(xpath).text
    review_count=review_count.split(' ', 1)[0]

    driver.find_element_by_xpath(xpath).click()

        #scroll to show all reviews
    time.sleep(2)
    if scroll_limit:
        review_count=scroll_limit
    scrollable_div = driver.find_element_by_xpath('//*[@id="pane"]/div/div[1]/div/div/div[2]')
    for i in range(0,(round(int(review_count)/10-1))):
            driver.execute_script('arguments[0].scrollTop = arguments[0].scrollHeight', 
                    scrollable_div)
            time.sleep(2)


    ### Scrap the reviews info using BS      
    soup = BeautifulSoup(driver.page_source, 'html.parser')
    
    #Scrap the reviews text
    reviews_soup = soup.find_all('div', class_='ODSEW-ShBeI NIyLF-haAclf gm2-body-2')
    reviews = [r.text for r in reviews_soup]
    
    #Scrap the reviews rate
    review_rates_soup = [s.find('span',class_='ODSEW-ShBeI-H1e3jb') for s in reviews_soup]
    review_rates = [rr.attrs['aria-label'][1] for rr in review_rates_soup]
    
    #Scrap the reviews date
    review_dates_soup=[s.find('span', class_='ODSEW-ShBeI-RgZmSc-date') for s in reviews_soup]
    review_dates=[rd.text for rd in review_dates_soup]
    
    
    if return_count:
        return review_count,review_dates,review_rates,reviews
    
    return review_dates,review_rates,reviews


In [40]:
review_dates,review_rates,reviews = get_reviews_google(url_for_test,scroll_limit=40,quiet_mode=False)

  driver.find_element_by_xpath(xpath).click()
  review_count = driver.find_element_by_xpath(xpath).text
  driver.find_element_by_xpath(xpath).click()
  scrollable_div = driver.find_element_by_xpath('//*[@id="pane"]/div/div[1]/div/div/div[2]')


In [41]:
review_dates

['il y a une semaine',
 'il y a 4\xa0jours',
 'il y a 4\xa0jours',
 'il y a un mois',
 'il y a un mois',
 'il y a 2\xa0mois',
 'il y a 2\xa0mois',
 'il y a une semaine',
 'il y a un mois',
 'il y a 2\xa0mois',
 'il y a 3\xa0mois',
 'il y a 2\xa0mois',
 'il y a 3\xa0mois',
 'il y a 3\xa0mois',
 'il y a 4\xa0mois',
 'il y a 3\xa0semaines',
 'il y a 4\xa0mois',
 'il y a 3\xa0mois',
 'il y a 4\xa0mois',
 'il y a 2\xa0semaines',
 'il y a une semaine',
 'il y a une semaine',
 'il y a une semaine',
 'il y a un mois',
 'il y a un mois',
 'il y a 4\xa0mois',
 'il y a 3\xa0semaines',
 'il y a 3\xa0mois',
 'il y a un mois',
 'il y a 3\xa0mois',
 'il y a une semaine',
 'il y a une semaine',
 'il y a 3\xa0mois',
 'il y a un mois',
 'il y a 4\xa0mois',
 'il y a une semaine',
 'il y a 3\xa0mois',
 'il y a 3\xa0semaines',
 'il y a une semaine',
 'il y a 3\xa0mois']

In [42]:
reviews

["                  Jean-Luc Paredes  Local\xa0Guide6\xa0avis               il y a une semaine Nouveau    Excellent ! Typique ! La vrai brasserie parisienne telle qu'on l'espère ! Un serveur au top. Des voisins de table sympa. Tout était réuni pour passer un agréable moment !       Visité en novembre                J'aime    Partager       ",
 "                  Pascal vacaresse  Local\xa0Guide · 48\xa0avis               il y a 4\xa0jours Nouveau    De passage à Paris pour quelques jours. Une visite dans ce lieu me semblait incontournable. Victime de leur succès 1h30 d'attente pour avoir une table. Restauration convenable et les prix accessibles pour tous. Un peu bruyant. …    Plus      Visité en novembre             +2         J'aime    Partager       ",
 "                  Marie Claude Suchet  Local\xa0Guide · 172\xa0avis               il y a 4\xa0jours Nouveau    C'est vrai c'est vraiment une brasserie hors du temps où vous pouvez manger seul, car c'est un véritable spectacle. Nous 

In [43]:
data_to_clean = pd.DataFrame({'data':review_dates,
                              'rate':review_rates,
                              'review':reviews})
data_to_clean.head()

Unnamed: 0,data,rate,review
0,il y a une semaine,4,Jean-Luc Paredes Local Guid...
1,il y a 4 jours,4,Pascal vacaresse Local Guid...
2,il y a 4 jours,5,Marie Claude Suchet Local G...
3,il y a un mois,4,Michel C. Local Guide · 140...
4,il y a un mois,5,Jackie Ibanez Local Guide ·...


In [44]:
data_to_clean.to_csv('data_to_clean.csv')