In [18]:
import pandas as pd
import requests
from selenium import webdriver
import matplotlib.pyplot as plt
import folium
import time
from selenium import webdriver
import chromedriver_binary
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as ec
from selenium.webdriver.chrome.options import Options
from bs4 import BeautifulSoup
from dotenv import load_dotenv, dotenv_values, find_dotenv
from fobokiller.utils import subzones_paris
from os.path import join,dirname
import os
import csv

env_path = find_dotenv()
#env_path = join(dirname(dirname(__file__)), '.env')
load_dotenv(env_path)

#api_key = dotenv_values()["YELP_KEY"]

api_key = os.getenv('YELP_KEY')

def get_restaurants(centers, radius):
    """
    Returns DataFrame of restaurants in Paris
    """
    headers = {'Authorization': f'Bearer {api_key}'}
    url = 'https://api.yelp.com/v3/businesses/search'

    data = []

    for i, c in enumerate(centers):
        print(f'---------- Requesting API for subzone #{i+1} ----------')
        for offset in range(0, 200, 50):
            print(
                f'   ------- Requesting API with offset = {offset} -------   ')
            params = {
                'limit': 50,
                'categories': ['restaurants'],
                'sort_by': 'review_count',
                'offset': offset,
                'latitude': c[0],
                'longitude': c[1],
                'radius': int(radius)
            }

            response = requests.get(url, headers=headers, params=params)
            if response.status_code == 200:
                data += response.json()['businesses']
            elif response.status_code == 400:
                print('400 Bad Request')
                break

    print(f'#####   Request completed, {len(data)} businesses fetched   ###')
    return data


### Create DF for Yelp data


def create_df_yelp(data):

    df = pd.DataFrame(columns=[
        'alias', 'name', 'url', 'categories', 'latitude', 'longitude',
        'address', 'zip_code', 'price', 'rating', 'review_count'
    ])

    features_to_loop = [
        'alias', 'name', 'url', 'categories', 'price', 'rating', 'review_count'
    ]

    #populate DF
    #if condition to avoid raising errors in case restaurant doesn't have all informations

    for i, d in enumerate(data):

        for f in features_to_loop:
            if f in d:
                df.loc[i, f] = d[f]
            else:
                df.loc[i, f] = ''

        if 'location' in d:
            if 'latitude' in d['coordinates']:
                df.loc[i, 'latitude'] = d['coordinates']['latitude']
            else:
                df.loc[i, 'latitude'] = ''

            if 'longitude' in d['coordinates']:
                df.loc[i, 'longitude'] = d['coordinates']['longitude']
            else:
                df.loc[i, 'longitude'] = ''

            if 'address1' in d['location']:
                df.loc[i, 'address'] = d['location']['address1']
            else:
                df.loc[i, 'address'] = ''

            if 'zip_code' in d['location']:
                df.loc[i, 'zip_code'] = d['location']['zip_code']
            else:
                df.loc[i, 'zip_code'] = 0

    #clean DF
    #dtypes
    df['latitude'] = df['latitude'].astype(float)
    df['longitude'] = df['longitude'].astype(float)
    df['zip_code'] = df['zip_code'].replace('', 0).astype(int)
    df['rating'] = df['rating'].astype(float)
    df['review_count'] = df['review_count'].astype(float)

    #url
    df['url'] = df['url'].apply(lambda txt: txt.split('?', 1)[0])

    #price
    prices = {'€': '1', '€€': '2', '€€€': '3', '€€€€': '4'}

    for euro, num in prices.items():
        df['price'] = df['price'].replace(euro, num)

    df['price'] = df['price'].replace('', 0).astype(int)

    #categories
    df['categories'] = df['categories'].apply(
        lambda dicts: ', '.join([d['alias'] for d in dicts]))

    return df.drop_duplicates()



def get_place_google_id(name,latitude,longitude):

    url = 'https://maps.googleapis.com/maps/api/place/findplacefromtext/json'
    params={
        'key' :  os.getenv('GOOGLE_PLACE_KEY'),
        'input' : name,
        'inputtype' : 'textquery',
        'locationbias' : f'point:{latitude},{longitude}'
    }

    response = requests.get(url,params=params)

    #if conditions to avoid raising errors
    if response.status_code != 200:
        return ''

    if 'candidates' in response.json():
        response = response.json()['candidates']
        if len(response)==0:
            return ''
        if 'place_id' in response[0]:
            return response[0]['place_id']

    return ''



## Get place url


def get_place_google_url(place_id):
    url = 'https://maps.googleapis.com/maps/api/place/details/json'
    params = {
        'key': os.getenv('GOOGLE_PLACE_KEY'),
        'place_id': place_id,
        'fields': 'url'
    }

    response = requests.get(url, params=params)

    #if conditions to avoid raising errors
    if response.status_code != 200:
        return ''

    if 'result' in response.json():
        response = response.json()['result']
        if 'url' in response:
            return response['url']

    return ''



def get_reviews_google(url,scroll_limit=None,quiet_mode=True,return_count=False):
    options=Options()
    if quiet_mode:
        options.add_argument('--headless')
    driver = webdriver.Chrome(options=options)
    driver.get(url)


    ###Expand all the reviews using Selenium
    # privacy pop-up
    xpath = "/html/body/c-wiz/div/div/div/div[2]/div[1]/div[4]/form/div[1]/div/button/span"
    driver.find_element_by_xpath(xpath).click()

    #review_count click
    xpath = '//*[@id="pane"]/div/div[1]/div/div/div[2]/div[1]/div[1]/div[2]/div/div[1]/span[1]/span/span[1]/span[2]'

    review_count = driver.find_element_by_xpath(xpath).text
    review_count=review_count.split(' ', 1)[0]

    driver.find_element_by_xpath(xpath).click()

    # check
    #driver.find_element_by_xpath("/html/body/div[3]/div[9]/div[8]/div/div[1]/div/div/div[38]/div/button/span/span").click()

    #scroll to show all reviews
    time.sleep(2)
    if scroll_limit:
        review_count=scroll_limit
    scrollable_div = driver.find_element_by_xpath('//*[@id="pane"]/div/div[1]/div/div/div[2]')
    for i in range(0,(round(int(review_count)/10-1))):
        driver.execute_script('arguments[0].scrollTop = arguments[0].scrollHeight',
                scrollable_div)
        time.sleep(2)


    ### Scrap the reviews info using BS
    soup = BeautifulSoup(driver.page_source, 'html.parser')

    #Scrap the reviews text
    reviews_soup = soup.find_all('div', class_='ODSEW-ShBeI NIyLF-haAclf gm2-body-2')
    reviews = [r.text for r in reviews_soup]

    #Scrap the reviews rate
    review_rates_soup = [s.find('span',class_='ODSEW-ShBeI-H1e3jb') for s in reviews_soup]
    review_rates = [rr.attrs['aria-label'][1] for rr in review_rates_soup]
    #Scrap the reviews date
    review_dates_soup=[s.find('span', class_='ODSEW-ShBeI-RgZmSc-date') for s in reviews_soup]
    review_dates=[rd.text for rd in review_dates_soup]


    if return_count:

        return review_count,review_dates,review_rates,reviews


    return review_dates,review_rates,reviews

### Get all reviews from a Google page


def get_reviews_google(url,
                       scroll_limit=None,
                       quiet_mode=True,
                       return_count=False):

    # Import the webdriver
    options = webdriver.ChromeOptions()
    options.add_experimental_option('prefs',
                                    {'intl.accept_languages': 'en,en_US'})

    if quiet_mode:
        options.add_argument('--headless')
    driver = webdriver.Chrome(chrome_options=options)
    driver.get(url)

    # privacy pop-up

    xpath = "/html/body/c-wiz/div/div/div/div[2]/div[1]/div[4]/form/div[1]/div/button/span"
    driver.find_element_by_xpath(xpath).click()

    #### expand the review

    time.sleep(2)

    class_ = "ODSEW-KoToPc-ShBeI gXqMYb-hSRGPd"

    soup = BeautifulSoup(driver.page_source, "html.parser")

    #total_number_of_reviews = soup.find("div", class_="gm2-caption").text
    total_number_of_reviews = driver.find_element_by_xpath(
        '//*[@id="pane"]/div/div[1]/div/div/div[2]/div[1]/div[1]/div[2]/div/div[1]/span[1]/span/span[1]/span[2]'
    ).text
    total_number_of_reviews = total_number_of_reviews.split(' ', 1)[0]

    ## Catch nombre d'avis
    xpath = '//*[@id="pane"]/div/div[1]/div/div/div[2]/div[1]/div[1]/div[2]/div/div[1]/span[1]/span/span[1]/span[2]'

    review_count = driver.find_element_by_xpath(xpath).text
    review_count = review_count.split(' ', 1)[0]

    driver.find_element_by_xpath(
        '//*[@id="pane"]/div/div[1]/div/div/div[2]/div[1]/div[1]/div[2]/div/div[1]/span[1]/span/span[1]/span[2]'
    ).click()
    #total_number_of_reviews = soup.find("div", class_="gm2-caption").text
    #a = total_number_of_reviews
    time.sleep(1)
    try :
        xpatrier = "/html/body/div[3]/div[9]/div[8]/div/div[1]/div/div/div[2]/div[7]/div[2]/button/span"
        driver.find_element_by_xpath(xpatrier).click()
    except :
        pass
    time.sleep(2)
    try :
        xpatrecent = "/html/body/div[3]/div[3]/div[1]/ul/li[2]"
        driver.find_element_by_xpath(xpatrecent).click()
    except :
        pass
    ## Catch cellule of reviews

    books_html = soup.findAll('div', class_="siAUzd-neVct")
    len(books_html)

    #scroll to show all reviews
    time.sleep(2)
    if scroll_limit:
        review_count = scroll_limit
    scrollable_div = driver.find_element_by_xpath(
        '//*[@id="pane"]/div/div[1]/div/div/div[2]')
    for i in range(0, (round(int(review_count) / 10 - 1))):
        driver.execute_script(
            'arguments[0].scrollTop = arguments[0].scrollHeight',
            scrollable_div)
        time.sleep(2)

    #Find scroll layout
    scrollable_div = driver.find_element_by_xpath(
        '//*[@id="pane"]/div/div[1]/div/div/div[2]')

    print(review_count)
    #Scroll as many times as necessary to load all reviews
    for i in range(0, (round(int(review_count / 10 - 1)))):
        driver.execute_script(
            'arguments[0].scrollTop = arguments[0].scrollHeight',
            scrollable_div)
        time.sleep(2)

    response = BeautifulSoup(driver.page_source, 'html.parser')

    reviews = response.find_all('div',
                                class_='ODSEW-ShBeI NIyLF-haAclf gm2-body-2')

    return reviews


def get_review_summary(result_set):
    rev_dict = {'Review Rate': [], 'Review Time': [], 'Review Text': []}
    for result in result_set:
        review_rate = result.find('span',
                                  class_='ODSEW-ShBeI-H1e3jb')["aria-label"]
        review_time = result.find('span',
                                  class_='ODSEW-ShBeI-RgZmSc-date').text
        review_text = result.find('span', class_='ODSEW-ShBeI-text').text
        rev_dict['Review Rate'].append(review_rate)
        rev_dict['Review Time'].append(review_time)
        rev_dict['Review Text'].append(review_text)
    A = pd.DataFrame(rev_dict)

    A["Review Rate"] = [i.split("\xa0")[0] for i in A["Review Rate"]]
    A["Review Time"] = [i.strip("il y a ") for i in A["Review Time"]]
    A["Review Time"] = [i.replace("une", "1") for i in A["Review Time"]]
    A["Review Time"] = [i.replace("un", "1") for i in A["Review Time"]]
    return A


def get_all_gr(url, iid, name, alias):
    test = get_reviews_google(url, scroll_limit=10, quiet_mode=False)
    table = get_review_summary(test)
    table["id"] = iid
    table["name"] = name
    table["alias"] = alias
    table.to_csv(name.replace(" ", "_").replace("'", "") + ".csv")
    os.system("""gsutil cp '*.csv' 'gs://wagon-data-722-manoharan/restaurant/'""")
    return table


In [19]:
centers, radius = subzones_paris(1)

In [20]:
df = get_restaurants(centers, radius)

---------- Requesting API for subzone #1 ----------
   ------- Requesting API with offset = 0 -------   
   ------- Requesting API with offset = 50 -------   
   ------- Requesting API with offset = 100 -------   
   ------- Requesting API with offset = 150 -------   
#####   Request completed, 200 businesses fetched   ###


In [21]:
df = create_df_yelp(df)

In [22]:
df = df.head().copy()

In [23]:
df["id"] = df.apply(lambda x: get_place_google_id(x["name"], x["latitude"],
                                                      x["longitude"]),
                        axis=1)

In [24]:
df["lien"] = df.apply(lambda x: get_place_google_url(x["id"]), axis=1)

In [25]:
df

Unnamed: 0,alias,name,url,categories,latitude,longitude,address,zip_code,price,rating,review_count,id,lien
0,l-as-du-fallafel-paris,L'As du Fallafel,https://www.yelp.com/biz/l-as-du-fallafel-paris,"kosher, sandwiches, falafel",48.857498,2.35908,34 rue des Rosiers,75004,1,4.5,1812.0,,
1,angelina-paris,Angelina,https://www.yelp.com/biz/angelina-paris,"breakfast_brunch, tea, cakeshop",48.865092,2.328464,226 rue de Rivoli,75001,3,4.0,1349.0,,
2,le-comptoir-de-la-gastronomie-paris,Le Comptoir de la Gastronomie,https://www.yelp.com/biz/le-comptoir-de-la-gas...,french,48.864516,2.345402,34 rue Montmartre,75001,2,4.5,1107.0,,
3,bouillon-chartier-paris,Bouillon Chartier,https://www.yelp.com/biz/bouillon-chartier-paris,french,48.87194,2.34317,7 rue du Faubourg Montmartre,75009,2,3.5,953.0,,
4,l-avant-comptoir-paris-3,L'Avant Comptoir,https://www.yelp.com/biz/l-avant-comptoir-paris-3,"tapas, wine_bars",48.85202,2.3388,3 carrefour de l'Odéon,75006,2,4.5,612.0,,


In [17]:
api_key

'tjiK2WrTVRJyobBmFkzDltuzbS2vnXfndtwrySrK0QHFpqKgaoK1_0mLOpIJ8b5hgB59zW5nWCv1rYnHcgTI9bZthCKEZQV2PUweLg_QxVNO7JwsDfakFXf-S4mbYXYx'