In [18]:
# scraping imports
from bs4 import BeautifulSoup
import requests
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as ec
from selenium.webdriver.chrome.options import Options

# import re
import time
import json
import helium
import chromedriver_binary

# data wrangling imports
import numpy as np
import pandas as pd

# other imports
import csv
import random

In [2]:
# creating csv file to collect all data
fieldnames = [
    'restaurant_name', 'avg_review_score', 'street', 'zip_code', 'city_name',
    'type_of_cuisine', 'minimum_order_value', 'delivery_fee', 'pricyness',
    'latitude', 'longitude', 'avg_delivery_time']

with open("../geotracker/data/wolt.csv", "w") as file:
    writer = csv.DictWriter(file, fieldnames=fieldnames)


def save_in_csv(resto_info):
    fieldnames = [
        'restaurant_name', 'avg_review_score', 'street', 'zip_code',
        'city_name', 'type_of_cuisine', 'minimum_order_value', 'delivery_fee',
        'pricyness', 'latitude', 'longitude', 'avg_delivery_time']
    with open("../geotracker/data/wolt.csv", "a", newline='') as file:
        writer = csv.DictWriter(file, fieldnames=fieldnames)
        writer.writerow(resto_info)



In [25]:
# creating headers for the csv file
headers = {}
for x,y in zip(fieldnames,fieldnames):
    headers[x] = y
    
save_in_csv(headers)

In [27]:
def zip_codes():
    """method to extract and create a list with all zip codes"""
    filename = '../raw_data/Berlin Zip Codes - Sheet1.csv'
    zip_codes_list = []
    with open(filename, 'r') as csvfile:
        datareader = csv.reader(csvfile)
        for row in datareader:
            zip_codes_list.append(row[1])
    return zip_codes_list[1:]

# list with all Berlin zipcodes
zip_code_list = zip_codes()


In [4]:
def links(url):
    """gets all links and scrape restaurant links"""
    driver = webdriver.Chrome()
    driver.get(url)

    chrome_options = Options()
    chrome_options.add_argument("--headless")

    wait = WebDriverWait(driver, 15).until(ec.presence_of_element_located((By.XPATH, '/html/body/div[1]/div/div/div[2]/div[2]/div/div/div/div[2]/div[1]')))

    ####  Scroller
    SCROLL_PAUSE_TIME = 1.5
    last_height = driver.execute_script("return document.body.scrollHeight")

    while True:
        # Scroll down to bottom
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")

        # Wait to load page
        time.sleep(SCROLL_PAUSE_TIME)

        # Calculate new scroll height and compare with last scroll height
        new_height = driver.execute_script("return document.body.scrollHeight")
        if new_height == last_height:
            break
        last_height = new_height

    # list that will contain all links
    links_list = []

    links = driver.find_elements_by_class_name("VenueVerticalList__Grid-sc-1w1a9dr-1")[0]
    elems = links.find_elements_by_xpath("//a[@href]")

    for link in elems:
        links_list.append(link.get_attribute('href'))

    driver.quit()

    # cleaning None values from links_list
    links_list = [x for x in links_list if x != None]

    # cleaning links_list from non-restaurant profile links
    clean_link_root = 'https://wolt.com/en/deu/berlin/restaurant/'
    clean_links_list = [x for x in links_list if x[:42] == clean_link_root]

    return clean_links_list

In [5]:
# iterate over zip_codes
def all_links_by_zipcode(zipcode_list):
    """iterates over all zipcodes and creates a dictionary with zipcode num : all links"""
    
    base_url = "https://wolt.com/en/discovery?tab=restaurants"
    
    links_by_zip_code = {}

    for zipcode in zipcode_list:
        links_by_zip_code[zipcode] = links(f"{base_url}&search?q={zipcode}")
    
    return links_by_zip_code


In [36]:
def fetch_page(page):
    response = requests.get(page, headers={"Accept-Language":"en-US"} )
    soup = BeautifulSoup(response.content, "html.parser")

    restaurant_info = {}

    # Restaurant Name (MUST): restaurant_name
    try:
        restaurant_info['restaurant_name'] = soup.find(class_="VenueHeroBanner__TitleSpan-sc-3gkm9v-2 ifxphB").text
    except Exception:
        restaurant_info['restaurant_name'] = None

    # Number of Reviews (MUST) : reviews Average Review score (out of 5) (MUST) : avg_review_score - out of 10
    try:
        restaurant_info['avg_review_score'] = soup.find(class_="RatingsButton-module__score___fTqMn").text
    except Exception:
        restaurant_info['avg_review_score'] = None

    # Street and House Number (MUST): street
    try:
        restaurant_info['street'] = soup.find(class_="VenueSideInfo-module__primary___xK8qF").text
    except Exception:
        restaurant_info['street'] = None

    # ZIP code (MUST): zip_code
    try:
        restaurant_info['zip_code'] = soup.find(class_="VenueSideInfo-module__secondary___Kuira").text[:5]
    except Exception:
        restaurant_info['zip_code'] = None
    # City Name (MUST) city_name
    try:
        restaurant_info['city_name'] = soup.find(class_="VenueSideInfo-module__secondary___Kuira").text[6:]
    except Exception:
        restaurant_info['zip_code'] = None


    # Type of cuisine (SHOULD) type_of_cuisine
    try:
        type_of_cuisine = []
        for elem in soup.find_all(class_="RelatedSearches__Item-sc-1ohvfsu-1"):
            tmp = elem.find('a').text
            type_of_cuisine.append(tmp)
        restaurant_info['type_of_cuisine'] = type_of_cuisine
    except Exception:
        restaurant_info['type_of_cuisine'] = None

    # Minimum order value (COULD): minimum_order_value
    # Delivery fee (COULD): delivery_fee

    try:
        tmp = soup.find_all(
        class_=
        'Tags__Root-sc-1dm36sr-0 ghWIPZ VenueHeroBanner__StyledConnectedTags-sc-3gkm9v-5 ljMfkO')[0].text.split(" ")
        # tmp[1] = tmp[1][:-4]
        if len(tmp) ==4:
            restaurant_info['minimum_order_value'] = tmp[3]
            restaurant_info['delivery_fee'] = tmp[1]
        else:
            restaurant_info['minimum_order_value'] = tmp[2]
            restaurant_info['delivery_fee'] = None
    except Exception:
        restaurant_info['minimum_order_value'] = None
        restaurant_info['delivery_fee'] = None

    ##selenium
    # Pricyness (in 1-5) (SHOULD): pricyness
    driver = webdriver.Chrome()
    driver.get(page)
    soup_s = BeautifulSoup(driver.page_source, "html.parser")

    chrome_options = Options()
    chrome_options.add_argument("--headless")

    info =[]

    try:
        for s in soup_s.select("script[type='application/ld+json']"):
            home_url = json.loads(s.get_text(strip=True))
            info.append(home_url)
        restaurant_info['pricyness'] = info[0]['priceRange']

        # Latitude (introduce later) (MUST): latitude (generated during cleaning)
        restaurant_info['latitude'] = info[0]['geo']['latitude']

        # Longitude (introduce later) (MUST): longitude (generated during cleaning)
        restaurant_info['longitude'] = info[0]['geo']['longitude']

    except Exception:
        restaurant_info['pricyness'] = None
        restaurant_info['latitude'] = None
        restaurant_info['longitude'] = None

    driver.quit()

    # Average delivery time (COULD): avg_delivery_time
    try:
        restaurant_info['avg_delivery_time'] = soup.find_all(class_="DeliveryInfo__DeliveryInformationText-sc-8y92dv-2 dYkBpD")[0].find("strong").text
    except Exception:
        restaurant_info['avg_delivery_time'] =  None

    # save_in_csv(restaurant_info)
    save_in_csv(restaurant_info)


In [55]:
def start_scraper(all_links):
    i = 0
    for key, values in all_links.items():
        for link in values:
            fetch_page(link)
            time.sleep(random.randrange(1, 100) / 100)
        i += 1
        print(f" Scraped {key} zip code ({i}/194)..............")


In [35]:
all_links = all_links_by_zipcode(zip_code_list)

In [48]:
# saving all_links into a csv
with open("../geotracker/data/links_list.csv", 'w') as f:
    w = csv.writer(f)
    w.writerows(all_links.items())

In [52]:
# to see how many links to be scraped
sum([len(x) for x in all_links.values()])

25801

In [56]:
start_scraper(all_links)

 Scraped 10117 zip code (1/194)..............
 Scraped 10115 zip code (2/194)..............
 Scraped 10119 zip code (3/194)..............
 Scraped 10178 zip code (4/194)..............
 Scraped 10179 zip code (5/194)..............
 Scraped 10243 zip code (6/194)..............
 Scraped 10245 zip code (7/194)..............
 Scraped 10247 zip code (8/194)..............
 Scraped 10405 zip code (9/194)..............
 Scraped 10435 zip code (10/194)..............
 Scraped 10437 zip code (11/194)..............
 Scraped 10587 zip code (12/194)..............
 Scraped 10623 zip code (13/194)..............
 Scraped 10707 zip code (14/194)..............
 Scraped 10719 zip code (15/194)..............
 Scraped 10785 zip code (16/194)..............
 Scraped 10787 zip code (17/194)..............
 Scraped 10961 zip code (18/194)..............
 Scraped 10963 zip code (19/194)..............
 Scraped 10969 zip code (20/194)..............
 Scraped 10997 zip code (21/194)..............
 Scraped 10999 zip cod

WebDriverException: Message: disconnected: Unable to receive message from renderer
  (Session info: chrome=96.0.4664.55)


In [None]:
#deberia haber limpiado las listas de links de repetidos!!!!