In [4]:
# scraping imports
from bs4 import BeautifulSoup
import requests
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as ec
from selenium.webdriver.chrome.options import Options

# import re
import time
import json
import helium
import chromedriver_binary

# data wrangling imports
import numpy as np
import pandas as pd

# other imports
import csv

In [5]:
# creating csv file to collect all data
fieldnames = [
    'restaurant_name', 'avg_review_score', 'street', 'zip_code', 'city_name',
    'type_of_cuisine', 'minimum_order_value', 'delivery_fee', 'pricyness',
    'latitude', 'longitude', 'avg_delivery_time']

with open("../geotracker/data/wolt.csv", "w") as file:
    writer = csv.DictWriter(file, fieldnames=fieldnames)


def save_in_csv(resto_info):
    fieldnames = [
        'restaurant_name', 'avg_review_score', 'street', 'zip_code',
        'city_name', 'type_of_cuisine', 'minimum_order_value', 'delivery_fee',
        'pricyness', 'latitude', 'longitude', 'avg_delivery_time']
    with open("../geotracker/data/wolt.csv", "a", newline='') as file:
        writer = csv.DictWriter(file, fieldnames=fieldnames)
        writer.writerow(resto_info)



In [6]:
def zip_codes():
    """method to extract and create a list with all zip codes"""
    filename = '../raw_data/Berlin Zip Codes - Sheet1.csv'
    zip_codes_list = []
    with open(filename, 'r') as csvfile:
        datareader = csv.reader(csvfile)
        for row in datareader:
            zip_codes_list.append(row[1])
    return zip_codes_list[1:]

# list with all Berlin zipcodes
zip_code_list = zip_codes()

In [7]:
def links(url):
    """gets all links and scrape restaurant links"""
    driver = webdriver.Chrome()
    driver.get(url)

    chrome_options = Options()
    chrome_options.add_argument("--headless")

    wait = WebDriverWait(driver, 15).until(ec.presence_of_element_located((By.XPATH, '/html/body/div[1]/div/div/div[2]/div[2]/div/div/div/div[2]/div[1]')))

    ####  Scroller
    SCROLL_PAUSE_TIME = 1.5
    last_height = driver.execute_script("return document.body.scrollHeight")

    while True:
        # Scroll down to bottom
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")

        # Wait to load page
        time.sleep(SCROLL_PAUSE_TIME)

        # Calculate new scroll height and compare with last scroll height
        new_height = driver.execute_script("return document.body.scrollHeight")
        if new_height == last_height:
            break
        last_height = new_height

    # list that will contain all links
    links_list = []

    links = driver.find_elements_by_class_name("VenueVerticalList__Grid-sc-1w1a9dr-1")[0]
    elems = links.find_elements_by_xpath("//a[@href]")

    for link in elems:
        links_list.append(link.get_attribute('href'))

    driver.quit()

    # cleaning None values from links_list
    links_list = [x for x in links_list if x != None]

    # cleaning links_list from non-restaurant profile links
    clean_link_root = 'https://wolt.com/en/deu/berlin/restaurant/'
    clean_links_list = [x for x in links_list if x[:42] == clean_link_root]

    return clean_links_list

In [9]:
# iterate over zip_codes
def all_links_by_zipcode(zipcode_list):
    """iterates over all zipcodes and creates a dictionary with zipcode num : all links"""
    
    base_url = "https://wolt.com/en/discovery?tab=restaurants"
    
    links_by_zip_code = {}

    for zipcode in zipcode_list:
        links_by_zip_code[zipcode] = links(f"{base_url}&search?q={zipcode}")
    
    return links_by_zip_code


# ahora, hay que meter una fx adentrode la otra  
# necesito que me itere y que me agregue en algun lado el zipcode como sanity check
# y meter todo en el csv code


In [10]:
def fetch_page(page):
    response = requests.get(page, headers={"Accept-Language":"en-US"} )
    soup = BeautifulSoup(response.content, "html.parser")

    restaurant_info = {}

    # Restaurant Name (MUST): restaurant_name

    restaurant_info['restaurant_name'] = soup.find(class_="VenueHeroBanner__TitleSpan-sc-3gkm9v-2 ifxphB").text

    # Number of Reviews (MUST) : reviews Average Review score (out of 5) (MUST) : avg_review_score - out of 10
    restaurant_info['avg_review_score'] = soup.find(class_="RatingsButton-module__score___fTqMn").text

    # Street and House Number (MUST): street
    restaurant_info['street'] = soup.find(class_="VenueSideInfo-module__primary___xK8qF").text

    # ZIP code (MUST): zip_code
    restaurant_info['zip_code'] = soup.find(class_="VenueSideInfo-module__secondary___Kuira").text[:5]

    # City Name (MUST) city_name
    restaurant_info['city_name'] = soup.find(class_="VenueSideInfo-module__secondary___Kuira").text[6:]

    # Type of cuisine (SHOULD) type_of_cuisine
    type_of_cuisine = []
    for elem in soup.find_all(class_="RelatedSearches__Item-sc-1ohvfsu-1"):
        tmp = elem.find('a').text
        type_of_cuisine.append(tmp)
    restaurant_info['type_of_cuisine'] = type_of_cuisine

    # Minimum order value (COULD): minimum_order_value
    # Delivery fee (COULD): delivery_fee

    tmp = soup.find_all(
    class_=
    'Tags__Root-sc-1dm36sr-0 ghWIPZ VenueHeroBanner__StyledConnectedTags-sc-3gkm9v-5 ljMfkO')[0].text.split(" ")
    # tmp[1] = tmp[1][:-4]
    if len(tmp) ==4:
        restaurant_info['minimum_order_value'] = tmp[3]
        restaurant_info['delivery_fee'] = tmp[1]
    else:
        restaurant_info['minimum_order_value'] = tmp[2]
        restaurant_info['delivery_fee'] = None

    ##selenium
    # Pricyness (in 1-5) (SHOULD): pricyness
    driver = webdriver.Chrome()
    driver.get(page)
    soup_s = BeautifulSoup(driver.page_source, "html.parser")

    chrome_options = Options()
    chrome_options.add_argument("--headless")

    info =[]
    for s in soup_s.select("script[type='application/ld+json']"):
        home_url = json.loads(s.get_text(strip=True))
        info.append(home_url)
    restaurant_info['pricyness'] = info[0]['priceRange']

    # Latitude (introduce later) (MUST): latitude (generated during cleaning)
    restaurant_info['latitude'] = info[0]['geo']['latitude']

    # Longitude (introduce later) (MUST): longitude (generated during cleaning)
    restaurant_info['longitude'] = info[0]['geo']['longitude']
    driver.quit()

    # Average delivery time (COULD): avg_delivery_time
    #last bc time sensitive...
    try:
        restaurant_info['avg_delivery_time'] = soup.find_all(class_="DeliveryInfo__DeliveryInformationText-sc-8y92dv-2 dYkBpD")[0].find("strong").text
    except Exception:
        restaurant_info['avg_delivery_time'] =  None

    # save_in_csv(restaurant_info)
    save_in_csv(restaurant_info)


In [11]:
urls = [
    "https://wolt.com/en/deu/berlin/restaurant/burgermeister-potsdamer-platz",
    "https://wolt.com/en/deu/berlin/restaurant/kwa-kebab-with-attitude"]

def start_scraper(zipcode_list):
    all_links = all_links_by_zipcode(zipcode_list)
    print(all_links)
    for values in all_links.values():
        for link in values:
            fetch_page(link)
            time.sleep(0.005)
            
    


In [14]:
prueba = [10435]
start_scraper(prueba)

{10435: ['https://wolt.com/en/deu/berlin/restaurant/crackbuns', 'https://wolt.com/en/deu/berlin/restaurant/monsieur-vuong', 'https://wolt.com/en/deu/berlin/restaurant/umami-prenzlauer-berg', 'https://wolt.com/en/deu/berlin/restaurant/kuchi-mitte', 'https://wolt.com/en/deu/berlin/restaurant/burgermeister-potsdamer-platz', 'https://wolt.com/en/deu/berlin/restaurant/soul-sushi-berlin-mitte', 'https://wolt.com/en/deu/berlin/restaurant/kwa-kebab-with-attitude', 'https://wolt.com/en/deu/berlin/restaurant/chicken-buzz', 'https://wolt.com/en/deu/berlin/restaurant/brammibals-donuts-alte-potsdamer-str', 'https://wolt.com/en/deu/berlin/restaurant/chupenga-burritos-salads-georgenstr', 'https://wolt.com/en/deu/berlin/restaurant/rosenburger', 'https://wolt.com/en/deu/berlin/restaurant/district-mot', 'https://wolt.com/en/deu/berlin/restaurant/stay-green', 'https://wolt.com/en/deu/berlin/restaurant/hasir-burger', 'https://wolt.com/en/deu/berlin/restaurant/la-gino', 'https://wolt.com/en/deu/berlin/rest

In [48]:
# driver = webdriver.Chrome()
# driver.get(example_url)
# wait = WebDriverWait(driver, 15).until(ec.presence_of_element_located((By.XPATH, '//*[@id="app"]/div/div/div[2]/div[3]/div/div/div/button[2]')))
# driver.find_elements_by_xpath('//*[@id="app"]/div/div/div[2]/div[3]/div/div/div/button[2]')[0].click()
# button = driver.find_elements_by_xpath(
#     '//*[@id="mainContent"]/div/div[4]/div[2]/div[2]/div[2]/button')[0]
# button.click()
# wait = WebDriverWait(driver, 15).until(
#     ec.presence_of_element_located((
#         By.XPATH,
#         '/html/body/div[6]/div/aside/div[2]/div[2]/div[2]/div/div[1]/div[4]/div[2]/p[1]/span[2]'
#     )))

# element = driver.find_element_by_xpath(
#     '/html/body/div[6]/div/aside/div[2]/div[2]/div[2]/div/div[1]/div[4]/div[2]/p[1]/span[2]'
# ).get_attribute("innerHTML")

# print(element)




In [None]:
# def try_pass(operation, op2):
#     try:
#         tmp = operation
#     except: return None
#     return tmp.op2