In [1]:
import pickle
import re
import time
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.firefox.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as ec

In [2]:
# Unpickle zip code list
with open('../geotracker/data/zip.pkl', 'rb') as f:
    zip_codes = pickle.load(f)

In [3]:
# Create iterable list of URLS for the scraper
base_url = "https://www.lieferando.de/en/delivery/food/berlin-"
urls = [base_url + zip_code for zip_code in zip_codes]
urls[0]

'https://www.lieferando.de/en/delivery/food/berlin-10115'

In [13]:
len(urls)

190

# Restaurant page overview for each ZIP Code

In [5]:
def scroll_down():
    """A method for scrolling the page."""

    # Get scroll height.
    last_height = driver.execute_script("return document.body.scrollHeight")

    while True:

        # Scroll down to the bottom.
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")

        # Wait to load the page.
        time.sleep(2)

        # Calculate new scroll height and compare with last scroll height.
        new_height = driver.execute_script("return document.body.scrollHeight")

        if new_height == last_height:

            break

        last_height = new_height

In [6]:
# Set selenium options
options = Options()
#options.add_argument("--headless")  # Starts driver without opening a window
driver = webdriver.Firefox(options=options)

In [7]:
# Extract the information for restaurants from this page

driver.get(urls[0])
time.sleep(10)
scroll_down()
#names = driver.find_elements(by=By.CLASS_NAME, value='restaurantname notranslate')
#names = driver.find_elements(By.CSS_SELECTOR, "h3")
restaurants_names = driver.find_elements(By.XPATH, "//a[@class='restaurantname notranslate']")

restaurant_urls = []
restaurant_names = []
for restaurant in restaurants_names:
    restaurant_urls.append(restaurant.get_attribute("href"))
    restaurant_names.append(restaurant.text)

print(restaurant_urls)
print(restaurant_names)

['https://www.lieferando.de/en/menu/burger-king-berlin-schoenhauser-allee', 'https://www.lieferando.de/en/menu/bowl-time', 'https://www.lieferando.de/en/menu/burger-king-berlin-mitte-bahnhof', 'https://www.lieferando.de/en/menu/burger-king-berlin-europaplatz', 'https://www.lieferando.de/en/menu/bring-bakery', 'https://www.lieferando.de/en/menu/otacos-berlin-spandauer-strasse', 'https://www.lieferando.de/en/menu/kreuzburger-friedrichshain', 'https://www.lieferando.de/en/menu/xi-gon', 'https://www.lieferando.de/en/menu/habba-habba-berlin', 'https://www.lieferando.de/en/menu/riga-s-pizza-box-berlin-berlin', 'https://www.lieferando.de/en/menu/maison-viet-10437', 'https://www.lieferando.de/en/menu/dunkin-donuts-berlin-badstrasse', 'https://www.lieferando.de/en/menu/dunkin-donuts-berlin-unter-den-linden', 'https://www.lieferando.de/en/menu/dawat-indische-spezialitaeten', 'https://www.lieferando.de/en/menu/madni', 'https://www.lieferando.de/en/menu/comcha-vegan', 'https://www.lieferando.de/en

In [None]:
print(len(restaurant_names))
print(len(restaurant_urls))

In [None]:
reviews = driver.find_elements(By.CLASS_NAME, "review_rating")
print(len(reviews))

In [None]:
#restaurants = driver.find_elements(By.XPATH, "//h2[@class='restaurantname']")

soup = BeautifulSoup(driver.page_source, "html.parser")


In [None]:
reviews = soup.find_all("div", class_="review-rating")
print(len(reviews))
reviews[0]

In [None]:
def convert_rating(rating_attribute):
    found = re.search(r"[0-9]{1,3}", rating_attribute)
    return int(found[0]) / 20
    
convert_rating('width: 100%;')

In [None]:
clean_reviews = reviews[ :len(restaurants_names)] # Remove the last element in the list which does not contain a review
restaurant_reviews = []
for clean_review in clean_reviews:
    rating = clean_review.find("span").get("style")
    restaurant_reviews.append(convert_rating(rating))
    
len(restaurant_reviews)
    


In [None]:
total_ratings = soup.find_all("span", class_="rating-total")
clean_ratings = total_ratings[ :len(restaurants_names)]

restaurant_rating_totals = []
for clean_rating in clean_ratings:
    total_rating = clean_rating.text.strip()
    total_rating = re.search(r"[0-9]{1,5}", total_rating)[0]
    restaurant_rating_totals.append(int(total_rating))

print(len(restaurant_rating_totals))
restaurant_rating_totals[0:5]


In [None]:
kitchens = soup.find_all("div", class_="kitchens")
clean_kitchens = kitchens[ :len(restaurant_names)]

restaurant_kitchens = []
for clean_kitchen in clean_kitchens:
    restaurant_kitchens.append(clean_kitchen.find("span").text)
    
len(restaurant_kitchens)
    

In [None]:
# Final restaurant list

restaurant_list = []
for restaurant_name, restaurant_url, restaurant_review, restaurant_rating_total, restaurant_kitchen in zip(restaurant_names, restaurant_urls, restaurant_reviews, restaurant_rating_totals, restaurant_kitchens):
    restaurant_list.append(
        dict(restaurant_name=restaurant_name,
             restaurant_url=restaurant_url, 
             restaurant_review=restaurant_review, 
             restaurant_rating_total=restaurant_rating_total, 
             restaurant_kitchen=restaurant_kitchen)
    )
    
restaurant_list[:2]

# Feature extraction from the restaurant pages

In [None]:
driver.get(restaurant_list[0].get("restaurant_url"))
wait = WebDriverWait(driver, 15)
wait.until(ec.visibility_of_element_located(
    (By.XPATH, "//button[@class='info info-icon js-open-info-tab']")))


button = driver.find_element(By.XPATH, "//button[@class='info info-icon js-open-info-tab']")
button.click()

In [None]:
address = driver.find_element(By.XPATH, "//section[@class='card-body notranslate']").text

address

In [None]:
# Extract street, ZIP code and City from string
street = re.search(r".*", address)[0]
zip_code = re.search(r"[0-9]{5}", address)[0]
city = re.search(r"[\w]+$", address)[0]

print("street:", street)
print("zip code:", zip_code)
print("city:", city)

In [None]:
copy_rest_list = restaurant_list.copy()

In [None]:
copy_rest_list[0]["street"] = street
copy_rest_list[0]["zip_code"] = zip_code
copy_rest_list[0]["city"] = city

copy_rest_list[0]

In [None]:
import pandas as pd
df = pd.DataFrame(restaurant_list)
df.drop_duplicates(subset=['restaurant_url'], inplace=True)
restaurant_urls = df["restaurant_url"]

list(restaurant_urls)
for index, url in restaurant_urls:
    print(index, url)

# Stuff that didn't work

In [None]:
delivery_costs = soup.find_all("div", class_="delivery-cost js-delivery-cost notranslate")
len(delivery_costs)

In [None]:
delivery_times = soup.find_all("div", class_="avgdeliverytime avgdeliverytimefull open")

restaurant_delivery_times = [item.text for item in delivery_times]

len(restaurant_delivery_times) # Different because it's empty for restaurants that are closed for delivery or closed right now

In [None]:
restaurant_delivery_cost = [item.text for item in delivery_costs]
print(len(restaurant_delivery_cost)) 
set(restaurant_delivery_cost) # unique Delivery costs don't include "Free"

# Excludes restaurants where delivery is free and which are closed

In [None]:
restaurant_wrapper = soup.find_all("div", class_="detailswrapper")

restaurant_wrapper[0]

# This just contains skeleton code

# Checking whether pickled files are okay

In [12]:
with open('../geotracker/data/restaurant_list_pickles/restaurant_list_7.pkl', 'rb') as f:
    restaurant_list = pickle.load(f)
    
len(restaurant_list)

5709

In [26]:
with open('../raw_data/lieferando_pickles/city_lists/city_list_60.pkl', 'rb') as f:
    city_list = pickle.load(f)
    
len(city_list)

60

In [27]:
import pandas as pd
pd.Series(city_list).value_counts()

Berlin    60
dtype: int64

In [30]:
with open('../raw_data/lieferando_pickles/zip_code_lists/zip_code_list_400.pkl', 'rb') as f:
    zip_code_list = pickle.load(f)
    
len(zip_code_list)

400

In [31]:
pd.Series(zip_code_list).value_counts()

10437    46
10117    39
10119    38
10178    28
10435    23
         ..
12347     1
10965     1
13187     1
13088     1
13407     1
Length: 61, dtype: int64

In [29]:
with open('../raw_data/lieferando_pickles/street_lists/street_list_80.pkl', 'rb') as f:
    street_list = pickle.load(f)
    
len(street_list)

80

In [None]:
pd.Series(zip_code_list).value_counts()

10437    46
10117    39
10119    38
10178    28
10435    23
         ..
12347     1
10965     1
13187     1
13088     1
13407     1
Length: 61, dtype: int64

In [41]:
import numpy as np
np.random.randint(1,4)

3

In [23]:
from concurrent.futures import ThreadPoolExecutor

threading.csv_writer_lock()

AttributeError: module 'threading' has no attribute 'csv_writer_lock'