In [63]:
import pickle
import re
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.firefox.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as ec

In [30]:
# Unpickle zip code list
with open('../geotracker/data/zip.pkl', 'rb') as f:
    zip_codes = pickle.load(f)

In [31]:
# Create iterable list of URLS for the scraper
base_url = "https://www.lieferando.de/en/delivery/food/berlin-"
urls = [base_url + zip_code for zip_code in zip_codes]
urls[0]

'https://www.lieferando.de/en/delivery/food/berlin-10115'

# Restaurant page overview for each ZIP Code

In [34]:
# Set selenium options
options = Options()
#options.add_argument("--headless")  # Starts driver without opening a window
driver = webdriver.Firefox(options=options)

In [123]:
# Extract the information for restaurants from this page

driver.get(urls[0])
WebDriverWait(driver, 5)
#names = driver.find_elements(by=By.CLASS_NAME, value='restaurantname notranslate')
#names = driver.find_elements(By.CSS_SELECTOR, "h3")
restaurants_names = driver.find_elements(By.XPATH, "//a[@class='restaurantname notranslate']")

restaurant_urls = []
restaurant_names = []
for restaurant in restaurants_names:
    restaurant_urls.append(restaurant.get_attribute("href"))
    restaurant_names.append(restaurant.text)

print(restaurant_urls)
print(restaurant_names)

['https://www.lieferando.de/en/menu/diazo-greifenhagener', 'https://www.lieferando.de/en/menu/hot-million-burger-reinickendorfer', 'https://www.lieferando.de/en/menu/burger-king-berlin-schoenhauser-allee', 'https://www.lieferando.de/en/menu/burger-king-berlin-alexanderplatz', 'https://www.lieferando.de/en/menu/bowl-time', 'https://www.lieferando.de/en/menu/burger-king-berlin-mitte-bahnhof', 'https://www.lieferando.de/en/menu/bring-bakery', 'https://www.lieferando.de/en/menu/otacos-berlin-spandauer-strasse', 'https://www.lieferando.de/en/menu/kreuzburger-friedrichshain', 'https://www.lieferando.de/en/menu/pizza-nostra-berlin', 'https://www.lieferando.de/en/menu/xi-gon', 'https://www.lieferando.de/en/menu/sushi-palace-berlin-pankow', 'https://www.lieferando.de/en/menu/habba-habba-berlin', 'https://www.lieferando.de/en/menu/madni', 'https://www.lieferando.de/en/menu/dunkin-donuts-berlin-unter-den-linden', 'https://www.lieferando.de/en/menu/dunkin-donuts-berlin-badstrasse', 'https://www.li

In [36]:
print(len(restaurant_names))
print(len(restaurant_urls))

705
705


In [56]:
reviews = driver.find_elements(By.CLASS_NAME, "review_rating")
print(len(reviews))

0


In [48]:
#restaurants = driver.find_elements(By.XPATH, "//h2[@class='restaurantname']")

soup = BeautifulSoup(driver.page_source, "html.parser")


In [57]:
reviews = soup.find_all("div", class_="review-rating")
print(len(reviews))
reviews[0]

706


<div class="review-rating">
<div class="review-stars notranslate">
<span class="review-stars-range" style="width: 80%;">
</span>
</div>
<span class="rating-total">(150)</span>
<span class="rating-total-short">(150)</span>
</div>

In [79]:
def convert_rating(rating_attribute):
    found = re.search(r"[0-9]{1,3}", rating_attribute)
    return int(found[0]) / 20
    
convert_rating('width: 100%;')

5.0

In [82]:
clean_reviews = reviews[ :len(restaurants_names)] # Remove the last element in the list which does not contain a review
restaurant_reviews = []
for clean_review in clean_reviews:
    rating = clean_review.find("span").get("style")
    restaurant_reviews.append(convert_rating(rating))
    
len(restaurant_reviews)
    


705

In [116]:
total_ratings = soup.find_all("span", class_="rating-total")
clean_ratings = total_ratings[ :len(restaurants_names)]

restaurant_rating_totals = []
for clean_rating in clean_ratings:
    total_rating = clean_rating.text.strip()
    total_rating = re.search(r"[0-9]{1,5}", total_rating)[0]
    restaurant_rating_totals.append(int(total_rating))

print(len(restaurant_rating_totals))
restaurant_rating_totals[0:5]


705


[150, 128, 209, 128, 6]

In [97]:
kitchens = soup.find_all("div", class_="kitchens")
clean_kitchens = kitchens[ :len(restaurant_names)]

restaurant_kitchens = []
for clean_kitchen in clean_kitchens:
    restaurant_kitchens.append(clean_kitchen.find("span").text)
    
len(restaurant_kitchens)
    

705

In [118]:
# Final restaurant list

restaurant_list = []
for restaurant_name, restaurant_url, restaurant_review, restaurant_rating_total, restaurant_kitchen in zip(restaurant_names, restaurant_urls, restaurant_reviews, restaurant_rating_totals, restaurant_kitchens):
    restaurant_list.append(
        dict(restaurant_name=restaurant_name,
             restaurant_url=restaurant_url, 
             restaurant_review=restaurant_review, 
             restaurant_rating_total=restaurant_rating_total, 
             restaurant_kitchen=restaurant_kitchen)
    )
    
restaurant_list[:2]

[{'restaurant_name': 'Diazo',
  'restaurant_url': 'https://www.lieferando.de/en/menu/diazo-greifenhagener',
  'restaurant_review': 4.0,
  'restaurant_rating_total': 150,
  'restaurant_kitchen': 'Italian style pizza, Burgers, Pasta'},
 {'restaurant_name': 'Hot Million Burger',
  'restaurant_url': 'https://www.lieferando.de/en/menu/hot-million-burger-reinickendorfer',
  'restaurant_review': 4.5,
  'restaurant_rating_total': 128,
  'restaurant_kitchen': 'Italian style pizza, Burgers, Pasta'}]

# Feature extraction from the restaurant pages

In [160]:
driver.get(restaurant_list[0].get("restaurant_url"))
wait = WebDriverWait(driver, 15)
wait.until(ec.visibility_of_element_located(
    (By.XPATH, "//button[@class='info info-icon js-open-info-tab']")))


button = driver.find_element(By.XPATH, "//button[@class='info info-icon js-open-info-tab']")
button.click()

In [156]:
address = driver.find_element(By.XPATH, "//section[@class='card-body notranslate']").text

address

'Greifenhagener Straße 61\n10437 Berlin'

In [157]:
# Extract street, ZIP code and City from string
street = re.search(r".*", address)[0]
zip_code = re.search(r"[0-9]{5}", address)[0]
city = re.search(r"[\w]+$", address)[0]

print("street:", street)
print("zip code:", zip_code)
print("city:", city)

street: Greifenhagener Straße 61
zip code: 10437
city: Berlin


In [158]:
copy_rest_list = restaurant_list.copy()

In [159]:
copy_rest_list[0]["street"] = street
copy_rest_list[0]["zip_code"] = zip_code
copy_rest_list[0]["city"] = city

copy_rest_list[0]

{'restaurant_name': 'Diazo',
 'restaurant_url': 'https://www.lieferando.de/en/menu/diazo-greifenhagener',
 'restaurant_review': 4.0,
 'restaurant_rating_total': 150,
 'restaurant_kitchen': 'Italian style pizza, Burgers, Pasta',
 'street': 'Greifenhagener Straße 61',
 'zip_code': '10437',
 'city': 'Berlin'}

In [165]:
import pandas as pd
df = pd.DataFrame(restaurant_list)
df.drop_duplicates(subset=['restaurant_url'], inplace=True)
restaurant_urls = df["restaurant_url"]

list(restaurant_urls)
for index, url in restaurant_urls:
    print(index, url)

['https://www.lieferando.de/en/menu/diazo-greifenhagener',
 'https://www.lieferando.de/en/menu/hot-million-burger-reinickendorfer',
 'https://www.lieferando.de/en/menu/burger-king-berlin-schoenhauser-allee',
 'https://www.lieferando.de/en/menu/burger-king-berlin-alexanderplatz',
 'https://www.lieferando.de/en/menu/bowl-time',
 'https://www.lieferando.de/en/menu/burger-king-berlin-mitte-bahnhof',
 'https://www.lieferando.de/en/menu/bring-bakery',
 'https://www.lieferando.de/en/menu/otacos-berlin-spandauer-strasse',
 'https://www.lieferando.de/en/menu/kreuzburger-friedrichshain',
 'https://www.lieferando.de/en/menu/pizza-nostra-berlin',
 'https://www.lieferando.de/en/menu/xi-gon',
 'https://www.lieferando.de/en/menu/habba-habba-berlin',
 'https://www.lieferando.de/en/menu/madni',
 'https://www.lieferando.de/en/menu/dunkin-donuts-berlin-badstrasse',
 'https://www.lieferando.de/en/menu/maison-viet-10437',
 'https://www.lieferando.de/en/menu/riga-s-pizza-box-berlin-berlin',
 'https://www.li

# Stuff that didn't work

In [None]:
delivery_costs = soup.find_all("div", class_="delivery-cost js-delivery-cost notranslate")
len(delivery_costs)

442

In [None]:
delivery_times = soup.find_all("div", class_="avgdeliverytime avgdeliverytimefull open")

restaurant_delivery_times = [item.text for item in delivery_times]

len(restaurant_delivery_times) # Different because it's empty for restaurants that are closed for delivery or closed right now

589

In [None]:
restaurant_delivery_cost = [item.text for item in delivery_costs]
print(len(restaurant_delivery_cost)) 
set(restaurant_delivery_cost) # unique Delivery costs don't include "Free"

# Excludes restaurants where delivery is free and which are closed

442


{'0,50 €',
 '0,90 €',
 '1,00 €',
 '1,50 €',
 '1,90 €',
 '1,99 €',
 '2,00 €',
 '2,50 €',
 '2,90 €',
 '2,99 €',
 '3,00 €',
 '3,90 €',
 '5,00 €'}

In [None]:
restaurant_wrapper = soup.find_all("div", class_="detailswrapper")

restaurant_wrapper[0]

# This just contains skeleton code

<div class="detailswrapper">
<div class="shim skeleton__title"></div>
<div class="shim skeleton__kitchens"></div>
<div class="skeleton__details">
<div class="shim skeleton__detail"></div>
<div class="shim skeleton__detail"></div>
<div class="shim skeleton__detail"></div>
</div>
</div>