In [1]:
# data wrangling imports
import numpy as np
import pandas as pd 

# other imports
import csv
import re

In [2]:
# I had to run 2 scrapes, and just in case divided the database in 2
df_1 = pd.read_csv("../geotracker/data/wolt_2611.csv")
df_2 = pd.read_csv("../geotracker/data/wolt.csv")
wolt_df = pd.concat([df_1, df_2])


In [3]:
# creating column to identify the data source
wolt_df["database"] = "wolt"

# dropping duplicated columns
wolt_df.drop_duplicates(inplace=True)

# preprocessing and turning into float min delivery_fee column
wolt_df.delivery_fee.replace("deliveryMin.", np.NaN, inplace=True)
wolt_df.delivery_fee = wolt_df.delivery_fee.apply(lambda x: 1.90
                                                  if x == "1.90Min." else x)
wolt_df.reset_index(drop="index", inplace=True)


In [4]:
# preprocessing and turning into float min avg_delivery_time column
mask_1 = wolt_df['avg_delivery_time'].notnull()
mask_2 = wolt_df.avg_delivery_time != "Takeaway"
mask = mask_1 & mask_2


In [5]:
wolt_df.tail()

Unnamed: 0,restaurant_name,avg_review_score,street,zip_code,city_name,type_of_cuisine,minimum_order_value,delivery_fee,pricyness,latitude,longitude,avg_delivery_time,database
1356,Shiso Burger Mitte,8.6,Auguststrasse 29C,10119.0,Berlin,"['bap burgers and pastrami', 'hamburger', 'asi...",10.0,1.9,€€,52.527281,13.39882,,wolt
1357,Yakoolza,8.6,Wilhelmstraße 87,10117.0,Berlin,"['sushi', 'asian', 'fish']",10.0,,€€,52.513098,13.381989,45–55 min,wolt
1358,Don Sushi 15,8.4,Brückenstraße 15,10179.0,Berlin,"['sushi', 'Don', 'bowl']",10.0,,€€,52.511576,13.416528,45–55 min,wolt
1359,Cafe Fleury,9.2,Weinbergsweg 20,10119.0,Berlin,"['french', 'baguette', 'homemade']",10.0,,€€,52.531174,13.402212,45–55 min,wolt
1360,Kuchi Mitte,8.8,Gipsstraße 3,10119.0,Berlin,"['asian', 'japanese', 'sushi']",10.0,,€€,52.527184,13.399308,30–40 min,wolt


In [6]:
wolt_df["min_avg_delivery_time"] = wolt_df[
    mask].loc[:, "avg_delivery_time"].apply(
        lambda x: x.replace("–", " ").split(" ")[0] if str(x) != np.nan else np.nan)
    
wolt_df["max_avg_delivery_time"] = wolt_df[
    mask].loc[:, "avg_delivery_time"].apply(
        lambda x: x.replace("–", " ").split(" ")[1])

wolt_df[["min_avg_delivery_time", "max_avg_delivery_time"
 ]] = wolt_df[["min_avg_delivery_time",
               "max_avg_delivery_time"]].apply(pd.to_numeric)


In [7]:
# calculating clean avg_delivery_time : avg_delivery_time_clean
wolt_df["avg_delivery_time_clean"] = wolt_df[[
    "min_avg_delivery_time", "max_avg_delivery_time"
]].mean(axis=1)
wolt_df.drop(columns=["avg_delivery_time"], inplace=True)

In [8]:
# preprocessing type_of_cuisine

# converting into a proper list
wolt_df.type_of_cuisine = wolt_df.type_of_cuisine.apply(lambda x: x.replace(
    "'", "").replace(" ", "").replace("[", "").replace("]", "").split(","))

# creating separate columns for each of the 3 types of cuisine options
wolt_df["type_of_cuisine_categorized"] = wolt_df["type_of_cuisine"].apply(
    pd.Series)[0]
wolt_df["type_of_cuisine_2"] = wolt_df["type_of_cuisine"].apply(pd.Series)[1]
wolt_df["type_of_cuisine_3"] = wolt_df["type_of_cuisine"].apply(pd.Series)[2]

toc1 = wolt_df.type_of_cuisine_categorized.unique().tolist()
toc2 = wolt_df.type_of_cuisine_2.unique().tolist()
toc3 = wolt_df.type_of_cuisine_3.unique().tolist()
unique_toc = list(set(toc1 + toc2 + toc3))

# dictionary containing all keywords and categorizing
unique_toc_dict = {
    'thai': 'asian',
    'fine-dining': 'middle eastern',
    'cheese': np.nan,
    'fusion': np.nan,
    'butterchicken': 'middle eastern',
    'chinese': 'asian',
    'traditional': np.nan,
    'café': 'cafes',
    'german': 'european',
    'cocktail': 'bars',
    'hamburger': 'fastfood',
    'baklava': 'middle eastern',
    'pokebowl': 'poke',
    'bagel': 'breakfast/dessert',
    'waffles': 'cafes',
    'worklunch': 'fastfood',
    'tapas': 'mediterranean',
    'mediterranean': 'mediterranean',
    'pastries': 'breakfast/dessert',
    'sliders': 'fastfood',
    'turkish': 'middle eastern',
    'steak': 'steak',
    'snacks': 'snacks',
    'Sashimi': 'asian',
    'pancakes': 'breakfast/dessert',
    'Georgian': 'european',
    'risotto': 'italian',
    'glutenfree': 'healthy',
    'bento': 'asian',
    'bistro': 'european',
    'shawarma': 'middle eastern',
    'meatballs': 'european',
    'sushi': 'asian',
    'fish': 'seafood',
    'Don': np.nan,
    'bakery': 'breakfast/dessert',
    'delicious': np.nan,
    'mexican': 'mexican',
    'summerrolls': 'asian',
    'chickennuggets': 'fastfood',
    'korean': 'asian',
    'vegan': 'vegetarian or vegan',
    'chocolate': 'breakfast/dessert',
    'porridge': 'breakfast/dessert',
    'Schnitzel': 'european',
    'Arabic': 'middle eastern',
    'moussaka': 'middle eastern',
    'Dessert': 'breakfast/dessert',
    'vegetarian': 'vegetarian or vegan',
    'donut': 'breakfast/dessert',
    'friedchicken': 'fastfood',
    'smoothie': 'breakfast/dessert',
    'beer': 'bars',
    'indian': 'middle eastern',
    'rice': 'asian',
    'fruit': 'breakfast/dessert',
    'icecoffee': 'cafes',
    'Pastrami': np.nan,
    'dumplings': 'asian',
    'currywurst': 'fastfood',
    'asian': 'asian',
    'pizza': 'italian',
    'grill': 'steak',
    'seafood': 'seafood',
    'wine': 'bars',
    'schnitzel': 'european',
    'american': 'american',
    'healthy': 'healthy',
    'hummus': 'middle eastern',
    'russian': 'russian',
    'Donburi': np.nan,
    'baguette': 'breakfast/dessert',
    'salad': 'healthy',
    'fries': 'fastfood',
    'Austrian': 'european',
    'gyoza': 'asian',
    'potato': np.nan,
    'naan': 'nan',
    'icecream': 'breakfast/dessert',
    'pita': 'mediterranean',
    'sausage': 'european',
    'neapolitanpizza': 'italian',
    'spaghetti': 'italian',
    'cake': 'breakfast/dessert',
    'ribs': 'american',
    'dessert': "breakfast/dessert",
    'Doughnut': 'breakfast/dessert',
    'matcha': 'breakfast/dessert',
    'focaccia': 'italian',
    'homemade': np.nan,
    'milkshake': 'american',
    'taco': 'mexican',
    'curry': 'fastfood',
    'israeli': 'middle eastern',
    'bapburgersandpastrami': '',
    'sandwich': 'fastfood',
    'bowl': 'poke',
    'maki': 'asian',
    'roll': 'asian',
    'brunch': 'breakfast/dessert',
    'pho': 'asian',
    'vietnamese': 'asian',
    'burgers': 'fastfood',
    'muchapizza': 'italian',
    'masala': 'middle eastern',
    'contemporary': np.nan,
    'falafel': 'middle eastern',
    'gyros': 'greek',
    'friedrice': 'asian',
    'chicken': 'snacks',
    'italian': 'european',
    'spaetzle': 'european',
    'streetfood': 'fastfood',
    'Mozzarella': 'italian',
    'ramen': 'asian',
    'antipasti': 'italian',
    'noodles': 'italian',
    'hotdog': 'snacks',
    'coffee': 'cafes',
    'oriental': 'middle eastern',
    'bubbletea': 'breakfast/dessert',
    'greek': 'mediterranean',
    'middleeastern': 'middle eastern',
    'pasta': 'italian',
    'fresh': 'healthy',
    'tandoori': 'middle eastern',
    'wrap': 'healthy',
    'european': 'european',
    'tea': 'breakfast/dessert',
    'bao': 'asian',
    'beyondmeat': 'vegetarian or vegan',
    'duck': 'asian',
    'galette': 'european',
    'panini': 'italian',
    'soup': 'healthy',
    'fastfood': 'fastfood',
    'LatinAmerican': 'south american',
    'juice': 'breakfast/dessert',
    'halal': 'middle eastern',
    'burger': 'american',
    'vegetable': 'vegetarian or vegan',
    'Hawaii': 'american',
    'breakfast': 'breakfast/dessert',
    'poke': 'poke',
    'drinks': 'bars',
    'burrito': 'mexican',
    'salmon': 'seafood',
    'homecooking': np.nan,
    'spanish': 'mediterranean',
    'meat': 'steak',
    'french': 'european',
    'veggieburger': 'vegetarian or vegan',
    'lunch': 'steak',
    'meze': 'asian',
    'homemademeals': np.nan,
    'kebab': 'middle eastern',
    'beef': 'steak',
    'tex-mex': 'mexican',
    'japanese': 'asian',
    '': np.nan
}

# in case we need to add more columns

for x in unique_toc:
    if x not in unique_toc_dict.keys():
        unique_toc_dict[x] = np.nan

# updating type of cuisine columns
wolt_df[
    'type_of_cuisine_categorized'] = wolt_df.type_of_cuisine_categorized.map(
        unique_toc_dict)
wolt_df.type_of_cuisine_2 = wolt_df.type_of_cuisine_2.map(unique_toc_dict)
wolt_df.type_of_cuisine_3 = wolt_df.type_of_cuisine_3.map(unique_toc_dict)

# encoding pricyness column

wolt_df["pricyness"] = wolt_df["pricyness"].map({
    '€': 1,
    '€€': 2,
    '€€€': 3,
    np.nan: np.nan,
    '€€€€': 4
})

# converting zipcode into an integer
wolt_df["zip_code"] = wolt_df["zip_code"].astype('Int64')


In [9]:
# creating clean df
wolt_df_clean = wolt_df[['restaurant_name', 'avg_review_score', 'minimum_order_value',
    'delivery_fee', 'pricyness', 'avg_delivery_time_clean',
    'type_of_cuisine_categorized', 'street', 'zip_code', 'city_name',
    'latitude', 'longitude', 'database']]

In [10]:
# saving de into csv
wolt_df_clean.to_csv("../geotracker/data/wolt_clean_data.csv")

In [11]:
wolt_df_clean.type_of_cuisine_categorized.unique()

array(['vegetarian or vegan', 'european', 'poke', 'middle eastern',
       'asian', 'italian', 'steak', 'american', 'breakfast/dessert', nan,
       'cafes', 'healthy', 'seafood', 'fastfood', 'bars', '', 'snacks',
       'mexican', 'mediterranean', 'russian'], dtype=object)