In [1]:
# data wrangling
import pandas as pd
import numpy as np
from fuzzywuzzy import fuzz
from fuzzywuzzy import process
from fuzzymatcher import link_table, fuzzy_left_join

# data visualization
import matplotlib.pyplot as plt
import seaborn as sns


In [None]:
all_data = pd.read_csv("../geotracker/data/all_data.csv").iloc[:, 1:]
# converting lat/lon & coords into 5 decimal figures
all_data["latitude"] = all_data.latitude.round(5)
all_data["longitude"] = all_data.longitude.round(5)
all_data["coordinates"] = all_data.latitude.astype(
    str) + "," + all_data.longitude.astype(str)


In [None]:
all_data.shape

(7087, 13)

In [None]:
# dividing all data into 3 dataframes; each for one column

wolt = all_data[all_data.database == "wolt"]
lieferando = all_data[all_data.database == "lieferando"]
maps = all_data[all_data.database == "here_maps"]

maps.rename(columns={"restaurant_name": "name"}, inplace=True)


# adjusting name cols to make sense for fuzzy merge
maps['name'] = maps['name'].str.lower()
maps['name'] = maps.loc[:, 'name'].apply(lambda x: str(x).replace(" ", ""))

wolt['restaurant_name'] = wolt.restaurant_name.str.lower()
wolt['restaurant_name'] = wolt.loc[:, "restaurant_name"].apply(
    lambda x: str(x).replace(" ", ""))




def fuzzy_merge(df_1, df_2, key1, key2, threshold=90, limit=1):
    """
    :param df_1: the left table to join
    :param df_2: the right table to join
    :param key1: key column of the left table
    :param key2: key column of the right table
    :param threshold: how close the matches should be to return a match, based on Levenshtein distance
    :param limit: the amount of matches that will get returned, these are sorted high to low
    :return: dataframe with boths keys and matches
    """
    s = df_2[key2].tolist()

    m = df_1[key1].apply(lambda x: process.extract(x, s, limit=limit))
    df_1['match'] = m

    m2 = df_1['match'].apply(
        lambda x: ', '.join([i[0] for i in x if i[1] >= threshold]))
    df_1['match'] = m2

    return df_1

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().rename(
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  maps['name'] = maps['name'].str.lower()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  maps['name'] = maps.loc[:, 'name'].apply(lambda x: str(x).replace(" ", ""))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead


In [None]:
fuzzy_merge(maps, wolt, "name", "restaurant_name", threshold=75, limit=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_1['match'] = m
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_1['match'] = m2


Unnamed: 0,name,avg_review_score,reviews,type_of_cuisine,address,street,zip_code,city_name,coordinates,latitude,longitude,database,opening_hours,match
3740,ihrkochontour,,,european,Seelenbinderstraße 112 Köpenick 12555 Berlin,Seelenbinderstraße 112,12555,Köpenick,"52.45481,13.59031",52.45481,13.59031,here_maps,Wed-Sat: 17:00 - 22:30 Sun: 14:00 - 19:00,
3741,pizzawerkköpenick,,,italian,Seelenbinderstraße 112 Köpenick 12555 Berlin,Seelenbinderstraße 112,12555,Köpenick,"52.45481,13.59031",52.45481,13.59031,here_maps,,
3742,pizzawerk,,,italian,Seelenbinderstraße 112 Köpenick 12555 Berlin,Seelenbinderstraße 112,12555,Köpenick,"52.45481,13.59031",52.45481,13.59031,here_maps,Tue-Sun: 17:00 - 22:00,
3743,veracruz,,,mexican,Fürstenwalder Damm 260 Friedrichshagen 12587 B...,Fürstenwalder Damm 260,12587,Friedrichshagen,"52.45268,13.59594",52.45268,13.59594,here_maps,"Mon-Thu, Sun: 12:00 - 23:00 Fri, Sat: 12:00 - ...",
3744,marinasol,,,european,Müggelseedamm 70 Friedrichshagen 12587 Berlin,Müggelseedamm 70,12587,Friedrichshagen,"52.44842,13.61015",52.44842,13.61015,here_maps,Mon-Sun: 12:00 - 22:00,marinablu
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7082,restaurantmaracas,,,steak,Neuköllner Straße 201 Rudow 12357 Berlin,Neuköllner Straße 201,12357,Rudow,"52.43003,13.47804",52.43003,13.47804,here_maps,Mon-Sun: 12:00 - 22:00,
7083,verde'spizza&gelato,,,italian,JoachimGottschalkWeg 21 Gropiusstadt 12353 Berlin,JoachimGottschalkWeg 21,12353,Gropiusstadt,"52.42294,13.47549",52.42294,13.47549,here_maps,Mon-Sat: 09:00 - 22:00,tiny'spizza
7084,eiscafékunterbunt,,,breakfast/dessert,Waldsassener Straße 59A Marienfelde 12279 Berlin,Waldsassener Straße 59A,12279,Marienfelde,"52.40663,13.34947",52.40663,13.34947,here_maps,Mon-Sun: 12:00 - 19:00,
7085,osteriaballerino,,,mediterranean,Schwanenwerderweg Nikolassee 14129 Berlin,Schwanenwerderweg,14129,Nikolassee,"52.44167,13.18912",52.44167,13.18912,here_maps,Mon-Sun: 10:00 - 22:00,


In [None]:
maps[maps.match != ""][["name", "match", "coordinates"]]

Unnamed: 0,name,match,coordinates
3744,marinasol,marinablu,"52.44842,13.61015"
3752,elloco,hellocharge,"52.44732,13.57702"
3754,domino'spizza,tiny'spizza,"52.45415,13.57784"
3756,burgerburo,burgersburgersmitte,"52.45736,13.57798"
3760,nordsee,nordsee-berlinspandauerstraße,"52.45846,13.57895"
...,...,...,...
7063,scheich-schnellrestaurant,omrestaurant,"52.41608,13.49607"
7066,schrösel's,chef's,"52.41899,13.49848"
7070,kaiserdrachenchinarestaurant,chenche,"52.4354,13.50174"
7075,ristorantealbergo,ristorantemarea,"52.41501,13.49555"


In [None]:
maps["name_matcher"] = maps.apply(lambda x: x["match"]
                                  if x["match"] != "" else x["name"],
                                  axis=1)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  maps["name_matcher"] = maps.apply(lambda x: x["match"]


In [None]:
maps["latitude"] = maps.latitude.round(3)
maps["longitude"] = maps.longitude.round(3)
wolt["latitude"] = wolt.latitude.round(3)
wolt["longitude"] = wolt.longitude.round(3)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  maps["latitude"] = maps.latitude.round(3)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  maps["longitude"] = maps.longitude.round(3)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  wolt["latitude"] = wolt.latitude.round(3)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using 

In [None]:
# merged_table = pd.merge(maps,
#                         wolt[["restaurant_name", "coordinates"]],
#                         left_on=["name_matcher", "coordinates"],
#                         right_on=["restaurant_name", "coordinates"],
#                         how="left")

merged_table = pd.merge(maps,
                        wolt[["restaurant_name", "latitude", "longitude"]],
                        on=["latitude", "longitude"],
                        how="inner")

In [None]:
merged_table.drop_duplicates()

Unnamed: 0,name,avg_review_score,reviews,type_of_cuisine,address,street,zip_code,city_name,coordinates,latitude,longitude,database,opening_hours,match,name_matcher,restaurant_name
0,rotisserieweingrün,,,european,Gertraudenstraße 10 Mitte 10178 Berlin,Gertraudenstraße 10,10178,Mitte,"52.51273,13.40303",52.513,13.403,here_maps,Mon-Sat: 17:00 - 23:00 Sun: 00:00 - 00:00,rotisserieweingrün,rotisserieweingrün,rotisserieweingrün
1,cafeampetriplatz,,,european,Gertraudenstraße Mitte 10178 Berlin,Gertraudenstraße,10178,Mitte,"52.51294,13.40318",52.513,13.403,here_maps,Mon-Sat: 11:00 - 22:00 Sun: 11:00 - 20:00,,cafeampetriplatz,rotisserieweingrün
2,jedermann's,,,breakfast/dessert,Unter den Linden 12 Mitte 10117 Berlin,Unter den Linden 12,10117,Mitte,"52.51739,13.39015",52.517,13.390,here_maps,Mon-Sun: 10:00 - 23:00,fresh's,fresh's,littlegreenrabbitunterdenlinden
3,thedigitaleatery,,,international,Unter den Linden 17 Mitte 10117 Berlin,Unter den Linden 17,10117,Mitte,"52.51681,13.39048",52.517,13.390,here_maps,Mon-Fri: 09:00 - 19:00 Sat: 11:00 - 20:00 Sun:...,,thedigitaleatery,littlegreenrabbitunterdenlinden
4,vietnamrestaurant,,,asian,Unter den Linden 12 Mitte 10117 Berlin,Unter den Linden 12,10117,Mitte,"52.51739,13.39015",52.517,13.390,here_maps,Mon-Sat: 10:30 - 22:00 Sun: 11:30 - 22:00,eden-vietnameserestaurant,eden-vietnameserestaurant,littlegreenrabbitunterdenlinden
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
151,arman,,,asian,Mehringdamm 45 Kreuzberg 10961 Berlin,Mehringdamm 45,10961,Kreuzberg,"52.49199,13.38771",52.492,13.388,here_maps,"Mon-Thu: 12:00 - 13:00 Fri, Sat: 12:00 - 14:00...",asman,asman,cancunkreuzberg
152,sushicube,,,,Zossener Straße 18 Kreuzberg 10961 Berlin,Zossener Straße 18,10961,Kreuzberg,"52.49081,13.39403",52.491,13.394,here_maps,"Mon-Sat: 12:00 - 00:00, 12:00 - 23:30 Sun: 12:...",sushicube,sushicube,sesam
153,sushicube,,,,Zossener Straße 18 Kreuzberg 10961 Berlin,Zossener Straße 18,10961,Kreuzberg,"52.49081,13.39403",52.491,13.394,here_maps,"Mon-Sat: 12:00 - 00:00, 12:00 - 23:30 Sun: 12:...",sushicube,sushicube,sushicube
154,tuttiisland,,,middle eastern,Zossener Straße 17 Kreuzberg 10961 Berlin,Zossener Straße 17,10961,Kreuzberg,"52.49094,13.39407",52.491,13.394,here_maps,Mon-Sun: 11:00 - 23:00,,tuttiisland,sesam


In [None]:
wolt.shape

(468, 13)