In [1]:
import requests
import string
import numpy as np
import pandas as pd
import json
import glob
import os
import re

In [2]:
def get_circlegrid(topleft, bottomright, spacing=2, overlap=1):
    
    lats = np.linspace(topleft[0], bottomright[0], spacing)
    lons = np.linspace(bottomright[1], topleft[1], spacing)

    counter = 0
    points = {}
    for lat in lats:
        for lon in lons:
            points[counter] = f'{lat},{lon}'
            counter += 1
            
    side_lenght = bottomright [0] - topleft[0]
    degradius = (side_lenght/(2 * (spacing - 1))) * overlap
    mradius = abs(1000*(degradius * (40075 * np.cos(topleft[1]) / 360)))
    
    return points, mradius

In [3]:
def split_title_tags(tags):
    # handle empty tags
    if isinstance(tags.tags, float):
        return {'tag_0': None}

    # use set to avoid duplicate
    titles = set()
    
    # get title for each tag, add to set
    for tag in tags.tags:
        titles.add(tag.get('title'))

    # return as dictionary for column names
    return {f'tag_{i}':title for i,title in enumerate(titles)}

In [4]:
df = pd.read_csv("../csv/combined_csv.csv")
df

Unnamed: 0,position,title,category/title,vicinity,openingHours/text,tag_0,tag_1,tag_2,tag_3,tag_4,...,tag_9,tag_10,Coordinates,Name,Category,Address,Opening Hours,Cuisine_1,Cuisine_2,Cuisine_3
0,"[52.45481, 13.59031]",Ihr Koch on Tour,Restaurant,"Seelenbinderstraße 112<br/>Köpenick, 12555 Berlin",Wed-Sat: 17:00 - 22:30<br/>Sun: 14:00 - 19:00,German,,,,,...,,,,,,,,,,
1,"[52.45481, 13.59031]",Pizzawerkköpenick,Restaurant,"Seelenbinderstraße 112<br/>Köpenick, 12555 Berlin",,Pizza,Burgers,Italian,,,...,,,,,,,,,,
2,"[52.45481, 13.59031]",Pizzawerk,Restaurant,"Seelenbinderstraße 112<br/>Köpenick, 12555 Berlin",Tue-Sun: 17:00 - 22:00,Pizza,,,,,...,,,,,,,,,,
3,"[52.45268, 13.59594]",Veracruz,Restaurant,"Fürstenwalder Damm 260<br/>Friedrichshagen, 12...","Mon-Thu, Sun: 12:00 - 23:00<br/>Fri, Sat: 12:0...",Mexican,,,,,...,,,,,,,,,,
4,"[52.44842, 13.61015]",Marina Sol,Restaurant,"Müggelseedamm 70<br/>Friedrichshagen, 12587 Be...",Mon-Sun: 12:00 - 22:00,Balkan,Grill,Mediterranean,European,Seafood,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11574,"[52.43003, 13.47804]",Restaurant Maracas,Restaurant,"Neuköllner Straße 201<br/>Rudow, 12357 Berlin",Mon-Sun: 12:00 - 22:00,Steak,Grill,Mexican,,,...,,,,,,,,,,
11575,"[52.42294, 13.47549]",Verde's Pizza & Gelato,Restaurant,"Joachim-Gottschalk-Weg 21<br/>Gropiusstadt, 12...",Mon-Sat: 09:00 - 22:00,Pizza,American,Italian,Barbecue,,...,,,,,,,,,,
11576,"[52.40663, 13.34947]",Eiscafé Kunterbunt,Food & Drink,"Waldsassener Straße 59A<br/>Marienfelde, 12279...",Mon-Sun: 12:00 - 19:00,Ice cream,,,,,...,,,,,,,,,,
11577,"[52.441673, 13.189119]",Osteria Ballerino,Restaurant,"Schwanenwerderweg<br/>Nikolassee, 14129 Berlin",Mon-Sun: 10:00 - 22:00,Mediterranean,European,Italian,,,...,,,,,,,,,,


In [5]:
df = df.drop(columns=["Coordinates", "Name",
                      "Category", "Address", 
                      "Opening Hours", "Cuisine_1", 
                      "Cuisine_2", "Cuisine_3"], axis=1)

In [6]:
for i in range(2, 11):
    df = df.drop(columns=f"tag_{i}")

In [7]:
df = df.drop_duplicates(subset=("position", "title"))

In [8]:
#replacing NaN with '' to apply below functions

df["openingHours/text"] = df["openingHours/text"].mask(df["openingHours/text"].isnull(), "")
df["vicinity"] = df["vicinity"].mask(df["vicinity"].isnull(), "")

In [9]:
def punctuation(x):
    
    string.punctuation
    for punctuation in string.punctuation:
        x = x.replace(punctuation, '')
        
    return x

def remove_br(x):
    
    x = x.replace("<br/>", " ")
    
    return x

In [10]:
# eliminating "<br/>" and punctuation

df["openingHours/text"] = df["openingHours/text"].apply(lambda x: remove_br(x))
df["vicinity"] = df["vicinity"].apply(lambda x: remove_br(x))
df["vicinity"] = df["vicinity"].apply(lambda x: punctuation(x))

In [11]:
# renaming columns

df = df.rename(columns={
    "position":"Coordinates",
    "title":"Name",
    "category/title":"Type",
    "vicinity":"Address",
    "openingHours/text":"Opening Hours",
    "tag_0":"Cuisine_1",
    "tag_1":"Cuisine_2"
})

In [42]:
df = df.reset_index().drop(columns="index")

In [43]:
df

Unnamed: 0,Coordinates,Name,Type,Address,Opening Hours,Cuisine_1,Cuisine_2,Street,Bezirk,PLZ
0,"[52.45481, 13.59031]",Ihr Koch on Tour,Restaurant,Seelenbinderstraße 112 Köpenick 12555 Berlin,Wed-Sat: 17:00 - 22:30 Sun: 14:00 - 19:00,German,,Seelenbinderstraße 112,Köpenick,12555
1,"[52.45481, 13.59031]",Pizzawerkköpenick,Restaurant,Seelenbinderstraße 112 Köpenick 12555 Berlin,,Pizza,Burgers,Seelenbinderstraße 112,Köpenick,12555
2,"[52.45481, 13.59031]",Pizzawerk,Restaurant,Seelenbinderstraße 112 Köpenick 12555 Berlin,Tue-Sun: 17:00 - 22:00,Pizza,,Seelenbinderstraße 112,Köpenick,12555
3,"[52.45268, 13.59594]",Veracruz,Restaurant,Fürstenwalder Damm 260 Friedrichshagen 12587 B...,"Mon-Thu, Sun: 12:00 - 23:00 Fri, Sat: 12:00 - ...",Mexican,,Fürstenwalder Damm 260,Friedrichshagen,12587
4,"[52.44842, 13.61015]",Marina Sol,Restaurant,Müggelseedamm 70 Friedrichshagen 12587 Berlin,Mon-Sun: 12:00 - 22:00,Balkan,Grill,Müggelseedamm 70,Friedrichshagen,12587
...,...,...,...,...,...,...,...,...,...,...
3343,"[52.43003, 13.47804]",Restaurant Maracas,Restaurant,Neuköllner Straße 201 Rudow 12357 Berlin,Mon-Sun: 12:00 - 22:00,Steak,Grill,,,
3344,"[52.42294, 13.47549]",Verde's Pizza & Gelato,Restaurant,JoachimGottschalkWeg 21 Gropiusstadt 12353 Berlin,Mon-Sat: 09:00 - 22:00,Pizza,American,,,
3345,"[52.40663, 13.34947]",Eiscafé Kunterbunt,Food & Drink,Waldsassener Straße 59A Marienfelde 12279 Berlin,Mon-Sun: 12:00 - 19:00,Ice cream,,,,
3346,"[52.441673, 13.189119]",Osteria Ballerino,Restaurant,Schwanenwerderweg Nikolassee 14129 Berlin,Mon-Sun: 10:00 - 22:00,Mediterranean,European,,,


In [40]:
len(df.Address)

3348

In [44]:
def address_splitterDE(column):
    
    try:
        df.insert(7, "Street", "")
        df.insert(8, "Bezirk", "")
        df.insert(9, "PLZ", "")
    except ValueError:
        pass

    for i in range(len(column)):

        text = column[i]

        try:
            df["Street"][i] = re.match(r'^(\w*\s)*(?=\w+\s\d{5})', text)[0].strip(" ")
        except TypeError:
            continue
        try:
            df["Bezirk"][i] = re.search(r'(?<=(\d|\w))\s\w+\s(?=\d{5})', text)[0].strip(" ")
        except TypeError:
            continue
        try:
            df["PLZ"][i] = re.search(r'\d{5}', text)[0]
        except TypeError:
            continue

address_splitterDE(df.Address)

df

    

Unnamed: 0,Coordinates,Name,Type,Address,Opening Hours,Cuisine_1,Cuisine_2,Street,Bezirk,PLZ
0,"[52.45481, 13.59031]",Ihr Koch on Tour,Restaurant,Seelenbinderstraße 112 Köpenick 12555 Berlin,Wed-Sat: 17:00 - 22:30 Sun: 14:00 - 19:00,German,,Seelenbinderstraße 112,Köpenick,12555
1,"[52.45481, 13.59031]",Pizzawerkköpenick,Restaurant,Seelenbinderstraße 112 Köpenick 12555 Berlin,,Pizza,Burgers,Seelenbinderstraße 112,Köpenick,12555
2,"[52.45481, 13.59031]",Pizzawerk,Restaurant,Seelenbinderstraße 112 Köpenick 12555 Berlin,Tue-Sun: 17:00 - 22:00,Pizza,,Seelenbinderstraße 112,Köpenick,12555
3,"[52.45268, 13.59594]",Veracruz,Restaurant,Fürstenwalder Damm 260 Friedrichshagen 12587 B...,"Mon-Thu, Sun: 12:00 - 23:00 Fri, Sat: 12:00 - ...",Mexican,,Fürstenwalder Damm 260,Friedrichshagen,12587
4,"[52.44842, 13.61015]",Marina Sol,Restaurant,Müggelseedamm 70 Friedrichshagen 12587 Berlin,Mon-Sun: 12:00 - 22:00,Balkan,Grill,Müggelseedamm 70,Friedrichshagen,12587
...,...,...,...,...,...,...,...,...,...,...
3343,"[52.43003, 13.47804]",Restaurant Maracas,Restaurant,Neuköllner Straße 201 Rudow 12357 Berlin,Mon-Sun: 12:00 - 22:00,Steak,Grill,Neuköllner Straße 201,Rudow,12357
3344,"[52.42294, 13.47549]",Verde's Pizza & Gelato,Restaurant,JoachimGottschalkWeg 21 Gropiusstadt 12353 Berlin,Mon-Sat: 09:00 - 22:00,Pizza,American,JoachimGottschalkWeg 21,Gropiusstadt,12353
3345,"[52.40663, 13.34947]",Eiscafé Kunterbunt,Food & Drink,Waldsassener Straße 59A Marienfelde 12279 Berlin,Mon-Sun: 12:00 - 19:00,Ice cream,,Waldsassener Straße 59A,Marienfelde,12279
3346,"[52.441673, 13.189119]",Osteria Ballerino,Restaurant,Schwanenwerderweg Nikolassee 14129 Berlin,Mon-Sun: 10:00 - 22:00,Mediterranean,European,Schwanenwerderweg,Nikolassee,14129


In [45]:
df.to_csv("final.csv", index=False, header=True ,encoding='utf-8-sig')