# Mapping locations

In [1]:
# Basic imports
import pandas as pd
import numpy as np
import requests
import json
import pycountry 
# Reading the file
file_name = "../data/locations.txt"
with open(file_name,encoding="utf8") as f:
    locations = f.readlines()
locations = pd.Series([x.strip() for x in locations])
print('number of unique locations : ',len(locations))

number of unique locations :  21815


In [2]:
# Read the already mapped locations
location_to_canton = pd.read_csv('../data/location_to_canton.csv')
del location_to_canton['Unnamed: 0']
location_to_canton.head()

Unnamed: 0,ID,Location
0,ZH,CH 8545 Rickenbach Sulz
1,ZH,"Ruschlikon, CH"
2,ZH,"CH-8700- Kusnacht, Switzerland"
3,ZH,"hombrechtikon, switzerland"
4,ZH,"Basel, Brugg/Windisch (CH)"


In [3]:
# Remove the already mapped locations from the data
idx_loc = [not any(location_to_canton['Location'].isin([loc])) for loc in locations]
locations = locations[idx_loc]
print('number of remaining locations : ',len(locations))

number of remaining locations :  21605


In [4]:
countries = {}

for country in pycountry.countries:
    if country.name!='Switzerland':
        countries[country.name] = [country.name.lower(), country.alpha_2]
    
countries

{'Afghanistan': ['afghanistan', 'AF'],
 'Albania': ['albania', 'AL'],
 'Algeria': ['algeria', 'DZ'],
 'American Samoa': ['american samoa', 'AS'],
 'Andorra': ['andorra', 'AD'],
 'Angola': ['angola', 'AO'],
 'Anguilla': ['anguilla', 'AI'],
 'Antarctica': ['antarctica', 'AQ'],
 'Antigua and Barbuda': ['antigua and barbuda', 'AG'],
 'Argentina': ['argentina', 'AR'],
 'Armenia': ['armenia', 'AM'],
 'Aruba': ['aruba', 'AW'],
 'Australia': ['australia', 'AU'],
 'Austria': ['austria', 'AT'],
 'Azerbaijan': ['azerbaijan', 'AZ'],
 'Bahamas': ['bahamas', 'BS'],
 'Bahrain': ['bahrain', 'BH'],
 'Bangladesh': ['bangladesh', 'BD'],
 'Barbados': ['barbados', 'BB'],
 'Belarus': ['belarus', 'BY'],
 'Belgium': ['belgium', 'BE'],
 'Belize': ['belize', 'BZ'],
 'Benin': ['benin', 'BJ'],
 'Bermuda': ['bermuda', 'BM'],
 'Bhutan': ['bhutan', 'BT'],
 'Bolivia, Plurinational State of': ['bolivia, plurinational state of', 'BO'],
 'Bonaire, Sint Eustatius and Saba': ['bonaire, sint eustatius and saba',
  'BQ'],
 

In [5]:
# other countries locations 
idx_out_sw = [any(countries[ct][0] in loc.lower() for ct in countries) for loc in locations]
# drop other countries locations
locations1 = locations[[not(idx) for idx in idx_out_sw]]
print(len(locations1))

18933


In [6]:
# other countries ID's locations 
idx_out_sw = [any(countries[ct][1] in loc for ct in countries) for loc in locations1]
# drop other countries locations
locations1 = locations1[[not(idx) for idx in idx_out_sw]]
print(len(locations1))

16689


In [7]:
import unicodedata as ud

latin_letters= {}

def is_latin(uchr):
    try: return latin_letters[uchr]
    except KeyError:
         return latin_letters.setdefault(uchr, 'LATIN' in ud.name(uchr))

def only_roman_chars(unistr):
    return all(is_latin(uchr)
           for uchr in unistr
           if uchr.isalpha())

# drop non latin 

idx_non_latin = [only_roman_chars(loc) for loc in locations1]
locations2 = locations1[idx_non_latin]
len(locations2)

16102

In [8]:
# take swiss cantons locations
ch_sub = pycountry.subdivisions.get(country_code='CH')
cantons = {}

for canton in ch_sub:
    cantons[canton.code.split('-')[1]] = [canton.code.split('-')[1],canton.name.lower()]
    
cantons

{'AG': ['AG', 'aargau'],
 'AI': ['AI', 'appenzell innerrhoden'],
 'AR': ['AR', 'appenzell ausserrhoden'],
 'BE': ['BE', 'bern'],
 'BL': ['BL', 'basel-landschaft'],
 'BS': ['BS', 'basel-stadt'],
 'FR': ['FR', 'fribourg'],
 'GE': ['GE', 'genève'],
 'GL': ['GL', 'glarus'],
 'GR': ['GR', 'graubünden'],
 'JU': ['JU', 'jura'],
 'LU': ['LU', 'luzern'],
 'NE': ['NE', 'neuchâtel'],
 'NW': ['NW', 'nidwalden'],
 'OW': ['OW', 'obwalden'],
 'SG': ['SG', 'sankt gallen'],
 'SH': ['SH', 'schaffhausen'],
 'SO': ['SO', 'solothurn'],
 'SZ': ['SZ', 'schwyz'],
 'TG': ['TG', 'thurgau'],
 'TI': ['TI', 'ticino'],
 'UR': ['UR', 'uri'],
 'VD': ['VD', 'vaud'],
 'VS': ['VS', 'valais'],
 'ZG': ['ZG', 'zug'],
 'ZH': ['ZH', 'zürich']}

In [9]:
# locations containing the canton information
idx_with_ct = [any([cantons[ct][1] in loc.lower() for ct in cantons]) for loc in locations2]
loc_with_canton = locations2[idx_with_ct]
len(loc_with_canton)

2299

In [10]:
# remaining locations
locations3 = locations2[[not(idx) for idx in idx_with_ct]]
len(locations3)

13803

In [11]:
# adding locations containing canton ID's
idx_with_ct = [any([cantons[ct][0] in loc for ct in cantons]) for loc in locations3]
loc_with_canton = loc_with_canton.append(locations3[idx_with_ct])
len(loc_with_canton)

2553

In [12]:
# remaining locations
locations3 = locations3[[not(idx) for idx in idx_with_ct]]
len(locations3)

13549

In [13]:
locations3

0                                  nan
1                             Chiemgau
3                     #standrewsestate
4         iPhone: 42.198639,-71.278198
5                          Guadalajara
7                            Bucuresti
8                      From Everywhere
9                   Unterseen, Schweiz
10           Ü T: 31.319998,-85.861966
13               Chavannes-près-Renens
17       Holmfirth, West Yorkshire, UK
19                           Morgane ♡
20                  Powhatan, Virginia
22                   swiss cheese land
24        Usually at 38,000ft / London
25                            Bebo's ❤
27                  Gland, Switzerland
28                              Coruña
31                  Muntelier, Schweiz
32              Hertfordshire, England
34              Rouen, Haute-Normandie
35                      rue du paradis
37                               Breda
38             Most of the time on set
39                      Jengkalifornia
42           Mont-sur-Lau

In [14]:
# take only switzerland locations
ch_names = ['switzerland','suisse','schweiz']
idx_in_ch = [(any([s in loc for s in ch_names]) or 'CH' in loc) for loc in locations3]
loc_in_ch = locations3[idx_in_ch]
print(len(loc_in_ch))

247


In [15]:
# remone unsignificant (only refering to switzerland)
ch_names = ['switzerland','switzerland.','suisse.','suisse','schweiz','schweiz.','CH','CHE','CH.','CHE.']
idx_eq_ch = [any([s.lower() == loc.lower() for s in ch_names]) for loc in loc_in_ch]
loc_in_ch = loc_in_ch[[not(idx) for idx in idx_eq_ch]]
print(len(loc_in_ch))

241


In [16]:
# more cleaning
vis_unclean = ['paris','lille',]
idx_unclean = [any([s in loc.lower() for s in vis_unclean]) for loc in loc_in_ch]

loc_in_ch = loc_in_ch[[not(idx) for idx in idx_unclean]]
print(len(loc_in_ch))

238


In [17]:
loc_in_ch = loc_in_ch.reset_index()
loc_in_ch

Unnamed: 0,index,0
0,378,los angeles/switzerland
1,577,"im schönen freiamt, schweiz"
2,1217,la suisse de la merde
3,2379,"switzerland, planet earth"
4,2541,PCHS'17
5,2567,"Rapperswil-Jona / Basel, CHE"
6,2731,Sommewhere in switzerland
7,2886,"CH, Earth, Milky Way, Laniakea"
8,4047,R-34// 305CH
9,4528,"St.Petersburg, FL Oberburg, CH"


In [18]:
del loc_in_ch['index']

In [19]:
loc_in_ch[0:3]

Unnamed: 0,0
0,los angeles/switzerland
1,"im schönen freiamt, schweiz"
2,la suisse de la merde


In [20]:
loc_in_ch1 = loc_in_ch[0:109]
loc_in_ch2 = loc_in_ch[109:218]
loc_in_ch3 = loc_in_ch[218:]

In [26]:
# mapping location -> canton using google api
# function taking the name of the place and returning the long address of it
def getAddress(name,gkey):
  glink="https://maps.googleapis.com/maps/api/place/textsearch/json?query={}&key={}".format(name + " Switzerland", gkey)
  r = requests.get(glink)
  dic = json.loads(r.text)
  try:
    return dic['results'][0]['formatted_address']
  except:
    return ''

geocodeLink = "https://maps.googleapis.com/maps/api/geocode/json?sensor=true"
from time import sleep

# Get the resulting Json file from the request, and put it in a json dictionary
def getGeocodeJsonDictionary(name,gkey):
    link =  u"{}&address=[{}]".format(geocodeLink, name, gkey)
    sleep(0.2) # a workaround for the rate-limit of Google geocode API
    r = requests.get(link)
    dic = json.loads(r.text)
    return dic

# Get the canton from the administrative_area_level_1 field in the dictionary
def getCantonFromGeocodeDictionary(dic):
    canton = ''
    if dic['results']:
        a = dic['results'][0]['address_components']
        for x in a:
            if x['types'][0]=='administrative_area_level_1':
                canton = x['short_name']    
    return canton

# Get the canton by composing the two previous functions
def getCanton(name):
    return getCantonFromGeocodeDictionary(getGeocodeJsonDictionary(name,gkey))

In [22]:
#part 1
addresses1 = [[a, getAddress(a,gkey)] for a in loc_in_ch1[0]]
addresses1

[['los angeles/switzerland',
  '5721 W Imperial Hwy, Los Angeles, CA 90045, United States'],
 ['im schönen freiamt, schweiz', ''],
 ['la suisse de la merde', ''],
 ['switzerland, planet earth', ''],
 ["PCHS'17", ''],
 ['Rapperswil-Jona / Basel, CHE', ''],
 ['Sommewhere in switzerland', ''],
 ['CH, Earth, Milky Way, Laniakea', ''],
 ['R-34// 305CH', ''],
 ['St.Petersburg, FL Oberburg, CH', ''],
 ['www.lenews.ch - Bassins, CH', ''],
 ['#3052 (switzerland)', ''],
 ['Cloud Hopping , CH', ''],
 ['CH, Bodensee', 'Lake Constance'],
 ['switzerland/toronto', '3230 Yonge St, Toronto, ON M4N 3P6, Canada'],
 ['switzerland; planet jeds', ''],
 ['The CHI', 'Chi, 3982, Switzerland'],
 ['LONDWICH', ''],
 ['Basel, CH', 'Basel, Switzerland'],
 ['zuerich, switzerland', 'Zürich, Switzerland'],
 ['Biel, CH', 'Biel/Bienne, Switzerland'],
 ['Zentralschweiz / Basel', 'Alpenstrasse 1, 6002 Luzern, Switzerland'],
 ['vevey,switzerland', 'Vevey, Switzerland'],
 ['Manchester, UK / Geneva, CH',
  "Route de l'Aéropo

In [23]:
for a in addresses1:
    if a[1]=='':
        a[1]=a[0]
    else:
        if not(a[1].endswith('Switzerland')):
            a[1]=''

addresses1 = [a for a in addresses1 if a[1] != '']
len(addresses1)

103

In [27]:
cantons1 = [[a[0], getCanton(a[1])] for a in addresses1]

In [28]:
cantons1

[['im schönen freiamt, schweiz', ''],
 ['la suisse de la merde', ''],
 ['switzerland, planet earth', ''],
 ["PCHS'17", ''],
 ['Rapperswil-Jona / Basel, CHE', ''],
 ['Sommewhere in switzerland', ''],
 ['CH, Earth, Milky Way, Laniakea', ''],
 ['R-34// 305CH', 'NY'],
 ['St.Petersburg, FL Oberburg, CH', ''],
 ['www.lenews.ch - Bassins, CH', ''],
 ['#3052 (switzerland)', ''],
 ['Cloud Hopping , CH', ''],
 ['switzerland; planet jeds', ''],
 ['The CHI', 'VS'],
 ['LONDWICH', ''],
 ['Basel, CH', 'BS'],
 ['zuerich, switzerland', 'ZH'],
 ['Biel, CH', 'BE'],
 ['Zentralschweiz / Basel', 'LU'],
 ['vevey,switzerland', 'VD'],
 ['Manchester, UK / Geneva, CH', 'GE'],
 ['* Anime World * CH', ''],
 ['Montreux, CH', 'VD'],
 ['lausanne, CH', 'VD'],
 ['lausanne,CH', 'VD'],
 ['CH-6170', 'LU'],
 ['Opfikon (CH)', 'ZH'],
 ['Cholerweg 9, 5212 Hausen, CH', 'AG'],
 ['meilen, switzerland', 'ZH'],
 ['CH-8132 Egg', 'ZH'],
 ['sion, switzerland', 'VS'],
 ['switzerland, basel', 'BS'],
 ['interlaken,switzerland', 'BE'],
 

In [29]:
loc_in_ch_left = loc_in_ch[0][[not (any([(loc in c[0]) for c in cantons1])) for loc in loc_in_ch[0]]]

In [30]:
len(loc_in_ch_left)

132

In [31]:
df_cantons = pd.DataFrame(cantons1)
df_cantons

Unnamed: 0,0,1
0,"im schönen freiamt, schweiz",
1,la suisse de la merde,
2,"switzerland, planet earth",
3,PCHS'17,
4,"Rapperswil-Jona / Basel, CHE",
5,Sommewhere in switzerland,
6,"CH, Earth, Milky Way, Laniakea",
7,R-34// 305CH,NY
8,"St.Petersburg, FL Oberburg, CH",
9,"www.lenews.ch - Bassins, CH",


In [32]:
df_cantons.columns = [['Location','ID']]
df_cantons

Unnamed: 0,Location,ID
0,"im schönen freiamt, schweiz",
1,la suisse de la merde,
2,"switzerland, planet earth",
3,PCHS'17,
4,"Rapperswil-Jona / Basel, CHE",
5,Sommewhere in switzerland,
6,"CH, Earth, Milky Way, Laniakea",
7,R-34// 305CH,NY
8,"St.Petersburg, FL Oberburg, CH",
9,"www.lenews.ch - Bassins, CH",


In [33]:
df_cantons = pd.concat([df_cantons,location_to_canton]).reset_index()
del df_cantons['index']
df_cantons

Unnamed: 0,ID,Location
0,,"im schönen freiamt, schweiz"
1,,la suisse de la merde
2,,"switzerland, planet earth"
3,,PCHS'17
4,,"Rapperswil-Jona / Basel, CHE"
5,,Sommewhere in switzerland
6,,"CH, Earth, Milky Way, Laniakea"
7,NY,R-34// 305CH
8,,"St.Petersburg, FL Oberburg, CH"
9,,"www.lenews.ch - Bassins, CH"


In [34]:
df_canton_name = pd.read_csv('../data/canton_name.csv')
df_canton_name

Unnamed: 0,ID
0,ZH
1,BE
2,LU
3,UR
4,SZ
5,OW
6,NW
7,GL
8,ZG
9,FR


In [35]:
df = pd.merge(df_canton_name, df_cantons,on = ['ID'], how='outer') # merge the two datasets to have all the cantons

In [36]:
loc_with_canton

11             Zurich, Dortmund, Barcelona
12                         luzern, schweiz
15                   Recherswil, Solothurn
16                                    Vaud
18                     Schaffhausen, Swiss
33                       (Saudi tourists )
36               Zurich, London, Paris, SF
40                   Zurich, Dinerolandia.
58          Basel - Bern - Zürich, Schweiz
66                             Péry, Berne
79                             Lake Zurich
91                          Bülach, Zürich
97                                 Berneck
107            Lucerne, Zurich or anywhere
130                        Verbier, Valais
146               Freiamt, Aargau, Schweiz
155                  Tenero-Contra, Ticino
164                              Bern 3000
194                     Sion, Valais-Suiça
209          Saint Gallen, Bern und Zürich
212             Zug, London, Dubai, Joburg
214            Suisse , Neuchâtel , Bevaix
238         Herzogenbuchsee, Bern, Schweiz
241        

In [37]:
import unicodedata

def remove_accents(input_str):
    nfkd_form = unicodedata.normalize('NFKD', input_str)
    return u"".join([c for c in nfkd_form if not unicodedata.combining(c)])


In [38]:
import re


# take swiss cantons locations
ch_sub = pycountry.subdivisions.get(country_code='CH')
cantons_ = {}

for canton in ch_sub:
    cantons_[canton.code.split('-')[1]] = [canton.code.split('-')[1],remove_accents(canton.name.lower())]
    
def add_canton(loc):
    l_parts = re.sub(r'[^\w\s]','',remove_accents(loc.lower())).split()
    cant = ''
    for ct in cantons_:
        for l in l_parts:
            if (cantons_[ct][1] == l) or (cantons_[ct][0].lower() == l):
                cant = cantons_[ct][0]
    return cant

cantons_extra = [[loc, add_canton(loc)] for loc in loc_with_canton]

In [39]:
cantons_extra

[['Zurich, Dortmund, Barcelona', 'ZH'],
 ['luzern, schweiz', 'LU'],
 ['Recherswil, Solothurn', 'SO'],
 ['Vaud', 'VD'],
 ['Schaffhausen, Swiss', 'SH'],
 ['(Saudi tourists )', ''],
 ['Zurich, London, Paris, SF', 'ZH'],
 ['Zurich, Dinerolandia.', 'ZH'],
 ['Basel - Bern - Zürich, Schweiz', 'BE'],
 ['Péry, Berne', ''],
 ['Lake Zurich', 'ZH'],
 ['Bülach, Zürich', 'ZH'],
 ['Berneck', ''],
 ['Lucerne, Zurich or anywhere', 'ZH'],
 ['Verbier, Valais', 'VS'],
 ['Freiamt, Aargau, Schweiz', 'AG'],
 ['Tenero-Contra, Ticino', 'TI'],
 ['Bern 3000', 'BE'],
 ['Sion, Valais-Suiça', ''],
 ['Saint Gallen, Bern und Zürich', 'BE'],
 ['Zug, London, Dubai, Joburg', 'ZG'],
 ['Suisse , Neuchâtel , Bevaix', 'NE'],
 ['Herzogenbuchsee, Bern, Schweiz', 'BE'],
 ['Locarno, Ticino, CH', 'TI'],
 ['Vaud, Suiza', 'VD'],
 ['Bern (Switzerland)', 'BE'],
 ['Belp, Bern', 'BE'],
 ['Valais, Suisse #250', 'VS'],
 ['Bern, Bern', 'BE'],
 ['Zollikon, Zürich', 'ZH'],
 ['Ballwil, Luzern', 'LU'],
 ['Bulle, Fribourg - Switzerland', 'FR'

In [40]:
df_cantons_e = pd.DataFrame(cantons_extra)
df_cantons_e.columns = [['Location','ID']]
df_cantons_e

Unnamed: 0,Location,ID
0,"Zurich, Dortmund, Barcelona",ZH
1,"luzern, schweiz",LU
2,"Recherswil, Solothurn",SO
3,Vaud,VD
4,"Schaffhausen, Swiss",SH
5,(Saudi tourists ),
6,"Zurich, London, Paris, SF",ZH
7,"Zurich, Dinerolandia.",ZH
8,"Basel - Bern - Zürich, Schweiz",BE
9,"Péry, Berne",


In [41]:
df_cantons = pd.concat([df_cantons,df_cantons_e]).reset_index()
del df_cantons['index']

In [42]:
df_cantons = df_cantons.dropna()

In [43]:
df_cantons = df_cantons[df_cantons['ID']!=''].reset_index()
del df_cantons['index']

In [44]:
df_cantons

Unnamed: 0,ID,Location
0,NY,R-34// 305CH
1,VS,The CHI
2,BS,"Basel, CH"
3,ZH,"zuerich, switzerland"
4,BE,"Biel, CH"
5,LU,Zentralschweiz / Basel
6,VD,"vevey,switzerland"
7,GE,"Manchester, UK / Geneva, CH"
8,VD,"Montreux, CH"
9,VD,"lausanne, CH"


In [45]:
df.to_csv('../data/location_to_canton.csv')