In [1]:
import pandas as pd
from hotcities.readers import load
from hotcities.config import read_config, default_config

In [2]:
testing = False

In [3]:
config = read_config('../test/res/config.test.ini') if testing else default_config

In [4]:
cities = load('cities', config=config)

In [5]:
min_population = 500000

In [6]:
cities = cities.loc[cities.population >= min_population].sort_values(by='population', ascending=False)
cities = cities[['geonameid', 'name', 'population', 'countrycode', 'lng', 'lat', 'timezone']]
cities = cities.reset_index(drop=True)

In [7]:
cities.head()

Unnamed: 0,geonameid,name,population,countrycode,lng,lat,timezone
0,1796236,Shanghai,22315474,CN,121.45806,31.22222,Asia/Shanghai
1,745044,Istanbul,14804116,TR,28.94966,41.01384,Europe/Istanbul
2,3435910,Buenos Aires,13076300,AR,-58.37723,-34.61315,America/Argentina/Buenos_Aires
3,1275339,Mumbai,12691836,IN,72.88261,19.07283,Asia/Kolkata
4,3530597,Mexico City,12294193,MX,-99.12766,19.42847,America/Mexico_City


In [8]:
countries = load('countries', config=config)

In [9]:
countries['lang'] = countries['languages'].apply(lambda value: value.split(',')[0][:2] if pd.notnull(value) else value)
countries = countries[['countrycode', 'countryname', 'lang']]

In [10]:
countries.head()

Unnamed: 0,countrycode,countryname,lang
0,AD,Andorra,ca
1,AE,United Arab Emirates,ar
2,AF,Afghanistan,fa
3,AG,Antigua and Barbuda,en
4,AI,Anguilla,en


In [11]:
alternatenames = load('alternatenames', config=config, low_memory=False)

In [12]:
alternatenames = alternatenames.loc[(alternatenames['geonameid'].isin(cities['geonameid'])) & pd.notnull(alternatenames['lang']) & (alternatenames['lang'] != 'link') & pd.isnull(alternatenames['isshortname']) & pd.isnull(alternatenames['iscolloquial']) & pd.isnull(alternatenames['ishistoric'])]
alternatenames = alternatenames.reset_index(drop=True)
alternatenames = alternatenames[['geonameid', 'lang', 'alternatename']]

In [13]:
alternatenames.head()

Unnamed: 0,geonameid,lang,alternatename
0,292223,de,Dubai
1,292223,en,Dubai
2,292223,es,Dubái
3,292223,ar,دبي
4,292223,bg,Дубай


In [14]:
alternatenames = alternatenames.groupby(['geonameid', 'lang']).agg({'alternatename': lambda series: list(series)[0]})

In [15]:
alternatenames.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,alternatename
geonameid,lang,Unnamed: 2_level_1
14256,de,Azad Shahr
14256,fa,Āzādshahr
23814,fa,Kahrīz
53654,af,Mogadisjoe
53654,am,ሞቃዲሾ


In [16]:
cities_with_lang = pd.merge(cities, countries, how='left', on='countrycode')
cities_with_lang = cities_with_lang.sort_values(by='population', ascending=False)
cities_with_lang = cities_with_lang.reset_index(drop=True)

In [17]:
cities_with_lang.head()

Unnamed: 0,geonameid,name,population,countrycode,lng,lat,timezone,countryname,lang
0,1796236,Shanghai,22315474,CN,121.45806,31.22222,Asia/Shanghai,China,zh
1,745044,Istanbul,14804116,TR,28.94966,41.01384,Europe/Istanbul,Turkey,tr
2,3435910,Buenos Aires,13076300,AR,-58.37723,-34.61315,America/Argentina/Buenos_Aires,Argentina,es
3,1275339,Mumbai,12691836,IN,72.88261,19.07283,Asia/Kolkata,India,en
4,3530597,Mexico City,12294193,MX,-99.12766,19.42847,America/Mexico_City,Mexico,es


In [18]:
cities_with_localname = pd.merge(cities_with_lang, alternatenames, how='left', on=['geonameid', 'lang'])
cities_with_localname = cities_with_localname.rename(columns={'alternatename': 'localname'})
cities_with_localname = cities_with_localname.sort_values(by='population', ascending=False)
cities_with_localname = cities_with_localname.reset_index(drop=True)

In [19]:
cities_with_localname.head()

Unnamed: 0,geonameid,name,population,countrycode,lng,lat,timezone,countryname,lang,localname
0,1796236,Shanghai,22315474,CN,121.45806,31.22222,Asia/Shanghai,China,zh,上海
1,745044,Istanbul,14804116,TR,28.94966,41.01384,Europe/Istanbul,Turkey,tr,İstanbul
2,3435910,Buenos Aires,13076300,AR,-58.37723,-34.61315,America/Argentina/Buenos_Aires,Argentina,es,Ciudad Autónoma de Buenos Aires
3,1275339,Mumbai,12691836,IN,72.88261,19.07283,Asia/Kolkata,India,en,Mumbai
4,3530597,Mexico City,12294193,MX,-99.12766,19.42847,America/Mexico_City,Mexico,es,Ciudad de México
