In [1]:
from pathlib import Path
import geojson
import os
import json
from shapely.geometry import mapping
from shapely.geometry import Point

In [2]:
import pandas as pd
import geopandas as gpd
import numpy as np
from tqdm import tqdm

  shapely_geos_version, geos_capi_version_string


In [6]:
# COUNTRY = 'Guatemala'
COUNTRY = 'Paraguay'

root_dirs = {
    'Guatemala': Path('/home/adeel/2021 - RAI Toolkit-20210528T125906Z-001/2021 - RAI Toolkit/Country Data/Guatemala_4-19-2021'),
    'Paraguay': Path('/home/adeel/2021 - RAI Toolkit-20210528T125906Z-001/2021 - RAI Toolkit/Country Data/Paraguay'),
}

file_paths = {
    'Guatemala': Path('GT/GT.txt'),
    'Paraguay': Path('PY/PY.txt'),
}

os.chdir(root_dirs[COUNTRY])

In [28]:
import unicodedata as ud

# https://stackoverflow.com/a/15547803/5908685
def rmdiacritics(char):
    '''
    Return the base character of char, by "removing" any
    diacritics like accents or curls and strokes and the like.
    '''
    desc = ud.name(char)
    cutoff = desc.find(' WITH ')
    if cutoff != -1:
        desc = desc[:cutoff]
        try:
            char = ud.lookup(desc)
        except KeyError:
            pass  # removing "WITH ..." produced an invalid name
    return char

def remove_diacritics(s: str):
    if not isinstance(s, str):
        return s
    return ''.join(map(rmdiacritics, s))

# 'Palá --> ' + remove_diacritics('Palá')

In [32]:
def load_geonames_data(path):
    column_names = [
        'geonameid',
        'name',
        'asciiname',
        'alternatenames',
        'latitude',
        'longitude',
        'feature class',
        'feature code',
        'country code',
        'cc2',
        'admin1 code',
        'admin2 code',
        'admin3 code',
        'admin4 code',
        'population',
        'elevation',
        'dem',
        'timezone',
        'modification '
    ]
    places_df = pd.read_csv(path, delimiter='\t', header=None)
    places_df.columns = column_names
    places_df = places_df.loc[places_df['feature class'] != 'A']
    places_df.loc[:, 'orig_name'] = places_df.name
    places_df.name = places_df.asciiname.str.lower()
    places_df.alternatenames = places_df.alternatenames.str.lower().map(remove_diacritics)
    places_df.longitude = places_df.longitude.astype(float)
    places_df.latitude = places_df.latitude.astype(float)
    places_df = gpd.GeoDataFrame(places_df, geometry=gpd.points_from_xy(places_df.longitude, places_df.latitude))
    return places_df
    

In [33]:
places_df = load_geonames_data(file_paths[COUNTRY])
places_df

Unnamed: 0,geonameid,name,asciiname,alternatenames,latitude,longitude,feature class,feature code,country code,cc2,...,admin2 code,admin3 code,admin4 code,population,elevation,dem,timezone,modification,orig_name,geometry
0,3434138,isla santa isabel,Isla Santa Isabel,"isla entre rios,isla entre rios,isla santa isabel",-27.42043,-57.49094,T,ISL,PY,,...,,,,0,,58,America/Asuncion,2017-10-04,Isla Santa Isabel,POINT (-57.49094 -27.42043)
1,3436529,24 de julio,24 De Julio,,-25.08333,-57.30000,P,PPL,PY,,...,306.0,,,0,,71,America/Asuncion,2018-04-09,24 De Julio,POINT (-57.30000 -25.08333)
2,3436530,tres de mayo,Tres de Mayo,"3 de mayo,tres de mayo",-26.48134,-56.09592,P,PPL,PY,,...,611.0,,,0,,158,America/Asuncion,2020-06-10,Tres de Mayo,POINT (-56.09592 -26.48134)
3,3436531,fortin zenteno,Fortin Zenteno,"aliguata,alihuata,fortin zenteno,fortin zenten...",-23.16667,-59.98333,S,FT,PY,PY,...,1507.0,,,0,,133,America/Asuncion,2020-06-10,Fortín Zenteno,POINT (-59.98333 -23.16667)
4,3436532,zeballos-cue,Zeballos-Cue,"zeballos-cue,zeballos-cue,zeballos-cue,zeballo...",-25.23019,-57.57118,P,PPLX,PY,,...,0.0,,,0,,78,America/Asuncion,2018-04-09,Zeballos-Cué,POINT (-57.57118 -25.23019)
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18502,12174563,kokue guazu san antonio,Kokue Guazu San Antonio,"kokue guasu san antonio,kokue guazu san antoni...",-25.32743,-57.39412,P,PPL,PY,,...,,,,0,,154,America/Asuncion,2020-07-13,Kokue Guazú San Antonio,POINT (-57.39412 -25.32743)
18503,12174826,parques del yacht,Parques del Yacht,parques del yacht,-25.36612,-57.63104,P,PPLX,PY,,...,,,,0,,96,America/Asuncion,2020-07-13,Parques del Yacht,POINT (-57.63104 -25.36612)
18504,12174827,san juan,San Juan,san juan,-25.36281,-57.59567,P,PPLX,PY,,...,,,,0,,120,America/Asuncion,2020-07-13,San Juan,POINT (-57.59567 -25.36281)
18505,12174828,isla san francisco,Isla San Francisco,isla san francisco,-25.13408,-57.52246,P,PPLX,PY,,...,,,,0,,54,America/Asuncion,2020-07-13,Isla San Francisco,POINT (-57.52246 -25.13408)


In [20]:
filename = 'geonames_places.json'

features = []
with tqdm(places_df.itertuples(index=False), total=len(places_df)) as bar:
    for row in bar:
        p = Point(row.longitude, row.latitude)
        f = {
            'type': 'Feature',
            'geometry': mapping(p),
            'properties': {k: v for (k, v) in row._asdict().items() if k not in ['geometry']}
        }
        features.append(f)

gjson = geojson.FeatureCollection(features)
with open(filename, 'w') as f:
    json.dump(gjson, f)


100%|██████████| 10006/10006 [00:00<00:00, 13152.80it/s]
