![Natural Earth logo](https://www.naturalearthdata.com/wp-content/themes/NEV/images/nev_logo.png "Natural Earth logo")
<div align="center">

## Scraping cities and populated places data
</div>

Document explaining how populated places data is processed and saved in PostgreSQL database.

Link to data from Natural Earth: [populated places data](https://www.naturalearthdata.com/http//www.naturalearthdata.com/download/10m/cultural/ne_10m_populated_places_simple.zip)

<hr>

# 1. Download and load data

## Data is in shapefile format compressed into zip file. There is need to import geopandas to read spatial data and requests to get zip file from web.

In [None]:
import geopandas as gpd
import requests
import os

In [None]:
cities_link = "https://www.naturalearthdata.com/http//www.naturalearthdata.com/download/10m/cultural/ne_10m_populated_places.zip"

## Get zip file and check status code. 200 is OK.

In [None]:
r = requests.get(cities_link ,stream=True, headers={"User-Agent": "XY"})
r.status_code

## Save zip file inside temp folder.

In [None]:
if not os.path.exists("../temp/places/"):
    os.makedirs("../temp/places/")

In [None]:
with open('../temp/places/ne_10m_populated_places.zip', 'wb') as fd:
    for chunk in r.iter_content(chunk_size=128):
        fd.write(chunk)

In [None]:
zip_file = "zip://../temp/places/ne_10m_populated_places.zip!ne_10m_populated_places.shp"

## Load shapefile into geopandas dataframe.

In [None]:
cities_shp = gpd.read_file(
    zip_file, layer='ne_10m_populated_places'
)

## Check number of row, columns and crs shapefile data.

In [None]:
cities_shp.shape

In [None]:
cities_shp.crs

## Modify dataframe to display only columns that are valuable for our dataset.

In [None]:
cities_shp.columns.values.tolist()

In [None]:
filter_cities = cities_shp[
    [
        "FEATURECLA",
        "NAME",
        "WIKIDATAID",
        "WOF_ID",
        "GEONAMESID",
        "ISO_A2",
        "geometry",
    ]
]

In [None]:
filter_cities

## Replace "-99" value with None.

In [None]:
filter_cities[filter_cities.eq('-99').any(1)]

In [None]:
finall_cities = filter_cities.replace('-99', None)

# 2. Split and view avaliable data

## Check avaliable populated places types.

In [None]:
city_types = set(finall_cities["FEATURECLA"])

In [None]:
city_types

## Filter and view countries capitals.

In [None]:
filter_capitals = (finall_cities["FEATURECLA"] == "Admin-0 capital")

In [None]:
finall_cities[filter_capitals].explore(popup=True,)

## Filter and view dependent countries capitals.

In [None]:
filter_region_capital = (finall_cities["FEATURECLA"] == "Admin-0 region capital")

In [None]:
finall_cities[filter_region_capital].explore(popup=True,)

## Filter and view historical capitals.

In [None]:
filter_history_capital = (finall_cities["FEATURECLA"] == "Admin-0 capital alt")

In [None]:
finall_cities[filter_history_capital].explore(popup=True,)

## Filter and view states and provinces capitals.

In [None]:
filter_adm_capital = finall_cities["FEATURECLA"].isin(["Admin-1 capital", "Admin-1 region capital"])

In [None]:
finall_cities[filter_adm_capital].explore(popup=True,)

## Filter and view other populated places.

In [None]:
city = finall_cities["FEATURECLA"].isin(["Populated place", "Populated Place"])

In [None]:
finall_cities[city].explore(popup=True,)

## Filter and view scientific stations.

In [None]:
scientific_station = (finall_cities["FEATURECLA"] == "Scientific station")

In [None]:
finall_cities[scientific_station].explore(popup=True,)

## Filter and view historic places.

In [None]:
historic_place = (finall_cities["FEATURECLA"] == "Historic place")

In [None]:
finall_cities[historic_place].explore(popup=True,)

## Filter and view meteo stations.

In [None]:
meteo_station = (finall_cities["FEATURECLA"] == "Meteorological Station")

In [None]:
finall_cities[meteo_station].explore(popup=True,)

# 3. Split table on scientific stations and cities

## Get list of iso2 codes not avaliable in country db table to skip this items cosing errors while saving as FK.

In [None]:
country_iso2 = set(Country.objects.order_by().values_list("iso2", flat=True))

In [None]:
city_iso2 = set(finall_cities["ISO_A2"].unique())

In [None]:
iso2_diff = city_iso2.difference(country_iso2)

In [None]:
iso2_diff

## Filter scientific stations

In [None]:
science = finall_cities["FEATURECLA"].isin(["Meteorological Station", "Scientific station"])

In [None]:
scientific_stations = finall_cities[science]

In [None]:
scientific_stations

## Filter cities

In [None]:
cities = finall_cities[~science]

In [None]:
cities

# 4. Save in database

In [None]:
from apps.utils.converters import pandas_to_gis_multipoint
from apps.administrative_area.models import Country, City, CityType
from apps.civic_structure.models import ScientificStation, ScientificStationType

## Make type name mapping dictionary

In [None]:
map_types = {
    'Admin-0 capital': "Capital",
    'Admin-0 capital alt': "Old capital",
    'Admin-0 region capital': "Capital",
    'Admin-1 capital': "Region capital",
    'Admin-1 region capital': "Region capital",
    'Historic place': "Historic place",
    'Meteorological Station': "Meteorological Station",
    'Populated Place': "Populated place",
    'Populated place': "Populated place",
    'Scientific station': "Scientific station",
}

## Create Cities FK

In [None]:
cities_types = set(cities["FEATURECLA"])

In [None]:
cities_types

In [None]:
[CityType.objects.get_or_create(name=map_types[item]) for item in cities_types if item]

## Iterate through cities dataframe, convert data, get FK , update or create new entry in database.

In [None]:
for row in cities.itertuples(index=False, name="Pandas"):
    if row.ISO_A2 in iso2_diff:
        country = None
    else:
        country = Country.objects.filter(iso2=row.ISO_A2)[0]

    geometry = pandas_to_gis_multipoint(row.geometry)
    city_type = CityType.objects.filter(name=map_types[row.FEATURECLA])
    city_type = city_type[0] if city_type else None
    updated_values = {
        "geonames_id": row.GEONAMESID,
        "point": geometry,
        "wikidata_id": row.WIKIDATAID,
        "city_type": city_type,
        "country": country,
    }
    City.objects.update_or_create(
        whosonfirst_id=row.WOF_ID,
        name=row.NAME,
        defaults=updated_values,
    )

## Create Scientific Stations FK

In [None]:
science_types = set(scientific_stations["FEATURECLA"])

In [None]:
science_types

In [None]:
[ScientificStationType.objects.get_or_create(name=map_types[item]) for item in science_types if item]

In [None]:
for row in scientific_stations.itertuples(index=False, name="Pandas"):
    if row.ISO_A2 in iso2_diff:
        country = None
    else:
        country = Country.objects.filter(iso2=row.ISO_A2)[0]
    geometry = pandas_to_gis_multipoint(row.geometry)
    science_type = ScientificStationType.objects.filter(name=map_types[row.FEATURECLA])
    science_type = science_type[0] if science_type else None

    updated_values = {
        "geonames_id": row.GEONAMESID,
        "point": geometry,
        "wikidata_id": row.WIKIDATAID,
        "science_station_type": science_type,
        "country": country,
    }

    ScientificStation.objects.update_or_create(
        whosonfirst_id=row.WOF_ID,
        name=row.NAME,
        defaults=updated_values,
    )