![Natural Earth logo](https://www.naturalearthdata.com/wp-content/themes/NEV/images/nev_logo.png "Natural Earth logo")
<div align="center">

## Scraping countries data and boundaries
</div>

Document explaining how countries data is processed and saved in PostgreSQL database.

Link to data from Natural Earth: [counties data](https://www.naturalearthdata.com/http//www.naturalearthdata.com/download/50m/cultural/ne_50m_admin_0_countries.zip)

<hr>

# 1. Download and load data
## Data is in shapefile format compressed into zip file. There is need to import geopandas to read spatial data and requests to get zip file from web.

In [None]:
import geopandas as gpd
import requests
import os

In [None]:
countries_link = 'https://www.naturalearthdata.com/http//www.naturalearthdata.com/download/50m/cultural/ne_50m_admin_0_countries.zip'

## Get zip file and check status code. 200 is OK.

In [None]:
r = requests.get(countries_link ,stream=True, headers={"User-Agent": "XY"})
r.status_code

## Save zip file inside temp folder.

In [None]:
if not os.path.exists('../temp/country/'):
    os.makedirs('../temp/country/')

In [None]:
with open('../temp/country/ne_50m_admin_0_countries.zip', 'wb') as fd:
    for chunk in r.iter_content(chunk_size=128):
        fd.write(chunk)

In [None]:
zip_file = "zip://../temp/country/ne_50m_admin_0_countries.zip!ne_50m_admin_0_countries.shp"

## Load shapefile into geopandas dataframe.

In [None]:
countries_shp = gpd.read_file(
    zip_file, layer='ne_50m_admin_0_countries'
)

## Check number of row, columns and crs shapefile data.

In [None]:
countries_shp.shape

In [None]:
countries_shp.crs

## Modify dataframe to display only columns that are valuable for our dataset.

In [None]:
countries_shp.columns.values.tolist()

In [None]:
filter_countries = countries_shp[
    [
        "NAME", "NAME_LONG",
        "FIPS_10", "ISO_A2",
        "ISO_A3", "POSTAL",
        "TYPE", "CONTINENT",
        "SUBREGION", "WIKIDATAID",
        "geometry",
    ]
]

## Change column names to equal database model names.

In [None]:
filter_countries.set_axis(
[
    'name', 'name_long',
    'fips_10', 'iso2',
    'iso3', 'postal',
    'country_type', 'continent',
    'subregion', 'wikidata_id',
    "geometry"
], axis=1, inplace=True)

In [None]:
filter_countries

# 2. Create foreign keys if not exist and save in db

In [None]:
from apps.landform.models import Continent
from apps.administrative_area.models import CountryType, Subregion

## Get unique values from "country_type" column and save if not yet in database.

In [None]:
types = set(filter_countries["country_type"])

In [None]:
types

In [None]:
[CountryType.objects.get_or_create(name=item) for item in types]

In [None]:
CountryType.objects.values_list('name', flat=True)

## Get unique values from "continent" column and save if not yet in database.

In [None]:
continents = set(filter_countries["continent"])

In [None]:
continents

In [None]:
[Continent.objects.get_or_create(name=item) for item in continents]

In [None]:
Continent.objects.values_list('name', flat=True)

## Get unique values from "subregion" column and save if not yet in database.

In [None]:
subregions = set(filter_countries["subregion"])

In [None]:
subregions

In [None]:
[Subregion.objects.get_or_create(name=item) for item in subregions]

In [None]:
Subregion.objects.values_list('name', flat=True)

# 3. Fix -99 value for country codes (error creating fk)
## Some country codes values are '-99'. It is important to change especially iso2 values because it will be conected to other data by foreign key eg. regions and state data.

In [None]:
filter_countries[filter_countries.eq('-99').any(1)]

## Replace data that is avaliable manually

In [None]:
for index, row in filter_countries[filter_countries.eq('-99').any(1)].iterrows():
    if row['name'] == 'S. Sudan':
        filter_countries.at[index, 'fips_10'] = 'OD'
    elif row['name'] == 'Norway':
        filter_countries.at[index, 'fips_10'] = 'NO'
        filter_countries.at[index, 'iso2'] = 'NO'
        filter_countries.at[index, 'iso3'] = 'NOR'
    elif row['name'] == 'Israel':
        filter_countries.at[index, 'fips_10'] = 'IS'
    elif row['name'] == 'France':
        filter_countries.at[index, 'iso2'] = 'FR'
        filter_countries.at[index, 'iso3'] = 'FRA'

## Change not avaliable data from '-99' to None

In [None]:
finall_countries = filter_countries.replace('-99', None)

In [None]:
finall_countries

# 4. Save in database

## Preview avaliable data from dataframe on map.

In [None]:
finall_countries.explore(column="continent", popup=True, style_kwds=dict(color="black"))

In [None]:
from django.contrib.gis.geos import GEOSGeometry, Polygon, MultiPolygon
from apps.administrative_area.models import Country

## Convert geometry field from geopandas dataframe to GIS Multipolygon.

In [None]:
def convert_geometry(geometry):
    geometry = GEOSGeometry(str(geometry))
    if geometry.geom_type == 'Polygon':
        geometry = MultiPolygon(geometry)
    return geometry

## Iterate through dataframe, convert data, get FK and update and create new entry in database.

In [None]:
for row in finall_countries.itertuples(index=False, name='Pandas'):
    geometry = convert_geometry(row.geometry)
    country_type = CountryType.objects.filter(name=row.country_type)[0]
    continent = Continent.objects.filter(name=row.continent)[0]
    subregion = Subregion.objects.filter(name=row.subregion)[0]
    
    updated_values = {
        'name': row.name,
        'name_long': row.name_long,
        'fips_10': row.fips_10,
        'iso2': row.iso2,
        'iso3': row.iso3,
        'continent': continent,
        'subregion': subregion,
        'country_type': country_type,
        'geometry': geometry,
    }
    
    # update base on postal code and wikidata_id
    Country.objects.update_or_create(
        wikidata_id=row.wikidata_id,
        postal=row.postal,
        defaults=updated_values,
    )