![Natural Earth logo](https://www.naturalearthdata.com/wp-content/themes/NEV/images/nev_logo.png "Natural Earth logo")
<div align="center">
    
## Scraping states data and boundaries
</div>

Document explaining how states data is processed and saved in PostgreSQL database.

Link to data from Natural Earth: [states data](https://www.naturalearthdata.com/http//www.naturalearthdata.com/download/10m/cultural/ne_10m_admin_1_states_provinces.zip)


# 1. Download and load data
## Data is in shapefile format compressed into zip file. There is need to import geopandas to read spatial data and requests to get zip file from web.

In [None]:
import geopandas as gpd
import requests
import os

In [None]:
states_link = 'https://www.naturalearthdata.com/http//www.naturalearthdata.com/download/10m/cultural/ne_10m_admin_1_states_provinces.zip'

## Get zip file and check status code. 200 is OK.

In [None]:
r = requests.get(states_link ,stream=True, headers={"User-Agent": "XY"})
r.status_code

## Save zip file inside temp folder.

In [None]:
if not os.path.exists('../temp/state/'):
    os.makedirs('../temp/state/')

In [None]:
with open('../temp/state/ne_10m_admin_1_states_provinces.zip', 'wb') as fd:
    for chunk in r.iter_content(chunk_size=128):
        fd.write(chunk)

In [None]:
zip_file = "zip://../temp/state/ne_10m_admin_1_states_provinces.zip!ne_10m_admin_1_states_provinces.shp"

## Load shapefile into geopandas dataframe.

In [None]:
states_shp = gpd.read_file(
    zip_file, layer='ne_10m_admin_1_states_provinces'
)

## Check number of row, columns and crs shapefile data.

In [None]:
states_shp.shape

In [None]:
states_shp.crs

## Modify dataframe to display only columns that are valuable for our dataset.

In [None]:
states_shp.columns.values.tolist()

In [None]:
filter_states = states_shp[
    [
        "name", "adm1_code", 
        "iso_3166_2", "fips", 
        "type_en", "wikidataid", 
        "iso_a2", "geometry",
    ]
]

## Change column names to equal database model names.

In [None]:
filter_states.set_axis(
[
    'name', 'adm1_code',
    'iso_3166_1_2', 'fips',
    'state_type', 'wikidata_id',
    "iso_a2", "geometry",
], axis=1, inplace=True)

In [None]:
filter_states[2625:2635]

# 2. Fix errors in database
## Some "iso_a2" cells having value of "-1", change it to None.

In [None]:
filter_states[filter_states.eq('-1').any(1)]

In [None]:
filter_errors = filter_states.replace('-1', None)

# 3. Create foreign key for state type if not in db

In [None]:
from apps.administrative_area.models import StateType

In [None]:
types = set(filter_errors["state_type"])

In [None]:
types

In [None]:
[StateType.objects.get_or_create(name=item) for item in types if item]

In [None]:
StateType.objects.values_list('name', flat=True)

# 4. Save in database

## Preview data on map

In [None]:
filter_errors.explore(column="state_type", popup=True)

In [None]:
from django.contrib.gis.geos import GEOSGeometry, Polygon, MultiPolygon
from apps.administrative_area.models import Country, State

## Convert geometry field from geopandas dataframe to GIS Multipolygon.

In [None]:
def convert_geometry(geometry):
    geometry = GEOSGeometry(str(geometry))
    if geometry.geom_type == 'Polygon':
        geometry = MultiPolygon(geometry)
    return geometry

## Get list of iso2 codes not avaliable in country db table. Then skip this items to not get error while saving state FK.

In [None]:
country_iso2 = set(Country.objects.order_by().values_list('iso2', flat=True))

In [None]:
state_iso2 = set(filter_errors['iso_a2'].unique())

In [None]:
iso2_diff = state_iso2.difference(country_iso2)

In [None]:
iso2_diff

## Iterate through dataframe, convert data, get FK and update and create new entry in database.

In [None]:
for row in filter_errors.itertuples(index=False, name='Pandas'):
    if row.iso_a2 in iso2_diff:
        country = None
    else:
        country = Country.objects.filter(iso2=row.iso_a2)[0]
    geometry = convert_geometry(row.geometry)
    state_type = StateType.objects.filter(name=row.state_type)
    state_type = state_type[0] if state_type else None

    updated_values = {
        "name": row.name,
        "iso_3166_1_2": row.iso_3166_1_2,
        "fips": row.fips,
        "state_type": state_type,
        "wikidata_id": row.wikidata_id,
        "geometry": geometry,
        "country": country,
    }
    
    # Update base on adm1 code
    State.objects.update_or_create(
        adm1_code=row.adm1_code,
        defaults=updated_values,
    )