In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import ee
import geemap
import geopandas as gpd
import numpy as np
import pandas as pd
from rasterstats import zonal_stats
import rioxarray as rxr
from rioxarray.merge import merge_arrays

from src.utils.geometry import load_country_boundaries, reproject_geo
from src.constants import RAW_PATH, EXTERNAL_PATH, PROCESSED_PATH


from src.utils.gee import init_gee
init_gee()

In [None]:
ops = ee.data.listOperations()
ops_ = [o for o in ops if o['metadata']['state'] in ['PENDING', 'READY', 'RUNNING']]

# Creation

## Global Human Settlement Layer (GSHL)
## SMOD (Settlement Model) product
Downloaded the data from [here](https://ghsl.jrc.ec.europa.eu/download.php?ds=smod)

The settlement grid at level 2 represents these definitions on a layer grid. Each pixel is classified as follow: 

* Class 30: “Urban Centre grid cell”, if the cell belongs to an Urban Centre spatial entity; 
* Class 23: “Dense Urban Cluster grid cell”, if the cell belongs to a Dense Urban Cluster spatial entity; 
* Class  22:  “Semi-dense  Urban  Cluster  grid  cell”,  if  the  cell  belongs  to  a  Semi-dense  Urban  Cluster  
spatial entity; 
* Class  21:  “Suburban  or  per-urban  grid  cell”,  if  the  cell  belongs  to  an  Urban  Cluster  cells  at  first  
hierarchical level but is not part of a Dense or Semi-dense Urban Cluster; 
* Class 13: “Rural cluster grid cell”, if the cell belongs to a Rural Cluster spatial entity; 
* Class 12: “Low Density Rural grid cell”, if the cell is classified as Rural grid cells at first hierarchical 
level, has more than 50 inhabitant and is not part of a Rural Cluster; 
* Class 11: “Very low density rural grid cell”, if the cell is classified as Rural grid cells at first hierarchical 
level, has less than 50 inhabitant and is not part of a Rural Cluster; 
* Class  10:  “Water  grid cell”, if the cell has 0.5 share covered by permanent surface water and is not 
populated nor built.

In [None]:
GSHL_SMOD_PATH = RAW_PATH / 'ghsl_smod'
geo_ukr = load_country_boundaries('Ukraine')
smod_paths = sorted(GSHL_SMOD_PATH.glob('*.tif'))

smods = [rxr.open_rasterio(p) for p in smod_paths]
geo_ukr_reproj = reproject_geo(geo_ukr, 'EPSG:4326', smods[0].rio.crs)
smods_merged = merge_arrays(smods)
smod_ukr = smods_merged.rio.clip([geo_ukr_reproj])

In [None]:
smod_ukr.rio.to_raster(PROCESSED_PATH / 'gshl_smod_ukraine.tif')

In [None]:
smod_ukr.where(smod_ukr!=smod_ukr.rio.nodata).plot()

In [None]:
# Compute percentage of pixels that are above 12 among all pixels that are not nodata
min_level = 13
n_pix_urban = smod_ukr.where(smod_ukr!=smod_ukr.rio.nodata).where(smod_ukr>=min_level).count().item()
n_pix_tot = smod_ukr.where(smod_ukr!=smod_ukr.rio.nodata).count().item()
print(f'Percentage of urban or rural cluster: {100*n_pix_urban / n_pix_tot:.2f}%')
smod_ukr.where(smod_ukr!=smod_ukr.rio.nodata).where(smod_ukr>=min_level).plot();

In [None]:
# Binarize the data
smod_bin = smod_ukr.where((smod_ukr>=min_level)|(smod_ukr==smod_ukr.rio.nodata), 0).where(smod_ukr<min_level,1)
smod_bin.where(smod_bin!=smod_bin.rio.nodata).plot()

In [None]:
perc_build_up = 100*(smod_bin==1).sum() / ((smod_bin==0).sum() + (smod_bin==1).sum())
print(f"{perc_build_up:.2f}% of Ukraine is town or city")

## Ukraine Administrative Boundaries
Downloaded the data from [here](https://data.humdata.org/dataset/cod-ab-ukr?)

In [None]:
# level 4 is the finest one
ukraine_regions_path = sorted((EXTERNAL_PATH / 'UKR_admin_boundaries').glob('*_adm4*.shp'))[0]
ukr_regions = gpd.read_file(ukraine_regions_path)[['ADM4_EN','ADM3_EN','ADM2_EN','ADM1_EN', 'geometry']]
ukr_regions.shape

In [None]:
df = pd.DataFrame(zonal_stats(
    vectors=ukr_regions.to_crs(smod_bin.rio.crs),
    raster=smod_bin.squeeze().values,
    affine=smod_bin.rio.transform(),
    stats=['mean'],
    nodata=smod_bin.rio.nodata))

ukr_regions['perc_build_up'] = df['mean']
ukr_regions.head()

In [None]:
ukr_regions.to_file('./all_adm4.geojson', driver='GeoJSON')

In [None]:
ukr_regions.explore()

## Match with population data
From WorldPop: [Ukraine 1km population](https://data.humdata.org/dataset/worldpop-population-density-for-ukraine?)

In [None]:
ukr_pop = rxr.open_rasterio(EXTERNAL_PATH / 'ukr_population_density_2020_1km.tif')

In [None]:
df = pd.DataFrame(zonal_stats(
    vectors=ukr_regions.to_crs(ukr_pop.rio.crs),
    raster=ukr_pop.squeeze().values,
    affine=ukr_pop.rio.transform(),
    stats=['sum'],
    nodata=ukr_pop.rio.nodata),
)
ukr_regions['population'] = df['sum']
ukr_regions.head()

## Filter settlements

In [None]:
threshold_urban = 0.1
poly_settlements = ukr_regions[ukr_regions['perc_build_up']>=threshold_urban].copy()
print(f'There are {poly_settlements.shape[0]} settlements in Ukraine with {100*threshold_urban}% or more urban pixels')

In [None]:
poly_settlements.explore()

In [None]:
from shapely.geometry import box
poly_settlements['geometry_box'] = poly_settlements.geometry.apply(lambda g: box(*g.bounds))

In [None]:
import warnings
with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    print(f'{len(poly_settlements)} settlements')
    print(f'representing {100*poly_settlements.area.sum() / geo_ukr.area:.2f}% of the country')
    print(f'({100*poly_settlements.geometry_box.area.sum() / geo_ukr.area:.2f}% when box)')
    print(f'and {int(poly_settlements.population.sum())/10**6:.1f}M people')

In [None]:
poly_settlements.drop('geometry_box', axis=1).to_file(PROCESSED_PATH / 'ukraine_settlements.geojson', driver='GeoJSON')

## To GEE

In [None]:
from shapely.geometry import box
gdf_path = PROCESSED_PATH / 'ukraine_settlements.geojson'
gdf = gpd.read_file(gdf_path)
gdf['geometry'] = gdf['geometry'].apply(lambda g: box(*g.bounds))
gdf.shape

In [None]:
# filter box that are fully within another one
joined_gdf = gdf.sjoin(gdf, how='inner', predicate='within')
indices_within_others = joined_gdf[joined_gdf.index!=joined_gdf.index_right].index
gdf = gdf[~gdf.index.isin(indices_within_others)]
gdf.shape

In [None]:
import warnings
with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    geo_ukr = load_country_boundaries('Ukraine')
    print(f'area final = {100*gdf.area.sum() / geo_ukr.area:.2f}% of the country')

In [None]:
gdf_final = gdf.reset_index().rename(columns={'index':'settlement_id'})
gdf_final.to_file(PROCESSED_PATH / 'ukraine_settlements_gee_final.geojson', driver='GeoJSON')

In [None]:
gdf_ee = geemap.geopandas_to_ee(gdf_final)
asset_id = ASSETS_PATH + f's1tsdd_Ukraine/ukraine_settlements'
geemap.ee_export_vector_to_asset(
    gdf_ee,
    description=f"Ukraine Settlements",
    assetId=asset_id,
)

# Exploration

In [None]:
from src.data.settlements import load_gdf_settlements

gdf = load_gdf_settlements()
gdf.head()

In [None]:
gdf.explore()

In [None]:
gdf[gdf.ADM4_EN=='Chernihiv'].to_file('Chernihiv.shp')

In [None]:
from src.constants import PREDS_PATH
from src.data.settlements import load_gdf_settlements

preds = sorted((PREDS_PATH / '240224' / 'buildings_with_preds').glob('*.geojson'))
existing_ids = [int(p.stem) for p in preds]
gdf_settlements = load_gdf_settlements()
gdf_settlements = gdf_settlements[~gdf_settlements.index.isin(existing_ids)]
gdf_settlements

In [None]:
for id_ in [2397,2398,2405,2406,2407]:
    print(gdf_settlements.loc[id_].ADM4_EN)

In [None]:
from src.data.settlements import MSFT_SETTLEMENTS_PATH
from tqdm import tqdm
for i in tqdm(gdf_settlements.index):
    gdf_buildings = gpd.read_file(MSFT_SETTLEMENTS_PATH / f'{i}.geojson')
    if not gdf_buildings.empty:
        print(i)

In [None]:
gdf

In [None]:
from shapely.geometry import Point
coords = Point(37.502168,47.941873)
gdf[gdf.geometry.contains(coords)]

In [None]:
import geopandas as gpd
from src.constants import PREDS_PATH
run_name = '240224'
folder =  PREDS_PATH / run_name / 'buildings_with_preds'
gdf = gpd.read_file(folder / '0.geojson')
gdf.head()