![title](../images/header.png)

Add attributes from polygons and assets
-------
This notebook lets you add additional columns to your cleaned CEO data, including administrative areas for sub-national estimates and updated strata from assets in GEE. 
###### For more information contact aurelie.shapiro@fao.org or remi.dannunzio@fao.org

In [1]:
from pathlib import Path

import numpy as np
import pandas as pd
import geopandas as gpd
from matplotlib import pyplot as plt

import geopandas
import rasterio
import matplotlib.pyplot as plt
from shapely.geometry import Point

import ee
import geemap
import pandas as pd

In [2]:
# Initialize Earth Engine
ee.Initialize()

##### read points from scripts 1-7

In [5]:
#points = pd.read_csv('/home/sepal-user/module_results/esbae/DRC/DRC_all_ceo_2015_2022_2023_clean.csv', delimiter=',', low_memory=False)
#points = pd.read_csv('/home/sepal-user/module_results/esbae/DRC/DRC_all_ceo_2015_2022_2023_prov.csv', delimiter=',', low_memory=False)
points = pd.read_csv('/home/sepal-user/module_results/esbae/GAB/GAB_all_ceo_2015_2022_2023_clean.csv', delimiter=',', low_memory=False)
#points = pd.read_csv('/home/sepal-user/module_results/esbae/CMR/CMR_all_ceo_2015_2022_2023_clean.csv',delimiter=',', low_memory=False)
#points = pd.read_csv('/home/sepal-user/module_results/esbae/EQG/EQG_all_ceo_2015_2022_2023_prov.csv',delimiter=',', low_memory=False)
#points = pd.read_csv('/home/sepal-user/module_results/esbae/EQG/EQG_all_ceo_2015_2022_2023_prov.csv',delimiter=',', low_memory=False)
#points = pd.read_csv('/home/sepal-user/module_results/esbae/CAR/CAR_all_ceo_2015_2022_2023_pref.csv',delimiter=',', low_memory=False)
#read GEE points
#points = ee.FeatureCollection('users/faocongo/sbae/sbae_hex16_cmr')

In [6]:
len(points)

10315

In [7]:
all_columns = points.columns.tolist()

# Display the list of column names
print(all_columns)

['UID', 'ID', 'plotid', 'point_id', 'sampleid', 'lon', 'lat', 'sample_geom', 'Ref_FNF_2015', 'Ref_Regeneration', 'Ref_Change_1522', 'Ref_Change_Type_1522', 'ArtFor', 'ArtMine', 'InfraR', 'Urb', 'IndFor', 'Other', 'Other_Desc', 'IndMine', 'IndAg', 'ArtAg', 'Ref_Year_1522', 'Ref_NFtype_2015', 'Ref_Ftype_2015', 'collection', 'interpreted', 'Ref_LCover_2015', 'Ref_Change_Year_1522', 'Ref_Change_LCover_1522', 'Def2016', 'Def2017', 'Def2018', 'Def2019', 'Def2020', 'Def2021', 'Def2022', 'Deg2016', 'Deg2017', 'Deg2018', 'Deg2019', 'Deg2020', 'Deg2021', 'Deg2022', 'Defall', 'Degall', 'Stable', 'NF', 'DensFor', 'DensDryFor', 'SecFor', 'DryOpenFor', 'Mangrove', 'Swamp', 'Gallery', 'Plantation', 'Woodland', 'Shrubland', 'Grassland', 'Aquatic', 'Bare', 'Cultivated', 'Builtup', 'Water', 'DensFor_Def', 'DensDryFor_Def', 'SecFor_Def', 'DryOpenFor_Def', 'Mangrove_Def', 'Swamp_Def', 'Gallery_Def', 'Plantation_Def', 'Woodland_Def', 'DensFor_Deg', 'DensDryFor_Deg', 'SecFor_Deg', 'DryOpenFor_Deg', 'Mangrov

In [8]:
#drop any columns if needed
#esbae = esbae.drop('index_right', axis=1)

In [9]:
#enter Lat and Lon columns
LATcol = 'lat'
LONcol = 'lon'

In [11]:
gdf = geopandas.GeoDataFrame(points, geometry=geopandas.points_from_xy(points[LONcol], points[LATcol]), crs="EPSG:4326")

#### add attributes from local shapefile to associate points with sub-national juridictions

In [12]:
# this shapefile should be uploaded to your SEPAL workspace
poly = "/home/sepal-user/data/admin/geoBoundaries-GAB-ADM1.shp"
#data/admin/RDC_Province_26.shp"
poly_shp = gpd.read_file(poly)
poly_shp.head()

Unnamed: 0,shapeName,shapeISO,shapeID,shapeGroup,shapeType,shapeName1,geometry
0,Woleu-Ntem,GA-9,98808126B17543062715179,GAB,ADM1,Woleu-Ntem,"POLYGON ((13.15133 1.26335, 12.85648 1.25701, ..."
1,Estuaire,GA-1,98808126B6174087794558,GAB,ADM1,Estuaire,"POLYGON ((10.4189 0.98965, 10.42187 0.82788, 1..."
2,Moyen-Ogooué,GA-3,98808126B18014785017467,GAB,ADM1,Moyen-Ogooue,"POLYGON ((10.83434 0.22485, 11.03868 0.22511, ..."
3,Ogooué-Lolo,GA-7,98808126B26189030995048,GAB,ADM1,Ogooue-Lolo,"POLYGON ((13.28884 0.0763, 13.28682 0.02301, 1..."
4,Haut-Ogooué,GA-2,98808126B76399587683442,GAB,ADM1,Haut-Ogooue,"POLYGON ((12.77941 -1.86166, 12.78632 -1.83539..."


In [13]:
# from the table above, identify the column name you want to associate
#admin_name = 'ADM1_FR'
admin_name = 'shapeName'
#admin_name = 'admin1Name'
#admin_name = 'NOM'
#new_name = 'Prefecture'
new_name = 'Province'

In [14]:
poly_crs = poly_shp.crs
print("Current CRS:", poly_crs)

Current CRS: EPSG:4326


In [15]:
#poly_shp = poly_shp.to_crs(gdf.crs)

In [16]:
# Reproject both GeoDataFrames to the same projected CRS (e.g., EPSG:3395 for World Mercator)
gdf = gdf.to_crs("EPSG:3395")
poly_shp = poly_shp.to_crs("EPSG:3395")

# Step 1: Perform the spatial join for intersecting points
joined_data_intersecting = gpd.sjoin(gdf, poly_shp[[admin_name, 'geometry']], how="left", predicate="within", lsuffix='left', rsuffix='right')

# Drop the 'index_right' column if it exists, to avoid conflicts
if 'index_right' in joined_data_intersecting.columns:
    joined_data_intersecting = joined_data_intersecting.drop(columns='index_right')

# Step 2: Identify points without an intersection (NaN values in the Admin_Name column)
no_intersection_points = joined_data_intersecting[joined_data_intersecting[admin_name].isna()]

# Step 3: Perform the nearest spatial join for points without an intersection
nearest_join = gpd.sjoin_nearest(no_intersection_points.drop(columns=admin_name), poly_shp[[admin_name, 'geometry']], how="left", distance_col="distance_to_polygon")

# Drop the 'index_right' column if it exists in the nearest join result
if 'index_right' in nearest_join.columns:
    nearest_join = nearest_join.drop(columns='index_right')

# Step 4: Combine intersecting and nearest joined data into one GeoDataFrame
# Retain only the specified `admin_name` column from `poly_shp`
joined_data_combined = pd.concat([
    joined_data_intersecting.dropna(subset=[admin_name]),
    nearest_join
])

# Select only the columns from `gdf` plus `admin_name`
columns_to_keep = list(gdf.columns) + [admin_name]
joined_data = joined_data_combined[columns_to_keep]

# Display the resulting GeoDataFrame
joined_data.head()

Unnamed: 0,UID,ID,plotid,point_id,sampleid,lon,lat,sample_geom,Ref_FNF_2015,Ref_Regeneration,...,Ref_LCover_2022,Ref_Change_Year_2023,Ref_Change_LCover_2023,Def2023,Deg2023,interpreted_qc,Ref_Change_Type_1523,Ref_Change_Year_1523,geometry,shapeName
1,1,1,187092,187092.0,187092,13.994761,0.869878,POINT(13.994761 0.869878),1,,...,,,,,,,Stable,Stable,POINT (1557889.668 96189.875),Ogooué-Ivindo
2,2,2,185303,185303.0,185303,13.254088,-0.492938,POINT(13.254088 -0.492938),1,,...,,,,,,,Stable,Stable,POINT (1475438.327 -54506.944),Ogooué-Lolo
3,3,3,185899,185899.0,185899,14.175518,1.275546,POINT(14.175518 1.275546),1,,...,,,,,,,Stable,Stable,POINT (1578011.445 141054.383),Ogooué-Ivindo
4,4,4,161928,161928.0,161928,12.303429,-1.380904,POINT(12.303429 -1.380904),1,,...,,,,,,,Stable,Stable,POINT (1369611.451 -152707.442),Ogooué-Lolo
5,5,5,173884,173884.0,173884,12.853122,-0.772983,POINT(12.853122 -0.772983),1,,...,,,,,,,Stable,Stable,POINT (1430802.996 -85474.663),Ogooué-Lolo


In [17]:
nearest_join.head()

Unnamed: 0,UID,ID,plotid,point_id,sampleid,lon,lat,sample_geom,Ref_FNF_2015,Ref_Regeneration,...,Ref_Change_Year_2023,Ref_Change_LCover_2023,Def2023,Deg2023,interpreted_qc,Ref_Change_Type_1523,Ref_Change_Year_1523,geometry,shapeName,distance_to_polygon
0,0,0,157,157.0,157,9.56202,0.937144,POINT(9.56202 0.937144),1,,...,,,,,,Stable,Stable,POINT (1064439.197 103628.702),Estuaire,965.390719
120,120,120,157707,157707.0,157707,11.698154,-2.402482,POINT(11.698154 -2.402482),1,,...,,,,,,Stable,Stable,POINT (1302232.547 -265731.63),Ngounié,4152.75296
122,122,122,156677,156677.0,156677,11.663324,-2.431369,POINT(11.663324 -2.431369),1,,...,,,,,,Stable,Stable,POINT (1298355.289 -268928.672),Ngounié,9154.509766
270,270,270,225142,225142.0,225142,14.417681,-1.912686,POINT(14.417681 -1.912686),1,,...,,,,,,Stable,Stable,POINT (1604968.907 -211533.688),Haut-Ogooué,3765.237586
285,285,285,65607,65607.0,65607,9.845223,-2.614749,POINT(9.845223 -2.614749),1,,...,,,,,,Stable,Stable,POINT (1095965.211 -289225.73),Ogooué-Maritime,2493.522277


In [18]:
joined_data[admin_name].value_counts(dropna=False)

shapeName
Haut-Ogooué        1799
Ngounié            1500
Ogooué-Ivindo      1286
Ogooué-Maritime    1127
Woleu-Ntem         1084
Nyanga             1022
Estuaire            895
Moyen-Ogooué        809
Ogooué-Lolo         793
Name: count, dtype: int64

In [19]:
province_pts = pd.pivot_table(joined_data,values='plotid',index=[admin_name],columns=['collection'],aggfunc="count",margins=True,
                             margins_name='Total',dropna=False)
province_pts

collection,coll_1522_1,coll_1522_2,coll_1522_3,coll_1522_4,coll_1522_5,coll_2023_1,coll_2023_2,coll_2023_3,Total
shapeName,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Estuaire,41.0,282.0,228.0,,162.0,50.0,84.0,48.0,895
Haut-Ogooué,62.0,707.0,,3.0,617.0,67.0,170.0,173.0,1799
Moyen-Ogooué,41.0,249.0,3.0,2.0,353.0,45.0,88.0,28.0,809
Ngounié,66.0,510.0,,9.0,642.0,69.0,155.0,49.0,1500
Nyanga,45.0,360.0,,,340.0,53.0,138.0,86.0,1022
Ogooué-Ivindo,80.0,400.0,,9.0,607.0,72.0,81.0,37.0,1286
Ogooué-Lolo,58.0,262.0,,9.0,383.0,40.0,28.0,13.0,793
Ogooué-Maritime,38.0,412.0,118.0,4.0,337.0,47.0,121.0,50.0,1127
Woleu-Ntem,68.0,342.0,,10.0,514.0,54.0,68.0,28.0,1084
Total,499.0,3524.0,349.0,46.0,3955.0,497.0,933.0,512.0,10315


In [20]:
# Remove rows where the 'Province' column has NaN values
joined_data = joined_data.dropna(subset=[admin_name])

In [21]:
len(joined_data)

10315

In [22]:
poly_shp = poly_shp.to_crs("EPSG:3395")

In [23]:
#calculate area of poly
poly_shp['area_sqm'] = poly_shp.geometry.area

In [24]:
poly_shp['area_ha'] = poly_shp['area_sqm'] / 10000

In [25]:
pd.set_option('display.float_format', lambda x: '%.0f' % x)

# Display the GeoDataFrame with the new area columns
print(poly_shp[[admin_name, 'area_ha']])

         shapeName  area_ha
0       Woleu-Ntem  3674113
1         Estuaire  1965458
2     Moyen-Ogooué  1797702
3      Ogooué-Lolo  2932509
4      Haut-Ogooué  3377135
5          Ngounié  3752265
6  Ogooué-Maritime  2227467
7           Nyanga  1956576
8    Ogooué-Ivindo  4243471


In [26]:
# Rename columns
column_mapping = {
   admin_name: new_name
#    'NAME_2': 'Territoire',
#    'Unnamed: 0':'Index'
}

# Use the rename() method to rename columns
joined_data.rename(columns=column_mapping, inplace=True)

In [27]:
list_columns = joined_data.columns.tolist()

# Display the list of column names
print(list_columns)

['UID', 'ID', 'plotid', 'point_id', 'sampleid', 'lon', 'lat', 'sample_geom', 'Ref_FNF_2015', 'Ref_Regeneration', 'Ref_Change_1522', 'Ref_Change_Type_1522', 'ArtFor', 'ArtMine', 'InfraR', 'Urb', 'IndFor', 'Other', 'Other_Desc', 'IndMine', 'IndAg', 'ArtAg', 'Ref_Year_1522', 'Ref_NFtype_2015', 'Ref_Ftype_2015', 'collection', 'interpreted', 'Ref_LCover_2015', 'Ref_Change_Year_1522', 'Ref_Change_LCover_1522', 'Def2016', 'Def2017', 'Def2018', 'Def2019', 'Def2020', 'Def2021', 'Def2022', 'Deg2016', 'Deg2017', 'Deg2018', 'Deg2019', 'Deg2020', 'Deg2021', 'Deg2022', 'Defall', 'Degall', 'Stable', 'NF', 'DensFor', 'DensDryFor', 'SecFor', 'DryOpenFor', 'Mangrove', 'Swamp', 'Gallery', 'Plantation', 'Woodland', 'Shrubland', 'Grassland', 'Aquatic', 'Bare', 'Cultivated', 'Builtup', 'Water', 'DensFor_Def', 'DensDryFor_Def', 'SecFor_Def', 'DryOpenFor_Def', 'Mangrove_Def', 'Swamp_Def', 'Gallery_Def', 'Plantation_Def', 'Woodland_Def', 'DensFor_Deg', 'DensDryFor_Deg', 'SecFor_Deg', 'DryOpenFor_Deg', 'Mangrov

In [28]:
joined_data = joined_data.to_crs("EPSG:4326")

In [27]:
# isave output
out_file_shp_att =  '/home/sepal-user/module_results/esbae/DRC/DRC_all_ceo_2015_2022_2023_prov.csv'
joined_data.to_csv(out_file_shp_att,index=False)

#### assign strata from GEE asset

In [29]:
gdf = joined_data

In [30]:
# Assuming df is your DataFrame - check for unique values
if gdf['UID'].nunique() == len(gdf):
    print("UID has entirely unique values.")
else:
    print("UID does not have entirely unique values.")

if gdf['ID'].nunique() == len(gdf):
    print("ID has entirely unique values.")
else:
    print("ID does not have entirely unique values.")

UID has entirely unique values.
ID does not have entirely unique values.


In [31]:
# select 2022-2023 strata asset you want to sample 
#asset2223 = ee.Image('users/faocongo/sbae/EQG_FNF_kmeans_strat_5_2022_2023')
#asset2223 = ee.Image('users/faocongo/sbae/CAF_FNF_kmeans_strat_5_2022_2023')
asset2223 = ee.Image('users/faocongo/sbae/GAB_FNF_kmeans_strat_5_2022_2023')
#asset2223 = ee.Image('users/faocongo/sbae/COD_FNF_kmeans_strat_5_2022_2023')
#asset2223 = ee.Image('users/faocongo/sbae/CMR_FNF_kmeans_strat_5_2022_2023')

In [32]:
# what to rename sampled band to. it is always called first
newName2223 = 'TNT_stratum_2223'

In [33]:
# select 2015-2022 strata asset you want to sample 
#asset1522 = ee.Image('users/faocongo/sbae/EQG_FNF_kmeans_strat_5_2015_2022')
#asset1522 = ee.Image('users/faocongo/sbae/CAF_FNF_kmeans_strat_5_2015_2022')
asset1522 = ee.Image('users/faocongo/sbae/GAB_FNF_kmeans_strat_5_2015_2022')
#asset1522 = ee.Image('users/faocongo/sbae/COD_FNF_kmeans_strat_5_2015_2022')
#asset1522 = ee.Image('users/faocongo/sbae/CMR_FNF_kmeans_strat_5_2015_2022')

In [34]:
newName1522 = 'TNT_stratum_1522'

In [35]:
#resolution of asset
scale = 70

In [36]:
#name of band
band = 'strata'

In [37]:
#sample 2223 stratum
gdf = gdf.reset_index(drop=True)
#with just a few columns
#this can take a little while with many points
dfs = []
for i in range(0, len(points), 5000):
    tmp_df = gdf.loc[i:i+4999]
    tmp_fc = geemap.gdf_to_ee(tmp_df[['UID', 'geometry']])
    tmp_smp = asset2223.select(band).reduceRegions(**{
        'collection': tmp_fc,
        'scale': scale,
        'reducer': ee.Reducer.first()   
    })

    tmp_res = geemap.ee_to_gdf(tmp_smp)
    dfs.append(tmp_res)
    
sampled_df = pd.concat(dfs)

In [38]:
sampled_df

Unnamed: 0,geometry,UID,first
0,POINT (13.99476 0.86988),1,1
1,POINT (13.25409 -0.49294),2,2
2,POINT (14.17552 1.27555),3,1
3,POINT (12.30343 -1.3809),4,1
4,POINT (12.85312 -0.77298),5,1
...,...,...,...
310,POINT (9.05048 -1.37204),10134,1
311,POINT (9.20203 -0.53131),10215,0
312,POINT (9.43469 0.41507),10249,2
313,POINT (13.90183 -2.45927),10259,1


In [40]:
sampled_df = sampled_df.rename(columns={'first':newName2223})
sampled_df.head()

Unnamed: 0,geometry,UID,TNT_stratum_2223
0,POINT (13.99476 0.86988),1,1
1,POINT (13.25409 -0.49294),2,2
2,POINT (14.17552 1.27555),3,1
3,POINT (12.30343 -1.3809),4,1
4,POINT (12.85312 -0.77298),5,1


In [42]:
#sample 1522 stratum
gdf = sampled_df.reset_index(drop=True)
#with just a few columns
#this can take a little while with many points
dfs = []
for i in range(0, len(points), 5000):
    tmp_df = gdf.loc[i:i+4999]
    tmp_fc = geemap.gdf_to_ee(tmp_df[['UID', 'geometry',newName2223]])
    tmp_smp = asset1522.select(band).reduceRegions(**{
        'collection': tmp_fc,
        'scale': scale,
        'reducer': ee.Reducer.first()   
    })

    tmp_res = geemap.ee_to_gdf(tmp_smp)
    dfs.append(tmp_res)
    
resampled_df = pd.concat(dfs)

In [46]:
resampled_df = resampled_df.rename(columns={'first':newName1522})
resampled_df.head()

Unnamed: 0,geometry,TNT_stratum_2223,UID,TNT_stratum_1522
0,POINT (13.99476 0.86988),1,1,3
1,POINT (13.25409 -0.49294),2,2,1
2,POINT (14.17552 1.27555),1,3,2
3,POINT (12.30343 -1.3809),1,4,2
4,POINT (12.85312 -0.77298),1,5,1


In [47]:
# recover columns
# Step 1: Merge the two DataFrames on the 'ID' column (assuming 'ID' is the common key)
merged_df = pd.merge(resampled_df, joined_data, on='UID', how='left', suffixes=('', '_dup'))

# Step 2: Remove duplicate columns
# Here, we assume that any duplicate column from df2 will have the '_dup' suffix added by the merge
# We will keep the columns from df1 and drop the duplicate ones from df2.
merged_df = merged_df.loc[:, ~merged_df.columns.str.endswith('_dup')]
merged_df.head()

Unnamed: 0,geometry,TNT_stratum_2223,UID,TNT_stratum_1522,ID,plotid,point_id,sampleid,lon,lat,...,Ref_Ftype_2022,Ref_LCover_2022,Ref_Change_Year_2023,Ref_Change_LCover_2023,Def2023,Deg2023,interpreted_qc,Ref_Change_Type_1523,Ref_Change_Year_1523,Province
0,POINT (13.99476 0.86988),1,1,3,1,187092,187092,187092,14,1,...,,,,,,,,Stable,Stable,Ogooué-Ivindo
1,POINT (13.25409 -0.49294),2,2,1,2,185303,185303,185303,13,0,...,,,,,,,,Stable,Stable,Ogooué-Lolo
2,POINT (14.17552 1.27555),1,3,2,3,185899,185899,185899,14,1,...,,,,,,,,Stable,Stable,Ogooué-Ivindo
3,POINT (12.30343 -1.3809),1,4,2,4,161928,161928,161928,12,-1,...,,,,,,,,Stable,Stable,Ogooué-Lolo
4,POINT (12.85312 -0.77298),1,5,1,5,173884,173884,173884,13,-1,...,,,,,,,,Stable,Stable,Ogooué-Lolo


In [48]:
merged_df_cols = merged_df.columns.tolist()
# Display the list of column names
print(merged_df_cols)

['geometry', 'TNT_stratum_2223', 'UID', 'TNT_stratum_1522', 'ID', 'plotid', 'point_id', 'sampleid', 'lon', 'lat', 'sample_geom', 'Ref_FNF_2015', 'Ref_Regeneration', 'Ref_Change_1522', 'Ref_Change_Type_1522', 'ArtFor', 'ArtMine', 'InfraR', 'Urb', 'IndFor', 'Other', 'Other_Desc', 'IndMine', 'IndAg', 'ArtAg', 'Ref_Year_1522', 'Ref_NFtype_2015', 'Ref_Ftype_2015', 'collection', 'interpreted', 'Ref_LCover_2015', 'Ref_Change_Year_1522', 'Ref_Change_LCover_1522', 'Def2016', 'Def2017', 'Def2018', 'Def2019', 'Def2020', 'Def2021', 'Def2022', 'Deg2016', 'Deg2017', 'Deg2018', 'Deg2019', 'Deg2020', 'Deg2021', 'Deg2022', 'Defall', 'Degall', 'Stable', 'NF', 'DensFor', 'DensDryFor', 'SecFor', 'DryOpenFor', 'Mangrove', 'Swamp', 'Gallery', 'Plantation', 'Woodland', 'Shrubland', 'Grassland', 'Aquatic', 'Bare', 'Cultivated', 'Builtup', 'Water', 'DensFor_Def', 'DensDryFor_Def', 'SecFor_Def', 'DryOpenFor_Def', 'Mangrove_Def', 'Swamp_Def', 'Gallery_Def', 'Plantation_Def', 'Woodland_Def', 'DensFor_Deg', 'DensD

In [49]:
#drop any columns if needed
columns_to_drop = ['sample_geom']
merged_df = merged_df.drop(columns_to_drop, axis=1)

In [50]:
# Using loc to replace only rows where 'stratum' is 0
merged_df.loc[merged_df['TNT_stratum_1522'] == 0, 'TNT_stratum_1522'] = 1

In [52]:
# Using loc to replace only rows where 'stratum' is 0
merged_df.loc[merged_df['TNT_stratum_2223'] == 0, 'TNT_stratum_2223'] = 1

In [53]:
merged_df['TNT_stratum_1522'] = merged_df['TNT_stratum_1522'].fillna(1)
merged_df['TNT_stratum_2223'] = merged_df['TNT_stratum_2223'].fillna(1)

In [56]:
print(merged_df.dtypes)

geometry                geometry
TNT_stratum_2223         float64
UID                        int64
TNT_stratum_1522         float64
ID                         int64
                          ...   
Deg2023                  float64
interpreted_qc           float64
Ref_Change_Type_1523      object
Ref_Change_Year_1523      object
Province                  object
Length: 109, dtype: object


In [57]:
pd.pivot_table(
    merged_df,
    values='UID',
    index=['TNT_stratum_1522'],
    columns=['collection'],
    aggfunc="count",
    margins=True,              # Adds totals
    margins_name='Total',      # Name of the margins/total column
    dropna=False               # Include NaN values in the table
)

collection,coll_1522_1,coll_1522_2,coll_1522_3,coll_1522_4,coll_1522_5,coll_2023_1,coll_2023_2,coll_2023_3,Total
TNT_stratum_1522,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1,329,2236,208,28.0,2649,329,516,222,6517
2,112,644,69,13.0,738,99,125,82,1882
3,43,320,39,5.0,318,46,130,85,986
4,11,227,21,,158,18,119,78,632
5,4,97,12,,92,5,43,45,298
Total,499,3524,349,46.0,3955,497,933,512,10315


In [58]:
pd.pivot_table(
    merged_df,
    values='UID',
    index=['TNT_stratum_2223'],
    columns=['collection'],
    aggfunc="count",
    margins=True,              # Adds totals
    margins_name='Total',      # Name of the margins/total column
    dropna=False               # Include NaN values in the table
)

collection,coll_1522_1,coll_1522_2,coll_1522_3,coll_1522_4,coll_1522_5,coll_2023_1,coll_2023_2,coll_2023_3,Total
TNT_stratum_2223,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1,365,2548,220,38,2770,361,549,315,7166
2,73,534,51,4,670,67,134,77,1610
3,43,286,48,2,342,48,146,72,987
4,14,129,28,1,137,18,84,30,441
5,4,27,2,1,36,3,20,18,111
Total,499,3524,349,46,3955,497,933,512,10315


In [87]:
# Remove rows where the stratum has NaN values - outside the country
#merged_df = merged_df.dropna(subset=['TNT_stratum_2223'])

In [59]:
merged_df['sampling'].value_counts(dropna=False)

sampling
ceo_1522    8373
ceo_2023    1942
Name: count, dtype: int64

In [60]:
# Check if the lengths of merged_df and ceo are the same
if len(merged_df) == len(points):
    print("The lengths of merged_df and ceo are the same.")
else:
    print("The lengths of merged_df and ceo are different.")

# Optionally, you can print the lengths for clarity
print("Length of merged_df:", len(merged_df))
print("Length of ceo:", len(points))

The lengths of merged_df and ceo are the same.
Length of merged_df: 10315
Length of ceo: 10315


#### export

In [90]:
out_file =  '/home/sepal-user/module_results/esbae/GAB/GAB_all_ceo_2015_2022_2023_TNTstrat.csv'
#sampled_df.to_csv(out_file,index=False)
merged_df.to_csv(out_file,index=False)