![title](images/header.png)

# Vb - Supervised Classification with CAFI data
-------
this notebook performs a supervised classification of probability of forest change for one country using the merged eSBAE and CEO data trained on the interpreted data from CAFI DDD phase I (2015-2020).  
The change probability is then divided into 3 strata using kmeans
The modeled data are exported to csv for the following script to extract samples for validation in CEO.

This script requires an m16 or r16 instance to run
###### For more information contact aurelie.shapiro@fao.org or remi.dannunzio@fao.org

In [1]:
from pathlib import Path

import numpy as np
import pandas as pd
import geopandas as gpd
import uuid
from matplotlib import pyplot as plt

from sampling_handler.misc import py_helpers
from sampling_handler.ensemble import classification as clf
from sampling_handler.ensemble import helpers as h
from sampling_handler.sampling import sfc
from sampling_handler import KMeansSubSampling


import geopandas
import rasterio
import matplotlib.pyplot as plt
from shapely.geometry import Point

  warn("cupy is not available in this environment, GPU fonctionnalities won't be available")


### Parameters

In [2]:
ISO = 'CMR'

#### Import the CEO validated data from CAFI DDD first phase - output from script 5a
##### this file includes 13,115 points, randomly distributed over CAFI, with interpretation
##### additional attributes from CAFI rasters have been added (fragmentation, tree cover)
##### this file is downloaded when you pull the GitHub repository: https://github.com/aurelgrooves/CAFI_DDD

In [3]:
trn_ceo = pd.read_csv('/home/sepal-user/CAFI_DDD/CAFI_esbae_ceo_phaseI_att.csv', delimiter=',')
trn_ceo

Unnamed: 0,geometry,UniqueID,dates,ts,images,mon_images,bfast_change_date,bfast_magnitude,bfast_means,cusum_change_date,...,sampling,ECO_CODE,ECO_NAME,ECO_NUM,ECODE_NAME,TropBiome,CAFI_LC_2015,Frag_2015,Treecov_2015,Biomass_2010
0,POINT (25.60212035 -11.69188784),1,"['20100314', '20100501', '20100509', '20100517...","{'green': [438.0061079394925, 444.750798915826...",181,97,2018.589,-738.678406,-1.354781,2016.597,...,str_random,AT0704,Central Zambezian Miombo Woodlands,4.0,AT0704. Central Zambezian Miombo woodlands,2.0,2,5,52,94
1,POINT (19.01297774 6.404943059),2,"['20100211', '20100331', '20101110', '20110318...","{'green': [605.875099198608, 568.6520036652914...",153,90,2016.381,-1332.806396,-1.765601,2020.038,...,str_random,AT0712,Northern Congolian Forest-Savanna Mosaic,12.0,AT0712. Northern Congolian forest-savanna mosaic,2.0,9,3,54,194
2,POINT (22.79183081 7.078410027),3,"['20100105', '20100129', '20101020', '20101129...","{'green': [497.9944621895307, 644.717567057849...",158,93,2016.915,-2419.585449,-2.557598,2020.003,...,str_random,AT0705,East Sudanian Savanna,5.0,AT0705. East Sudanian savanna,2.0,11,2,33,9
3,POINT (27.43010212 -6.534839454),4,"['20100211', '20100510', '20100518', '20101001...","{'green': [463.976386765175, 447.3358682219745...",129,60,-1.000,0.000000,0.000000,2017.430,...,str_random,AT0704,Central Zambezian Miombo Woodlands,4.0,AT0704. Central Zambezian Miombo woodlands,2.0,11,2,28,52
4,POINT (22.21214796 -9.940981517),5,"['20100429', '20100515', '20101006', '20101115...","{'green': [370.4567016055455, 385.308451001878...",140,79,2018.342,13.038783,-0.582325,2018.518,...,str_random,AT0704,Central Zambezian Miombo Woodlands,4.0,AT0704. Central Zambezian Miombo woodlands,2.0,2,3,51,87
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13110,POINT (18.90329319 4.178378347),13112,"['20100211', '20100331', '20100518', '20110318...","{'green': [674.2453004364089, 653.694492020193...",123,80,-1.000,0.000000,0.000000,2016.468,...,str_random,AT0124,Northeastern Congolian Lowland Forests,24.0,AT0124. Northeastern Congolian lowland forests,1.0,1,2,41,38
13111,POINT (17.17745164 -5.075794729),13113,"['20100119', '20100401', '20101002', '20101119...","{'green': [344.4054229887222, 539.972540387191...",113,63,2017.433,-750.342834,-1.426068,2018.485,...,str_random,AT0718,Southern Congolian Forest-Savanna Mosaic,18.0,AT0718. Southern Congolian forest-savanna mosaic,2.0,1,2,59,39
13112,POINT (21.96690965 3.213589406),13114,"['20100129', '20100529', '20100809', '20100817...","{'green': [733.2082639924525, 385.167344664216...",116,67,2017.219,-84.756882,-0.863496,2018.118,...,str_random,AT0124,Northeastern Congolian Lowland Forests,24.0,AT0124. Northeastern Congolian lowland forests,1.0,1,2,70,392
13113,POINT (27.81844527 -3.425142538),13115,"['20100117', '20100202', '20100330', '20100509...","{'green': [327.6101261002253, 342.841394483886...",134,71,2016.816,362.227112,0.848210,2020.036,...,str_random,AT0124,Northeastern Congolian Lowland Forests,24.0,AT0124. Northeastern Congolian lowland forests,1.0,1,2,75,180


In [4]:
print(trn_ceo.columns.values)

['geometry' 'UniqueID' 'dates' 'ts' 'images' 'mon_images'
 'bfast_change_date' 'bfast_magnitude' 'bfast_means' 'cusum_change_date'
 'cusum_confidence' 'cusum_magnitude' 'red_mean' 'red_sd' 'red_min'
 'red_max' 'nir_mean' 'nir_sd' 'nir_min' 'nir_max' 'swir1_mean' 'swir1_sd'
 'swir1_min' 'swir1_max' 'swir2_mean' 'swir2_sd' 'swir2_min' 'swir2_max'
 'ndfi_mean' 'ndfi_sd' 'ndfi_min' 'ndfi_max' 'brightness_mean'
 'brightness_sd' 'brightness_min' 'brightness_max' 'greenness_mean'
 'greenness_sd' 'greenness_min' 'greenness_max' 'wetness_mean'
 'wetness_sd' 'wetness_min' 'wetness_max' 'bs_slope_mean' 'bs_slope_sd'
 'bs_slope_max' 'bs_slope_min' 'ewma_jrc_date' 'ewma_jrc_change'
 'ewma_jrc_magnitude' 'mosum_jrc_date' 'mosum_jrc_change'
 'mosum_jrc_magnitude' 'cusum_jrc_date' 'cusum_jrc_change'
 'cusum_jrc_magnitude' 'ccdc_change_date' 'ccdc_magnitude' 'aspect'
 'dw_class_mode' 'dw_tree_prob__max' 'dw_tree_prob__min'
 'dw_tree_prob__stdDev' 'dw_tree_prob_mean' 'elevation' 'esa_lc20'
 'esa_lc21' '

In [None]:
# filter by biome (optional)
biome = 2
trn_ceo = trn_ceo[trn_ceo['TropBiome'] == biome]

In [5]:
trn_gdf = gpd.GeoDataFrame(trn_ceo, geometry=gpd.points_from_xy(trn_ceo.LON, trn_ceo.LAT))

### Augmented dataset for country of interest (2015-2022)
#### this is the output folder from eSBAE script 4

In [6]:
# glob all files in the data augmentation output folder
#files = Path('/home/sepal-user/module_results/esbae/GAB/04_Dataset_Augmentation/Landsat/').glob('*geojson')
#files = Path('/home/sepal-user/module_results/esbae/DRC/04_Dataset_Augmentation/Landsat/').glob('*geojson')
#files = Path('/home/sepal-user/module_results/esbae/COG/04_Dataset_Augmentation/Landsat/').glob('*geojson')
#files = Path('/home/sepal-user/module_results/esbae/EQG/04_Dataset_Augmentation/Landsat/').glob('*geojson')
files = Path('/home/sepal-user/module_results/esbae/CMR/04_Dataset_Augmentation/Landsat/').glob('*geojson')
#files = Path('/home/sepal-user/module_results/esbae/CAR/04_Dataset_Augmentation/Landsat/').glob('*geojson')

In [7]:
# output file eSBAE data for your country with a modeled change probability using CAFI phase I data
all_points_out_file = '/home/sepal-user/module_results/esbae/CMR/CMR_esbae_1520_model.csv'

In [None]:
# saves combined eSBAE data in parquet format
parquet_out = '/home/sepal-user/module_results/esbae/DRC/DRC_esbae_biome2.parquet'

In [None]:
# esbae data with additional attributes from raster data
esbae_att_out = '/home/sepal-user/module_results/esbae/DRC/DRC_esbae_1522_att_biome2.csv'

In [None]:
#eSBAE columns to include in data export
cols_to_export = ['geometry', 'UniqueID',
                  #'dates', 'ts',
                  'images', 'mon_images',
       'bfast_change_date', 'bfast_magnitude', 'bfast_means',
       'cusum_change_date', 'cusum_confidence', 'cusum_magnitude', 'red_mean',
       'red_sd', 'red_min', 'red_max', 'nir_mean', 'nir_sd', 'nir_min',
       'nir_max', 'swir1_mean', 'swir1_sd', 'swir1_min', 'swir1_max',
       'swir2_mean', 'swir2_sd', 'swir2_min', 'swir2_max', 'ndfi_mean',
       'ndfi_sd', 'ndfi_min', 'ndfi_max', 'brightness_mean', 'brightness_sd',
       'brightness_min', 'brightness_max', 'greenness_mean', 'greenness_sd',
       'greenness_min', 'greenness_max', 'wetness_mean', 'wetness_sd',
       'wetness_min', 'wetness_max', 'bs_slope_mean', 'bs_slope_sd',
       'bs_slope_max', 'bs_slope_min', 'ewma_jrc_date', 'ewma_jrc_change',
       'ewma_jrc_magnitude', 'mosum_jrc_date', 'mosum_jrc_change',
       'mosum_jrc_magnitude', 'cusum_jrc_date', 'cusum_jrc_change',
       'cusum_jrc_magnitude', 'ccdc_change_date', 'ccdc_magnitude', 'aspect',
       'dw_class_mode', 'dw_tree_prob__max', 'dw_tree_prob__min',
       'dw_tree_prob__stdDev', 'dw_tree_prob_mean', 'elevation', 'esa_lc20',
       'esa_lc21', 'esri_lc17', 'esri_lc18', 'esri_lc19', 'esri_lc20',
       'esri_lc21', 'gfc_gain', 'gfc_loss', 'gfc_lossyear', 'gfc_tc00',
       'lang_tree_height', 'potapov_tree_height', 'slope',
       'tmf_2016', 'tmf_2017', 'tmf_2018', 'tmf_2019', 'tmf_2020',
       'tmf_defyear', 'tmf_degyear', 'tmf_main', 'tmf_sub','CAFI_LC_2015','Frag_2015', 'Treecov_2015', 'Biomass_2010']

#### auxilliary data to further augment data

In [None]:
LC_map = "/home/sepal-user/data/lc_map/CAFI_LC_2015_19_wgs84.tif"

In [None]:
frag_map = "/home/sepal-user/data/other_data/cafi_frag_wgs84.tif"

In [None]:
treecov = "/home/sepal-user/data/other_data/treecover_2015_1ha_wgs84.tif"

In [None]:
biomass = "/home/sepal-user/data/other_data/ESA_Biomass_2010.tif"

In [None]:
biomes = "/home/sepal-user/data/other_data/CAFI_tnc_ecoregions.shp"

#### end of parameters

# 1 - CEO data preparation
### 1.1 - Import CEO interpreted data

In [None]:
len(trn_gdf.index)

### 1.2 - Inspect CEO interpreted data

Our ultimate goal is to use the CEO data for getting the probability of forest change via a series of classifications and their probability output. . , i.e. Forest-Non-Forest and a Change-No-Change classifcation.
Therefore we need to extract and trnasform the relevant information into a column of 0s and 1s. 

First let's look at the columns the CEO file contains.

In [None]:
trn_gdf.columns.tolist()

In [None]:
trn_gdf['UniqueID'].nunique(dropna=False)

### 1.3 Print unique values for columns of interest generate Change-No Change

In [None]:
# show unique values for the columns (in red)
print('Change', trn_gdf['Ref_Change_Type_1520'].unique())

In [None]:
# add stable forest column for classification
trn_gdf['Change_Bin'] = trn_gdf['Ref_Change_Type_1520'].apply(lambda x: 1 if x == 'Def' or  x == 'Deg' else 0)
np.unique(trn_gdf['Change_Bin'], return_counts=True)

In [None]:
# add stable forest column for classification
trn_gdf['Stable_Bin'] = trn_gdf['Ref_Change_Type_1520'].apply(lambda x: 1 if x == 'Stable' else 0)
np.unique(trn_gdf['Stable_Bin'], return_counts=True)

In [None]:
# add def column for classification
trn_gdf['Def_Bin'] = trn_gdf['Ref_Change_Type_1520'].apply(lambda x: 1 if x == 'Def' else 0)
np.unique(trn_gdf['Def_Bin'], return_counts=True)

In [None]:
# add def column for classification
trn_gdf['Deg_Bin'] = trn_gdf['Ref_Change_Type_1520'].apply(lambda x: 1 if x == 'Deg' else 0)
np.unique(trn_gdf['Deg_Bin'], return_counts=True)

In [None]:
# add NF column for classification
trn_gdf['NF_Bin'] = trn_gdf['Ref_Change_Type_1520'].apply(lambda x: 1 if x == 'NF' else 0)
np.unique(trn_gdf['NF_Bin'], return_counts=True)

In [None]:
trn_gdf.head()

In [None]:
# Initialize a dictionary to store the counts by column
merged_columns = trn_gdf.columns.tolist()
missing_counts = {}

# Count missing values and sum the counts by column
for column in merged_columns:
    missing_counts[column] =trn_gdf[column].isna().sum()

# Print the counts
print(missing_counts)

# 2 - Join CEO and Time-Series data

### 2.1 - Load data augmented time-series data from script 4
area must intersect with the CAFI region

In [None]:
# prepare for parallel execution
files = [[str(file), False] for file in files]

# read files in parallel nad put the in a list
result = py_helpers.run_in_parallel(
    py_helpers.geojson_to_gdf,
    files,
    workers=4,
    parallelization='processes'
)

# concatenate dataframes from result's list
cdf = pd.concat(result)

In [None]:
#save file
cdf.to_parquet(parquet_out)

In [None]:
#if you already ran this step you can read the parquet file
cdf = gpd.read_parquet(parquet_out)

In [None]:
all_columns = cdf.columns.tolist()

# Print the list of columns
print(all_columns)

In [None]:
len(cdf.index)

In [None]:
print(trn_gdf.columns)

In [None]:
merged_columns = cdf.columns.tolist()
# Initialize a dictionary to store the counts by column
merged_columns = cdf.columns.tolist()
missing_counts = {}

# Count missing values and sum the counts by column
for column in merged_columns:
    missing_counts[column] =cdf[column].isna().sum()

# Print the counts
print(missing_counts)

### 2.2 - add raster and polygon attributes

In [None]:
# Read file using gpd.read_file()
poly_shp = gpd.read_file(biomes)
poly_shp.head()

In [None]:
poly_crs = poly_shp.crs
print("Current CRS:", poly_crs)

In [None]:
poly_shp = poly_shp.to_crs(cdf.crs)

In [None]:
# Perform the spatial join
joined_data = gpd.sjoin(cdf, poly_shp, how="left", op="within",lsuffix='left', rsuffix='right')

In [None]:
joined_data['TropBiome'].value_counts(dropna=False)

In [None]:
joined_data['TropBiome'] = joined_data['TropBiome'].fillna(2)

In [None]:
#drop any columns if needed
columns_to_drop = ['index_right', 'ECO_ID_U','CLS_CODE', 'ECO_NOTES', 'WWF_REALM', 'WWF_REALM2', 'WWF_MHTNUM', 'WWF_MHTNAM', 'RealmMHT', 'ER_UPDATE', 'ER_DATE_U', 'ER_RATION', 'SOURCEDATA', 'Shape__Are', 'Shape__Len', 'GlobalID']
cdf = joined_data.drop(columns_to_drop, axis=1)

In [None]:
LC_src = rasterio.open(LC_map)

In [None]:
frag_src = rasterio.open(frag_map)

In [None]:
tree_src = rasterio.open(treecov)

In [None]:
biomass_src =  rasterio.open(biomass)

In [None]:
coord_list = [(x, y) for x, y in zip(cdf["geometry"].x, cdf["geometry"].y)]

In [None]:
cdf["CAFI_LC_2015"] = [x for x in LC_src.sample(coord_list)]
cdf["CAFI_LC_2015"] = cdf["CAFI_LC_2015"].str[0]
cdf.head()

In [None]:
cdf["Frag_2015"] = [x for x in frag_src.sample(coord_list)]
cdf["Frag_2015"] = cdf["Frag_2015"].str[0]
cdf.head()

In [None]:
cdf["Treecov_2015"] = [x for x in tree_src.sample(coord_list)]
cdf["Treecov_2015"] = cdf["Treecov_2015"].str[0]
cdf.head()

In [None]:
cdf["Biomass_2010"] = [x for x in biomass_src.sample(coord_list)]
cdf["Biomass_2010"] = cdf["Biomass_2010"].str[0]
cdf.head()

In [None]:
print(cdf.columns)

In [None]:
# export cdf file to avoid having to resample all the time
cdf.to_csv(esbae_att_out, index=False)

### 2.3 - Make a union with training data

In [None]:
# if raster attributes were already calculated
#cdf_file = gpd.read_file('/home/sepal-user/module_results/esbae/COG/COG_esbae_1522_att.csv',delimiter=',')

In [None]:
# filter by biome (optional)
cdf = cdf[cdf['TropBiome'] == biome]

In [None]:
db = pd.concat([cdf, trn_gdf], ignore_index=True)

In [None]:
len(db.index)

In [None]:
db.head()

In [None]:
print(db.columns.values)

In [None]:
len(cdf.index)+len(trn_gdf.index) == len(db.index)

#### check for NAs

In [None]:
# Initialize a dictionary to store the counts by column
missing_counts = {}

# Count missing values and sum the counts by column
for column in cols_to_export:
    missing_counts[column] = db[column].isna().sum()

# Print the counts
print(missing_counts)

In [None]:
# Specify the columns to replace NaN with 0 (categorical data)
columns_to_fill = ['gfc_lossyear', 'gfc_gain','gfc_loss', 'lang_tree_height', 'esa_lc20', 'esa_lc21', 'esri_lc20',  'esri_lc21' , 'esri_lc19', 'esri_lc18', 'esri_lc17', 
                   'potapov_tree_height','elevation','slope', 'tmf_2016', 'tmf_2017', 'tmf_2018', 'tmf_2019', 'tmf_2020', 'tmf_defyear', 'tmf_degyear', 'tmf_main', 'tmf_sub', 'aspect']

# Set NaN values in the selected columns to 0
db[columns_to_fill] = db[columns_to_fill].fillna(0)

In [None]:
# Specify the columns you want to impute with the mean value - continuous data
columns_to_impute = ['dw_class_mode', 'dw_tree_prob__max', 'dw_tree_prob__stdDev','dw_tree_prob_mean', 'dw_tree_prob__min']

# Impute NaN values in the selected columns with the mean
for column in columns_to_impute:
    mean_value = db[column].mean()
    db[column].fillna(mean_value, inplace=True)

### 2.4 Check the amount of ground truth data we have for each class

In [None]:
db['Change_Bin'].value_counts(dropna=False)

In [None]:
db['Def_Bin'].value_counts(dropna=False)

In [None]:
db['Deg_Bin'].value_counts(dropna=False)

## 3. Run change/no-change classification

In [None]:
# eSBAE columns for analysis
eSBAE_analysis = [
    #'geometry', 
    #'point_id', 
    #'dates', 'ts', 
    'images', 'mon_images', 
       'bfast_change_date','bfast_magnitude', 'bfast_means', 
       'cusum_change_date','cusum_confidence', 'cusum_magnitude', 
       'red_mean', 'red_sd', 'red_min',
       'red_max', 'nir_mean', 'nir_sd', 'nir_min', 'nir_max', 'swir1_mean',
       'swir1_sd', 'swir1_min', 'swir1_max', 'swir2_mean', 'swir2_sd',
       'swir2_min', 'swir2_max', 'ndfi_mean', 'ndfi_sd', 'ndfi_min',
       'ndfi_max', 'brightness_mean', 'brightness_sd', 'brightness_min',
       'brightness_max', 'greenness_mean', 'greenness_sd', 'greenness_min',
       'greenness_max', 'wetness_mean', 'wetness_sd', 'wetness_min',
       'wetness_max', 'bs_slope_mean', 'bs_slope_sd', 'bs_slope_max',
       'bs_slope_min', 'ewma_jrc_date', 'ewma_jrc_change',
       'ewma_jrc_magnitude', 'mosum_jrc_date', 'mosum_jrc_change',
       'mosum_jrc_magnitude', 'cusum_jrc_date', 'cusum_jrc_change',
       'cusum_jrc_magnitude', 'ccdc_change_date', 'ccdc_magnitude', 'aspect',
       'dw_class_mode', 'dw_tree_prob__max', 'dw_tree_prob__min',
       'dw_tree_prob__stdDev', 'dw_tree_prob_mean', 'elevation', 'esa_lc20',
       'esa_lc21', 'esri_lc17', 'esri_lc18', 'esri_lc19', 'esri_lc20',
       'esri_lc21', 'gfc_gain', 'gfc_loss', 'gfc_lossyear', 'gfc_tc00',
       'lang_tree_height', 'potapov_tree_height', 'slope',
       'tmf_2016', 'tmf_2017', 'tmf_2018', 'tmf_2019', 'tmf_2020',
       'tmf_defyear', 'tmf_degyear', 'tmf_main', 'tmf_sub', 
        'CAFI_LC_2015','Frag_2015', 'Treecov_2015', 'Biomass_2010'
        ]

In [None]:
# predictors used change/nochange classification
db['chg_prob_cafi1520'] = clf.binary_probability_classification(db, 'Change_Bin', eSBAE_analysis, outlier=False)

In [None]:
# predictors used change/nochange classification
db['def_prob_cafi1520'] = clf.binary_probability_classification(db, 'Def_Bin', eSBAE_analysis, outlier=False)

In [None]:
# predictors used change/nochange classification
db['deg_prob_cafi1520'] = clf.binary_probability_classification(db, 'Deg_Bin', eSBAE_analysis, outlier=False)

In [None]:
# predictors used change/nochange classification
db['stable_prob_cafi1520'] = clf.binary_probability_classification(db, 'Stable_Bin', eSBAE_analysis, outlier=False)

In [None]:
clf_db = db[db["point_id"] > 0]

In [None]:
fig, ax = plt.subplots(1,2, figsize=(15,7))

clf_db.plot('chg_prob_cafi1520', markersize=0.05, ax=ax[0], legend=True, cmap='magma')
clf_db.hist('chg_prob_cafi1520', ax=ax[1])

In [None]:
fig, ax = plt.subplots(1,2, figsize=(15,7))

clf_db.plot('def_prob_cafi1520', markersize=0.05, ax=ax[0], legend=True, cmap='magma')
clf_db.hist('def_prob_cafi1520', ax=ax[1])

In [None]:
fig, ax = plt.subplots(1,2, figsize=(15,7))

clf_db.plot('deg_prob_cafi1520', markersize=0.05, ax=ax[0], legend=True, cmap='magma')
clf_db.hist('deg_prob_cafi1520', ax=ax[1])

In [None]:
fig, ax = plt.subplots(1,2, figsize=(15,7))

clf_db.plot('stable_prob_cafi1520', markersize=0.05, ax=ax[0], legend=True, cmap='magma')
clf_db.hist('stable_prob_cafi1520', ax=ax[1])

In [None]:
clf_db.columns

In [None]:
joined_df = clf_db

In [None]:
len(joined_df)

### 3.2 Stratify with Kmeans

In [None]:
sample_size_no = 10000

In [None]:
joined_df, samples = h.kmeans_stratifier(joined_df, 'chg_prob_cafi1520', strata=3, sample_size=sample_size_no)

In [None]:
joined_df['kmeans_chg'] = joined_df['stratum']

In [None]:
h.plot_cluster(joined_df, prob_column='chg_prob_cafi1520', strata_column='kmeans_chg')

fig, ax = plt.subplots(1,2, figsize=(15,7))

joined_df.plot('kmeans_chg', markersize=0.15, ax=ax[0], legend=True, cmap='magma')
joined_df.hist('kmeans_chg', ax=ax[1], bins=50)
samples

In [None]:
joined_df, samples = h.kmeans_stratifier(joined_df, 'def_prob_cafi1520', strata=3, sample_size=sample_size_no)

In [None]:
joined_df['kmeans_def'] = joined_df['stratum']

In [None]:
h.plot_cluster(joined_df, prob_column='def_prob_cafi1520', strata_column='kmeans_def')

fig, ax = plt.subplots(1,2, figsize=(15,7))

joined_df.plot('kmeans_def', markersize=0.15, ax=ax[0], legend=True, cmap='magma')
joined_df.hist('kmeans_def', ax=ax[1], bins=50)
samples

In [None]:
joined_df, samples = h.kmeans_stratifier(joined_df, 'deg_prob_cafi1520', strata=3, sample_size=sample_size_no)

In [None]:
joined_df['kmeans_deg'] = joined_df['stratum']

In [None]:
h.plot_cluster(joined_df, prob_column='deg_prob_cafi1520', strata_column='kmeans_deg')

fig, ax = plt.subplots(1,2, figsize=(15,7))

joined_df.plot('kmeans_deg', markersize=0.15, ax=ax[0], legend=True, cmap='magma')
joined_df.hist('kmeans_deg', ax=ax[1], bins=50)
samples

In [None]:
cross_tab = pd.crosstab(index= joined_df['kmeans_def'], columns=joined_df['kmeans_deg'], margins=True, margins_name="Total")
cross_tab

In [None]:
pd.pivot_table(joined_df,values='point_id',index=['kmeans_chg'],aggfunc="count", margins=True)

### 4. align column names, get coordinates


In [None]:
joined_df['PLOTID'] = joined_df['point_id']

In [None]:
joined_df.columns.tolist()

In [None]:
joined_df.drop(columns=(['dates','ts','stratum','UniqueID',
 'OID', 'PLOTID', 'CEO_PLOTID','CEO_Ref_Code','LON','LAT', 'Ref_FNF_2015',
 'Ref_LCover', 'Ref_Change_Type_1520', 'Ref_Year', 'countDrivers',
 'Ref_InfraR', 'Ref_Urb', 'Ref_ArtAg', 'Ref_IndAg',
 'Ref_ArtFor', 'Ref_IndFor', 'Ref_ArtMin', 'Ref_IndMin',
 'Ref_Other', 'Source', 'Ref_Change_Year_1520',
 'Ref_Change_1520', 'collection', 'interpreted',
 'Def2016', 'Def2017','Def2018', 'Def2019',
 'Def2020', 'Deg2016', 'Deg2017', 'Deg2018',
 'Deg2019', 'Deg2020', 'Defall', 'Degall',
 'Stable', 'NF',]), inplace=True)

In [None]:
joined_df['LON'] = joined_df.geometry.apply(lambda p: p.x)
joined_df['LAT'] = joined_df.geometry.apply(lambda p: p.y)

### 5. export the points for your area with the change probability model

In [None]:
joined_df.to_csv(all_points_out_file,index=False)

### now you may proceed to script 5c if you want to extract sample points for CEO