# V - Merging esbae with CEO data 
### This script serves to merge validated data from CEO with eSBAE points for quick estimation of areas
#### This script requires minimum an m16 instance and select the python 3 kernel
###### For more information contact aurelie.shapiro@fao.org or remi.dannunzio@fao.org

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
# Set the display option to a large value to prevent text wrapping
pd.set_option('display.max_colwidth', None)

#### combine all CEO sample files
##### download your CEO sample files from collect.earth and upload to SEPAL using FileZilla.
##### add as many files as you have projects

In [2]:
#DRC
#ceo1 = pd.read_csv('/home/sepal-user/module_results/esbae/DRC/ceo-CAFI-DDD-RDC-eSBAE-validation-1-sample-data_prov.csv')
#ceo2 = pd.read_csv('/home/sepal-user/module_results/esbae/DRC/ceo-CAFI-DDD-RDC-eSBAE-validation-2-sample-data.csv')
#ceo3 = pd.read_csv('/home/sepal-user/module_results/esbae/DRC/ceo-CAFI-DDD-RDC-eSBAE-validation-3-sample-data.csv')
#GAB
ceo1 = pd.read_csv('/home/sepal-user/module_results/esbae/GAB/ceo-CAFI-DDD-Gabon-eSBAE-validation-1-sample-data.csv')
ceo2 = pd.read_csv('/home/sepal-user/module_results/esbae/GAB/ceo-CAFI-DDD-Gabon-eSBAE-validation-2-sample-data.csv')
ceo3 = pd.read_csv('/home/sepal-user/module_results/esbae/GAB/ceo-CAFI-DDD-Gabon-eSBAE-validation-3-sample-data.csv')
ceo4 = pd.read_csv('/home/sepal-user/module_results/esbae/GAB/ceo-CAFI-DDD-Gabon-eSBAE-validation-4-sample-data.csv')
ceo5 = pd.read_csv('/home/sepal-user/module_results/esbae/GAB/ceo-CAFI-DDD-Gabon-eSBAE-validation-5-sample-data.csv')

##### drop duplicates from ceo collections
###### keep first duplicate by date

In [3]:
column_names = ceo1.columns.tolist()
# Print the list of column names
print(column_names)

['plotid', 'sampleid', 'lon', 'lat', 'email', 'flagged', 'collection_time', 'analysis_duration', 'imagery_title', 'imagery_attributions', 'sample_geom', 'pl_aspect', 'pl_brightness_max', 'pl_red_mean', 'pl_greenness_max', 'pl_cusum_confidence', 'pl_cusum_change_date', 'pl_nir_max', 'pl_red_min', 'pl_bfast_means', 'pl_elevation', 'pl_red_sd', 'pl_cnc_1520', 'pl_swir1_sd', 'pl_index', 'pl_tmf_sub', 'pl_dw_tree_prob__min', 'pl_bs_slope_max', 'pl_red_max', 'pl_images', 'pl_dist', 'pl_bfast_magnitude', 'pl_ccdc_magnitude', 'pl_simple_combined', 'pl_tmf_defyear', 'pl_greenness_mean', 'pl_tmf_2019', 'pl_swir2_sd', 'pl_nir_sd', 'pl_gfc_tc00', 'pl_greenness_min', 'pl_tmf_degyear', 'pl_tmf_2020', 'pl_tmf_2017', 'pl_bs_slope_min', 'pl_lang_tree_height', 'pl_bs_slope_sd', 'pl_dw_class_mode', 'pl_swir1_mean', 'pl_wetness_sd', 'pl_bs_slope_mean', 'pl_ccdc_change_date', 'pl_dw_tree_prob_mean', 'pl_nir_mean', 'pl_dw_tree_prob__stddev', 'pl_tmf_2016', 'pl_nir_min', 'pl_esri_lc17', 'pl_wetness_mean', 'p

##### merge all ceo files

In [4]:
#make a list of the ceo files first
ceo_files = [ceo1, ceo2, ceo3,ceo4,ceo5]

##### add a column named "interpreted" which = 1 when the point has been validated in CEO, otherwise 0
##### count the number of validated points per ceo file
##### replace forest and non-forest types with codes

In [None]:
# List to store the modified DataFrames
modified_ceo_files = []

# Initialize collection counter
collection_counter = 1

for i, ceo in enumerate(ceo_files):
       
    # Sort the DataFrame by 'date' in descending order
    ceo.sort_values(by='collection_time', ascending=True, inplace=True)

    # Remove duplicates keeping the last occurrence (the most recent date)
    ceo.drop_duplicates(subset='pl_point_id', keep='first', inplace=True)

    # Reset the index
    ceo.reset_index(drop=True, inplace=True)
    
    # Add 'collection' column with an increasing number
    ceo['collection'] = collection_counter
    
    # Add 'interpreted' column with 1 when 'email' is not null, otherwise 0
    ceo['interpreted'] = np.where(ceo['email'].notna(), 1, 0)
    
    # Calculate total count for each 'interpreted' value
    interpreted_counts = ceo['interpreted'].value_counts().to_dict()
    
    # Append the modified DataFrame to the list
    modified_ceo_files.append(ceo)
    
    # Print count for the current DataFrame
    print(f"ceo {i + 1} # of validated points: {interpreted_counts.get(1, 0)}, # of not validated points: {interpreted_counts.get(0, 0)}")
    
    # Increment the collection counter
    collection_counter += 1

# Concatenate the modified DataFrames into one merged DataFrame
merged_ceo = pd.concat(modified_ceo_files, ignore_index=True)


# Specify the columns you want to keep in the merged DataFrame
columns_to_keep = ['plotid', 'sampleid', 'lon', 'lat', 'email', 'flagged', 'collection_time', 'analysis_duration', 
                    'sample_geom', 'pl_point_id', 'pl_chg_prob', 'pl_simple_combined', 'pl_strata', 'forêt ou non-forêt en 2015?', "Y'a t'il de la régéneration", 
                    'Changement entre 2015-2022', 'Type de Changement (2015-2022)', 'Exploitation forestière artisanale (<5ha)', 
                    'Activité minière artisanale', 'Infrastructure routière', 'Infrastructure d’urbanisation ou habitations', 
                    'Exploitation forestière industrielle (>5ha)', 'Autre moteur', 'Descriptif autre moteur', 'Activité minière industrielle', 
                    'Agriculture industrielle', 'Agriculture paysanne', 'Année de Changement', 'Type de non-forêt en 2015', 'Type de forêt en 2015', 'Commentaires', 'collection', 'interpreted']

# Select only the specified columns
merged_ceo = merged_ceo[columns_to_keep]

# Create a dictionary to map old column names to new column names
column_mapping = {
    'pl_point_id': 'point_id',
    'pl_chg_prob': 'chg_prob', 
    'pl_simple_combined': 'simple_combined', 
    'pl_strata': 'stratum',
    'forêt ou non-forêt en 2015?': 'Ref_FNF_2015', 
    "Y'a t'il de la régéneration": 'Ref_Regeneration',
    'Changement entre 2015-2022': 'Ref_Change_1522', 
    'Type de Changement (2015-2022)': 'Ref_Change_Type_1522', 
    'Exploitation forestière artisanale (<5ha)': 'ArtFor',
    'Activité minière artisanale': 'ArtMine',
    'Infrastructure routière': 'InfraR', 
    'Infrastructure d’urbanisation ou habitations': 'Urb',
    'Exploitation forestière industrielle (>5ha)':'IndFor', 
    'Autre moteur': 'Other', 
    'Descriptif autre moteur': 'Other_Desc', 
    'Activité minière industrielle': 'IndMine', 
    'Agriculture industrielle': 'IndAg', 
    'Agriculture paysanne': 'ArtAg', 
    'Année de Changement': 'Ref_Year', 
    'Type de non-forêt en 2015':'Ref_NFType', 
    'Type de forêt en 2015':'Ref_FType'
}

# Use the rename() method to rename the columns
merged_ceo = merged_ceo.rename(columns=column_mapping)

# Define the replacements as a dictionary
replace_dict = {
    'Ref_FNF_2015': {'non-forêt': '0', 'forêt': '1'},
    'Ref_FType': {'1 - forêt dense': '1', '2 - forêt dense sèche': '2', '3 - forêt secondaire': '3','4 - forêt claire sèche': '4','7 - forêt mangrove': '7',
                 '8 - forêt marécageuse': '8', '9 - forêt galérie': '9','10 - plantation forestière': '10', '11- savane arborée': '11'},
    'Ref_NFType': {'12 - savane arbustive': '12', '13 - savane herbacée': '13', '14 - prairie aquatique': '14', '15 - sol nu végétation éparse': '15',
                  '16 - terres cultivées': '16', '17 - zone baties': '17', '18 - eau': '18'},
    'Ref_Change_1522': {'oui':'1','non':'0'},
    'ArtFor':{'oui':'1','non':'0'},
    'IndFor':{'oui':'1','non':'0'},
    'ArtAg':{'oui':'1','non':'0'},
    'IndAg':{'oui':'1','non':'0'},
    'ArtMine':{'oui':'1','non':'0'},
    'IndMine':{'oui':'1','non':'0'},
    'Urb':{'oui':'1','non':'0'},
    'InfraR':{'oui':'1','non':'0'},
    'Other':{'oui':'1','non':'0'}
}

# Use the replace() method to replace strings in specified columns
merged_ceo.replace(replace_dict, inplace=True)

In [None]:
merged_ceo_interpreted = merged_ceo[merged_ceo['interpreted'] == 1]

In [None]:
ceo_column_names = merged_ceo_interpreted.columns.tolist()
# Print the list of column names
print(ceo_column_names)

In [None]:
merged_ceo_interpreted.head()

In [None]:
merged_ceo_interpreted = merged_ceo_interpreted.sort_values(by='collection', ascending=True)

# Remove duplicates keeping the last occurrence (the most recent date)
merged_ceo_nodup = merged_ceo_interpreted.drop_duplicates(subset='point_id', keep='first')

# Reset the index
merged_ceo_nodup = merged_ceo_nodup.reset_index(drop=True)

In [None]:
merged_ceo_nodup['Ref_FNF_2015'].value_counts()

In [None]:
merged_ceo_nodup['Ref_Change_1522'].value_counts()

In [None]:
merged_ceo_nodup['Ref_Change_Type_1522'].value_counts()

In [None]:
merged_ceo_nodup['interpreted'].value_counts()

In [None]:
merged_ceo_nodup['collection'].value_counts()

##### Check if values in 'pointid' are unique

In [None]:
# this might take some time
#unique_check = merged_ceo_nodup['point_id'].nunique() == len(merged_ceo_nodup['point_id'])

# Print the result
#if unique_check:
#    print("All values are unique!")
#else:
#    print("F$%§!")

##### if there are duplicates determine which points and which project (collection)

In [None]:
# Find duplicate values in the 'ID' column
#duplicate_ids = merged_ceo_nodup[merged_ceo_nodup.duplicated(subset='point_id', keep='first')]

# If you want to print specific columns along with duplicates, you can do:
# For example, print only 'ID' and 'Name' columns
#print("\nDuplicate Rows with 'ID' and 'Name':")
#print(duplicate_ids[['point_id', 'collection']])
#len(duplicate_ids)

In [None]:
#merged_ceo_nodup.head()

## add new columns for change and year

#### clean up columns and replace values

In [None]:
def f(x):
  if x['Ref_FNF_2015'] == '0' : return 'NF'
  elif x['Ref_Change_Type_1522'] == 'déforestation': return 'Def'
  elif x['Ref_Change_Type_1522'] == 'dégradation' : return 'Deg'
  else: return 'Stable'

merged_ceo_nodup['Ref_Change_Type_1522'] = merged_ceo_nodup.apply(f, axis=1)
merged_ceo_nodup['Ref_Change_Type_1522'].value_counts()

In [None]:
def y(x):
  if x['Ref_Change_Type_1522'] == 'Def' and x['Ref_Year'] == 2016: return 'Def2016'
  elif x['Ref_Change_Type_1522'] == 'Def' and x['Ref_Year'] == 2017: return 'Def2017'
  elif x['Ref_Change_Type_1522'] == 'Def' and x['Ref_Year'] == 2018: return 'Def2018'
  elif x['Ref_Change_Type_1522'] == 'Def' and x['Ref_Year'] == 2019: return 'Def2019'
  elif x['Ref_Change_Type_1522'] == 'Def' and x['Ref_Year'] == 2020: return 'Def2020'
  elif x['Ref_Change_Type_1522'] == 'Def' and x['Ref_Year'] == 2021: return 'Def2021'
  elif x['Ref_Change_Type_1522'] == 'Def' and x['Ref_Year'] == 2022: return 'Def2022'
  elif x['Ref_Change_Type_1522'] == 'Deg' and x['Ref_Year'] == 2016: return 'Deg2016'
  elif x['Ref_Change_Type_1522'] == 'Deg' and x['Ref_Year'] == 2017: return 'Deg2017'
  elif x['Ref_Change_Type_1522'] == 'Deg' and x['Ref_Year'] == 2018: return 'Deg2018'
  elif x['Ref_Change_Type_1522'] == 'Deg' and x['Ref_Year'] == 2019: return 'Deg2019'
  elif x['Ref_Change_Type_1522'] == 'Deg' and x['Ref_Year'] == 2020: return 'Deg2020'
  elif x['Ref_Change_Type_1522'] == 'Deg' and x['Ref_Year'] == 2021: return 'Deg2021'
  elif x['Ref_Change_Type_1522'] == 'Deg' and x['Ref_Year'] == 2022: return 'Deg2022'
  elif x['Ref_Change_Type_1522'] == 'NF' : return 'NF'
  elif x['Ref_Change_Type_1522'] == 'Stable' : return 'Stable'
  else: return ''

merged_ceo_nodup['Ref_Change_Year_1522'] = merged_ceo_nodup.apply(y, axis=1)

annual_counts = merged_ceo_nodup['Ref_Change_Year_1522'].value_counts()
annual_counts = annual_counts.sort_index()
print(annual_counts)

In [None]:
merged_ceo_nodup['Def2016'] = np.where(merged_ceo_nodup['Ref_Change_Year_1522'] == 'Def2016', 1, 0)
merged_ceo_nodup['Def2017'] = np.where(merged_ceo_nodup['Ref_Change_Year_1522'] == 'Def2017', 1, 0)
merged_ceo_nodup['Def2018'] = np.where(merged_ceo_nodup['Ref_Change_Year_1522'] == 'Def2018', 1, 0)
merged_ceo_nodup['Def2019'] = np.where(merged_ceo_nodup['Ref_Change_Year_1522'] == 'Def2019', 1, 0)
merged_ceo_nodup['Def2020'] = np.where(merged_ceo_nodup['Ref_Change_Year_1522'] == 'Def2020', 1, 0)
merged_ceo_nodup['Def2021'] = np.where(merged_ceo_nodup['Ref_Change_Year_1522'] == 'Def2021', 1, 0)
merged_ceo_nodup['Def2022'] = np.where(merged_ceo_nodup['Ref_Change_Year_1522'] == 'Def2022', 1, 0)

In [None]:
merged_ceo_nodup['Deg2016'] = np.where(merged_ceo_nodup['Ref_Change_Year_1522'] == 'Deg2016', 1, 0)
merged_ceo_nodup['Deg2017'] = np.where(merged_ceo_nodup['Ref_Change_Year_1522'] == 'Deg2017', 1, 0)
merged_ceo_nodup['Deg2018'] = np.where(merged_ceo_nodup['Ref_Change_Year_1522'] == 'Deg2018', 1, 0)
merged_ceo_nodup['Deg2019'] = np.where(merged_ceo_nodup['Ref_Change_Year_1522'] == 'Deg2019', 1, 0)
merged_ceo_nodup['Deg2020'] = np.where(merged_ceo_nodup['Ref_Change_Year_1522'] == 'Deg2020', 1, 0)
merged_ceo_nodup['Deg2021'] = np.where(merged_ceo_nodup['Ref_Change_Year_1522'] == 'Deg2021', 1, 0)
merged_ceo_nodup['Deg2022'] = np.where(merged_ceo_nodup['Ref_Change_Year_1522'] == 'Deg2022', 1, 0)

In [None]:
merged_ceo_nodup['Defall'] = np.where(merged_ceo_nodup['Ref_Change_Type_1522'] == 'Def', 1, 0)
merged_ceo_nodup['Degall'] = np.where(merged_ceo_nodup['Ref_Change_Type_1522'] == 'Deg', 1, 0)

In [None]:
merged_ceo_nodup['Stable'] = np.where(merged_ceo_nodup['Ref_Change_Year_1522'] == 'Stable', 1, 0)
merged_ceo_nodup['NF'] = np.where(merged_ceo_nodup['Ref_Change_Year_1522'] == 'NF', 1, 0)

In [None]:
annual_counts.sum()

In [None]:
#check points by change and stratum
ceo_table = pd.pivot_table(merged_ceo_nodup, values='interpreted', index=['stratum'],
                       columns=['Ref_Change_Year_1522'], aggfunc="count")
ceo_table

In [None]:
#export cleaned CEO data
ceo_out = '/home/sepal-user/module_results/esbae/GAB/GAB_esbae_all_ceo_clean.csv'
merged_ceo_nodup.to_csv(ceo_out,index=True)

In [None]:
#merged_ceo_nodup = pd.read_csv('/home/sepal-user/module_results/esbae/GAB/GAB_esbae_all_ceo_clean.csv')

#### read all esbae points with augmented data (output from script 5a)

In [5]:
# Read the CSV files
#DRC
#esbae = pd.read_csv('/home/sepal-user/module_results/esbae/DRC/DRC_esbae_2015_2022_model.csv')
#GAB
#merged_ceo_nodup = pd.read_csv('/home/sepal-user/module_results/esbae/GAB/GAB_esbae_all_ceo_clean.csv')
esbae = pd.read_csv('/home/sepal-user/module_results/esbae/GAB/GAB_esbae_2015_2022_model.csv')
len(esbae)

222878

##### Rename columns to be same as ceo file 

In [7]:
esbae = esbae.rename(columns={'PLOTID': 'plotid'})
esbae = esbae.rename(columns={'LAT': 'lat'})
esbae = esbae.rename(columns={'LON': 'lon'})

In [8]:
print(esbae.dtypes)

images                    int64
mon_images                int64
bfast_change_date       float64
bfast_magnitude         float64
bfast_means             float64
cusum_change_date       float64
cusum_confidence        float64
cusum_magnitude         float64
red_mean                float64
red_sd                  float64
red_min                 float64
red_max                 float64
nir_mean                float64
nir_sd                  float64
nir_min                 float64
nir_max                 float64
swir1_mean              float64
swir1_sd                float64
swir1_min               float64
swir1_max               float64
swir2_mean              float64
swir2_sd                float64
swir2_min               float64
swir2_max               float64
ndfi_mean               float64
ndfi_sd                 float64
ndfi_min                float64
ndfi_max                float64
brightness_mean         float64
brightness_sd           float64
brightness_min          float64
brightne

In [9]:
len(esbae)

222878

In [10]:
eSBAE_cols = esbae.columns
eSBAE_cols

Index(['images', 'mon_images', 'bfast_change_date', 'bfast_magnitude',
       'bfast_means', 'cusum_change_date', 'cusum_confidence',
       'cusum_magnitude', 'red_mean', 'red_sd', 'red_min', 'red_max',
       'nir_mean', 'nir_sd', 'nir_min', 'nir_max', 'swir1_mean', 'swir1_sd',
       'swir1_min', 'swir1_max', 'swir2_mean', 'swir2_sd', 'swir2_min',
       'swir2_max', 'ndfi_mean', 'ndfi_sd', 'ndfi_min', 'ndfi_max',
       'brightness_mean', 'brightness_sd', 'brightness_min', 'brightness_max',
       'greenness_mean', 'greenness_sd', 'greenness_min', 'greenness_max',
       'wetness_mean', 'wetness_sd', 'wetness_min', 'wetness_max',
       'bs_slope_mean', 'bs_slope_sd', 'bs_slope_max', 'bs_slope_min',
       'ccdc_change_date', 'ccdc_magnitude', 'aspect', 'dw_class_mode',
       'dw_tree_prob__max', 'dw_tree_prob__min', 'dw_tree_prob__stdDev',
       'dw_tree_prob_mean', 'elevation', 'esa_lc20', 'esa_lc21', 'esri_lc17',
       'esri_lc18', 'esri_lc19', 'esri_lc20', 'esri_lc21', 'gfc_

### Merge ceo and sbae tables
#### replace validated points in esbae points

In [13]:
#make list of point_ids of validated points
validated_id_list = merged_ceo_nodup['point_id'].tolist()
len(validated_id_list)

6539

In [14]:
# Merge ceo and sbae on "pointid" with an outer join
esbae_ceo = pd.merge(esbae, merged_ceo_nodup, on='point_id', how='outer')

In [15]:
# Fill NaN values in 'stratum' with values from the esbae DataFrame
esbae_ceo['stratum'] = esbae_ceo['stratum_y'].fillna(esbae_ceo['stratum_x'])

In [16]:
len(esbae_ceo)

222878

In [17]:
esbae_ceo.head()

Unnamed: 0.1,images,mon_images,bfast_change_date,bfast_magnitude,bfast_means,cusum_change_date,cusum_confidence,cusum_magnitude,red_mean,red_sd,red_min,red_max,nir_mean,nir_sd,nir_min,nir_max,swir1_mean,swir1_sd,swir1_min,swir1_max,swir2_mean,swir2_sd,swir2_min,swir2_max,ndfi_mean,ndfi_sd,ndfi_min,ndfi_max,brightness_mean,brightness_sd,brightness_min,brightness_max,greenness_mean,greenness_sd,greenness_min,greenness_max,wetness_mean,wetness_sd,wetness_min,wetness_max,bs_slope_mean,bs_slope_sd,bs_slope_max,bs_slope_min,ccdc_change_date,ccdc_magnitude,aspect,dw_class_mode,dw_tree_prob__max,dw_tree_prob__min,dw_tree_prob__stdDev,dw_tree_prob_mean,elevation,esa_lc20,esa_lc21,esri_lc17,esri_lc18,esri_lc19,esri_lc20,esri_lc21,gfc_gain,gfc_loss,gfc_lossyear,gfc_tc00,lang_tree_height,potapov_tree_height,slope,tmf_2016,tmf_2017,tmf_2018,tmf_2019,tmf_2020,tmf_defyear,tmf_degyear,tmf_main,tmf_sub,geometry,point_id,cnc_1520,chg_prob_x,simple_combined_x,stratum_x,kmeans,plotid_x,lon_x,lat_x,Unnamed: 0,plotid_y,sampleid,lon_y,lat_y,email,flagged,collection_time,analysis_duration,sample_geom,chg_prob_y,simple_combined_y,stratum_y,Ref_FNF_2015,Ref_Regeneration,Ref_Change_1522,Ref_Change_Type_1522,ArtFor,ArtMine,InfraR,Urb,IndFor,Other,Other_Desc,IndMine,IndAg,ArtAg,Ref_Year,Ref_NFType,Ref_FType,Commentaires,collection,interpreted,Ref_Change_Year_1522,Def2016,Def2017,Def2018,Def2019,Def2020,Def2021,Def2022,Deg2016,Deg2017,Deg2018,Deg2019,Deg2020,Deg2021,Deg2022,Defall,Degall,Stable,NF,stratum
0,48,26,2016.041,-181.454315,-1.639592,2020.4,0.375197,3861.461538,9599.660256,448.342959,8149.0,10000.0,9599.660256,448.342959,8149.0,10000.0,9599.660256,448.342959,8149.0,10000.0,9599.660256,448.342959,8149.0,10000.0,9599.660256,448.342959,8149.0,10000.0,9599.660256,448.342959,8149.0,10000.0,9599.660256,448.342959,8149.0,10000.0,9599.660256,448.342959,8149.0,10000.0,13.958219,70.383567,193.584787,-138.033823,2020.4,-632.011504,128.0,1.0,73.0,7.0,18.0,67.0,207.340393,10.0,10.0,2.0,2.0,2.0,2.0,2.0,0.0,0.0,,90.0,40.0,33.0,3.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,10.0,10.0,POINT (10.57392349486188 -1.5930084720027642),76012,,0.006,0.006,1,1,76012,10.573923,-1.593008,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1.0
1,43,24,2016.129,5686.856934,3.850222,2019.151,0.018619,1780.680556,9744.340278,345.157588,8472.0,10000.0,9744.340278,345.157588,8472.0,10000.0,9744.340278,345.157588,8472.0,10000.0,9744.340278,345.157588,8472.0,10000.0,9744.340278,345.157588,8472.0,10000.0,9744.340278,345.157588,8472.0,10000.0,9744.340278,345.157588,8472.0,10000.0,9744.340278,345.157588,8472.0,10000.0,-0.642127,26.124178,60.085454,-103.812423,2016.107,4552.756313,118.0,1.0,73.0,3.0,28.0,55.0,199.150024,10.0,10.0,2.0,2.0,2.0,2.0,2.0,0.0,0.0,,90.0,34.0,36.0,2.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,10.0,10.0,POINT (10.578907239035795 -1.58336462319752),76013,,0.059333,0.059333,1,1,76013,10.578907,-1.583365,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1.0
2,37,23,2021.077,889.901794,1.144007,2021.69,-0.029096,1450.304348,9635.413043,404.692043,8307.0,10000.0,9635.413043,404.692043,8307.0,10000.0,9635.413043,404.692043,8307.0,10000.0,9635.413043,404.692043,8307.0,10000.0,9635.413043,404.692043,8307.0,10000.0,9635.413043,404.692043,8307.0,10000.0,9635.413043,404.692043,8307.0,10000.0,9635.413043,404.692043,8307.0,10000.0,44.244464,42.038331,164.402831,-101.459056,0.0,0.0,161.0,1.0,74.0,57.0,4.0,71.0,568.627991,10.0,10.0,2.0,2.0,2.0,2.0,2.0,0.0,0.0,,90.0,28.0,26.0,11.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,10.0,10.0,POINT (12.064807356210801 1.3036806593856438),76312,,0.024667,0.024667,1,1,76312,12.064807,1.303681,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1.0
3,32,20,2018.011,-212.911392,-0.766407,2016.984,0.194783,1930.883333,9590.675,326.133895,8815.0,10000.0,9590.675,326.133895,8815.0,10000.0,9590.675,326.133895,8815.0,10000.0,9590.675,326.133895,8815.0,10000.0,9590.675,326.133895,8815.0,10000.0,9590.675,326.133895,8815.0,10000.0,9590.675,326.133895,8815.0,10000.0,9590.675,326.133895,8815.0,10000.0,14.744563,30.981018,153.492013,-77.202391,2021.318,61.988871,65.0,1.0,74.0,9.0,18.0,66.0,594.630798,10.0,10.0,2.0,2.0,2.0,2.0,2.0,0.0,0.0,,95.0,31.0,30.0,4.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,10.0,10.0,POINT (12.069790548758524 1.313328693505733),76313,,0.040667,0.040667,1,1,76313,12.069791,1.313329,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1.0
4,43,22,2016.677,343.551605,0.471041,2016.436,0.300318,4431.090909,9607.431818,662.563232,6879.0,10000.0,9607.431818,662.563232,6879.0,10000.0,9607.431818,662.563232,6879.0,10000.0,9607.431818,662.563232,6879.0,10000.0,9607.431818,662.563232,6879.0,10000.0,9607.431818,662.563232,6879.0,10000.0,9607.431818,662.563232,6879.0,10000.0,9607.431818,662.563232,6879.0,10000.0,204.169622,90.299033,437.393554,-12.191784,0.0,0.0,92.0,1.0,74.0,55.0,5.0,71.0,577.762207,10.0,10.0,2.0,2.0,2.0,2.0,2.0,0.0,0.0,,90.0,33.0,34.0,4.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,10.0,10.0,POINT (12.074773937046153 1.322976607263328),76314,,0.022667,0.022667,1,1,76314,12.074774,1.322977,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1.0


#### clean up _x and _y columns

In [18]:
merged_columns = esbae_ceo.columns.tolist()
print(merged_columns)

['images', 'mon_images', 'bfast_change_date', 'bfast_magnitude', 'bfast_means', 'cusum_change_date', 'cusum_confidence', 'cusum_magnitude', 'red_mean', 'red_sd', 'red_min', 'red_max', 'nir_mean', 'nir_sd', 'nir_min', 'nir_max', 'swir1_mean', 'swir1_sd', 'swir1_min', 'swir1_max', 'swir2_mean', 'swir2_sd', 'swir2_min', 'swir2_max', 'ndfi_mean', 'ndfi_sd', 'ndfi_min', 'ndfi_max', 'brightness_mean', 'brightness_sd', 'brightness_min', 'brightness_max', 'greenness_mean', 'greenness_sd', 'greenness_min', 'greenness_max', 'wetness_mean', 'wetness_sd', 'wetness_min', 'wetness_max', 'bs_slope_mean', 'bs_slope_sd', 'bs_slope_max', 'bs_slope_min', 'ccdc_change_date', 'ccdc_magnitude', 'aspect', 'dw_class_mode', 'dw_tree_prob__max', 'dw_tree_prob__min', 'dw_tree_prob__stdDev', 'dw_tree_prob_mean', 'elevation', 'esa_lc20', 'esa_lc21', 'esri_lc17', 'esri_lc18', 'esri_lc19', 'esri_lc20', 'esri_lc21', 'gfc_gain', 'gfc_loss', 'gfc_lossyear', 'gfc_tc00', 'lang_tree_height', 'potapov_tree_height', 'slope'

In [19]:
#drop and columns
esbae_ceo = esbae_ceo.drop(['plotid_y', 'lon_y', 'lat_y','chg_prob_y', 'simple_combined_y', 'stratum_y', 'stratum_x'], axis=1)
esbae_ceo.columns = esbae_ceo.columns.str.replace('_x', '')
esbae_ceo.columns = esbae_ceo.columns.str.replace('_y', '')

In [20]:
print(esbae_ceo.columns.values)

['images' 'mon_images' 'bfast_change_date' 'bfast_magnitude' 'bfast_means'
 'cusum_change_date' 'cusum_confidence' 'cusum_magnitude' 'red_mean'
 'red_sd' 'red_min' 'red_max' 'nir_mean' 'nir_sd' 'nir_min' 'nir_max'
 'swir1_mean' 'swir1_sd' 'swir1_min' 'swir1_max' 'swir2_mean' 'swir2_sd'
 'swir2_min' 'swir2_max' 'ndfi_mean' 'ndfi_sd' 'ndfi_min' 'ndfi_max'
 'brightness_mean' 'brightness_sd' 'brightness_min' 'brightness_max'
 'greenness_mean' 'greenness_sd' 'greenness_min' 'greenness_max'
 'wetness_mean' 'wetness_sd' 'wetness_min' 'wetness_max' 'bs_slope_mean'
 'bs_slope_sd' 'bs_slope_max' 'bs_slope_min' 'ccdc_change_date'
 'ccdc_magnitude' 'aspect' 'dw_class_mode' 'dw_tree_prob__max'
 'dw_tree_prob__min' 'dw_tree_prob__stdDev' 'dw_tree_prob_mean'
 'elevation' 'esa_lc20' 'esa_lc21' 'esri_lc17' 'esri_lc18' 'esri_lc19'
 'esri_lc20' 'esri_lc21' 'gfc_gain' 'gfc_loss' 'gfc_lossyear' 'gfc_tc00'
 'lang_tree_height' 'potapov_tree_height' 'slope' 'tmf_2016' 'tmf_2017'
 'tmf_2018' 'tmf_2019' 'tmf_20

In [21]:
esbae_ceo['interpreted'] = esbae_ceo['interpreted'].fillna(0)
esbae_ceo['interpreted'] = esbae_ceo['interpreted'].astype(int)
esbae_ceo['interpreted'].value_counts(dropna=False)

0    216339
1      6539
Name: interpreted, dtype: int64

In [22]:
esbae_ceo['Ref_Change_Year_1522'].value_counts(dropna=False)

NaN        216339
Stable       5070
NF           1238
Deg2022        38
Deg2020        28
Deg2021        26
Deg2018        25
Def2016        21
Deg2019        16
Deg2017        15
Deg2016        14
Def2022        13
Def2017        13
Def2021         7
Def2018         7
Def2020         6
Def2019         2
Name: Ref_Change_Year_1522, dtype: int64

In [23]:
#check points by change and stratum
points_strata = pd.pivot_table(esbae_ceo, values = 'point_id', index = 'Ref_Change_Year_1522',columns = 'stratum', aggfunc =  "count")
points_strata

stratum,1.0,2.0,3.0
Ref_Change_Year_1522,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Def2016,,2.0,19.0
Def2017,,2.0,11.0
Def2018,,,7.0
Def2019,,,2.0
Def2020,,1.0,5.0
Def2021,,1.0,6.0
Def2022,,6.0,7.0
Deg2016,,2.0,12.0
Deg2017,,1.0,14.0
Deg2018,,1.0,24.0


In [None]:
esbae_ceo['stratum'].value_counts()

#### check for missing values

In [None]:
# Initialize a dictionary to store the counts by column
missing_counts = {}

# Count missing values and sum the counts by column
for column in eSBAE_cols:
    missing_counts[column] = esbae_ceo[column].isna().sum()

# Print the counts
print(missing_counts)

##### plot deforestation points by year

In [None]:
#select the values you want to plot
values_to_plot = ['Def2016','Def2017','Def2018','Def2019','Def2020','Def2021','Def2022']

# Filter the DataFrame to include only the selected values
filtered_df = esbae_ceo[esbae_ceo['Ref_Change_Year_1522'].isin(values_to_plot)]

# Get the value counts and sort by index (alphabetically)
defvalue_counts = filtered_df['Ref_Change_Year_1522'].value_counts().sort_index()

# Plot the value counts as a bar graph
plt.figure(figsize=(8, 6))  # Optional: Set the figure size
defvalue_counts.plot(kind='bar', color='skyblue')


# Set plot labels and title
plt.xlabel('Category')
plt.ylabel('Count')
plt.title('# of def points per year')

# Show the plot
plt.show()

##### plot degradation points by year

In [None]:
#plot DEG 
#select the values you want to plot
values_to_plot = ['Deg2016','Deg2017','Deg2018','Deg2019','Deg2020','Deg2021','Deg2022']

# Filter the DataFrame to include only the selected values
filtered_df = esbae_ceo[esbae_ceo['Ref_Change_Year_1522'].isin(values_to_plot)]

# Get the value counts and sort by index (alphabetically)
degvalue_counts = filtered_df['Ref_Change_Year_1522'].value_counts().sort_index()

# Plot the value counts as a bar graph
plt.figure(figsize=(8, 6))  # Optional: Set the figure size
degvalue_counts.plot(kind='bar', color='skyblue')


# Set plot labels and title
plt.xlabel('Category')
plt.ylabel('Count')
plt.title('# of deg points per year')

# Show the plot
plt.show()

In [None]:
print(esbae_ceo.columns.values)

In [None]:
merged_annual_counts = esbae_ceo['Ref_Change_Year_1522'].value_counts()
merged_annual_counts = merged_annual_counts.sort_index()
merged_annual_counts

In [None]:
esbae_ceo.describe()

In [None]:
print(esbae_ceo.dtypes)

##### export merged esbae and ceo data to one csv file for next step

In [None]:
# if you want to remove all the eSBAE variables
#DRC
#esbae = esbae[['point_id', 'PLOTID', 'LAT','LON','stratum', 'chg_prob', 'simple_combined','NOM']]
#GAB
#esbae = esbae[['point_id', 'PLOTID', 'LAT','LON','stratum', 'chg_prob', 'simple_combined']]

In [None]:
esbae_ceo_data_out =  '/home/sepal-user/module_results/esbae/GAB/GAB_esbae_ceo_merge.csv'
esbae_ceo.to_csv(esbae_ceo_data_out,index=False)
#OR
#esbae_ceo[esbae_ceo_columns].to_csv(data_out,index=True)

#### now move on to calculation of areas - script 6b