![title](images/header.png)

Merging CEO data (phase I 2015-2020) and phase II (2015-2022) for training with calibration and validation split
-------
This notebook merges validated data from the stratified random CEO data (2015-2022) with ceo data from phase II (2015-2022)
###### For more information contact aurelie.shapiro@fao.org or remi.dannunzio@fao.org

### this script merges multiple ceo files and data from CAFI phase I and phase II
#### you can use the last section to split CEO data into proportions of training and validation
#### in the parameters you will select your CEO clean files (from CAFI DDD eSBAE merge and clean CEO data) 

#### first install libraries and packages

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
# Set the display option to a large value to prevent text wrapping
pd.set_option('display.max_colwidth', None)

#### merge multiple CEO files and combine columns

### parameters

In [2]:
ISO = 'CMR'

In [3]:
# clean ceo data from script 05d
#ceo_phase_II = pd.read_csv('/home/sepal-user/module_results/esbae/GAB/GAB_all_ceo_1522_clean.csv')
#ceo_phase_II = pd.read_csv('/home/sepal-user/module_results/esbae/DRC/DRC_all_ceo_1522_clean.csv')
#ceo_phase_II = pd.read_csv('/home/sepal-user/module_results/esbae/EQG/EQG_all_ceo_1522_clean.csv')
#ceo_phase_II = pd.read_csv('/home/sepal-user/module_results/esbae/COG/COG_all_ceo_1622_clean.csv',low_memory=False)
#ceo_phase_IIa = pd.read_csv('/home/sepal-user/module_results/esbae/COG/COG_all_ceo_1522_clean.csv',low_memory=False)
#ceo_phase_II = pd.read_csv('/home/sepal-user/module_results/esbae/CAR/CAR_all_ceo_1522_clean.csv',low_memory=False)
ceo_phase_II = pd.read_csv('/home/sepal-user/module_results/esbae/CMR/CMR_all_ceo_1522_clean.csv',low_memory=False)

In [4]:
ceo_phase_II.head()

Unnamed: 0,plotid,sampleid,lon,lat,email,flagged,collection_time,analysis_duration,sample_geom,point_id,Ref_FNF_2015,Ref_Regeneration,Ref_Change_1522,Ref_Change_Type_1522,ArtFor,ArtMine,InfraR,Urb,IndFor,Other,Other_Desc,IndMine,IndAg,ArtAg,Ref_Year,Ref_NFtype,Ref_Ftype,Commentaires,collection,interpreted,Ref_LCover,Ref_Change_Year_1522,Def2016,Def2017,Def2018,Def2019,Def2020,Def2021,Def2022,Deg2016,Deg2017,Deg2018,Deg2019,Deg2020,Deg2021,Deg2022,Defall,Degall,Stable,NF,ISO
0,3195,3195,11.966747,7.395009,aurelie.shapiro@fao.org,False,2023-06-29 08:45,177.2 secs,POINT(11.966747 7.395009),3195,1,,0,Stable,0,0,0,0,0,0,,0,0,,,,4.0,rentrez vos commentaires,1,1,4,Stable,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,CMR
1,382369,382369,13.462191,4.616583,e.tchanak@gmail.com,False,2023-08-17 09:12,32.1 secs,POINT(13.462191 4.616583),382369,1,,1,Deg,1,0,0,0,0,0,,0,0,0.0,2022.0,,1.0,rentrez vos commentaires,1,1,1,Deg2022,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,CMR
2,381976,381976,13.326331,4.472166,e.tchanak@gmail.com,False,2023-08-17 09:06,10.4 secs,POINT(13.326331 4.472166),381976,1,,0,Stable,0,0,0,0,0,0,,0,0,,,,1.0,rentrez vos commentaires,1,1,1,Stable,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,CMR
3,381269,381269,13.454098,4.943217,e.tchanak@gmail.com,False,2023-08-17 09:06,39.0 secs,POINT(13.454098 4.943217),381269,1,,0,Stable,0,0,0,0,0,0,,0,0,,,,1.0,rentrez vos commentaires,1,1,1,Stable,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,CMR
4,379902,379902,13.021454,4.519678,e.tchanak@gmail.com,False,2023-08-17 09:05,38.4 secs,POINT(13.021454 4.519678),379902,1,,0,Stable,0,0,0,0,0,0,,0,0,,,,1.0,rentrez vos commentaires,1,1,1,Stable,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,CMR


In [5]:
phase_II_columns = ceo_phase_II.columns.tolist()
phase_II_columns

['plotid',
 'sampleid',
 'lon',
 'lat',
 'email',
 'flagged',
 'collection_time',
 'analysis_duration',
 'sample_geom',
 'point_id',
 'Ref_FNF_2015',
 'Ref_Regeneration',
 'Ref_Change_1522',
 'Ref_Change_Type_1522',
 'ArtFor',
 'ArtMine',
 'InfraR',
 'Urb',
 'IndFor',
 'Other',
 'Other_Desc',
 'IndMine',
 'IndAg',
 'ArtAg',
 'Ref_Year',
 'Ref_NFtype',
 'Ref_Ftype',
 'Commentaires',
 'collection',
 'interpreted',
 'Ref_LCover',
 'Ref_Change_Year_1522',
 'Def2016',
 'Def2017',
 'Def2018',
 'Def2019',
 'Def2020',
 'Def2021',
 'Def2022',
 'Deg2016',
 'Deg2017',
 'Deg2018',
 'Deg2019',
 'Deg2020',
 'Deg2021',
 'Deg2022',
 'Defall',
 'Degall',
 'Stable',
 'NF',
 'ISO']

In [6]:
ceo_phase_II.drop(columns=['sampleid', 'email', 'flagged', 'collection_time', 'analysis_duration'], inplace=True)

In [7]:
# output file names
# CEO phase I and phase II data
ceo_phase_I_II_out =  '/home/sepal-user/module_results/esbae/CMR/CMR_ceo_1522_phase_I_II.csv'
# stable data split into training and validation
ceo_phase_I_II_stable_out_train = '/home/sepal-user/module_results/esbae/CMR/CMR_all_ceo_1522_phaseI_II_stable_train.csv'
ceo_phase_I_II_stable_out_val = '/home/sepal-user/module_results/esbae/CMR/CMR_all_ceo_1522_phaseI_II_stable_val.csv'

#### CEO phase I data (2015-2020)
##### this file is downloaded from git: https://github.com/aurelgrooves/CAFI_DDD

In [8]:
ceo_phase_I = pd.read_csv('/home/sepal-user/CAFI_DDD/CAFI_all_ceo_validation_2015_2020_uniqueID.csv', low_memory=False)
phase_I_columns = ceo_phase_I.columns.tolist()
phase_I_columns

['plotid_orig',
 'CEO_PLOTID',
 'lon',
 'lat',
 'ISO',
 'CEO_Ref_Code',
 'Ref_FNF_2015',
 'Ref_LCover',
 'Ref_Change_Type_1520',
 'Ref_Year',
 'countDrivers',
 'InfraR',
 'Urb',
 'ArtAg',
 'IndAg',
 'ArtFor',
 'IndFor',
 'ArtMin',
 'IndMin',
 'Other',
 'Source',
 'Ref_Change_Year',
 'Change_Bin',
 'UniqueID']

In [9]:
ceo_phase_I.head()

Unnamed: 0,plotid_orig,CEO_PLOTID,lon,lat,ISO,CEO_Ref_Code,Ref_FNF_2015,Ref_LCover,Ref_Change_Type_1520,Ref_Year,countDrivers,InfraR,Urb,ArtAg,IndAg,ArtFor,IndFor,ArtMin,IndMin,Other,Source,Ref_Change_Year,Change_Bin,UniqueID
0,171,,24.043094,6.731301,CAF,12,0,12,NF,,0,0,0,0,0,0,0,0,0,0,ceo,NF,0,306
1,140,,18.704676,8.068803,CAF,12,0,12,NF,,0,0,0,0,0,0,0,0,0,0,ceo,NF,0,307
2,173,,13.502083,6.592511,CMR,12,0,12,NF,,0,0,0,0,0,0,0,0,0,0,ceo,NF,0,308
3,174,,23.944998,7.158989,CAF,12,0,12,NF,,0,0,0,0,0,0,0,0,0,0,ceo,NF,0,309
4,158,,13.697197,8.09683,CMR,12,0,12,NF,,0,0,0,0,0,0,0,0,0,0,ceo,NF,0,310


In [10]:
ceo_phase_I['ISO'].value_counts(dropna=False)

ISO
COD    6411
CMR    2442
CAF    2325
GAB    1124
COG     686
EQG     127
Name: count, dtype: int64

In [11]:
ceo_phase_I_ISO = ceo_phase_I[ceo_phase_I['ISO'] == ISO]

In [12]:
# validation proportion
split = 0.3

### end of parameters

In [13]:
# number of validated points
len(ceo_phase_II)

6733

In [14]:
# number of validated points
len(ceo_phase_I_ISO)

2442

#### merge with phase I data (2015-2020)

In [15]:
data_frames =[ceo_phase_I_ISO, ceo_phase_II]
# Source file names corresponding to each data frame
source_file = ['phase_I', 'phase_II']  # Replace with your file names

# Merge data frames and add a new column for source file name
merged_ceo = pd.concat([df.assign(sampling=source_file) for df, source_file in zip(data_frames, source_file)], ignore_index=True)

In [16]:
# Print or use the merged data frame
merged_I_columns = merged_ceo.columns.tolist()
merged_I_columns

['plotid_orig',
 'CEO_PLOTID',
 'lon',
 'lat',
 'ISO',
 'CEO_Ref_Code',
 'Ref_FNF_2015',
 'Ref_LCover',
 'Ref_Change_Type_1520',
 'Ref_Year',
 'countDrivers',
 'InfraR',
 'Urb',
 'ArtAg',
 'IndAg',
 'ArtFor',
 'IndFor',
 'ArtMin',
 'IndMin',
 'Other',
 'Source',
 'Ref_Change_Year',
 'Change_Bin',
 'UniqueID',
 'sampling',
 'plotid',
 'sample_geom',
 'point_id',
 'Ref_Regeneration',
 'Ref_Change_1522',
 'Ref_Change_Type_1522',
 'ArtMine',
 'Other_Desc',
 'IndMine',
 'Ref_NFtype',
 'Ref_Ftype',
 'Commentaires',
 'collection',
 'interpreted',
 'Ref_Change_Year_1522',
 'Def2016',
 'Def2017',
 'Def2018',
 'Def2019',
 'Def2020',
 'Def2021',
 'Def2022',
 'Deg2016',
 'Deg2017',
 'Deg2018',
 'Deg2019',
 'Deg2020',
 'Deg2021',
 'Deg2022',
 'Defall',
 'Degall',
 'Stable',
 'NF']

In [17]:
column_names = merged_ceo.columns.tolist()
# Print the list of column names
print(column_names)

['plotid_orig', 'CEO_PLOTID', 'lon', 'lat', 'ISO', 'CEO_Ref_Code', 'Ref_FNF_2015', 'Ref_LCover', 'Ref_Change_Type_1520', 'Ref_Year', 'countDrivers', 'InfraR', 'Urb', 'ArtAg', 'IndAg', 'ArtFor', 'IndFor', 'ArtMin', 'IndMin', 'Other', 'Source', 'Ref_Change_Year', 'Change_Bin', 'UniqueID', 'sampling', 'plotid', 'sample_geom', 'point_id', 'Ref_Regeneration', 'Ref_Change_1522', 'Ref_Change_Type_1522', 'ArtMine', 'Other_Desc', 'IndMine', 'Ref_NFtype', 'Ref_Ftype', 'Commentaires', 'collection', 'interpreted', 'Ref_Change_Year_1522', 'Def2016', 'Def2017', 'Def2018', 'Def2019', 'Def2020', 'Def2021', 'Def2022', 'Deg2016', 'Deg2017', 'Deg2018', 'Deg2019', 'Deg2020', 'Deg2021', 'Deg2022', 'Defall', 'Degall', 'Stable', 'NF']


In [18]:
#drop columns
#merged_ceo = merged_ceo.drop(['OID'], axis=1)
#merged_ceo = merged_ceo.rename(columns={'Unnamed: 0': 'index'})

In [19]:
# Create a new column 'Stable' with initial value 0
merged_ceo['Stable_Forest'] = 0

# Set 'Stable' to 1 where either 'Ref_Change_Type_1520' or 'Ref_Change_Type_1522' is 'Stable'
merged_ceo.loc[(merged_ceo['Ref_Change_Type_1520'] == 'Stable') | (merged_ceo['Ref_Change_Type_1522'] == 'Stable'), 'Stable_Forest'] = 1

# Display value counts of 'Stable' column
print(merged_ceo['Stable_Forest'].value_counts(dropna=False))

Stable_Forest
0    5826
1    3349
Name: count, dtype: int64


In [20]:
# Create a new column 'Ref Change Type' with initial value 0
merged_ceo['Ref_Change_Type'] = 0

merged_ceo['Ref_Change_Type'] = merged_ceo['Ref_Change_Type_1520'].fillna(merged_ceo['Ref_Change_Type_1522'])

# Display the DataFrame
print(merged_ceo['Ref_Change_Type'].value_counts(dropna=False))

Ref_Change_Type
NF        4884
Stable    3349
Def        556
Deg        386
Name: count, dtype: int64


In [21]:
#check for NAs
merged_ceo['Ref_LCover'].value_counts(dropna=False)

Ref_LCover
12    2591
1     1761
15     881
13     656
9      647
16     638
3      600
11     488
4      313
17     192
2      121
8      101
14      65
7       51
10      50
18      20
Name: count, dtype: int64

In [22]:
#create a new column for forest:1, non-forest:2, water: 3
# Define the conditions
conditions = [
    (merged_ceo['Ref_LCover'] >= 1) & (merged_ceo['Ref_LCover'] <= 11),
    (merged_ceo['Ref_LCover'] >= 12) & (merged_ceo['Ref_LCover'] <= 17),
    (merged_ceo['Ref_LCover'] == 18)
]

# Define the values to assign for each condition
values = [1, 2, 3]

# Use numpy.select() to assign values based on conditions
merged_ceo['Ref_FNFW'] = np.select(conditions, values, default=np.nan)

# Print the DataFrame with the new column
merged_ceo.head()

Unnamed: 0,plotid_orig,CEO_PLOTID,lon,lat,ISO,CEO_Ref_Code,Ref_FNF_2015,Ref_LCover,Ref_Change_Type_1520,Ref_Year,countDrivers,InfraR,Urb,ArtAg,IndAg,ArtFor,IndFor,ArtMin,IndMin,Other,Source,Ref_Change_Year,Change_Bin,UniqueID,sampling,plotid,sample_geom,point_id,Ref_Regeneration,Ref_Change_1522,Ref_Change_Type_1522,ArtMine,Other_Desc,IndMine,Ref_NFtype,Ref_Ftype,Commentaires,collection,interpreted,Ref_Change_Year_1522,Def2016,Def2017,Def2018,Def2019,Def2020,Def2021,Def2022,Deg2016,Deg2017,Deg2018,Deg2019,Deg2020,Deg2021,Deg2022,Defall,Degall,Stable,NF,Stable_Forest,Ref_Change_Type,Ref_FNFW
0,173.0,,13.502083,6.592511,CMR,12.0,0,12,NF,,0.0,0,0,0.0,0,0,0,0.0,0.0,0,ceo,NF,0.0,308.0,phase_I,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0,NF,2.0
1,158.0,,13.697197,8.09683,CMR,12.0,0,12,NF,,0.0,0,0,0.0,0,0,0,0.0,0.0,0,ceo,NF,0.0,310.0,phase_I,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0,NF,2.0
2,211.0,,9.836957,4.558097,CMR,104.0,1,1,Deg,2020.0,3.0,1,0,1.0,0,1,0,0.0,0.0,0,ceo,Deg2020,1.0,108.0,phase_I,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0,Deg,1.0
3,4.0,,11.127297,3.66607,CMR,1.0,1,1,Stable,,0.0,0,0,0.0,0,0,0,0.0,0.0,0,ceo,Stable,0.0,135.0,phase_I,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1,Stable,1.0
4,304.0,,13.264389,4.078935,CMR,1.0,1,1,Stable,,0.0,0,0,0.0,0,0,0,0.0,0.0,0,ceo,Stable,0.0,145.0,phase_I,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1,Stable,1.0


In [23]:
merged_ceo['Ref_FNFW'].value_counts(dropna=False)

Ref_FNFW
2.0    5023
1.0    4132
3.0      20
Name: count, dtype: int64

#### split stable data into training and calibration for land cover classification

In [None]:
# Filter rows where 'Stable' equals 1
stable_rows = merged_ceo[(merged_ceo['Stable_Forest'] == 1) | (merged_ceo['NF'] == 1)]

In [None]:
# Define an empty DataFrame to store the split data
split_data = pd.DataFrame()

# Iterate over unique values in 'landcover' column
for landcover_value in stable_rows['Ref_LCover'].unique():
    # Filter rows for current landcover value
    landcover_rows = stable_rows[stable_rows['Ref_LCover'] == landcover_value]
    
    # Check if there is enough data to split
    if len(landcover_rows) > 1:
        # Split data into training and validation sets
        train_data, val_data = train_test_split(landcover_rows, test_size=split, random_state=42)
        
        # Add 'Split' column and assign 'train' or 'val' based on split
        train_data['split'] = 'train'
        val_data['split'] = 'val'
        
        # Concatenate data to the split_data DataFrame
        split_data = pd.concat([split_data, train_data, val_data], ignore_index=True)
    else:
        # Not enough data for splitting, assign all data as calibration data
        landcover_rows['split'] = 'calibration'
        split_data = pd.concat([split_data, landcover_rows], ignore_index=True)

split_data.head()

In [None]:
# Print summaries
split_data['split'].value_counts(dropna=False)

In [None]:
pd.pivot_table(split_data,values='point_id',index=['Ref_LCover'],columns=['split'],aggfunc="count")

#### Export

In [None]:
# export merged phase I and II data
esbae_all_ceo_data =  ceo_phase_I_II_out
merged_ceo.to_csv(all_ceo_data,index=False)

In [None]:
# export only stable data with training and validation split
split_data_train = split_data[(split_data['split'] == 'train')]
split_data.to_csv(ceo_phase_I_II_stable_out_train,index=False)

In [None]:
# export only stable data with training and validation split
split_data_val = split_data[(split_data['split'] == 'val')]
split_data_val.to_csv(ceo_phase_I_II_stable_out_val,index=False)