In [1]:
# Respons sistem 0.0
import ee
import geopandas as gpd
from shapely.geometry import mapping

ee.Authenticate()
ee.Initialize()

In [2]:
# ----- User Input -----
# ----- Module 1 -----
# 1. AOI file path
UploadAOIVect = False

# AOIVectPath = 'C:/Users/fmahezs/epistem/modules_notebook/data/aoi_pedam.shp'
# AOIEePath = 'projects/ee-fazamahezs/assets/KHG_sample2'
# AOIEePath = 'projects/ee-fazamahezs/assets/aoi_pedam'
AOIEePath = 'projects/ee-fazamahezs/assets/planning_unit_bungo'
# AOIEePath = 'projects/ee-fazamahezs/assets/Sumsel_AdmKabKotaF_dis'

# Define AOI
if UploadAOIVect:
    # Read shapefile
    gdf = gpd.read_file(AOIVectPath)
    
    # Convert entire GeoDataFrame to EE FeatureCollection
    geojson = gdf.to_json()
    AOI = ee.FeatureCollection(geojson).geometry()
    print(f"Loaded AOI from shapefile: {len(gdf)} features")
    
else:
    AOI = ee.FeatureCollection(AOIEePath).geometry()
    print(f"Loaded AOI from Earth Engine asset: {AOIEePath}")

    
# 2. Landsat image
# ClippedImage = 

Loaded AOI from Earth Engine asset: projects/ee-fazamahezs/assets/planning_unit_bungo


# Modul 2 (Define Classification Scheme)

In [3]:
# Import module and functions
import pandas as pd
from epistemx.module_2 import LULCSchemeClass, SaveClassificationScheme, get_default_schemes

In [4]:
# ----- Data Input -----
# 1. Choose default LULC scheme
ReferenceDataSource = True # set as 'True' to use default classification scheme

# 1. Option to manual input or upload csv
ManualScheme = False # set as 'true' if you want to define your own classification scheme
# set as 'false' if you want to upload your own classes in csv file

# 2. Upload csv file if ManualScheme is 'false'
# LULCTablePath = "lc_tbl_KHG.csv"
LULCTablePath = "C:/Users/fmahezs/epistem/modules_notebook/data/lc_pedamaran.csv" # it can accept both string path and earth engine asset

In [5]:
# ----- System respons 2.1 -----
# Initialize LULCTable with default value
LULCTable = pd.DataFrame()

if ReferenceDataSource == True:
            
    default_schemes = get_default_schemes()
    restore_scheme = default_schemes["RESTORE+ Project"]
    
    # Convert to DataFrame
    LULCTable = pd.DataFrame(restore_scheme)
    print(f"Loaded {len(LULCTable)} land cover/use classes from RESTORE+ default scheme")

else:
    if ManualScheme == True:
        # ----- System respons 2.1.b (Manual/Interactive) -----
        LULCClassifier = LULCSchemeClass()
        LULCClassifier.Display()

        try:
            LULCTable = LULCClassifier.GetDataframe()
            print(f"Loaded {len(LULCTable)} land cover/use classes from manual scheme")
        except Exception as e:
            print(f"Error getting data from manual scheme: {e}")

            LULCTable = pd.DataFrame(columns=['ID', 'LULC_Type', 'color_palette'])
    else:
        # ----- System respons 2.1.a (Load from Path/Asset) -----
        try:
            # Check input data source
            if isinstance(LULCTablePath, str) and (LULCTablePath.startswith('users/') or LULCTablePath.startswith('projects/') or LULCTablePath.startswith('ft:')):
                LULCTable = ee.FeatureCollection(LULCTablePath)
                print(f"Loaded Earth Engine FeatureCollection from Asset ID: {LULCTablePath}")
            elif isinstance(LULCTablePath, str):
                LULCTable = pd.read_csv(LULCTablePath)
                print(f"Loaded {len(LULCTable)} land cover/use classes from local CSV file: {LULCTablePath}")
            else:
                raise TypeError("LULCTablePath must be a string file path or an Earth Engine Asset ID.")
        except Exception as e:
            print(f"Error loading LULC Table from path/asset {LULCTablePath}: {e}")
            # Fallback to empty DataFrame
            LULCTable = pd.DataFrame(columns=['ID', 'LULC_Type', 'color_palette'])
            raise

# Final validation check
if LULCTable is None or (hasattr(LULCTable, 'empty') and LULCTable.empty):
    print("Warning: LULCTable is empty or not properly defined!")
    LULCTable = pd.DataFrame(columns=['ID', 'LULC_Type', 'color_palette'])

Loaded 17 land cover/use classes from RESTORE+ default scheme


In [6]:
# ----- System respons 2.2 -----
if ReferenceDataSource == False and ManualScheme == True:
    # Define variable
    LULCTable = LULCClassifier.GetDataframe()
    print("\nDefined Land cover/use classes:")
    display(LULCTable)
else:
    print("\nDefined Land cover/use classes:")
    display(LULCTable)


Defined Land cover/use classes:


Unnamed: 0,ID,Class Name,Color Code
0,1,Undisturbed dry-land forest,#006400
1,2,Logged-over dry-land forest,#228B22
2,3,Undisturbed mangrove,#4169E1
3,4,Logged-over mangrove,#87CEEB
4,5,Undisturbed swamp forest,#2E8B57
5,6,Logged-over swamp forest,#8FBC8F
6,7,Agroforestry,#9ACD32
7,8,Plantation forest,#32CD32
8,9,Rubber monoculture,#8B4513
9,10,Oil palm monoculture,#FF8C00


# Modul 3 (Define LULC Data Sample)

In [7]:
# Modul 3a 
# Import modules and functions
import ee
import pandas as pd
import geopandas as gpd
import ipysheet as sheet
from shapely.geometry import mapping
from epistemx.module_3a import InputCheck, SyncTrainData, SplitTrainData, LULCSamplingTool

In [8]:
# ----- Data Input -----
# 1. Decision to upload data
UploadTrainData = True # set as 'true' to upload your own training data shapefile
# set as 'false' to either add train data by sampling on screen or use default training data

# 2. Training data file path (if UploadTrainData is true)
TrainVectPath  = 'C:/Users/fmahezs/epistem/modules_notebook/data/Pedamaran_sample.shp'
TrainEePath = 'projects/ee-fazamahezs/assets/pedamaran_points'
#TrainEePath = 'projects/ee-fazamahezs/assets/Pedamaran_sample'
TrainField = 'LULC_Type' # selected field name to use as training data

# 3. Split data training
SplitTrain = True
TrainSplitPct = 0.7 

In [9]:
# ----- System response 3.1 -----
# Check if user chose default classification scheme in module 2
if ReferenceDataSource == True:
    # InputCheck.ValidateVariable('LULCTable','AOI')
    TrainEePath = 'projects/ee-rg2icraf/assets/Indonesia_lulc_Sample'
    class_col_index = 0
    # display(ee.FeatureCollection(TrainEePath).geometry())
else:
    class_col_index = 1
#     InputCheck.ValidateVariable('LULCTable', 'AOI')

In [10]:
# ----- System response 3.2 -----
# Initialize TrainDataDict 
TrainDataDict = {
    'training_data': None,
    'landcover_df': LULCTable,
    'class_field': TrainField,
    'validation_results': {}
}

if ReferenceDataSource == True:
    # --- System response 3.2.a ---
    try:
        TrainDataDict = SyncTrainData.LoadTrainData(
            landcover_df=LULCTable,
            aoi_geometry=AOI,
            training_shp_path=None,
            training_ee_path=TrainEePath
        )
        TrainField = 'kelas'
        print("Loaded training data for reference data source")
    except Exception as e:
        print(f"Error loading reference training data: {e}")

else:
    # --- System response 3.2.a ---
    # Load training data for non-reference case
    try:
        if UploadTrainData == True:
            TrainDataDict = SyncTrainData.LoadTrainData(
                landcover_df=LULCTable,
                aoi_geometry=AOI,
                training_shp_path=TrainVectPath if UploadTrainData else None,
                training_ee_path=None if UploadTrainData else TrainEePath
            )
            print("Loaded training data from uploaded file")
        else:
            print("Using sampling tool for training data collection")
    except Exception as e:
        print(f"Error loading training data: {e}")

if UploadTrainData == True or ReferenceDataSource == True:
    # ----- System response 3.2.a -----
    # Set class field
    TrainDataDict = SyncTrainData.SetClassField(TrainDataDict, TrainField)

    # Validate classes
    TrainDataDict = SyncTrainData.ValidClass(TrainDataDict, class_col_index)

    # Check sample sufficiency
    TrainDataDict = SyncTrainData.CheckSufficiency(TrainDataDict, min_samples=20)

    # Filter by AOI
    TrainDataDict = SyncTrainData.FilterTrainAoi(TrainDataDict)

    # Create training data table
    table_df, total_samples, insufficient_df = SyncTrainData.TrainDataRaw(
        training_data=TrainDataDict.get('training_data'),
        landcover_df=TrainDataDict.get('landcover_df'),
        class_field=TrainDataDict.get('class_field')
    )

    # --- Print summary ---
    vr = TrainDataDict.get('validation_results', {})

    print("=" * 70)
    print("TRAINING DATA SUMMARY")
    print("=" * 70)
    print(f"Total training points loaded     : {vr.get('total_points', 'N/A')}")
    print(f"Points after class filtering     : {vr.get('points_after_class_filter', 'N/A')}")
    print(f"Valid points (inside AOI)        : {vr.get('valid_points', 'N/A')}")
    print(f"Invalid classes found            : {len(vr.get('invalid_classes', []))}")
    print(f"Points outside AOI               : {len(vr.get('outside_aoi', []))}")
    print("=" * 70)

    # --- Display the main table ---
    if table_df is not None and not table_df.empty:
        display_df = table_df.copy()
        if 'Percentage' in display_df.columns:
            display_df['Percentage'] = display_df['Percentage'].apply(
                lambda x: f"{x:.2f}%" if isinstance(x, (int, float)) else x
            )
        display(display_df)
    else:
        print("No valid training data available to display.")
else:
    print("Training data will be collected using sampling tool")
    table_df = None
    total_samples = 0
    insufficient_df = None

Loaded training data for reference data source
TRAINING DATA SUMMARY
Total training points loaded     : 863
Points after class filtering     : 863
Valid points (inside AOI)        : 863
Invalid classes found            : 0
Points outside AOI               : 0


Unnamed: 0,ID,LULC_class,Sample_Count,Percentage,Status
0,1,Undisturbed dry-land forest,190,22.02%,Sufficient
1,2,Logged-over dry-land forest,74,8.57%,Sufficient
2,3,Undisturbed mangrove,0,0.00%,No Samples
3,4,Logged-over mangrove,0,0.00%,No Samples
4,5,Undisturbed swamp forest,0,0.00%,No Samples
5,6,Logged-over swamp forest,0,0.00%,No Samples
6,7,Agroforestry,157,18.19%,Sufficient
7,8,Plantation forest,0,0.00%,No Samples
8,9,Rubber monoculture,317,36.73%,Sufficient
9,10,Oil palm monoculture,60,6.95%,Sufficient


In [11]:
# ----- System response 3.2.b -----
# Initialize TrainDataRecap 
TrainDataRecap = pd.DataFrame()

if UploadTrainData == True:    
    if insufficient_df is not None:
        # Preview the insufficient classes
        print(f"\n{len(insufficient_df)} classes with insufficient samples:")
        insufficient_display = insufficient_df.copy()
        insufficient_display['Percentage'] = insufficient_display['Percentage'].apply(lambda x: f"{x:.2f}%")
        display(insufficient_display)

        # Define table
        TrainDataRecap = TrainDataDict.get('training_data', pd.DataFrame())
    else:
        print(f"\nAll classes have sufficient samples!")
        # Define table
        TrainDataRecap = TrainDataDict.get('training_data', pd.DataFrame())
else:
    print("Training data recap will be defined from sampling tool")
    TrainDataRecap = pd.DataFrame(columns=['kelas', 'LULC_Type', 'latitude', 'longitude'])

# Final validation
if TrainDataRecap is None:
    print("Warning: TrainDataRecap is None, initializing empty DataFrame")
    TrainDataRecap = pd.DataFrame(columns=['kelas', 'LULC_Type', 'latitude', 'longitude'])


11 classes with insufficient samples:


Unnamed: 0,ID,LULC_class,Sample_Count,Percentage,Status
2,3,Undisturbed mangrove,0,0.00%,No Samples
3,4,Logged-over mangrove,0,0.00%,No Samples
4,5,Undisturbed swamp forest,0,0.00%,No Samples
5,6,Logged-over swamp forest,0,0.00%,No Samples
7,8,Plantation forest,0,0.00%,No Samples
10,11,Other monoculture,0,0.00%,No Samples
11,12,Grass/savanna,0,0.00%,No Samples
12,13,Shrub,2,0.23%,Insufficient
13,14,Cropland,0,0.00%,No Samples
15,16,Cleared land,16,1.85%,Insufficient


In [12]:
# ----- System response 3.3 -----
if ReferenceDataSource == False:
    if UploadTrainData == False:

        # ----- System response 3.3.a -----
        tool = LULCSamplingTool(
        lulc_dataframe=LULCTable,
        aoi_ee_featurecollection=AOI
        )

        tool.Display()

        # Export training data
        tool.SaveTrainingData("results/training_points.csv")
        
        # Ensure we update TrainDataRecap from the sampling tool
        try:
            tool.UpdateTrainDataSampling()
            
            # Check if we have any data
            if tool.TrainDataSampling.empty or tool.TrainDataSampling['Points'].sum() == 0:
                print("No training points collected yet. Please use the sampling tool to collect points first.")
                TrainDataRecap = pd.DataFrame(columns=['kelas', 'LULC_Type', 'latitude', 'longitude'])
            else:
                # Convert sampling tool data to individual point records
                training_points = []
                
                for _, row in tool.TrainDataSampling.iterrows():
                    if row['Points'] > 0 and row['Coordinates']:
                        coord_pairs = row['Coordinates'].split('; ')
                        for coord_pair in coord_pairs:
                            if coord_pair.strip():
                                coord_pair = coord_pair.strip('()')
                                try:
                                    lat, lon = map(float, coord_pair.split(', '))
                                    training_points.append({
                                        'kelas': row['ID'],
                                        'LULC_Type': row['LULC_Type'],
                                        'latitude': lat,
                                        'longitude': lon
                                    })
                                except ValueError:
                                    continue
                
                # Convert to GeoDataFrame
                if training_points:
                    import geopandas as gpd
                    from shapely.geometry import Point
                    
                    geometries = [Point(point['longitude'], point['latitude']) for point in training_points]
                    TrainDataRecap = gpd.GeoDataFrame(training_points, geometry=geometries, crs='EPSG:4326')
                    print(f"Successfully converted {len(TrainDataRecap)} training points from sampling tool")
                else:
                    TrainDataRecap = pd.DataFrame(columns=['kelas', 'LULC_Type', 'latitude', 'longitude'])
        except Exception as e:
            print(f"Error updating training data from sampling tool: {e}")
            TrainDataRecap = pd.DataFrame(columns=['kelas', 'LULC_Type', 'latitude', 'longitude'])

In [13]:
if ReferenceDataSource == False:
    if UploadTrainData == False:
        # ----- System response 3.3.a -----
        # Ensure we have the latest data from the sampling tool
        tool.UpdateTrainDataSampling()
        
        # Check if we have any data
        if tool.TrainDataSampling.empty or tool.TrainDataSampling['Points'].sum() == 0:
            print("No training points collected yet. Please use the sampling tool to collect points first.")
            TrainDataRecap = pd.DataFrame(columns=['kelas', 'LULC_Type', 'latitude', 'longitude'])
        else:
            # Convert sampling tool data to individual point records
            training_points = []
            
            for _, row in tool.TrainDataSampling.iterrows():
                if row['Points'] > 0 and row['Coordinates']:
                    coord_pairs = row['Coordinates'].split('; ')
                    for coord_pair in coord_pairs:
                        if coord_pair.strip():
                            coord_pair = coord_pair.strip('()')
                            try:
                                lat, lon = map(float, coord_pair.split(', '))
                                training_points.append({
                                    'kelas': row['ID'],
                                    'LULC_Type': row['LULC_Type'],
                                    'latitude': lat,
                                    'longitude': lon
                                })
                            except ValueError:
                                continue
            
            # Convert to GeoDataFrame
            if training_points:
                import geopandas as gpd
                from shapely.geometry import Point
                
                geometries = [Point(point['longitude'], point['latitude']) for point in training_points]
                TrainDataRecap = gpd.GeoDataFrame(training_points, geometry=geometries, crs='EPSG:4326')
                print(f"Successfully converted {len(TrainDataRecap)} training points")
            else:
                TrainDataRecap = pd.DataFrame(columns=['kelas', 'LULC_Type', 'latitude', 'longitude'])

In [14]:
# ----- Validation Check -----
print("Variable Status Check:")
print(f"- LULCTable defined: {LULCTable is not None and (not hasattr(LULCTable, 'empty') or not LULCTable.empty)}")
print(f"- TrainDataRecap defined: {TrainDataRecap is not None}")
print(f"- TrainDataRecap samples: {len(TrainDataRecap) if TrainDataRecap is not None else 0}")

if TrainDataRecap is None or (hasattr(TrainDataRecap, 'empty') and TrainDataRecap.empty):
    print("WARNING: No training data available! Please check your data sources.")
    TrainDataRecap = pd.DataFrame(columns=['kelas', 'LULC_Type', 'latitude', 'longitude'])

# ----- System response 3.4 -----
print("Recapitulation of selected training data sample:")

if TrainDataRecap.empty:
    print("No training data available for recapitulation")
    RecapTable = pd.DataFrame(columns=['ID', 'LULC_Type', 'Samples'])
else:
    col1 = TrainDataRecap.columns[0]
    col2 = TrainDataRecap.columns[1]

    RecapTable = (
        TrainDataRecap
            .groupby([col1, col2])
            .size()
            .reset_index(name='Samples')
            .rename(columns={col1: 'ID'})
            .sort_values('ID')
            .reset_index(drop=True)
    )

display(RecapTable)

Variable Status Check:
- LULCTable defined: True
- TrainDataRecap defined: True
- TrainDataRecap samples: 863
Recapitulation of selected training data sample:


Unnamed: 0,ID,LULC_Type,Samples
0,1,Undisturbed dry-land forest,190
1,2,Logged-over dry-land forest,74
2,7,Agroforestry,157
3,9,Rubber monoculture,317
4,10,Oil palm monoculture,60
5,13,Shrub,2
6,15,Settlement,47
7,16,Cleared land,16


In [None]:
# ----- System response 3.5 -----
# Split data training
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt

# Split data process
if SplitTrain == True:

    # Split training data
    TrainDataFinal, ValidDataFinal = SplitTrainData.SplitProcess(
        TrainDataRecap, 
        TrainSplitPct=TrainSplitPct, 
        random_state=123
    )

    # Print split statistics
    print(f"Split data summary")
    print(f" - Training data: {len(TrainDataFinal)} samples")
    print(f" - Validation data: {len(ValidDataFinal)} samples")
    print(f" - Total: {len(TrainDataRecap)} samples\n")

    # Display preview
    print(f"Training data head:")
    display(TrainDataFinal.head())
    print(f"Validation data head:")
    display(ValidDataFinal.head())

    # Plot the splitted data
    Map = SplitTrainData.PlotTrainValidInteractive(TrainDataFinal, AOI, ValidDataFinal)
    Map

else:
    print("Split not performed. Using entire dataset as training.")
    TrainDataFinal = TrainDataRecap
    ValidDataFinal = None

    # Display preview
    print(f"Training data head:")
    display(TrainDataFinal.head())

    # Plot the training data
    Map = SplitTrainData.PlotTrainValidInteractive(TrainDataFinal, AOI)
    Map

Split data summary
 - Training data: 604 samples
 - Validation data: 259 samples
 - Total: 863 samples

Training data head:


Unnamed: 0,kelas,LULC_Type,geometry,LULC_Class_Mapped
507,9,Rubber monoculture,POINT (101.87293 -1.34415),Rubber monoculture
100,1,Undisturbed dry-land forest,POINT (101.53999 -1.78549),Undisturbed dry-land forest
686,9,Rubber monoculture,POINT (101.99584 -1.84453),Rubber monoculture
292,7,Agroforestry,POINT (101.80517 -1.6829),Agroforestry
846,15,Settlement,POINT (102.11198 -1.48062),Settlement


Validation data head:


Unnamed: 0,kelas,LULC_Type,geometry,LULC_Class_Mapped
765,10,Oil palm monoculture,POINT (101.7471 -1.60063),Oil palm monoculture
431,9,Rubber monoculture,POINT (102.16425 -1.67225),Rubber monoculture
838,15,Settlement,POINT (102.292 -1.61902),Settlement
804,15,Settlement,POINT (102.36459 -1.64418),Settlement
199,2,Logged-over dry-land forest,POINT (102.07037 -1.77581),Logged-over dry-land forest


Plotting training and validation data...
