In [1]:
from ismn.interface import ISMN_Interface
import matplotlib.pyplot as plt
%matplotlib inline
import pandas as pd
import os
import re
import numpy as np
import xarray as xr
import geopandas as gpd
from shapely.geometry import Point
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

In [2]:
path = r'data\africa\Data_separate_files_header_20160614_20250614_11438_iyLN_20250614.zip'
ismn_data = ISMN_Interface(path, parallel=True)
network_list=[]
for i in ismn_data.networks:
    network_list.append(i)
print('Data will be extracted for these station: ', network_list)

Using the existing ismn metadata in data\africa\python_metadata\Data_separate_files_header_20160614_20250614_11438_iyLN_20250614.csv to set up ISMN_Interface. 
If there are issues with the data reader, you can remove the metadata csv file to repeat metadata collection.
Data will be extracted for these station:  ['AMMA-CATCH', 'SD_DEM', 'TAHMO']


In [None]:
def get_static(sm):

    static_data = []

    for sensor_idx in range(sm.dims['sensor']):
        sensor_data = {
            'sensor_id': sensor_idx,
            'network': sm.network.values[sensor_idx] if 'network' in sm.variables else 'AMMA-CATCH',
            'station': sm.station.values[sensor_idx] if 'station' in sm.variables else f'Station_{sensor_idx}',
            'sensor_name': str(sm.instrument.values[sensor_idx]) if 'instrument' in sm.variables else f'Sensor_{sensor_idx}',
            'latitude': float(sm.latitude.values[sensor_idx]),
            'longitude': float(sm.longitude.values[sensor_idx]),
            'elevation': float(sm.elevation.values[sensor_idx]) if 'elevation' in sm.variables else np.nan,
            'depth_from': float(sm.depth_from.values[sensor_idx]) if 'depth_from' in sm.variables else np.nan,
            'depth_to': float(sm.depth_to.values[sensor_idx]) if 'depth_to' in sm.variables else np.nan,
            'clay_fraction': float(sm.clay_fraction.values[sensor_idx]) if 'clay_fraction' in sm.variables else np.nan,
            'sand_fraction': float(sm.sand_fraction.values[sensor_idx]) if 'sand_fraction' in sm.variables else np.nan,
            'silt_fraction': float(sm.silt_fraction.values[sensor_idx]) if 'silt_fraction' in sm.variables else np.nan,
            'organic_carbon': float(sm.organic_carbon.values[sensor_idx]) if 'organic_carbon' in sm.variables else np.nan,
        }
        static_data.append(sensor_data)

    static_df = pd.DataFrame(static_data)

    return static_df


def get_sm_time_series(sm, statistic='mean'):
    """
    Extracts time series of soil moisture data with a specified statistical operation.
    
    Parameters:
    - sm: xarray dataset of soil moisture
    - statistic: str, one of ['mean', 'median', 'min', 'max', 'sum', 'std']
    
    Returns:
    - ts_df: pandas DataFrame with time series data
    """
    # Convert time to datetime
    sm_with_time = sm.assign_coords(date_time=pd.to_datetime(sm.date_time.values))
    
    # Select the aggregation method
    if statistic == 'mean':
        daily_sm = sm_with_time.soil_moisture.resample(date_time='D').mean()
    elif statistic == 'median':
        daily_sm = sm_with_time.soil_moisture.resample(date_time='D').median()
    elif statistic == 'min':
        daily_sm = sm_with_time.soil_moisture.resample(date_time='D').min()
    elif statistic == 'max':
        daily_sm = sm_with_time.soil_moisture.resample(date_time='D').max()
    elif statistic == 'sum':
        daily_sm = sm_with_time.soil_moisture.resample(date_time='D').sum()
    elif statistic == 'std':
        daily_sm = sm_with_time.soil_moisture.resample(date_time='D').std()
    else:
        raise ValueError(f"Statistic '{statistic}' is not supported. Use 'mean', 'median', 'min', 'max', 'sum', or 'std'.")
    
    # Prepare time series
    dates = pd.to_datetime(daily_sm.date_time.values)
    date_strings = [d.strftime('%Y-%m-%d') for d in dates]
    sm_values = daily_sm.values  
    
    # Cleaning
    valid_mask = (
        (~np.isnan(sm_values)) & 
        (sm_values >= 0) & 
        (sm_values <= 1) & 
        (sm_values != -9999) & 
        (sm_values != -999)
    )
    sm_values_clean = np.where(valid_mask, sm_values, np.nan)

    # Build DataFrame
    ts_df = pd.DataFrame(sm_values_clean, columns=date_strings)
    
    return ts_df



def export_gdf(gdf, output_path, file_format='geojson'):
    """
    Export a GeoDataFrame to the specified format.

    Parameters:
    - gdf: GeoDataFrame to export
    - output_path: Path without file extension
    - file_format: 'geojson', 'shp', 'parquet', 'gpkg'
    """
    file_format = file_format.lower()

    supported_formats = ['geojson', 'shp', 'parquet', 'gpkg','csv']

    if file_format not in supported_formats:
        raise ValueError(f"Unsupported format: {file_format}. Supported formats are: {supported_formats}")

    # Set file extension based on format
    if file_format == 'geojson':
        full_path = f"{output_path}.geojson"
        driver = 'GeoJSON'
        gdf.to_file(full_path, driver=driver)

    elif file_format == 'shp':
        full_path = f"{output_path}.shp"
        driver = 'ESRI Shapefile'
        gdf.to_file(full_path, driver=driver)

    elif file_format == 'gpkg':
        full_path = f"{output_path}.gpkg"
        driver = 'GPKG'
        gdf.to_file(full_path, driver=driver)

    elif file_format == 'parquet':
        full_path = f"{output_path}.parquet"
        gdf.to_parquet(full_path)
    
    elif file_format=='csv':
        full_path=f'{output_path}.csv'
        gdf.to_csv(full_path)

    print(f"File successfully written to {full_path}\n{'-' * 50}")



In [None]:

from shapely.geometry import Point

export_format = 'csv'  #'geojson', 'shp', 'parquet', 'gpkg','csv'
stat_operator=['mean','max','min','std', 'median']

for stat in stat_operator:

    for network in ismn_data.networks:
        print(f'Processing network: {network}')

        try:

            sm = ismn_data[network].to_xarray(variable='soil_moisture')
            
            if sm is None or len(sm.sensor)==0:
                print(f"No soil moisture data available for network: {network}. Skipping...\n{'-'*50}")
                continue

            # Extract static parameters
            static_df = get_static(sm)

            # Extract time series soil moisture
            ts_df=get_sm_time_series(sm)
            ts_df=get_sm_time_series(sm, statistic=stat)

            # Merge 
            merged_df=pd.concat([static_df, ts_df], axis=1)
            print('Dimention of static dataframe: ', static_df.shape)
            print('Dimention of time series soil moisture dataframe: ',ts_df.shape)
            print('Dimention of merged dataframe: ', merged_df.shape)

            geometry = [Point(xy) for xy in zip(merged_df['longitude'], merged_df['latitude'])]
            gdf = gpd.GeoDataFrame(merged_df, geometry=geometry, crs='EPSG:4326')


            # Build output path without extension
            output_path = os.path.join(os.path.split(path)[0], 'extracted_data',stat, f'{network}')
            os.makedirs(os.path.dirname(output_path), exist_ok=True)

            export_gdf(gdf, output_path, file_format=export_format)

        except Exception as e:
            print(f"Error processing network {network}:{e}\n{'-'*50}")

        
