In [1]:
import numpy as np
import pandas as pd
import xarray as xr
import geopandas as gpd
import matplotlib.pyplot as plt
import rasterio
import concurrent.futures

from netCDF4 import Dataset
from shapely.geometry import Point
from tqdm import tqdm
from shapely.vectorized import contains

import warnings
warnings.filterwarnings("ignore")
warnings.simplefilter("ignore", category=UserWarning)

In [2]:
basin_info      = pd.read_excel('../../Data/Basin_Selection/All_Selected_Basins.xlsx')
basin_list      = basin_info['stat_num']
source_list     = basin_info['source']

# AE

In [None]:
# 读取 nc 文件
ds = xr.open_dataset("F:/GLEAM/v42a/E/E_1980_GLEAM_v4.2a_MO.nc")  # 替换为你的 nc 文件路径
lon = ds["lon"].to_numpy()
lat = ds["lat"].to_numpy()
AE_GLEAM = ds["E"].to_numpy()

for year in range(1981, 2023):
    ds = xr.open_dataset(f"F:/GLEAM/v42a/E/E_{str(year)}_GLEAM_v4.2a_MO.nc")  # 替换为你的 nc 文件路径
    temp_AE_GLEAM = ds["E"].to_numpy()
    AE_GLEAM = np.concatenate((AE_GLEAM, temp_AE_GLEAM), axis=0)
    print(f"Year {year} data loaded.")

time_series = pd.date_range(start='1980-01-01', end='2022-12-31', freq='MS')

In [None]:
for i in range(len(basin_list)):
    basin = str(basin_list[i])
    print(f"Processing No.{i+1}, {basin}...")
    source = source_list[i]
    if source == "ZHCN":
        shp_filepath = "E:/2024_01_Global_WBM_Validation/Raw_Data/Boundary/" + basin + ".shp"
    else:
        shp_filepath = "E:/2024_01_Global_WBM_Validation/Raw_Data/Boundary/" + source + "_" + basin + ".shp"
    # 读取shape文件
    gdf = gpd.read_file(shp_filepath)  # 替换为你的 shp 文件路径
    polygon = gdf.geometry
    minx, miny, maxx, maxy = polygon.total_bounds
    # 流域上下左右边界
    basin_left_bound = minx - 0.2
    basin_right_bound = maxx + 0.5
    basun_top_bound = maxy + 0.5
    basin_bottom_bound = miny - 0.2
    # 寻找落在流域边界内的格点
    lon_loc = np.where((lon >= basin_left_bound) & (lon <= basin_right_bound))[0]
    lat_loc = np.where((lat >= basin_bottom_bound) & (lat <= basun_top_bound))[0]
    temp_lon = lon[lon_loc]
    temp_lat = lat[lat_loc]
    temp_AE = AE_GLEAM[:, lat_loc[0]:lat_loc[-1]+1, lon_loc[0]:lon_loc[-1]+1]
    # 生成所有格点的经纬度坐标
    lon2d, lat2d = np.meshgrid(temp_lon, temp_lat)
    points = [Point(x, y) for x, y in zip(lon2d.ravel(), lat2d.ravel())]
    # 判断哪些点在流域内
    mask = np.array([polygon.contains(point) for point in points])
    mask = mask.reshape(lon2d.shape)  # 还原为 2D 形状
    new_mask = mask.copy().astype(np.float64)
    new_mask[new_mask == 0] = np.nan
    # 计算时间序列
    AE_series = np.nanmean(temp_AE * new_mask, axis=(1, 2))

    AE_filepath = f"../../Data/AE/AE_{basin}.txt"
    AE_df = pd.DataFrame({'Time': time_series, 'AE': AE_series})
    AE_df.set_index('Time', inplace=True)
    AE_df.to_csv(AE_filepath, sep="\t", float_format="%.3f", index=True, header=True)

# BFI

In [None]:
ds = xr.open_dataset("../../Raw_Data/BFI.nc")
lon = ds["longitude"].to_numpy()
lat = ds["latitude"].to_numpy()
BFI = ds["BFI"].to_numpy()
BFI = np.flip(np.rot90(BFI), axis=0)
plt.imshow(BFI)

In [None]:
BFI_list = np.full(len(basin_list), np.nan)
for i in range(len(basin_list)):
    basin = str(basin_list[i])
    print(f"Processing No.{i+1}, {basin}...")
    source = source_list[i]
    if source == "ZHCN":
        shp_filepath = "E:/2024_01_Global_WBM_Validation/Raw_Data/Boundary/" + basin + ".shp"
    else:
        shp_filepath = "E:/2024_01_Global_WBM_Validation/Raw_Data/Boundary/" + source + "_" + basin + ".shp"
    # 读取shape文件
    gdf = gpd.read_file(shp_filepath)  # 替换为你的 shp 文件路径
    polygon = gdf.geometry
    minx, miny, maxx, maxy = polygon.total_bounds
    # 流域上下左右边界
    basin_left_bound = minx - 0.2
    basin_right_bound = maxx + 0.5
    basun_top_bound = maxy + 0.5
    basin_bottom_bound = miny - 0.2
    # 寻找落在流域边界内的格点
    lon_loc = np.where((lon >= basin_left_bound) & (lon <= basin_right_bound))[0]
    lat_loc = np.where((lat >= basin_bottom_bound) & (lat <= basun_top_bound))[0]
    temp_lon = lon[lon_loc]
    temp_lat = lat[lat_loc]
    temp_BFI = BFI[lat_loc[0]:lat_loc[-1]+1, lon_loc[0]:lon_loc[-1]+1]
    # 生成所有格点的经纬度坐标
    lon2d, lat2d = np.meshgrid(temp_lon, temp_lat)
    points = [Point(x, y) for x, y in zip(lon2d.ravel(), lat2d.ravel())]
    # 判断哪些点在流域内
    mask = np.array([polygon.contains(point) for point in points])
    mask = mask.reshape(lon2d.shape)  # 还原为 2D 形状
    new_mask = mask.copy().astype(np.float64)
    new_mask[new_mask == 0] = np.nan

    BFI_basin = np.nanmean(temp_BFI * new_mask)
    BFI_list[i] = BFI_basin

BFI_filepath = f"../../Data/BFI/BFI.txt"
BFI_df = pd.DataFrame({'BFI': BFI_list}, index=basin_list)
BFI_df.to_csv(BFI_filepath, sep="\t", float_format="%.2f", index=True, header=True)

# Climate Zones

In [None]:
def get_mode(aa):
    """
    计算二维数组的众数，忽略NaN值
    :param aa: 二维数组
    :return: 众数
    """
    # 过滤NaN并展平为一维数组
    # 假设aa为二维数组
    flat_data = aa[~np.isnan(aa)]  # 过滤NaN并展平为一维数组
    if flat_data.size == 0:
        mode = None  # 处理全为NaN的情况
    else:
        values, counts = np.unique(flat_data, return_counts=True)
        mode = values[np.argmax(counts)]  # 取出现次数最多的值
    return mode

In [None]:
ds = xr.open_dataset("../../Raw_Data/Climate.nc")
lon  = ds["longitude"].to_numpy()
lat  = ds["latitude"].to_numpy()
Climate = ds["Climate"].to_numpy()
Climate = np.flip(np.rot90(Climate), axis=0)

In [None]:
def process_basin(args):
    i, basin, source = args
    try:
        if source == "ZHCN":
            shp_filepath = "E:/2024_01_Global_WBM_Validation/Raw_Data/Boundary/" + basin + ".shp"
        else:
            shp_filepath = "E:/2024_01_Global_WBM_Validation/Raw_Data/Boundary/" + source + "_" + basin + ".shp"
        
        # Read shapefile
        gdf = gpd.read_file(shp_filepath)
        polygon = gdf.geometry
        minx, miny, maxx, maxy = polygon.total_bounds
        
        # Basin boundaries
        basin_left_bound = minx - 0.2
        basin_right_bound = maxx + 0.5
        basun_top_bound = maxy + 0.5
        basin_bottom_bound = miny - 0.2
        
        # Find grid points within basin boundaries
        lon_loc = np.where((lon >= basin_left_bound) & (lon <= basin_right_bound))[0]
        lat_loc = np.where((lat >= basin_bottom_bound) & (lat <= basun_top_bound))[0]
        
        if len(lon_loc) == 0 or len(lat_loc) == 0:
            return i, np.nan
        
        temp_lon = lon[lon_loc]
        temp_lat = lat[lat_loc]
        temp_Climate = Climate[lat_loc[0]:lat_loc[-1]+1, lon_loc[0]:lon_loc[-1]+1]
        
        # Generate all grid point coordinates
        lon2d, lat2d = np.meshgrid(temp_lon, temp_lat)
        points = [Point(x, y) for x, y in zip(lon2d.ravel(), lat2d.ravel())]
        
        # Check which points are in the basin
        mask = np.array([polygon.contains(point) for point in points])
        mask = mask.reshape(lon2d.shape)
        new_mask = mask.copy().astype(np.float64)
        new_mask[new_mask == 0] = np.nan
        
        Climate_basin = get_mode(temp_Climate * new_mask)
        
        return i, Climate_basin
    except Exception as e:
        print(f"Error processing basin {basin} (index {i+1}): {e}")
        return i, np.nan

# Initialize the result array
Climate_list = np.full(len(basin_list), np.nan)

# Prepare arguments for the process_basin function
args_list = [(i, str(basin_list[i]), source_list[i]) for i in range(len(basin_list))]

# Process basins in parallel with a progress bar
max_workers = 12  # Adjust based on your CPU
with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
    results = list(tqdm(executor.map(process_basin, args_list), total=len(args_list), desc="Processing basins"))
    
    # Update Climate_list with results
    for i, result in results:
        Climate_list[i] = result

# Save results
Climate_filepath = f"../../Data/Climate.txt"
Climate_df = pd.DataFrame(Climate_list, columns=['Koppen'], index=basin_list)
Climate_df.to_csv(Climate_filepath, sep="\t", float_format="%.2f", index=True, header=True)

Processing basins: 100%|██████████| 2003/2003 [05:08<00:00,  6.50it/s]


# NDVI

In [None]:
# 读取 nc 文件
ds   = xr.open_dataset("F:/NDVI/GIMMS/NDVI_Global_1982_2022/NDVI_Global_1982_2022.nc")
lon  = ds["lon"].to_numpy()
lat  = ds["lat"].to_numpy()
NDVI = ds["NDVI"].to_numpy()
time_series = pd.date_range(start='1982-01-01', end='2022-12-31', freq='MS')

In [None]:
for i in range(len(basin_list)):
    basin = str(basin_list[i])
    print(f"Processing No.{i+1}, {basin}...")
    source = source_list[i]
    if source == "ZHCN":
        shp_filepath = "E:/2024_01_Global_WBM_Validation/Raw_Data/Boundary/" + basin + ".shp"
    else:
        shp_filepath = "E:/2024_01_Global_WBM_Validation/Raw_Data/Boundary/" + source + "_" + basin + ".shp"
    # 读取shape文件
    gdf = gpd.read_file(shp_filepath)  # 替换为你的 shp 文件路径
    polygon = gdf.geometry
    minx, miny, maxx, maxy = polygon.total_bounds
    # 流域上下左右边界
    basin_left_bound = minx - 0.2
    basin_right_bound = maxx + 0.5
    basun_top_bound = maxy + 0.5
    basin_bottom_bound = miny - 0.2
    # 寻找落在流域边界内的格点
    lon_loc = np.where((lon >= basin_left_bound) & (lon <= basin_right_bound))[0]
    lat_loc = np.where((lat >= basin_bottom_bound) & (lat <= basun_top_bound))[0]
    temp_lon = lon[lon_loc]
    temp_lat = lat[lat_loc]
    temp_DVI = NDVI[:, lat_loc[0]:lat_loc[-1]+1, lon_loc[0]:lon_loc[-1]+1]
    # 生成所有格点的经纬度坐标
    lon2d, lat2d = np.meshgrid(temp_lon, temp_lat)
    points = [Point(x, y) for x, y in zip(lon2d.ravel(), lat2d.ravel())]
    # 判断哪些点在流域内
    mask = np.array([polygon.contains(point) for point in points])
    mask = mask.reshape(lon2d.shape)  # 还原为 2D 形状
    new_mask = mask.copy().astype(np.float64)
    new_mask[new_mask == 0] = np.nan
    # 计算时间序列
    NDVI_series = np.nanmean(temp_DVI * new_mask, axis=(1, 2))

    NDVI_filepath = f"../../Data/NDVI/NDVI_{basin}.txt"
    NDVI_df = pd.DataFrame({'Time': time_series, 'NDVI': NDVI_series})
    NDVI_df.set_index('Time', inplace=True)
    NDVI_df.to_csv(NDVI_filepath, sep="\t", float_format="%.3f", index=True, header=True)

# Location

In [3]:
Location_list = np.full((len(basin_list), 5), np.nan)
for i in range(len(basin_list)):
    basin = str(basin_list[i])
    # print(f"Processing No.{i+1}, {basin}...")
    source = source_list[i]
    if source == "ZHCN":
        shp_filepath = "E:/2024_01_Global_WBM_Validation/Raw_Data/Boundary/" + basin + ".shp"
    else:
        shp_filepath = "E:/2024_01_Global_WBM_Validation/Raw_Data/Boundary/" + source + "_" + basin + ".shp"
    # 读取shape文件
    gdf = gpd.read_file(shp_filepath)  # 替换为你的 shp 文件路径
    polygon = gdf.geometry
    # Calculate the centroid (center) of the polygon
    centroid = polygon.centroid.values[0]
    center_lon, center_lat = centroid.x, centroid.y

    # Get bounds and calculate dimensions
    bounds = polygon.bounds.values[0]  # [minx, miny, maxx, maxy]
    xmin, ymin, xmax, ymax = bounds
    width = xmax - xmin   # in degrees longitude
    length = ymax - ymin  # in degrees latitude
    aspect_ratio = length / width if width > 0 else np.nan

    # Store values in Location_list
    Location_list[i, 0] = center_lat
    Location_list[i, 1] = center_lon
    Location_list[i, 2] = length
    Location_list[i, 3] = width
    Location_list[i, 4] = aspect_ratio
# Location_filepath = f"../../Data/Location.txt"
Location_df = pd.DataFrame(Location_list, columns=['clat', 'clon', 'length', 'width', 'aspect_ratio'], index=basin_list)
# Location_df.to_csv(Location_filepath, sep="\t", float_format="%.2f", index=True, header=True)

In [4]:
Location_df

Unnamed: 0_level_0,clat,clon,length,width,aspect_ratio
stat_num,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
ZM_0000050,-9.555427,32.077578,1.176143,1.975659,0.595317
ZM_0000053,-9.939214,30.858557,0.897713,1.159387,0.774300
ZM_0000043,-11.195808,30.632132,4.431612,4.318066,1.026296
CD_0000003,-7.868349,28.867774,12.060986,10.134900,1.190045
CD_0000002,-6.967006,28.588987,13.538631,10.134900,1.335843
...,...,...,...,...,...
AU_0001056,-36.224709,147.689218,0.413707,0.396233,1.044101
AU_0001063,-36.516600,147.042458,0.839917,0.608175,1.381045
AU_0001087,-36.808596,146.811994,0.701920,0.666714,1.052805
AU_0001127,-37.267236,146.143028,0.818595,1.004970,0.814547


# Slope

In [None]:
# Path to the tif file
slope_path = '../../Raw_Data/Slope.tif'

# Open the raster file
with rasterio.open(slope_path) as src:
    # Read the data array
    slope_data = src.read(1)  # Read the first band
    # Get metadata
    slope_meta = src.meta
    # Create longitude and latitude arrays
    height = slope_meta['height']
    width = slope_meta['width']
    transform = slope_meta['transform']
    # Create coordinates
    rows = np.arange(height)
    cols = np.arange(width)
    # Transform pixel coordinates to geographic coordinates
    lon = np.zeros(width)
    lat = np.zeros(height)
    for col in range(width):
        lon[col] = transform[2] + (col * transform[0])
    for row in range(height):
        lat[row] = transform[5] + (row * transform[4])
    # Print some basic information
    print(f"Slope data shape: {slope_data.shape}")
    print(f"Longitude array shape: {lon.shape}")
    print(f"Latitude array shape: {lat.shape}")
    slope_data[slope_data > 10000] = np.nan

Slope data shape: (11907, 30836)
Longitude array shape: (30836,)
Latitude array shape: (11907,)


In [None]:
Slope_list = np.full((len(basin_list), 7), np.nan)
for i in range(len(basin_list)):
    basin = str(basin_list[i])
    print(f"Processing No.{i+1}, {basin}...")
    source = source_list[i]
    if source == "ZHCN":
        shp_filepath = "E:/2024_01_Global_WBM_Validation/Raw_Data/Boundary/" + basin + ".shp"
    else:
        shp_filepath = "E:/2024_01_Global_WBM_Validation/Raw_Data/Boundary/" + source + "_" + basin + ".shp"
    # 读取shape文件
    gdf = gpd.read_file(shp_filepath)  # 替换为你的 shp 文件路径
    polygon = gdf.geometry
    minx, miny, maxx, maxy = polygon.total_bounds
    # 流域上下左右边界
    basin_left_bound = minx - 0.2
    basin_right_bound = maxx + 0.5
    basun_top_bound = maxy + 0.5
    basin_bottom_bound = miny - 0.2
    # 寻找落在流域边界内的格点
    lon_loc = np.where((lon >= basin_left_bound) & (lon <= basin_right_bound))[0]
    lat_loc = np.where((lat >= basin_bottom_bound) & (lat <= basun_top_bound))[0]
    temp_lon = lon[lon_loc]
    temp_lat = lat[lat_loc]
    temp_Slope = slope_data[lat_loc[0]:lat_loc[-1]+1, lon_loc[0]:lon_loc[-1]+1]
    # Import vectorized contains function for fast point-in-polygon test

    # Create meshgrid
    lon2d, lat2d = np.meshgrid(temp_lon, temp_lat)

    # Get the polygon geometry
    basin_polygon = polygon.iloc[0]  # Extract the MultiPolygon from the GeoSeries

    # Use vectorized contains function for fast point-in-polygon test
    mask = contains(basin_polygon, lon2d.ravel(), lat2d.ravel())
    mask = mask.reshape(lon2d.shape)

    # Convert mask to float with NaN values for masked areas
    new_mask = mask.astype(np.float64)
    new_mask[new_mask == 0] = np.nan

    Slope_basin = temp_Slope * new_mask
    # Flatten the basin slope array and remove NaNs for calculation
    valid_slope = Slope_basin[~np.isnan(Slope_basin)]
    
    if len(valid_slope) > 0:
        # Calculate mean slope
        Slope_list[i, 0] = np.mean(valid_slope)
        # Calculate percentages in each slope range
        Slope_list[i, 1] = np.sum(valid_slope < 2) / len(valid_slope) * 100  # < 5
        Slope_list[i, 2] = np.sum((valid_slope >= 2) & (valid_slope < 5)) / len(valid_slope) * 100  # 5-10
        Slope_list[i, 3] = np.sum((valid_slope >= 5) & (valid_slope < 10)) / len(valid_slope) * 100  # 10-15
        Slope_list[i, 4] = np.sum((valid_slope >= 10) & (valid_slope < 15)) / len(valid_slope) * 100  # 15-20
        Slope_list[i, 5] = np.sum((valid_slope >= 15) & (valid_slope < 20)) / len(valid_slope) * 100  # 15-20
        Slope_list[i, 6] = np.sum(valid_slope >= 20) / len(valid_slope) * 100  # > 20
Slope_filepath = f"../../Data/Slope.txt"
Slope_df = pd.DataFrame(Slope_list, columns=['Ave', '<2', '2-5', '5-10', '10-15', '15-20', '>20'], index=basin_list)
Slope_df.to_csv(Slope_filepath, sep="\t", float_format="%.2f", index=True, header=True)

# Soil Texture

In [None]:
ds = xr.open_dataset("../../Raw_Data/CLAY.nc")
lon  = ds["longitude"].to_numpy()
lat  = ds["latitude"].to_numpy()
Clay = ds["CLAY"].to_numpy()
Clay = np.flip(np.rot90(Clay), axis=0)
ds = xr.open_dataset("../../Raw_Data/SAND.nc")
lon  = ds["longitude"].to_numpy()
lat  = ds["latitude"].to_numpy()
Sand = ds["SAND"].to_numpy()
Sand = np.flip(np.rot90(Sand), axis=0)
ds = xr.open_dataset("../../Raw_Data/SILT.nc")
lon  = ds["longitude"].to_numpy()
lat  = ds["latitude"].to_numpy()
Silt = ds["SILT"].to_numpy()
Silt = np.flip(np.rot90(Silt), axis=0)

In [None]:
def process_basin(args):
    i, basin, source = args
    try:
        if source == "ZHCN":
            shp_filepath = "E:/2024_01_Global_WBM_Validation/Raw_Data/Boundary/" + basin + ".shp"
        else:
            shp_filepath = "E:/2024_01_Global_WBM_Validation/Raw_Data/Boundary/" + source + "_" + basin + ".shp"
        
        # Read shapefile
        gdf = gpd.read_file(shp_filepath)
        polygon = gdf.geometry
        minx, miny, maxx, maxy = polygon.total_bounds
        
        # Basin boundaries
        basin_left_bound = minx - 0.2
        basin_right_bound = maxx + 0.5
        basun_top_bound = maxy + 0.5
        basin_bottom_bound = miny - 0.2
        
        # Find grid points within basin boundaries
        lon_loc = np.where((lon >= basin_left_bound) & (lon <= basin_right_bound))[0]
        lat_loc = np.where((lat >= basin_bottom_bound) & (lat <= basun_top_bound))[0]
        
        if len(lon_loc) == 0 or len(lat_loc) == 0:
            return i, np.array([np.nan, np.nan, np.nan])
        
        temp_lon = lon[lon_loc]
        temp_lat = lat[lat_loc]
        temp_clay = Clay[lat_loc[0]:lat_loc[-1]+1, lon_loc[0]:lon_loc[-1]+1]
        temp_sand = Sand[lat_loc[0]:lat_loc[-1]+1, lon_loc[0]:lon_loc[-1]+1]
        temp_silt = Silt[lat_loc[0]:lat_loc[-1]+1, lon_loc[0]:lon_loc[-1]+1]
        
        # Generate all grid point coordinates
        lon2d, lat2d = np.meshgrid(temp_lon, temp_lat)
        points = [Point(x, y) for x, y in zip(lon2d.ravel(), lat2d.ravel())]
        
        # Check which points are in the basin
        mask = np.array([polygon.contains(point) for point in points])
        mask = mask.reshape(lon2d.shape)
        new_mask = mask.copy().astype(np.float64)
        new_mask[new_mask == 0] = np.nan
        
        Clay_basin = np.nanmean(temp_clay * new_mask)
        Sand_basin = np.nanmean(temp_sand * new_mask)
        Silt_basin = np.nanmean(temp_silt * new_mask)
        
        return i, np.array([Clay_basin, Sand_basin, Silt_basin])
    except Exception as e:
        print(f"Error processing basin {basin} (index {i+1}): {e}")
        return i, np.array([np.nan, np.nan, np.nan])

# Initialize the result array
Soil_list = np.full((len(basin_list), 3), np.nan)

# Prepare arguments for the process_basin function
args_list = [(i, str(basin_list[i]), source_list[i]) for i in range(len(basin_list))]

# Process basins in parallel with a progress bar
max_workers = 12  # Adjust based on your CPU
with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
    results = list(tqdm(executor.map(process_basin, args_list), total=len(args_list), desc="Processing basins"))
    
    # Update Soil_list with results
    for i, result in results:
        Soil_list[i] = result

# Save results
Soil_filepath = f"../../Data/Soil.txt"
Soil_df = pd.DataFrame(Soil_list, columns=['Clay', 'Sand', 'Silt'], index=basin_list)
Soil_df.to_csv(Soil_filepath, sep="\t", float_format="%.2f", index=True, header=True)

Processing basins: 100%|██████████| 2003/2003 [05:24<00:00,  6.18it/s]


# TI

In [3]:
ds = xr.open_dataset("../../Raw_Data/TI.nc")
lon  = ds["longitude"].to_numpy()
lat  = ds["latitude"].to_numpy()
TI = ds["TI"].to_numpy()
TI = np.flip(np.rot90(TI), axis=0)

In [5]:
def process_basin(args):
    i, basin, source = args
    try:
        if source == "ZHCN":
            shp_filepath = "../../../2024_01_Global_WBM_Validation/Raw_Data/Boundary/" + basin + ".shp"
        else:
            shp_filepath = "../../../2024_01_Global_WBM_Validation/Raw_Data/Boundary/" + source + "_" + basin + ".shp"
        
        # Read shapefile
        gdf = gpd.read_file(shp_filepath)
        polygon = gdf.geometry
        minx, miny, maxx, maxy = polygon.total_bounds
        
        # Basin boundaries
        basin_left_bound = minx - 0.2
        basin_right_bound = maxx + 0.5
        basun_top_bound = maxy + 0.5
        basin_bottom_bound = miny - 0.2
        
        # Find grid points within basin boundaries
        lon_loc = np.where((lon >= basin_left_bound) & (lon <= basin_right_bound))[0]
        lat_loc = np.where((lat >= basin_bottom_bound) & (lat <= basun_top_bound))[0]
        
        if len(lon_loc) == 0 or len(lat_loc) == 0:
            return i, np.array([np.nan, np.nan, np.nan])
        
        temp_lon = lon[lon_loc]
        temp_lat = lat[lat_loc]
        temp_TI = TI[lat_loc[0]:lat_loc[-1]+1, lon_loc[0]:lon_loc[-1]+1]
        
        # Generate all grid point coordinates
        lon2d, lat2d = np.meshgrid(temp_lon, temp_lat)
        points = [Point(x, y) for x, y in zip(lon2d.ravel(), lat2d.ravel())]
        
        # Check which points are in the basin
        mask = np.array([polygon.contains(point) for point in points])
        mask = mask.reshape(lon2d.shape)
        new_mask = mask.copy().astype(np.float64)
        new_mask[new_mask == 0] = np.nan
        
        TI_basin = np.nanmean(temp_TI * new_mask)
        
        return i, TI_basin
    except Exception as e:
        print(f"Error processing basin {basin} (index {i+1}): {e}")
        return i, np.array([np.nan, np.nan, np.nan])

# Initialize the result array
TI_list = np.full(len(basin_list), np.nan)

# Prepare arguments for the process_basin function
args_list = [(i, str(basin_list[i]), source_list[i]) for i in range(len(basin_list))]

# Process basins in parallel with a progress bar
max_workers = 12  # Adjust based on your CPU
with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
    results = list(tqdm(executor.map(process_basin, args_list), total=len(args_list), desc="Processing basins"))
    
    # Update TI_list with results
    for i, result in results:
        TI_list[i] = result

# Save results
TI_filepath = f"../../Data/TI.txt"
TI_df = pd.DataFrame(TI_list, columns=['TI'], index=basin_list)
TI_df.to_csv(TI_filepath, sep="\t", float_format="%.2f", index=True, header=True)

Processing basins: 100%|██████████| 2003/2003 [05:20<00:00,  6.25it/s]


# Basin Properties

In [6]:
Basin_Properties = pd.DataFrame({}, index=basin_list)
# 气候条件
Climate = pd.read_csv("../../Data/Properties/Climate.txt", sep = '\t', header=0, index_col=0)
Basin_Properties['Climate'] = Climate['Koppen'].to_numpy()

# 土壤属性
Soil    = pd.read_csv("../../Data/Properties/Soil.txt", sep = '\t', header=0, index_col=0)
Basin_Properties['Clay'] = Soil['Clay'].to_numpy()
Basin_Properties['Silt'] = Soil['Silt'].to_numpy()
Basin_Properties['Sand'] = Soil['Sand'].to_numpy()

# 流域位置属性
Location = pd.read_csv("../../Data/Properties/Location.txt", sep = '\t', header=0, index_col=0)
Basin_Properties['Longitude'] = Location['clon'].to_numpy()
Basin_Properties['Latitude']  = Location['clat'].to_numpy()
Basin_Properties['Length']    = Location['length'].to_numpy()
Basin_Properties['Width']     = Location['width'].to_numpy()
Basin_Properties['Asp_Rat']   = Location['aspect_ratio'].to_numpy()

# 坡度
Slope   = pd.read_csv("../../Data/Properties/Slope.txt", sep = '\t', header=0, index_col=0)
Basin_Properties['Slope'] = Slope['Ave'].to_numpy()

# 面积
Basin_Properties['Area'] = np.round(basin_info['area'].to_numpy(), 2)

# 基流指数
BFI  = pd.read_csv("../../Data/Properties/BFI.txt", sep = '\t', header=0, index_col=0)
Basin_Properties['BFI'] = BFI['BFI'].to_numpy()

# 地形指数
TI = pd.read_csv("../../Data/TI.txt", sep = '\t', header=0, index_col=0)
Basin_Properties['TI'] = TI['TI'].to_numpy()

# 流域气候条件
PRE_list  = np.full(len(basin_list), np.nan)
TMP_list  = np.full(len(basin_list), np.nan)
PET_list  = np.full(len(basin_list), np.nan)
TMAX_list = np.full(len(basin_list), np.nan)
TMIN_list = np.full(len(basin_list), np.nan)
AE_list   = np.full(len(basin_list), np.nan)
NDVI_list = np.full(len(basin_list), np.nan)
for b in range(len(basin_list)):
    basin = str(basin_list[b])
    print(f"Processing No.{b+1}, {basin}...")
    # 基于CRU的气候数据
    HC_filepath = f"../../../2025_03_Hydrological_Models/Data/New_Hydro_Climatic/NHC_{basin}.txt"
    HC_data = pd.read_csv(HC_filepath, sep = '\t', header=0, index_col='Time', parse_dates=['Time'])
    PRE_list[b]  = HC_data['PRE_CRU'].mean()
    TMP_list[b]  = HC_data['TMP_CRU'].mean()
    PET_list[b]  = HC_data['PET_CRU'].mean()
    TMAX_list[b] = HC_data['TMAX_CRU'].mean()
    TMIN_list[b] = HC_data['TMIN_CRU'].mean()
    if np.isnan(HC_data['TMAX_CRU'].mean()):
        TMAX_list[b] = HC_data['TMP_CRU'].mean()
        TMIN_list[b] = HC_data['TMP_CRU'].mean()
    # 基于GLEAM的实际蒸散发数据
    AE_filepath = f"../../Data/AE/AE_{basin}.txt"
    AE_data = pd.read_csv(AE_filepath, sep = '\t', header=0, index_col='Time', parse_dates=['Time'])
    AE_list[b] = AE_data['AE'].mean()
    # 基于GIMMS的NDVI数据
    NDVI_filepath = f"../../Data/NDVI/NDVI_{basin}.txt"
    NDVI_data = pd.read_csv(NDVI_filepath, sep = '\t', header=0, index_col='Time', parse_dates=['Time'])
    NDVI_list[b] = NDVI_data['NDVI'].mean()/10000

Basin_Properties['PRE']  = PRE_list
Basin_Properties['TMP']  = TMP_list
Basin_Properties['PET']  = PET_list
Basin_Properties['TMAX'] = TMAX_list
Basin_Properties['TMIN'] = TMIN_list
Basin_Properties['AE']   = AE_list
Basin_Properties['NDVI'] = NDVI_list

Basin_Properties.to_csv("../../Data/Properties/Basin_Properties.txt", sep = '\t', index = True, header=True)

Processing No.1, ZM_0000050...
Processing No.2, ZM_0000053...
Processing No.3, ZM_0000043...
Processing No.4, CD_0000003...
Processing No.5, CD_0000002...
Processing No.6, CD_0000006...
Processing No.7, CD_0000005...
Processing No.8, CF_0000010...
Processing No.9, ET_0000002...
Processing No.10, MZ_0000001...
Processing No.11, TZ_0000051...
Processing No.12, TZ_0000027...
Processing No.13, TZ_0000024...
Processing No.14, TZ_0000007...
Processing No.15, TZ_0000032...
Processing No.16, MW_0000014...
Processing No.17, MW_0000019...
Processing No.18, MW_0000020...
Processing No.19, ZM_0000004...
Processing No.20, ZM_0000042...
Processing No.21, ZM_0000003...
Processing No.22, ZM_0000029...
Processing No.23, ZM_0000025...
Processing No.24, ZM_0000016...
Processing No.25, ZM_0000040...
Processing No.26, NA_0000003...
Processing No.27, NA_0000004...
Processing No.28, ZW_0000049...
Processing No.29, ZW_0000064...
Processing No.30, TD_0000003...
Processing No.31, NG_0000004...
Processing No.32,

In [7]:
Basin_Properties

Unnamed: 0_level_0,Climate,Clay,Silt,Sand,Longitude,Latitude,Length,Width,Asp_Rat,Slope,Area,BFI,TI,PRE,TMP,PET,TMAX,TMIN,AE,NDVI
stat_num,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
ZM_0000050,1.0,33.32,23.68,30.92,32.08,-9.56,1.18,1.98,0.60,0.89,13722.66,0.70,11.06,89.178046,20.699529,109.215239,26.883955,14.660806,65.095465,0.561711
ZM_0000053,4.0,33.31,19.11,30.43,30.86,-9.94,0.90,1.16,0.77,0.82,6418.00,0.71,10.29,104.946810,20.694904,108.238962,27.107090,14.427384,71.779663,0.616881
ZM_0000043,4.0,32.28,23.44,32.36,30.63,-11.20,4.43,4.32,1.03,0.61,123072.00,0.69,11.02,96.684413,20.929631,108.055383,27.227104,14.674167,69.588950,0.588864
CD_0000003,1.0,32.49,20.78,36.20,28.87,-7.87,12.06,10.13,1.19,1.21,810440.00,0.72,10.74,93.779010,22.321537,107.987029,28.396209,16.313798,75.773421,0.551594
CD_0000002,1.0,33.80,20.23,37.05,28.59,-6.97,13.54,10.13,1.34,1.31,948500.00,0.72,10.68,97.301544,22.342193,106.165601,28.373367,16.519235,81.799050,0.590818
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
AU_0001056,5.0,33.92,29.22,36.86,147.69,-36.22,0.41,0.40,1.04,4.37,794.02,0.57,8.12,80.030553,12.675478,86.780225,18.785833,6.322712,61.816262,0.695044
AU_0001063,4.0,27.72,25.49,32.15,147.04,-36.52,0.84,0.61,1.38,5.78,1709.31,0.54,8.97,88.600232,12.563449,84.347077,19.037145,6.596161,60.339266,0.653257
AU_0001087,4.0,28.75,28.56,37.41,146.81,-36.81,0.70,0.67,1.05,5.47,2978.38,0.62,9.02,101.191414,12.084911,84.495178,17.942234,5.974187,67.914281,0.703940
AU_0001127,4.0,26.07,26.56,35.52,146.14,-37.27,0.82,1.00,0.81,4.39,3919.81,0.59,8.07,103.598238,12.217561,91.358668,17.391523,6.680997,64.081746,0.694438
