In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import rasterio
import os
import sys
import time
import xarray as xr
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..')))

from Water_Blance_Model import mYWBMnlS, abcdnlS, DWBMnlS
from Rewrite_Func import nash_sutcliffe_efficiency, relative_error, kling_gupta_efficiency
from numba import float64, njit
from numba.experimental import jitclass
from netCDF4 import Dataset
from scipy.spatial.distance import mahalanobis
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import pairwise_distances
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.multioutput import MultiOutputRegressor
from xgboost import XGBRegressor
from scipy.ndimage import median_filter
from concurrent.futures import ThreadPoolExecutor, as_completed

# 定义流域信息

In [2]:
# 读取流域信息
basin_info      = pd.read_excel('../../Data/Basin_Selection/All_Selected_Basins.xlsx')
basin_list      = basin_info['stat_num'].astype(str)
cali_start_list = basin_info['cali_start']
cali_end_list   = basin_info['cali_end']
vali_start_list = basin_info['vali_start']
vali_end_list   = basin_info['vali_end']

# 定义数据读取函数

In [3]:
# 集总式模型数据读取
def get_data_lumped(basin, basin_idx):
    filepath = f"../../../2025_03_Hydrological_Models/Data/New_Hydro_Climatic/NHC_{basin}.txt"
    hc_data = pd.read_csv(filepath, sep = '\t', header=0, index_col='Time', parse_dates=['Time'])
    cali_start = pd.to_datetime(f"{str(cali_start_list[basin_idx])}-01-01")
    cali_end   = pd.to_datetime(f"{str(cali_end_list[basin_idx])}-12-31")
    vali_start = pd.to_datetime(f"{str(vali_start_list[basin_idx])}-01-01")
    vali_end   = pd.to_datetime(f"{str(vali_end_list[basin_idx])}-12-31")

    cali_data = hc_data.loc[cali_start : cali_end]
    vali_data = hc_data.loc[vali_start : vali_end]

    x_cali = cali_data[['PRE_CRU', 'TMP_CRU', 'PET_CRU']].to_numpy()
    y_cali = cali_data['RUN'].to_numpy()
    x_vali = vali_data[['PRE_CRU', 'TMP_CRU', 'PET_CRU']].to_numpy()
    y_vali = vali_data['RUN'].to_numpy()
    return x_cali, y_cali, x_vali, y_vali

# 获取流域属性

In [4]:
Basin_Properties = pd.read_csv("../../Data/Properties/Basin_Properties.txt", sep = '\t', header=0, index_col='stat_num')
source_properties = Basin_Properties[['Climate', 'Clay', 'Silt', 'Sand', 'Slope', 'BFI', 'PRE', 'TMP', 'PET', 'TMAX', 'TMIN', 'AE', 'NDVI', 'TI']].values

# 机器学习回归模型

In [5]:
def train_random_forest(basin_properties_scaled, params):
    # 初始化并训练模型
    rf_model = MultiOutputRegressor(RandomForestRegressor(n_estimators      = 100, 
                                                          max_depth         = 10,
                                                          min_samples_split = 5,
                                                          min_samples_leaf  = 2,
                                                          random_state      = 42,
                                                          n_jobs            = -1), n_jobs=-1)
    rf_model.fit(basin_properties_scaled, params)
    
    return rf_model

def train_svm(basin_properties_scaled, params): 
    # 初始化并训练模型
    svr_model = MultiOutputRegressor(SVR(kernel     = 'rbf',
                                         C          = 100,
                                         epsilon    = 0.01,
                                         gamma      = 0.1), n_jobs=-1)
    svr_model.fit(basin_properties_scaled, params)
    
    return svr_model

def train_xgboost(basin_properties_scaled, params):
    # 初始化并训练模型
    xgb_model = MultiOutputRegressor(XGBRegressor(n_estimators      = 100,
                                                  learning_rate     = 0.1,
                                                  max_depth         = 6,
                                                  min_child_weight  = 3,
                                                  gamma             = 0.1,
                                                  colsample_bytree  = 0.8,
                                                  subsample         = 0.8,
                                                  reg_alpha         = 0.1,
                                                  random_state      = 42,
                                                  n_jobs            = -1), n_jobs=-1)
    xgb_model.fit(basin_properties_scaled, params)
    
    return xgb_model

# 获取模型率定权重

In [9]:
sim_results = pd.read_csv("../../Results/Weighted_Average/Weighted_Average_Results_GRA.txt", sep="\t", index_col='stat_num')[['r_w_YM', 'r_w_AM', 'r_w_DM']].values

# 循环每个流域

In [18]:
pred_weight_rf  = pd.DataFrame(index=basin_list, columns=['r_w_YM', 'r_w_AM', 'r_w_DM'])
pred_weight_svr = pd.DataFrame(index=basin_list, columns=['r_w_YM', 'r_w_AM', 'r_w_DM'])
pred_weight_xgb = pd.DataFrame(index=basin_list, columns=['r_w_YM', 'r_w_AM', 'r_w_DM'])

b = 49
basin = basin_list[b]
print(f"Processing basin {basin} ({b+1}/{len(basin_list)})")

X_train = np.vstack([source_properties[:b], source_properties[b+1:]])
y_train = np.vstack([sim_results[:b], sim_results[b+1:]])

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)

y_scaler = StandardScaler()
y_train_scaled = y_scaler.fit_transform(y_train)

# 获取当前流域属性
target_properties = Basin_Properties.loc[basin, ['Climate', 'Clay', 'Silt', 'Sand', 'Slope', 'BFI', 'PRE', 'TMP', 'PET', 'TMAX', 'TMIN', 'AE', 'NDVI', 'TI']].values.reshape(1, -1)
target_properties_scaled = scaler.transform(target_properties)

# 训练并预测随机森林模型
rf_model  = train_random_forest(X_train_scaled, y_train)
svr_model = train_svm(X_train_scaled, y_train)
xgb_model = train_xgboost(X_train_scaled, y_train)

rf_pred_scaled  = rf_model.predict(target_properties_scaled)[0]
svr_pred_scaled = svr_model.predict(target_properties_scaled)[0]
xgb_pred_scaled = xgb_model.predict(target_properties_scaled)[0]

rf_pred  = y_scaler.inverse_transform(rf_pred_scaled.reshape(1, -1))
svr_pred = y_scaler.inverse_transform(svr_pred_scaled.reshape(1, -1))
xgb_pred = y_scaler.inverse_transform(xgb_pred_scaled.reshape(1, -1))

# # 归一化，确保和为1
# rf_pred  = np.clip(rf_pred, 0, None)
# svr_pred = np.clip(svr_pred, 0, None)
# xgb_pred = np.clip(xgb_pred, 0, None)
# rf_pred  = rf_pred / np.sum(rf_pred)
# svr_pred = svr_pred / np.sum(svr_pred)
# xgb_pred = xgb_pred / np.sum(xgb_pred)

pred_weight_rf.loc[basin]  = rf_pred
pred_weight_svr.loc[basin] = svr_pred
pred_weight_xgb.loc[basin] = xgb_pred

Processing basin 1749550 (50/2003)


In [19]:
pred_weight_rf

Unnamed: 0_level_0,r_w_YM,r_w_AM,r_w_DM
stat_num,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
ZM_0000050,,,
ZM_0000053,,,
ZM_0000043,,,
CD_0000003,,,
CD_0000002,,,
...,...,...,...
AU_0001056,,,
AU_0001063,,,
AU_0001087,,,
AU_0001127,,,
