In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier

# 定义流域信息

In [29]:
# 读取流域信息
basin_info      = pd.read_excel('../../Data/Basin_Selection/All_Selected_Basins.xlsx')
basin_list      = basin_info['stat_num'].astype(str)
cali_start_list = basin_info['cali_start']
cali_end_list   = basin_info['cali_end']
vali_start_list = basin_info['vali_start']
vali_end_list   = basin_info['vali_end']

# 获取流域属性

In [3]:
Basin_Properties = pd.read_csv("../../Data/Properties/Basin_Properties.txt", sep = '\t', header=0, index_col='stat_num')
source_properties = Basin_Properties[['Climate', 'Clay', 'Silt', 'Sand', 'Slope', 'BFI', 'PRE', 'TMP', 'PET', 'TMAX', 'TMIN', 'AE', 'NDVI', 'TI']].values

In [4]:
sim_results = pd.read_csv("../../Results/Weighted_Average/Weighted_Average_Results.txt", sep="\t", index_col='stat_num')[['r_kge_YM', 'r_kge_AM', 'r_kge_DM']]
sim_results['best_scale'] = sim_results.idxmax(axis=1).map({'r_kge_YM': 0, 'r_kge_AM': 1, 'r_kge_DM': 2})

In [42]:
# 定义四个分类器
models = {
    "RandomForest": RandomForestClassifier(n_estimators=200, random_state=42, n_jobs=-1),
    "SVM": Pipeline([("scaler", StandardScaler()), 
                     ("clf", SVC(kernel="rbf", probability=True, random_state=42))]),
    "XGBoost": XGBClassifier(n_estimators=300, max_depth=10, learning_rate=0.01, random_state=42),
    "KNN": Pipeline([("scaler", StandardScaler()), 
                     ("clf", KNeighborsClassifier(n_neighbors=15))])
}

In [43]:
pred_best_models = pd.DataFrame(index=basin_list, columns=models.keys())

for b in range(len(basin_list)):
    basin = str(basin_list[b])
    print(f"Processing basin {basin} ({b+1}/{len(basin_list)})")

    # 获取所有流域，除这个流域外的属性和标签
    X_train = np.vstack([source_properties[:b], source_properties[b+1:]])
    y_train = np.hstack([sim_results['best_scale'][:b], sim_results['best_scale'][b+1:]]).ravel()
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
        
    # 获取当前流域的属性
    target_properties = Basin_Properties.loc[basin, ['Climate', 'Clay', 'Silt', 'Sand', 'Slope', 'BFI', 'PRE', 'TMP', 'PET', 'TMAX', 'TMIN', 'AE', 'NDVI', 'TI']].values.reshape(1, -1)
    target_properties_scaled = scaler.transform(target_properties)

    # 训练并预测
    for model_name, model in models.items():
        trained_model = model.fit(X_train_scaled, y_train)
        pred_best_models.loc[basin, model_name] = trained_model.predict(target_properties_scaled)[0]

# 保存预测结果
pred_best_models.to_csv("../../Results/Best_Model_Transplant/Best_Model_Transplant.txt", sep='\t', header=True, index_label='stat_num')

Processing basin ZM_0000050 (1/2003)
Processing basin ZM_0000053 (2/2003)
Processing basin ZM_0000043 (3/2003)
Processing basin CD_0000003 (4/2003)
Processing basin CD_0000002 (5/2003)
Processing basin CD_0000006 (6/2003)
Processing basin CD_0000005 (7/2003)
Processing basin CF_0000010 (8/2003)
Processing basin ET_0000002 (9/2003)
Processing basin MZ_0000001 (10/2003)
Processing basin TZ_0000051 (11/2003)
Processing basin TZ_0000027 (12/2003)
Processing basin TZ_0000024 (13/2003)
Processing basin TZ_0000007 (14/2003)
Processing basin TZ_0000032 (15/2003)
Processing basin MW_0000014 (16/2003)
Processing basin MW_0000019 (17/2003)
Processing basin MW_0000020 (18/2003)
Processing basin ZM_0000004 (19/2003)
Processing basin ZM_0000042 (20/2003)
Processing basin ZM_0000003 (21/2003)
Processing basin ZM_0000029 (22/2003)
Processing basin ZM_0000025 (23/2003)
Processing basin ZM_0000016 (24/2003)
Processing basin ZM_0000040 (25/2003)
Processing basin NA_0000003 (26/2003)
Processing basin NA_0