In [1]:
import pandas as pd
import os

# === Step 1: Load the corresponding relationship between GSRN and cluster ===
cluster_df = pd.read_csv('./geo_cluster_turbines_3_clusters.csv')

if 'GSRN' not in cluster_df.columns or 'Cluster' not in cluster_df.columns:
    raise ValueError("CSV 文件必须包含 'GSRN' 和 'Cluster' 列。")

# Make sure GSRN is a string (in case it is a string in the feature table)
cluster_df["GSRN"] = cluster_df["GSRN"].astype(str)

# === Step 2: Load feature data ===
features_df = pd.read_csv('./turbine_prediction_selected_features_400turbines.csv')
features_df["GSRN"] = features_df["GSRN"].astype(str)

# === Step 3: Merge features and cluster information ===
merged_df = features_df.merge(cluster_df, on='GSRN', how='inner')

# === Step 4: Output path preparation ===
output_dir = "geo_cluster_3_cluster_split"
os.makedirs(output_dir, exist_ok=True)

# === Step 5: Split by Cluster and save ===
for cluster_id, group_df in merged_df.groupby("Cluster"):
    output_path = os.path.join(output_dir, f"cluster_{cluster_id}.csv")
    group_df.to_csv(output_path, index=False)
    print(f"Saved: {output_path}")

Saved: geo_cluster_3_cluster_split/cluster_0.csv
Saved: geo_cluster_3_cluster_split/cluster_1.csv
Saved: geo_cluster_3_cluster_split/cluster_2.csv
