In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, LabelEncoder
import pickle  # 用于保存 'fitter' 对象
import os

In [2]:
# --- 1. 配置 ---

# 原始数据文件
RAW_TRAIN_CSV = '/kaggle/input/csiro-biomass/train.csv'

# 处理后数据的输出路径
OUTPUT_DIR = "/kaggle/working/"
PROCESSED_TRAIN_CSV = os.path.join(OUTPUT_DIR, 'train_processed.csv')
SCALER_PATH = os.path.join(OUTPUT_DIR, 'scaler.pkl')
ENCODERS_PATH = os.path.join(OUTPUT_DIR, 'encoders.pkl')

# 定义列组
TARGET_COLS = ['Dry_Green_g', 'Dry_Dead_g', 'Dry_Clover_g', 'GDM_g', 'Dry_Total_g']
NUM_FEATURES = ['Pre_GSHH_NDVI', 'Height_Ave_cm']
CAT_FEATURES = ['State', 'Species']
TIME_FEATURE = 'Sampling_Date'

In [3]:
# --- 2. 核心处理函数 ---

def process_dataframe(csv_path, is_train=True, scaler=None, encoders=None):
    """
    加载并处理整个数据集（训练或测试）。
    
    is_train=True: 
        - 会 fit_transform 变换器
        - 会处理目标列
        - 会返回 (df, scaler, encoders)
    is_train=False: 
        - 会使用传入的 scaler/encoders 进行 transform
        - 不会处理目标列
        - 会返回 (df)
    """
    
    # --- A. 加载与重塑 ---
    df_long = pd.read_csv(csv_path)
    
    # 修正后的重塑逻辑 (使用 image_path 作为唯一ID)
    df_wide_targets = df_long.pivot(index='image_path', columns='target_name', values='target')
    
    meta_cols = ['image_path', TIME_FEATURE] + CAT_FEATURES + NUM_FEATURES
    available_meta_cols = [col for col in meta_cols if col in df_long.columns]
    df_meta = df_long[available_meta_cols].drop_duplicates(subset='image_path').set_index('image_path')
    
    df_wide = df_meta.join(df_wide_targets)
    
    # 复制一份用于输出，避免 SettingWithCopyWarning
    df_out = df_wide.copy()

    # --- B. 处理目标 (Y) - 仅限训练集 ---
    if is_train:
        print("Processing targets (Log Transform)...")
        for col in TARGET_COLS:
            df_out[f'log_{col}'] = np.log1p(df_out[col])
    
    # --- C. 处理时间特征 (X_time) ---
    print("Processing time features (Month sin/cos)...")
    df_out[TIME_FEATURE] = pd.to_datetime(df_out[TIME_FEATURE])
    df_out['Month'] = df_out[TIME_FEATURE].dt.month
    df_out['month_sin'] = np.sin(2 * np.pi * df_out['Month'] / 12)
    df_out['month_cos'] = np.cos(2 * np.pi * df_out['Month'] / 12)
    
    # --- D. 处理数值特征 (X_num) ---
    print("Processing numeric features (StandardScaler)...")
    if is_train:
        # 如果是训练，创建并 fit 新的 scaler
        scaler = StandardScaler()
        df_out[NUM_FEATURES] = scaler.fit_transform(df_out[NUM_FEATURES])
    else:
        # 如果是测试，使用之前 fit 好的 scaler
        if scaler is None:
            raise ValueError("Scaler must be provided for test data processing")
        df_out[NUM_FEATURES] = scaler.transform(df_out[NUM_FEATURES])

    # --- E. 处理类别特征 (X_cat) ---
    print("Processing categorical features (LabelEncoder)...")
    if is_train:
        # 如果是训练，创建并 fit 新的 encoders
        encoders = {}
        for col in CAT_FEATURES:
            le = LabelEncoder()
            df_out[f'{col}_encoded'] = le.fit_transform(df_out[col])
            encoders[col] = le
    else:
        # 如果是测试，使用之前 fit 好的 encoders
        if encoders is None:
            raise ValueError("Encoders must be provided for test data processing")
        for col in CAT_FEATURES:
            # 使用 .transform()。如果测试集出现训练集没有的标签，会报错
            df_out[f'{col}_encoded'] = encoders[col].transform(df_out[col])

    # --- F. 返回结果 ---
    if is_train:
        return df_out, scaler, encoders
    else:
        # 确保测试集的列顺序与训练集一致
        return df_out

In [4]:
# --- 3. 执行管道 ---

if __name__ == "__main__":
    try:
        # 确保输出目录存在
        os.makedirs(OUTPUT_DIR, exist_ok=True)
        
        # --- 处理训练数据 ---
        print("--- Processing Training Data ---")
        train_df_processed, fitted_scaler, fitted_encoders = process_dataframe(
            RAW_TRAIN_CSV, 
            is_train=True
        )
        
        print(f"\nProcessed training data shape: {train_df_processed.shape}")
        
        # --- 保存输出 ---
        print(f"Saving processed train CSV to: {PROCESSED_TRAIN_CSV}")
        train_df_processed.to_csv(PROCESSED_TRAIN_CSV, index=True) # index=True 保存 image_path
        
        print(f"Saving scaler to: {SCALER_PATH}")
        with open(SCALER_PATH, 'wb') as f:
            pickle.dump(fitted_scaler, f)
            
        print(f"Saving encoders to: {ENCODERS_PATH}")
        with open(ENCODERS_PATH, 'wb') as f:
            pickle.dump(fitted_encoders, f)
            
        print("\n--- Data Processing Pipeline COMPLETED ---")
        
        print("\nProcessed Data Head:")
        print(train_df_processed.head())

    except FileNotFoundError:
        print(f"Error: Raw data file not found at {RAW_TRAIN_CSV}")
    except Exception as e:
        print(f"An error occurred: {e}")

--- Processing Training Data ---
Processing targets (Log Transform)...
Processing time features (Month sin/cos)...
Processing numeric features (StandardScaler)...
Processing categorical features (LabelEncoder)...

Processed training data shape: (357, 20)
Saving processed train CSV to: /kaggle/working/train_processed.csv
Saving scaler to: /kaggle/working/scaler.pkl
Saving encoders to: /kaggle/working/encoders.pkl

--- Data Processing Pipeline COMPLETED ---

Processed Data Head:
                       Sampling_Date State            Species  Pre_GSHH_NDVI  \
image_path                                                                     
train/ID1011485656.jpg    2015-09-04   Tas    Ryegrass_Clover      -0.246319   
train/ID1012260530.jpg    2015-04-01   NSW            Lucerne      -0.707060   
train/ID1025234388.jpg    2015-09-01    WA  SubcloverDalkeith      -1.826004   
train/ID1028611175.jpg    2015-05-18   Tas           Ryegrass       0.016962   
train/ID1035947949.jpg    2015-09-11  