# Public Score 185091

MAE: 109640.8983, RMSE: 538318.4967, R²: 0.8215

# 修改 baseline 後還沒跑出結果
- 使用TimeSeriesSplit，專門用於時間序列資料的交叉驗證方法，它的設計目的是為了模擬「未來不能影響過去」的情況，防止資料洩漏（data leakage）
- 新增價格/面積等「密度型衍生特徵」
- 處理稀有類別 + 平滑 Target Encoding
- 用 LightGBM 作為 HybridModel 的機器學習部分，並用 Optuna 自動搜尋最佳超參數
- Clip 房價 + Winsorize 特徵

In [50]:
"""
倫敦房價預測 - 混合模型（趨勢分析 + 機器學習）
使用時間序列特徵結合機器學習進行房價預測
"""

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, Normalizer, LabelEncoder, OneHotEncoder, OrdinalEncoder
from sklearn.base import BaseEstimator, TransformerMixin, RegressorMixin
from sklearn.pipeline import Pipeline
from statsmodels.tsa.deterministic import DeterministicProcess, CalendarFourier
from sklearn.linear_model import Ridge
from lightgbm import LGBMRegressor
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor, VotingRegressor
from sklearn.neural_network import MLPRegressor
from xgboost import XGBRegressor
from sklearn.model_selection import GridSearchCV,TimeSeriesSplit
from sklearn.metrics import mean_absolute_error, r2_score, mean_squared_error
from catboost import CatBoostRegressor
import optuna
from lightgbm import LGBMRegressor
from sklearn.model_selection import TimeSeriesSplit, cross_val_score

def objective(trial, X, y, trend_cols, machine_cols, all_columns):
    params = {
        "device": "gpu", 
        "n_estimators": trial.suggest_int("n_estimators", 1000, 6000, step=500),
        "max_depth": trial.suggest_int("max_depth", 4, 12),
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.2, log=True),
        "subsample": trial.suggest_float("subsample", 0.6, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.6, 1.0),
        "random_state": 42,
        "verbose": -1
    }

    # ➡ 將 Encoder 與 HybridModel 串成管線
    model = Pipeline([
        ('Encoder', CustomEncoder()),
        ('Model', HybridModel(
            trend_model=Ridge(alpha=0.1),
            machine_model=LGBMRegressor(**params),
            trend_cols=trend_cols,
            machine_cols=machine_cols,
            all_columns=all_columns
        ))
    ])

    tscv = TimeSeriesSplit(n_splits=5)
    score = cross_val_score(model, X, y,
                            cv=tscv,
                            scoring='neg_mean_absolute_error',
                            error_score='raise')   # 方便除錯
    return -score.mean()

def create_time_features(data_list):
    """創建時間相關特徵"""
    print("創建時間特徵...")
    for data in data_list:
        # 創建時間索引
        data['time'] = pd.to_datetime(dict(
            year=data['sale_year'], 
            month=data['sale_month'], 
            day=15
        ))
        data['time'] = data['time'].dt.to_period('M')
        
        # 創建數值型時間特徵
        data['time_numeric'] = (
            (data['time'].dt.to_timestamp() - data['time'].min().to_timestamp()) / 
            np.timedelta64(1, 'D')
        )
    
    return data_list


def preprocess_address_features(data_list):
    """處理地址相關特徵"""
    print("處理地址特徵...")
    for data in data_list:
        # 提取街道資訊
        data['street'] = data['fullAddress'].apply(
            lambda address: ' '.join(address.split(',')[-3].split(' ')[-2:])
        )
        
        # 處理郵遞區號
        data['postcode'] = data['postcode'].apply(
            lambda postcode: postcode.split(' ')[1]
        )
        
        # 移除國家欄位（所有資料都是同一個國家）
        data.drop('country', axis=1, inplace=True)
    
    return data_list

def engineer_address_features(data_list):
    """從 fullAddress 中提取更豐富的特徵"""
    print("進行高級地址特徵工程...")
    
    for df in data_list:
        # 將地址轉為小寫以便搜索
        address_lower = df['fullAddress'].str.lower()

        # 1. 提取街道類型
        df['street_type'] = address_lower.str.extract(r'\b(road|street|avenue|lane|square|drive|court|place|gardens|mews)\b', expand=False).fillna('unknown')

        # 2. 是否為公寓/樓層
        df['is_flat_or_apt'] = address_lower.str.contains(r'flat|apartment|unit|floor|level').astype(int)

        # 3. 提取數字資訊 (可能代表門牌號或公寓號)
        # 提取第一個出現的數字序列
        df['address_number'] = address_lower.str.extract(r'(\d+)').astype(float).fillna(0)

        # 4. 地址長度 (一個代理特徵，更長的地址可能意味著更複雜的建築)
        df['address_length'] = df['fullAddress'].str.len()

        # 5. 關鍵詞計數
        keywords = ['mansion', 'penthouse', 'cottage', 'studio', 'garden', 'park', 'view', 'river', 'new build']
        for keyword in keywords:
            df[f'has_{keyword}'] = address_lower.str.contains(keyword).astype(int)
            
    return data_list

def haversine_distance(lat1, lon1, lat2, lon2):
    """
    計算兩點之間的 haversine 距離（單位：公里）
    """
    R = 6371  # 地球半徑 (km)
    lat1, lon1, lat2, lon2 = map(np.radians, [lat1, lon1, lat2, lon2])
    dlat = lat2 - lat1
    dlon = lon2 - lon1
    a = np.sin(dlat/2.0)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon/2.0)**2
    return R * 2 * np.arcsin(np.sqrt(a))

# ==================== 全新的、安全的目標編碼核心函式 ====================

def create_safe_target_encoding(train_df, test_df, column_to_encode, target_series):
    """
    使用 TimeSeriesSplit 進行安全的目標編碼，防止數據洩漏。
    """
    print(f"    -> 安全目標編碼: {column_to_encode}")
    
    # 初始化空的特徵欄位
    train_df[f'{column_to_encode}_mean_price'] = np.nan
    train_df[f'{column_to_encode}_price_volatility'] = np.nan
    
    # 使用時間序列交叉驗證來生成訓練集的編碼
    tscv = TimeSeriesSplit(n_splits=5)
    
    for train_index, val_index in tscv.split(train_df):
        # 準備當前折的訓練數據和驗證數據
        train_fold = train_df.iloc[train_index]
        val_fold = train_df.iloc[val_index]
        target_train_fold = target_series.iloc[train_index]
        
        # 在訓練折上計算編碼
        encoding = train_fold.join(target_train_fold.rename('target')).groupby(column_to_encode)['target'].agg(['mean', 'std'])
        mean_map = encoding['mean'].to_dict()
        std_map = encoding['std'].fillna(0).to_dict()
        
        # 將計算出的編碼應用到驗證折上
        train_df.loc[val_fold.index, f'{column_to_encode}_mean_price'] = val_fold[column_to_encode].map(mean_map)
        train_df.loc[val_fold.index, f'{column_to_encode}_price_volatility'] = val_fold[column_to_encode].map(std_map)
        
    # --- 為整個訓練集和測試集計算全局編碼 ---
    # 全局編碼用於填充交叉驗證中可能產生的缺失值，以及為測試集編碼
    global_encoding = train_df.join(target_series.rename('target')).groupby(column_to_encode)['target'].agg(['mean', 'std'])
    global_mean_map = global_encoding['mean'].to_dict()
    global_std_map = global_encoding['std'].fillna(0).to_dict()
    
    # 填充訓練集中可能因新類別產生的 NaN
    train_df[f'{column_to_encode}_mean_price'].fillna(target_series.mean(), inplace=True)
    train_df[f'{column_to_encode}_price_volatility'].fillna(target_series.std(), inplace=True)
    
    # 應用全局編碼到測試集
    test_df[f'{column_to_encode}_mean_price'] = test_df[column_to_encode].map(global_mean_map).fillna(target_series.mean())
    test_df[f'{column_to_encode}_price_volatility'] = test_df[column_to_encode].map(global_std_map).fillna(target_series.std())
    
    return train_df, test_df

# ==================== 請用這個簡化版，替換舊的地理特徵函式 ====================

def engineer_geo_features(train_df, test_df):
    """
    執行地理特徵工程（已移除所有價格相關特徵，只保留純幾何特徵）。
    """
    from sklearn.cluster import KMeans
    print("-> 開始執行地理特徵工程（僅純幾何特徵）...")
    
    train_df_copy = train_df.copy()
    test_df_copy = test_df.copy()
    
    for df in [train_df_copy, test_df_copy]:
        # === 1. 基本地理數學特徵 ===
        df['lat_lon_ratio'] = df['latitude'] / (df['longitude'] + 1e-9)
        df['lat_lon_product'] = df['latitude'] * df['longitude']

        # === 2. 倫敦重要地標距離特徵 ===
        london_landmarks = {
            'city_center': (51.5074, -0.1278), 'canary_wharf': (51.5055, -0.0195),
            'westminster': (51.4994, -0.1244), 'heathrow': (51.4700, -0.4543)
        }
        for name, (lat, lon) in london_landmarks.items():
            df[f'dist_to_{name}'] = haversine_distance(df['latitude'], df['longitude'], lat, lon)

    # === 3. 地理聚類特徵 (Fit on train, transform both) ===
    # 這一步是安全的，因為它只基於座標，不涉及價格
    coords_train = train_df_copy[['latitude', 'longitude']].values
    coords_test = test_df_copy[['latitude', 'longitude']].values
    cluster_configs = {'geo_cluster_medium': 20, 'geo_cluster_fine': 50}
    
    for name, k in cluster_configs.items():
        kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
        train_df_copy[name] = kmeans.fit_predict(coords_train)
        test_df_copy[name] = kmeans.predict(coords_test)
        
    print("-> 純地理特徵工程完成。")
    return train_df_copy, test_df_copy

def impute_missing_values_with_strategy(data_list, column_name, strategy='most_frequent'):
    """使用指定策略填補缺失值"""
    print(f"填補 {column_name} 的缺失值（策略：{strategy}）...")
    
    # 從訓練資料學習填補策略
    train_data = data_list[0]  # 第一個是訓練資料
    imputer = SimpleImputer(strategy=strategy)
    imputer.fit(train_data[[column_name]])
    
    # 對所有資料集應用填補
    for data in data_list:
        data[column_name] = imputer.transform(data[[column_name]]).ravel()
    
    return data_list


def impute_with_regression(data_list, target_column, feature_column):
    """使用回歸模型填補缺失值"""
    print(f"使用 {feature_column} 預測填補 {target_column} 的缺失值...")
    
    train_data = data_list[0]
    test_data = data_list[1]
    
    # 準備完整的訓練資料
    complete_train_data = train_data.dropna(subset=[target_column, feature_column])
    X_train = complete_train_data[[feature_column]]
    y_train = complete_train_data[target_column]
    
    # 訓練回歸模型
    regression_model = Ridge()
    regression_model.fit(X_train, y_train)
    
    # 填補訓練集的缺失值
    missing_train_mask = train_data[target_column].isna()
    if missing_train_mask.any():
        missing_train_features = train_data.loc[missing_train_mask, [feature_column]]
        train_data.loc[missing_train_mask, target_column] = regression_model.predict(missing_train_features)
    
    # 填補測試集的缺失值
    missing_test_mask = test_data[target_column].isna()
    if missing_test_mask.any():
        missing_test_features = test_data.loc[missing_test_mask, [feature_column]]
        test_data.loc[missing_test_mask, target_column] = regression_model.predict(missing_test_features)
    
    return data_list

def handle_missing_values(data_list):
    """處理所有缺失值"""
    print("開始處理缺失值...")
    
    # 使用最頻繁值填補面積
    data_list = impute_missing_values_with_strategy(data_list, 'floorAreaSqM')
    
    # 使用面積預測浴室數量
    data_list = impute_with_regression(data_list, 'bathrooms', 'floorAreaSqM')
    
    # 使用面積預測臥室數量
    data_list = impute_with_regression(data_list, 'bedrooms', 'floorAreaSqM')
    
    # 使用最頻繁值填補其他類別特徵
    categorical_columns = ['livingRooms', 'tenure', 'propertyType', 'currentEnergyRating']
    for column in categorical_columns:
        data_list = impute_missing_values_with_strategy(data_list, column)
    
    return data_list


def create_time_series_features(train_data, test_data):
    """創建時間序列特徵"""
    print("創建時間序列特徵...")
    
    # 創建確定性過程（趨勢、季節性、週期性）
    deterministic_process = DeterministicProcess(
        index=train_data.index.unique(),
        constant=True,        # 常數項
        seasonal=True,        # 季節性
        order=12,            # 趨勢階數
        drop=True,           # 移除共線性
        additional_terms=[CalendarFourier(freq="QE", order=4)],  # 季度傅立葉項
    )
    
    # 為訓練資料添加時間序列特徵
    time_features_train = deterministic_process.in_sample()
    train_data = train_data.join(time_features_train, how='left')
    
    # 計算預測相關參數
    forecast_origin = train_data.index.max()
    forecast_lead = test_data.index.min() - forecast_origin
    forecast_horizon = test_data.index.max() - test_data.index.min()
    
    print(f"預測起點: {forecast_origin}")
    print(f"領先時間: {forecast_lead.n} 個月")
    print(f"預測範圍: {forecast_horizon.n} 個月")
    
    # 為測試資料添加時間序列特徵
    time_features_test = deterministic_process.out_of_sample(
        steps=forecast_horizon.n + forecast_lead.n
    )
    test_data = test_data.join(time_features_test, how='left')
    test_data.index.name = 'time'
    
    return train_data, test_data, time_features_train.columns.tolist()


def create_additional_features(data_list):
    """創建額外的特徵"""
    print("創建額外特徵...")
    
    for data in data_list:
        # 總房間數 = 臥室 + 起居室
        data['rooms'] = data['bedrooms'] + data['livingRooms']
        # 總房間數 = 臥室 + 起居室
        data['rooms'] = data['bedrooms'] + data['livingRooms']
        
        # 衍生密度特徵
        data['rooms_per_bedroom'] = data['rooms'] / np.maximum(data['bedrooms'], 1)
        data['bath_per_room'] = data['bathrooms'] / np.maximum(data['rooms'], 1)

        # === 新增：創建交叉特徵 ===
        # 將 outcode 和 propertyType 結合，形成更具體的特徵
        # 例如 "SW1_Flat" (SW1區的公寓)
        data['outcode_proptype'] = data['outcode'].astype(str) + "_" + data['propertyType'].astype(str)

        # 將 outcode 和 tenure 結合
        data['outcode_tenure'] = data['outcode'].astype(str) + "_" + data['tenure'].astype(str)
            
    
    return data_list

class CustomEncoder(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.target_mean_encoders = {}
        self.fallback_values = {}
        self.bin_encoders = {}
        self.ordinal_encoders = {}

    def fit(self, X, y=None):
        X_copy = X.copy()
        
        X_copy['price'] = y
        
        # 簡單目標編碼 (不處理稀有值，不平滑)
        target_encoding_features =  [
            'street', 'postcode', 'outcode', 'tenure', 'propertyType',
            'street_type', 'outcode_proptype', 'outcode_tenure',
        
            # vvvvvvvvvvvv 新增需要編碼的地理特徵 vvvvvvvvvvvv
            'geo_cluster_medium', 'geo_cluster_fine'
        ]
        for feature in target_encoding_features:
            self.target_mean_encoders[feature] = X_copy.groupby(feature)['price'].mean()
            self.fallback_values[feature] = self.target_mean_encoders[feature].mean()
        
        # 舊版的分箱和順序編碼邏輯
        latitude_bins = pd.cut(X_copy['latitude'], bins=10, retbins=True)[1]
        self.bin_encoders['latitudeBins'] = latitude_bins
        
        longitude_bins = pd.cut(X_copy['longitude'], bins=10, retbins=True)[1]
        self.bin_encoders['longitudeBins'] = longitude_bins
        
        energy_rating_order = [['G', 'F', 'E', 'D', 'C', 'B', 'A']]
        # 處理訓練集中可能存在的未知評級
        present_ratings = X_copy['currentEnergyRating'].unique()
        for r in present_ratings:
            if r not in energy_rating_order[0]:
                energy_rating_order[0].append(r)

        self.ordinal_encoders['currentEnergyRating'] = OrdinalEncoder(
            categories=energy_rating_order,
            handle_unknown='use_encoded_value',
            unknown_value=-1
        ).fit(X_copy[['currentEnergyRating']])
        
        return self

    def transform(self, X):
        X_transformed = X.copy()
        
        # 應用目標編碼
        target_encoding_features = [
            'street', 'postcode', 'outcode', 'tenure', 'propertyType',
            'street_type', 'outcode_proptype', 'outcode_tenure',
        
            # vvvvvvvvvvvv 新增需要編碼的地理特徵 vvvvvvvvvvvv
            'geo_cluster_medium', 'geo_cluster_fine'
        ]
        for feature in target_encoding_features:
            X_transformed[feature] = X_transformed[feature].map(self.target_mean_encoders[feature])
            X_transformed[feature] = X_transformed[feature].fillna(self.fallback_values[feature])
        
        # 緯度和經度分箱 (使用 labels=False，更穩健)
        X_transformed['latitudeBins'] = pd.cut(X_transformed['latitude'], bins=self.bin_encoders['latitudeBins'], include_lowest=True, right=True, labels=False)
        X_transformed['longitudeBins'] = pd.cut(X_transformed['longitude'], bins=self.bin_encoders['longitudeBins'], include_lowest=True, right=True, labels=False)
        
        # 能源評級順序編碼
        X_transformed['currentEnergyRating'] = self.ordinal_encoders['currentEnergyRating'].transform(
            X_transformed[['currentEnergyRating']]
        )
        
        return X_transformed

class HybridModel(BaseEstimator, RegressorMixin):
    """
    混合模型：結合趨勢模型和機器學習模型
    - 趨勢模型：處理時間序列特徵
    - 機器學習模型：處理殘差和其他特徵
    """
    
    def __init__(self, trend_model, machine_model, trend_cols, machine_cols, all_columns):
        self.trend_model = trend_model
        self.machine_model = machine_model
        self.trend_cols = trend_cols
        self.machine_cols = machine_cols
        self.all_columns = all_columns

    def fit(self, X, y):
        """訓練混合模型"""
        # 對目標變量進行對數轉換以穩定方差
        y_log = np.log1p(y)
        
        # 確保輸入是 DataFrame 格式
        if not isinstance(X, pd.DataFrame):
            X = pd.DataFrame(X, columns=self.all_columns)
        
        # 分離趨勢特徵和機器學習特徵
        trend_features = X[self.trend_cols]
        machine_features = X[self.machine_cols]
        
        # 1. 訓練趨勢模型
        self.trend_model.fit(trend_features, y_log)
        
        # 2. 計算趨勢預測的殘差
        trend_predictions = self.trend_model.predict(trend_features)
        residual = y_log - trend_predictions
        
        # 3. 用機器學習模型學習殘差
        self.machine_model.fit(machine_features, residual)
        
        return self

    def predict(self, X):
        """進行預測"""
        # 確保輸入是 DataFrame 格式
        if not isinstance(X, pd.DataFrame):
            X = pd.DataFrame(X, columns=self.all_columns)
        
        # 分離特徵
        trend_features = X[self.trend_cols]
        machine_features = X[self.machine_cols]
        
        # 獲取趨勢預測和機器學習預測
        trend_predictions = self.trend_model.predict(trend_features)
        machine_predictions = self.machine_model.predict(machine_features)
        
        # 組合預測結果並反向對數轉換
        combined_predictions = trend_predictions + machine_predictions
        return np.expm1(combined_predictions)


def prepare_features(train_data, test_data, time_series_features):
    """準備特徵集合"""
    print("準備特徵集合...")
    
    # 時間序列特徵（用於趨勢模型）
    trend_features = time_series_features
    
    # 機器學習特徵（用於殘差模型）
    machine_learning_features = [
        # --- 原有特徵 ---
        'street', 'postcode', 'outcode', 'latitudeBins', 'longitudeBins',
        'bathrooms', 'bedrooms', 'rooms', 'floorAreaSqM', 'livingRooms',
        'tenure', 'propertyType', 'currentEnergyRating', 
        'rooms_per_bedroom', 'bath_per_room',
        
        # --- 之前新增的交叉特徵 ---
        'outcode_proptype', 'outcode_tenure',
        
        # --- 全新的地址特徵 ---
        'street_type', 'is_flat_or_apt', 'address_number', 'address_length',
        'has_mansion', 'has_penthouse', 'has_cottage', 'has_studio', 
        'has_garden', 'has_park', 'has_view', 'has_river', 'has_new build',
        
        # vvvvvvvvvvvv 新增所有地理特徵 vvvvvvvvvvvv
        'lat_lon_ratio', 'lat_lon_product',
        'dist_to_city_center', 'dist_to_canary_wharf', 'dist_to_westminster', 'dist_to_heathrow',
        'geo_cluster_medium', 'geo_cluster_fine'
    ]
    
    # 準備訓練特徵和目標
    X_train = train_data.drop('price', axis=1)
    y_train = train_data['price']
    
    # 標準化時間序列特徵
    scaler = StandardScaler()
    X_train[trend_features] = scaler.fit_transform(X_train[trend_features])
    test_data[trend_features] = scaler.transform(test_data[trend_features])
    
    return X_train, y_train, trend_features, machine_learning_features


def create_and_tune_model(X_train, y_train, trend_features, machine_learning_features):
    """創建並調優模型"""
    print("創建混合模型並進行超參數調優...")
    
    # 定義模型管道
    model_pipeline = {
        'HybridModel': Pipeline([
            ('Encoder', CustomEncoder()),
            ('Model', HybridModel(
                trend_model=Ridge(),
                machine_model=XGBRegressor(),
                trend_cols=trend_features,
                machine_cols=machine_learning_features,
                all_columns=X_train.columns
            ))
        ]),
    }
    
    # 定義超參數搜索空間
    hyperparameter_grid = {
        'HybridModel': {
            'Model__trend_model__alpha': [0.01, 0.1],
            'Model__machine_model__n_estimators': [300,600],
            'Model__machine_model__max_depth': [4,6,8],
            'Model__machine_model__learning_rate': [0.01, 0.005, 0.1],
        }
    }
    
    # 進行網格搜索
    best_models = {}
    for model_name, pipeline in model_pipeline.items():
        print(f"調優 {model_name}...")
        cv = TimeSeriesSplit(n_splits=5)
        grid_search = GridSearchCV(
            pipeline, 
            hyperparameter_grid[model_name], 
            cv=cv, 
            scoring='neg_mean_absolute_error', 
            n_jobs=-1, 
            verbose=2, 
            error_score='raise'
        )
        
        grid_search.fit(X_train, y_train)
        
        print(f"{model_name} 最佳參數: {grid_search.best_params_}")
        print(f"{model_name} 最佳 MAE: {-grid_search.best_score_:.4f}")
        
        best_models[model_name] = grid_search.best_estimator_
    
    return best_models

def create_and_tune_model_with_gridsearch(X_train, y_train, trend_features, machine_learning_features):
    """
    創建混合模型，並使用 GridSearchCV 進行超參數調優。
    - 使用 XGBRegressor 作為機器學習模型。
    - 使用 TimeSeriesSplit 進行交叉驗證。
    """
    print("創建混合模型並使用 GridSearchCV 進行超參數調優...")

    # 1. 定義模型管道，使用 XGBRegressor
    # 這裡我們將機器學習模型換成了 XGBRegressor
    model_pipeline = Pipeline([
        ('Encoder', CustomEncoder()),
        ('Model', HybridModel(
            trend_model=Ridge(),
            machine_model=XGBRegressor(random_state=42, eval_metric='mae'), # 使用 XGBoost
            trend_cols=trend_features,
            machine_cols=machine_learning_features,
            all_columns=X_train.columns
        ))
    ])

    # 2. 定義與舊版 hybrid-model-trend-ml.py 相似的超參數搜索空間
    hyperparameter_grid = {
        'Model__trend_model__alpha': [0.01, 0.1],
        'Model__machine_model__n_estimators': [500],
        'Model__machine_model__max_depth': [9],
        'Model__machine_model__learning_rate': [0.01, 0.05, 0.1],
    }

    # 3. 進行網格搜索，但保留新版中正確的 TimeSeriesSplit
    print("開始 GridSearchCV 調優...")
    tscv = TimeSeriesSplit(n_splits=5)
    grid_search = GridSearchCV(
        model_pipeline,
        hyperparameter_grid,
        cv=tscv,  # <-- 關鍵！我們保留了正確的時間序列交叉驗證方法
        scoring='neg_mean_absolute_error',
        n_jobs=-1,
        verbose=2,
        error_score='raise'
    )

    grid_search.fit(X_train, y_train)

    print(f"GridSearchCV 最佳參數: {grid_search.best_params_}")
    print(f"GridSearchCV 最佳 MAE (CV): {-grid_search.best_score_:.4f}")

    # 4. 返回訓練好的最佳模型
    return grid_search.best_estimator_

def create_ensemble_model(best_models, X_train, y_train):
    """創建集成模型"""
    print("創建集成模型...")
    
    # 準備集成模型的估計器列表
    ensemble_estimators = [
        ('HybridModel', best_models['HybridModel']),
    ]
    
    # 創建投票回歸器
    ensemble_model = VotingRegressor(estimators=ensemble_estimators)
    ensemble_model.fit(X_train, y_train)
    
    print(f"集成模型: {ensemble_model}")
    
    return ensemble_model


def evaluate_model(model, X_train, y_train):
    """評估模型性能"""
    print("評估模型性能...")
    
    # 預測訓練集
    train_predictions = model.predict(X_train)
    
    # 計算評估指標
    mae = mean_absolute_error(y_train, train_predictions)
    rmse = mean_squared_error(y_train, train_predictions, squared=False)
    r2 = r2_score(y_train, train_predictions)
    
    print(f"[訓練集] MAE: {mae:.4f}, RMSE: {rmse:.4f}, R²: {r2:.4f}")
    
    return mae, rmse, r2

def clip_outliers(train_df):
    print("Clip 價格與連續特徵異常值...")
    
    # clip price
    q1, q3 = train_df['price'].quantile([0.25, 0.75])
    iqr = q3 - q1
    upper_bound = q3 + 1.5 * iqr
    train_df['price'] = train_df['price'].clip(upper=upper_bound)
    
    # Winsorize floorAreaSqM
    fq1, fq3 = train_df['floorAreaSqM'].quantile([0.25, 0.75])
    fiqr = fq3 - fq1
    f_upper = fq3 + 1.5 * fiqr
    train_df['floorAreaSqM'] = train_df['floorAreaSqM'].clip(upper=f_upper)
    
    return train_df
def run_optuna_tuning(X_train, y_train, trend_features, machine_features, all_columns, n_trials=10):
    """使用 Optuna 進行超參數調優"""
    print("開始 Optuna 超參數調優...")
    
    # 創建 Optuna 研究
    study = optuna.create_study(direction='minimize')
    
    # 執行優化
    study.optimize(
        lambda trial: objective(trial, X_train, y_train, trend_features, machine_features, all_columns),
        n_trials=n_trials,show_progress_bar=True
    )
    
    print(f"最佳參數: {study.best_params}")
    print(f"最佳 MAE: {study.best_value:.4f}")
    
    # 使用最佳參數創建最終模型
    best_params = study.best_params
    final_model = Pipeline([
        ('Encoder', CustomEncoder()),
        ('Model', HybridModel(
            trend_model=Ridge(alpha=0.1),
            machine_model=LGBMRegressor(device='gpu', **best_params),
            trend_cols=trend_features,
            machine_cols=machine_features,
            all_columns=all_columns
        ))
    ])
    
    # 訓練最終模型
    final_model.fit(X_train, y_train)
    
    return final_model

def train_with_fixed_params(X_train, y_train, trend_features, machine_learning_features, best_params):
    """使用一組固定的超參數直接訓練最終模型"""
    print("使用固定參數進行最終模型訓練...")
    print(f"使用參數: {best_params}")

    # 創建包含 XGBoost 的模型管道
    model_pipeline = Pipeline([
        ('Encoder', CustomEncoder()),
        ('Model', HybridModel(
            trend_model=Ridge(), # alpha 將在下一步被設定
            machine_model=XGBRegressor(random_state=42, eval_metric='mae'), # 其他參數將在下一步被設定
            trend_cols=trend_features,
            machine_cols=machine_learning_features,
            all_columns=X_train.columns
        ))
    ])

    # 使用 set_params() 將您提供的參數應用到管道中
    model_pipeline.set_params(**best_params)

    # 訓練模型
    model_pipeline.fit(X_train, y_train)

    print("模型訓練完成。")
    return model_pipeline

In [6]:

def load_and_prepare_data():
    """載入並準備訓練和測試資料"""
    print("載入資料...")
    train_df = pd.read_csv('/kaggle/input/london-house-price/train.csv')
    test_df = pd.read_csv('/kaggle/input/london-house-price/test.csv')
    
    # 為測試集添加空的價格欄位
    test_df['price'] = np.nan
    
    return train_df, test_df


def generate_submission(model, test_data):
    """生成提交檔案"""
    print("生成提交檔案...")
    
    # 載入提交模板
    submission = pd.read_csv('/kaggle/input/london-house-price/sample_submission.csv')
    
    # 進行預測
    test_features = test_data.drop('price', axis=1)
    submission['price'] = model.predict(test_features)
    
    # 儲存提交檔案
    submission.to_csv('submission_TimeSeriesSplit.csv', index=False)
    print("提交檔案已儲存為 submission_TimeSeriesSplit.csv")


In [2]:
def main():
    """主要執行流程（已整合所有特徵工程）"""
    print("=== 倫敦房價預測 - 最終整合版 ===")
    
    # 1. 載入資料
    train_df, test_df = load_and_prepare_data()

    # --- 核心特徵工程管線 ---
    # 2. 基礎時間特徵
    data_list = create_time_features([train_df, test_df])
    train_df, test_df = data_list[0], data_list[1]
    train_df = train_df.set_index('time')
    test_df = test_df.set_index('time')

    # 3. 地址文字特徵 (基礎 + 您的高級版本)
    data_list = preprocess_address_features([train_df, test_df])
    data_list = engineer_address_features(data_list)
    train_df, test_df = data_list[0], data_list[1]

    # 4. 處理缺失值 (使用您筆記本中的版本)
    data_list = handle_missing_values([train_df, test_df])
    train_df, test_df = data_list[0], data_list[1]

    # 5. 創建時間序列趨勢特徵
    train_df, test_df, time_series_features = create_time_series_features(train_df, test_df)

    # 6. 創建額外交叉特徵
    data_list = create_additional_features([train_df, test_df])
    train_df, test_df = data_list[0], data_list[1]
    
    # 7. 創建地理空間特徵 (最關鍵的新增部分)
    # a. 提前準備好目標變量，因為目標編碼需要它
    y_train_log = np.log1p(train_df['price'])
    # b. 創建一個字典來存儲學習到的模型和映射
    geo_config = {} 
    # c. 執行地理特徵工程
    train_df, test_df = engineer_geo_features(
        train_df, test_df,y_train_log
    )
    
    # 8. 準備最終特徵 (在所有特徵創建完畢後，只調用一次)
    X_train, y_train, trend_features, machine_learning_features = prepare_features(
        train_df, test_df, time_series_features
    )
    
    # 9. 使用固定參數訓練模型
    best_params = {
        'Model__machine_model__learning_rate': 0.01,
        'Model__machine_model__max_depth': 9,
        'Model__machine_model__n_estimators': 500,
        'Model__trend_model__alpha': 0.01
    }
    final_model = train_with_fixed_params(
        X_train, y_train, trend_features, machine_learning_features, best_params
    )

    # 10. 評估和提交
    evaluate_model(final_model, X_train, y_train)
    generate_submission(final_model, test_df)
    
    print("=== 程序執行完成 ===")


# 執行主程序
if __name__ == "__main__":
    main()

=== 倫敦房價預測 - 混合模型 ===


NameError: name 'load_and_prepare_data' is not defined