In [15]:
from pathlib import Path
import re
import warnings
from typing import List, Tuple, Dict, Optional

import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score, KFold
from sklearn.compose import ColumnTransformer, TransformedTargetRegressor
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import OneHotEncoder, StandardScaler, PolynomialFeatures
from sklearn.impute import SimpleImputer
from sklearn.feature_selection import VarianceThreshold
from sklearn.linear_model import LinearRegression, Lasso, Ridge, ElasticNet
from sklearn.metrics import mean_absolute_error

In [24]:
# ================== 全局变量 ==================
DATA_DIR        = Path.cwd() 
TRAIN_PRICE     = DATA_DIR / "ruc_Class25Q2_train_price.csv"
TRAIN_RENT      = DATA_DIR / "ruc_Class25Q2_train_rent.csv"
TEST_PRICE = DATA_DIR / "ruc_Class25Q2_test_price.csv"
TEST_RENT  = DATA_DIR / "ruc_Class25Q2_test_rent.csv"

OUT_PRICE  = DATA_DIR / "pred_test_price.csv"   
OUT_RENT   = DATA_DIR / "pred_test_rent.csv"    

FAST_DEMO       = True        # True: 仍6折，但CV最多抽样 MAX_CV_ROWS 且参数网格较小；False: 全量搜索
MAX_CV_ROWS     = 30000       # FAST_DEMO 抽样上限
CV_FOLDS        = 6           
RANDOM_STATE    = 111         

USE_POLY        = False       # 是否加入多项式/交互项（默认关闭，避免维度膨胀）
LOW_CARD_MAX    = 12          # OHE 仅对基数<=12的分类变量
LOW_CORR_THR    = 0.05        # 与目标Spearman绝对相关度阈值，低于此阈值的数值列剔除
VIF_THRESH      = 15.0        # 多重共线VIF阈值
VAR_THR         = 0.01        # 方差过滤阈值
# ============================================

warnings.filterwarnings("ignore", category=FutureWarning)
warnings.filterwarnings("ignore", category=UserWarning)


In [17]:
# ------------------ 清洗 ------------------
def normalize_colnames(df: pd.DataFrame) -> pd.DataFrame:
    """列名标准化：小写、非字母数字转下划线，去首尾下划线"""
    mapping = {c: re.sub(r"[^0-9a-z]+", "_", str(c).strip().lower()).strip("_") for c in df.columns}
    return df.rename(columns=mapping)

def strip_strings(df: pd.DataFrame) -> pd.DataFrame:
    """字符串去空格；统一空值：'', '—', '无' 等替换为 NaN"""
    empties = {"", "null", "none", "nan", "na", "无", "不详", "未知", "—", "-", "— —"}
    for c in df.columns:
        if pd.api.types.is_object_dtype(df[c]):
            df[c] = df[c].astype(str).str.strip()
            df[c] = df[c].replace({v: np.nan for v in empties})
    return df

def parse_chinese_number(s: str) -> float | None:
    """解析带中文单位的数字，如 '2.3万', '5千', '120元/㎡' -> 规范化为float（单位按数值本身）"""
    if s is None or (not isinstance(s, str)):
        try: return float(s)
        except Exception: return None
    x = s.strip()
    if x == "": return None
    sign = -1.0 if x.startswith("-") else 1.0
    x = x.lstrip("+-").strip()
    x = re.sub(r"[,\s]", "", x)
    scale = 1.0
    # 规模单位转换
    if re.search(r"亿", x): scale = 1e8; x = x.replace("亿", "")
    elif re.search(r"万", x): scale = 1e4; x = x.replace("万", "")
    elif re.search(r"[千kK]", x): scale = 1e3; x = re.sub(r"[千kK]", "", x)
    # 去尾部单位/杂质
    x = re.sub(r"(元|块|人民币|rmb|￥|/.*|平方米|平米|㎡|m2|m²|平|套|层|间|户|年|月|天)$", "", x, flags=re.I)
    try:
        return sign * float(x) * scale
    except Exception:
        # 若仍失败，兜底提取首个数字
        m = re.search(r"([+-]?\d+(?:\.\d+)?)", x)
        return sign * float(m.group(1)) * scale if m else None

def parse_area_m2(s: str) -> float | None:
    """解析面积字符串，可识别区间 '80-90㎡' 取均值"""
    if not isinstance(s, str): return parse_chinese_number(s)
    # 先解析区间均值（80-90㎡）
    m = re.search(r"([+-]?\d+(?:\.\d+)?)\s*(?:-|~|至|到)\s*([+-]?\d+(?:\.\d+)?)", s)
    if m:
        a, b = parse_chinese_number(m.group(1)), parse_chinese_number(m.group(2))
        if a is not None and b is not None: return (a + b) / 2.0
    # 统一面积单位
    x = (s.strip().lower()
         .replace("平方米","㎡").replace("平米","㎡").replace("m2","㎡")
         .replace("m^2","㎡").replace("m²","㎡")).replace("㎡","")
    return parse_chinese_number(x)

def parse_floor_info(s: str) -> tuple[float | None, float | None, str | None]:
    """解析楼层信息：当前层数、总层数、楼层等级（高/中/低/顶/底）"""
    if not isinstance(s, str): return None, None, None
    x = s.strip()
    level = next((lvl for lvl in ["高层","中层","低层","顶层","底层"] if lvl in x), None)
    total = None
    m_total = re.search(r"共\s*([0-9]+)\s*层", x) or re.search(r"/\s*([0-9]+)\s*层", x)
    if m_total: total = float(m_total.group(1))
    cur = None
    m_cur = re.search(r"(?:第)?\s*([0-9]+)\s*层", x) or re.search(r"^\s*([0-9]+)\s*/", x)
    if m_cur:
        try: cur = float(m_cur.group(1))
        except Exception: cur = None
    return cur, total, level

def parse_layout_compact(s: str) -> dict:
    """户型解析为少量**数值**列（室/厅/厨/卫），避免OHE产生过多维度"""
    d = {"layout_rooms":0.0,"layout_halls":0.0,"layout_kitchens":0.0,"layout_baths":0.0}
    if not isinstance(s, str): return d
    for k, ch in [("layout_rooms","室"),("layout_halls","厅"),("layout_kitchens","厨"),("layout_baths","卫")]:
        m = re.search(r"(\d+)\s*"+ch, s); 
        if m: d[k] = float(m.group(1))
    return d

def parse_year_built(s: str) -> float | None:
    """解析建成年份（如 '1998年'）"""
    if not isinstance(s, str): return None
    m = re.search(r"(19\d{2}|20\d{2})\s*年", s)
    return float(m.group(1)) if m else None

def orientation_bucket(s: str) -> str | None:
    """朝向合并为少数类别：E/S/W/N/NE/NW/SE/SW/NS_Through/Other"""
    if not isinstance(s, str) or s.strip()=="":
        return None
    st = s.strip()
    if "南北" in st and "通" in st: return "NS_Through"
    for k in [("东南","SE"),("东北","NE"),("西南","SW"),("西北","NW")]:
        if k[0] in st: return k[1]
    if "东" in st: return "E"
    if "西" in st: return "W"
    if "南" in st: return "S"
    if "北" in st: return "N"
    return "Other"

def detect_cols(df: pd.DataFrame, keys: list[str]) -> Optional[str]:
    """根据关键字模糊匹配列名，返回命中的第一列名；找不到返回 None"""
    for key in keys:
        for c in df.columns:
            if key in c:
                return c
    return None

def clean_housing_df(df: pd.DataFrame, is_rent: bool) -> pd.DataFrame:
    """正则清洗：目标是输出少量数值/有限类别特征；避免生成大量二值列"""
    df = normalize_colnames(df)
    df = strip_strings(df)

    # 可能的关键列定位
    colmap: Dict[str, str] = {}
    colmap["layout"]    = detect_cols(df, ["layout","huxing","house_type","户型"])
    colmap["floor"]     = detect_cols(df, ["floor","楼层"])
    colmap["area"]      = detect_cols(df, ["area","mianji","面积"])
    colmap["unit_price"]= detect_cols(df, ["unit_price","danjia","单价","price_per_m2","price_sqm"])
    colmap["total_price"]=detect_cols(df, ["total_price","zongjia","总价","price"])
    colmap["orientation"]= detect_cols(df, ["orientation","chaoxiang","朝向"])
    colmap["decoration"]= detect_cols(df, ["decoration","zhuangxiu","装修"])
    colmap["year_built"]= detect_cols(df, ["year_built","build_year","建","竣工","年代"])
    colmap["elevator"]  = detect_cols(df, ["elevator","dianti","电梯"])
    colmap["subway"]    = detect_cols(df, ["subway","地铁","metro","rail"])
    colmap["property"]  = detect_cols(df, ["property","chanquan","产权"])

    # 户型四个数值列
    if colmap.get("layout"):
        parsed = df[colmap["layout"]].fillna("").apply(parse_layout_compact)
        for k in parsed.iloc[0].keys():
            df[k] = parsed.apply(lambda d: d.get(k, 0.0))

    # 楼层文本 -> 当前层数/总层数/等级 + 楼层比
    if colmap.get("floor"):
        parsed = df[colmap["floor"]].fillna("").apply(parse_floor_info)
        df["floor_number"]  = parsed.apply(lambda t: t[0])
        df["total_floors"]  = parsed.apply(lambda t: t[1])
        df["floor_level"]   = parsed.apply(lambda t: t[2])  # 少数等级类别
        with np.errstate(invalid="ignore", divide="ignore"):
            df["floor_ratio"] = df["floor_number"] / df["total_floors"]

    # 面积/单价/总价解析
    if colmap.get("area"):
        df["area_m2"] = df[colmap["area"]].apply(lambda x: parse_area_m2(x) if isinstance(x, str) else parse_area_m2(str(x)))
    if colmap.get("unit_price"):
        df["unit_price_per_m2"] = df[colmap["unit_price"]].apply(parse_chinese_number)
    if colmap.get("total_price"):
        df["total_price_yuan"] = df[colmap["total_price"]].apply(parse_chinese_number)

    # 朝向合并
    if colmap.get("orientation"):
        df["orientation_bucket"] = df[colmap["orientation"]].apply(orientation_bucket)

    # 装修映射到少数类别
    if colmap.get("decoration"):
        c = colmap["decoration"]
        df["decoration_std"] = None
        mapping = {"精装":"fine", "简装":"simple", "豪装":"luxury", "中装":"mid", "毛坯":"bare"}
        for zh, en in mapping.items():
            df.loc[df[c].astype(str).str.contains(zh, na=False), "decoration_std"] = en

    # 建成年份
    if colmap.get("year_built"):
        df["year_built_num"] = df[colmap["year_built"]].apply(parse_year_built)

    # 电梯：1/0/NaN
    if colmap.get("elevator"):
        s = df[colmap["elevator"]].astype(str)
        df["has_elevator"] = np.where(s.str.contains("无电梯|没电梯"), 0,
                               np.where(s.str.contains("有电梯|配电梯|电梯"), 1, np.nan))

    # 地铁距离，最终统一为米
    if colmap.get("subway"):
        s = df[colmap["subway"]].astype(str)
        km = s.str.extract(r"([0-9]+(?:\.\d+)?)\s*(?:km|公里)", expand=False)
        m  = s.str.extract(r"([0-9]+(?:\.\d+)?)\s*(?:m|米)", expand=False)
        df["subway_dist_m"] = np.where(km.notna(), km.astype(float)*1000, m.astype(float))

    # 产权年限
    if colmap.get("property"):
        s = df[colmap["property"]].astype(str)
        years = s.str.extract(r"([1-9][0-9]?)\s*年\s*产\s*权", expand=False)
        df["property_years"] = years.astype(float)

    # 基本合理性修正
    for c in ["area_m2","unit_price_per_m2","total_price_yuan","subway_dist_m"]:
        if c in df.columns: df.loc[df[c] < 0, c] = np.nan
    if "floor_number" in df.columns and "total_floors" in df.columns:
        df.loc[df["floor_number"] > df["total_floors"], "floor_number"] = np.nan

    return df


In [23]:
# ------------------ 防泄漏 & 特征工程 ------------------
LEAK_PATTERNS = [
    r"community.*price",                 
    r"avg.*price", r"mean.*price", r"median.*price",
    r"list.*price", r"quoted.*price",    
    r"target", r"y_true", r"label", r"ground.*truth",  
]
class LeakageDropper(BaseEstimator, TransformerMixin):
    """按列名正则匹配，删除疑似泄漏特征（在训练前置处理阶段）"""
    def __init__(self, target: str):
        self.target = target
        self.to_drop_: List[str] = []
    def fit(self, X: pd.DataFrame, y=None):
        cols = normalize_colnames(X).columns
        for c in cols:
            if c == self.target: continue
            for pat in LEAK_PATTERNS:
                if re.search(pat, c, flags=re.IGNORECASE):
                    self.to_drop_.append(c); break
        self.to_drop_ = sorted(set(self.to_drop_)); return self
    def transform(self, X: pd.DataFrame):
        return X.drop(columns=[c for c in self.to_drop_ if c in X.columns], errors="ignore")

class IQRClipper(BaseEstimator, TransformerMixin):
    """对数值特征按IQR区间截断，缓解极端值影响"""
    def __init__(self): self.bounds_=[]; self.n_features_=0
    @staticmethod
    def _as_2d(X):
        if isinstance(X, pd.DataFrame): return X.to_numpy(dtype=float)
        if isinstance(X, np.ndarray):
            X = X.astype(float, copy=False)
            return X if X.ndim==2 else X.reshape(-1,1)
        return np.asarray(X, dtype=float)
    def fit(self, X, y=None):
        A = self._as_2d(X); self.n_features_ = A.shape[1]; self.bounds_=[]
        for j in range(self.n_features_):
            col = A[:,j]; col = col[~np.isnan(col)]
            if col.size==0: self.bounds_.append((np.nan,np.nan)); continue
            q1,q3 = np.nanpercentile(col,[25,75]); iqr=q3-q1
            self.bounds_.append((q1-1.5*iqr, q3+1.5*iqr))
        return self
    def transform(self, X):
        A = self._as_2d(X).copy()
        for j,(lo,hi) in enumerate(self.bounds_):
            if not (np.isnan(lo) or np.isnan(hi)): A[:,j] = np.clip(A[:,j], lo, hi)
        return A

class AutoLogTransformer(BaseEstimator, TransformerMixin):
    """对右偏严重且非负的数值特征做 log1p 变换"""
    def __init__(self, skew_threshold: float=1.0): 
        self.skew_threshold=skew_threshold; self.log_mask_=None
    def fit(self, X, y=None):
        A = IQRClipper._as_2d(X); self.log_mask_ = np.zeros(A.shape[1], dtype=bool)
        for j in range(A.shape[1]):
            col = A[:,j]; col = col[~np.isnan(col)]
            if col.size==0: continue
            if np.nanmin(col) >= 0:
                sk = pd.Series(col).skew(skipna=True)
                if np.isfinite(sk) and sk > self.skew_threshold: self.log_mask_[j]=True
        return self
    def transform(self, X):
        A = IQRClipper._as_2d(X).copy()
        if self.log_mask_ is None: return A
        for j,flag in enumerate(self.log_mask_):
            if flag: A[:,j] = np.log1p(np.where(np.isnan(A[:,j]), np.nan, A[:,j]))
        return A

def identify_target(df: pd.DataFrame) -> str:
    """自动识别目标列：优先 'price' 其次 'rent'"""
    cols = normalize_colnames(df).columns
    if "price" in cols: return "price"
    if "rent"  in cols: return "rent"
    for c in cols:
        if c in ("y","target","label"): return c
    raise ValueError("无法识别目标列（price 或 rent）。")

def select_low_card_cats(X: pd.DataFrame, max_card=LOW_CARD_MAX) -> List[str]:
    """仅保留低基数分类变量用于 OHE，避免维度爆炸"""
    cats = [c for c in X.columns if not pd.api.types.is_numeric_dtype(X[c])]
    keep = []
    for c in cats:
        try:
            if X[c].nunique(dropna=True) <= max_card: keep.append(c)
        except Exception: pass
    return keep

def fast_corr_prefilter(X: pd.DataFrame, y: pd.Series, thr: float=LOW_CORR_THR) -> List[str]:
    """在训练集上用 Spearman 相关度筛掉与目标弱相关的数值特征"""
    num_cols = [c for c in X.columns if pd.api.types.is_numeric_dtype(X[c])]
    low_cols = []
    y_s = pd.to_numeric(y, errors="coerce")
    for c in num_cols:
        try:
            s = pd.to_numeric(X[c], errors="coerce")
            corr = s.corr(y_s, method="spearman")
            if (corr is None) or (not np.isfinite(corr)) or (abs(corr) < thr): low_cols.append(c)
        except Exception: continue
    return low_cols

def compute_high_vif_numeric(df: pd.DataFrame, thresh: float=VIF_THRESH) -> List[str]:
    """计算数值特征 VIF，剔除多重共线严重的列（、"""
    try:
        import statsmodels.api as sm
        from statsmodels.stats.outliers_influence import variance_inflation_factor
    except Exception:
        return []
    numeric_cols = [c for c in df.columns if pd.api.types.is_numeric_dtype(df[c]) and c not in ("price","rent")]
    if len(numeric_cols) < 2: return []
    X = df[numeric_cols].copy().replace([np.inf,-np.inf], np.nan)
    X = X.fillna(X.median(numeric_only=True))
    Xc = sm.add_constant(X)
    high = []
    for i in range(1, Xc.shape[1]):
        try:
            vif = variance_inflation_factor(Xc.values, i)
            if np.isfinite(vif) and vif >= thresh: high.append(X.columns[i-1])
        except Exception: pass
    return sorted(set(high))

def make_ohe(max_cats=12):
    """兼容不同 sklearn 版本的 OHE；限制最大类别数以控制维度"""
    try:
        return OneHotEncoder(handle_unknown="ignore", sparse_output=False, max_categories=max_cats)
    except TypeError:
        # 旧版本退化为不设 max_categories
        try:
            return OneHotEncoder(handle_unknown="ignore", sparse_output=False)
        except TypeError:
            # 极旧版本
            return OneHotEncoder(handle_unknown="ignore", sparse=False)

def build_preprocessor(X_train: pd.DataFrame, use_poly: bool) -> ColumnTransformer:
    """构建列式预处理：数值（缺失填充→IQR截断→按需log→标准化）+ 低基数分类（OHE），最后方差过滤"""
    num_cols = [c for c in X_train.columns if pd.api.types.is_numeric_dtype(X_train[c])]
    cat_cols = select_low_card_cats(X_train, max_card=LOW_CARD_MAX)

    num_steps = [("num_impute", SimpleImputer(strategy="median")),
                 ("num_clip", IQRClipper()),
                 ("num_log", AutoLogTransformer(skew_threshold=1.0))]
    if use_poly:
        num_steps.append(("num_poly", PolynomialFeatures(degree=2, include_bias=False)))
    num_steps.append(("num_scale", StandardScaler()))
    numeric_pipe = Pipeline(steps=num_steps)

    cat_pipe = Pipeline(steps=[("cat_impute", SimpleImputer(strategy="most_frequent")),
                               ("cat_ohe", make_ohe(max_cats=12))])

    pre = ColumnTransformer(transformers=[("num", numeric_pipe, num_cols),
                                          ("cat", cat_pipe, cat_cols)],
                            remainder="drop")
    # 方差过滤放在最后，进一步去除近似常量特征
    preproc = Pipeline(steps=[("pre_cols", pre),
                              ("variance_filter", VarianceThreshold(VAR_THR))])
    return preproc

In [20]:
# ------------------ 训练 & 汇报 ------------------
def _append_rmae_columns(df: pd.DataFrame, y_tr: pd.Series, y_te: pd.Series, cv_mae_col="Cross-validation") -> pd.DataFrame:
    """在指标表中追加 RMAE（=MAE / mean(y)）"""
    df = df.copy()
    y_tr_mean = float(np.nanmean(y_tr))
    y_te_mean = float(np.nanmean(y_te))
    if not np.isfinite(y_te_mean): y_te_mean = y_tr_mean
    df["In sample RMAE"]      = df["In sample"]       / y_tr_mean if y_tr_mean else np.nan
    df["Out of sample RMAE"]  = df["Out of sample"]   / y_te_mean if y_te_mean else np.nan
    df["CV RMAE"]             = df[cv_mae_col]        / y_tr_mean if y_tr_mean else np.nan
    cols = ["Metrics","In sample","In sample RMAE","Out of sample","Out of sample RMAE","Cross-validation","CV RMAE","Kaggle Score"]
    return df[cols]

def _pretty_print_block(title_cn: str, metrics_df: pd.DataFrame):
    """美观打印指标块（带中文标题与分隔线）"""
    line = "=" * 72
    print(line)
    print(title_cn)
    print(line)
    print(metrics_df.to_string(index=False))
    print()

def fit_and_report(df_raw: pd.DataFrame, dataset_name: str) -> pd.DataFrame:
    """对单个数据集执行：清洗→分割→筛特征→建模→评估→返回指标表"""
    print(f"\n========== [{dataset_name}] 清洗 ==========")
    cleaned = clean_housing_df(df_raw, is_rent=(dataset_name=="RENT"))

    # 识别目标列、去除缺失目标
    target = identify_target(cleaned)
    y_all_raw = pd.to_numeric(cleaned[target], errors="coerce")
    mask = y_all_raw.notna()
    cleaned = cleaned.loc[mask].reset_index(drop=True)
    y_all_raw = y_all_raw.loc[mask]

    X_all = cleaned.drop(columns=[target])

    # 防泄漏：按列名规则删除强疑似泄漏列
    leak_drp = LeakageDropper(target=target).fit(X_all, None)
    if leak_drp.to_drop_:
        print(f"[{dataset_name}] 删除疑似泄漏列: {leak_drp.to_drop_}")
        X_all = X_all.drop(columns=leak_drp.to_drop_, errors="ignore")

    # 80/20 切分train和validation
    X_tr, X_te, y_tr_raw, y_te_raw = train_test_split(X_all, y_all_raw, test_size=0.2, random_state=RANDOM_STATE)

    #  IQR 去异常值
    q1, q3 = np.nanpercentile(y_tr_raw, [25, 75])
    iqr = q3 - q1
    lo, hi = q1 - 1.5*iqr, q3 + 1.5*iqr
    keep = (y_tr_raw >= lo) & (y_tr_raw <= hi)
    X_tr, y_tr_raw = X_tr.loc[keep], y_tr_raw.loc[keep]
    print(f"[{dataset_name}] IQR 剔除目标极端值: {int((~keep).sum())}；保留训练样本: {X_tr.shape[0]}")
    print(f"[{dataset_name}] 验证集预测条数(Out-of-sample): {X_te.shape[0]}")

    # 低相关特征与高VIF共线特征
    low_corr = fast_corr_prefilter(X_tr, y_tr_raw, thr=LOW_CORR_THR)
    if low_corr:
        print(f"[{dataset_name}] 删除低相关数值列(|Spearman|<{LOW_CORR_THR}): {len(low_corr)} 列")
        X_tr = X_tr.drop(columns=low_corr, errors="ignore")
        X_te = X_te.drop(columns=low_corr, errors="ignore")

    high_vif = compute_high_vif_numeric(pd.concat([X_tr, y_tr_raw], axis=1), thresh=VIF_THRESH)
    if high_vif:
        print(f"[{dataset_name}] 删除高VIF(≥{VIF_THRESH})列: {high_vif}")
        X_tr = X_tr.drop(columns=high_vif, errors="ignore")
        X_te = X_te.drop(columns=high_vif, errors="ignore")

    # 预处理流水线+ 方差过滤
    preproc = build_preprocessor(X_tr, use_poly=USE_POLY)

    # FAST_DEMO 模式下限制参与CV的样本规模
    cv = KFold(n_splits=CV_FOLDS, shuffle=True, random_state=RANDOM_STATE)
    X_cv, y_cv_raw = X_tr, y_tr_raw
    if FAST_DEMO and X_tr.shape[0] > MAX_CV_ROWS:
        rng = np.random.RandomState(RANDOM_STATE)
        idx = rng.choice(np.arange(X_tr.shape[0]), size=MAX_CV_ROWS, replace=False)
        X_cv, y_cv_raw = X_tr.iloc[idx], y_tr_raw.iloc[idx]
        print(f"[{dataset_name}] CV样本: {X_cv.shape[0]} / {X_tr.shape[0]} (FAST_DEMO)")

    results = []

    # ---- OLS ----
    print(f"[{dataset_name}] 训练：OLS")
    ols_pipe = Pipeline([("pre", preproc), ("reg", LinearRegression())])
    ols_ttr  = TransformedTargetRegressor(regressor=ols_pipe, func=np.log1p, inverse_func=np.expm1)
    cv_mae_ols = -np.mean(cross_val_score(ols_ttr, X_cv, y_cv_raw, cv=cv, scoring="neg_mean_absolute_error", n_jobs=-1))
    ols_ttr.fit(X_tr, y_tr_raw)
    results.append({"Metrics":"OLS",
                    "In sample": mean_absolute_error(y_tr_raw, ols_ttr.predict(X_tr)),
                    "Out of sample": mean_absolute_error(y_te_raw, ols_ttr.predict(X_te)),
                    "Cross-validation": cv_mae_ols,
                    "Kaggle Score": "待提交"})

    # ---- LASSO ----
    print(f"[{dataset_name}] 训练：LASSO")
    lasso_pipe = Pipeline([("pre", preproc), ("reg", Lasso(max_iter=10000, random_state=RANDOM_STATE))])
    lasso_ttr  = TransformedTargetRegressor(regressor=lasso_pipe, func=np.log1p, inverse_func=np.expm1)
    # 覆盖较弱到中等的惩罚强度，以避免过度稀疏导致欠拟合
    lasso_grid = {
        "regressor__reg__alpha": (
            np.r_[np.logspace(-4, -1, 5), np.logspace(-1, 2, 8)] if FAST_DEMO
            else np.logspace(-4, 2, 20)
        )
    }
    lasso_cv   = GridSearchCV(lasso_ttr, lasso_grid, scoring="neg_mean_absolute_error", cv=cv, n_jobs=-1)
    lasso_cv.fit(X_cv, y_cv_raw)
    lasso_best = lasso_cv.best_estimator_
    results.append({"Metrics":"LASSO",
                    "In sample": mean_absolute_error(y_tr_raw, lasso_best.predict(X_tr)),
                    "Out of sample": mean_absolute_error(y_te_raw, lasso_best.predict(X_te)),
                    "Cross-validation": -lasso_cv.best_score_,
                    "Kaggle Score": "待提交"})

    # ---- Ridge ----
    print(f"[{dataset_name}] 训练：Ridge")
    ridge_pipe = Pipeline([("pre", preproc), ("reg", Ridge())])
    ridge_ttr  = TransformedTargetRegressor(regressor=ridge_pipe, func=np.log1p, inverse_func=np.expm1)
    ridge_grid = {"regressor__reg__alpha": (np.logspace(0, 4, 6) if FAST_DEMO else np.logspace(-2, 5, 12))} # 1~1e4
    ridge_cv   = GridSearchCV(ridge_ttr, ridge_grid, scoring="neg_mean_absolute_error", cv=cv, n_jobs=-1)
    ridge_cv.fit(X_cv, y_cv_raw)
    ridge_best = ridge_cv.best_estimator_
    results.append({"Metrics":"Ridge",
                    "In sample": mean_absolute_error(y_tr_raw, ridge_best.predict(X_tr)),
                    "Out of sample": mean_absolute_error(y_te_raw, ridge_best.predict(X_te)),
                    "Cross-validation": -ridge_cv.best_score_,
                    "Kaggle Score": "待提交"})

    # ---- Elastic Net ----
    print(f"[{dataset_name}] 训练：Elastic Net")
    enet_pipe = Pipeline([("pre", preproc), ("reg", ElasticNet(max_iter=10000, random_state=RANDOM_STATE))])
    enet_ttr  = TransformedTargetRegressor(regressor=enet_pipe, func=np.log1p, inverse_func=np.expm1)
    enet_grid = {
        # 1e-4 ~ 1e+2，覆盖弱到中等惩罚
        "regressor__reg__alpha": (
            np.r_[np.logspace(-4, -1, 5), np.logspace(-1, 2, 8)] if FAST_DEMO
            else np.logspace(-4, 2, 20)
        ),
        # 加入接近 Ridge 的比率，避免过度稀疏
        "regressor__reg__l1_ratio": ([0.05, 0.2, 0.4, 0.6, 0.8] if FAST_DEMO
                                     else [0.05, 0.1, 0.3, 0.5, 0.7, 0.9])
    }
    enet_cv   = GridSearchCV(enet_ttr, enet_grid, scoring="neg_mean_absolute_error", cv=cv, n_jobs=-1)
    enet_cv.fit(X_cv, y_cv_raw)
    enet_best = enet_cv.best_estimator_
    results.append({"Metrics":"Elastic Net",
                    "In sample": mean_absolute_error(y_tr_raw, enet_best.predict(X_tr)),
                    "Out of sample": mean_absolute_error(y_te_raw, enet_best.predict(X_te)),
                    "Cross-validation": -enet_cv.best_score_,
                    "Kaggle Score": "待提交"})

    # 最佳线性模型
    metrics = pd.DataFrame(results).sort_values("Cross-validation").reset_index(drop=True)
    best_row = metrics.loc[0].copy()
    best_row["Metrics"] = "Best Linear Model"
    metrics = pd.concat([metrics, pd.DataFrame([best_row])], ignore_index=True)

    # 追加 RMAE 列
    metrics = _append_rmae_columns(metrics, y_tr_raw, y_te_raw, cv_mae_col="Cross-validation")
    return metrics

def _get_id_series(df_raw: pd.DataFrame) -> pd.Series:
    """拿到ID列（优先 'ID'，其次 'id'；若都没有则回退为顺序编号）"""
    if "ID" in df_raw.columns: 
        return df_raw["ID"]
    if "id" in df_raw.columns:
        return df_raw["id"]
    return pd.Series(np.arange(len(df_raw)), name="ID")

In [21]:
def _ensure_required_columns_for_predict(fitted_ttr, df: pd.DataFrame) -> pd.DataFrame:
    """
    根据已拟合的 TransformedTargetRegressor 内部 ColumnTransformer，补齐预测所需但缺失的列。
    """
    try:
        pre_pipe = fitted_ttr.regressor_["pre"]
        coltrans = pre_pipe.named_steps["pre_cols"]
    except Exception:
        return df

    required_cols = []
    for name, trans, cols in getattr(coltrans, "transformers_", []):
        if cols is None:
            continue
        if isinstance(cols, list):
            required_cols.extend(cols)
        else:
            try:
                required_cols.extend(list(cols))
            except Exception:
                pass

    required_cols = [c for c in required_cols if isinstance(c, str)]
    missing = [c for c in required_cols if c not in df.columns]
    for c in missing:
        df[c] = np.nan
    return df

def predict_to_csv(best_estimator, test_csv: Path, out_csv: Path, is_rent: bool):
    """用训练好的最佳模型对测试集预测，并导出为两列(ID, Price)的CSV"""
    test_raw = pd.read_csv(test_csv, low_memory=False)
    ids = _get_id_series(test_raw)
    # 与训练一致的清洗；只做列转换，不删样本
    test_clean = clean_housing_df(test_raw, is_rent=is_rent)
    # 补齐训练所需但测试缺失的列
    test_clean = _ensure_required_columns_for_predict(best_estimator, test_clean)
    # 预测
    preds = best_estimator.predict(test_clean)
    pd.DataFrame({"ID": ids.values, "Price": preds}).to_csv(out_csv, index=False)

def train_best_estimator(df_raw: pd.DataFrame, dataset_name: str):
    """
    仅训练并返回“最佳线性模型”用于预测（与 fit_and_report 的模型与网格一致），
    保持与上方训练流程一致，按CV MAE 最小选择 OLS/LASSO/Ridge/ENet 中的最佳者。
    """
    # —— 以下基本复用在 fit_and_report 里的流程 ——
    cleaned = clean_housing_df(df_raw, is_rent=(dataset_name=="RENT"))
    target = identify_target(cleaned)
    y_all_raw = pd.to_numeric(cleaned[target], errors="coerce")
    mask = y_all_raw.notna()
    cleaned = cleaned.loc[mask].reset_index(drop=True)
    y_all_raw = y_all_raw.loc[mask]
    X_all = cleaned.drop(columns=[target])

    # 防泄漏
    leak_drp = LeakageDropper(target=target).fit(X_all, None)
    X_all = X_all.drop(columns=leak_drp.to_drop_, errors="ignore")

    # 切分
    X_tr, X_te, y_tr_raw, y_te_raw = train_test_split(X_all, y_all_raw, test_size=0.2, random_state=RANDOM_STATE)

    # IQR 去极端
    q1, q3 = np.nanpercentile(y_tr_raw, [25, 75]); iqr = q3 - q1
    lo, hi = q1 - 1.5*iqr, q3 + 1.5*iqr
    keep = (y_tr_raw >= lo) & (y_tr_raw <= hi)
    X_tr, y_tr_raw = X_tr.loc[keep], y_tr_raw.loc[keep]

    # 低相关 / 高VIF
    low_corr = fast_corr_prefilter(X_tr, y_tr_raw, thr=LOW_CORR_THR)
    X_tr = X_tr.drop(columns=low_corr, errors="ignore")
    X_te = X_te.drop(columns=low_corr, errors="ignore")
    high_vif = compute_high_vif_numeric(pd.concat([X_tr, y_tr_raw], axis=1), thresh=VIF_THRESH)
    X_tr = X_tr.drop(columns=high_vif, errors="ignore")
    X_te = X_te.drop(columns=high_vif, errors="ignore")

    # 预处理
    preproc = build_preprocessor(X_tr, use_poly=USE_POLY)

    # CV 与网格
    cv = KFold(n_splits=CV_FOLDS, shuffle=True, random_state=RANDOM_STATE)
    X_cv, y_cv_raw = X_tr, y_tr_raw
    if FAST_DEMO and X_tr.shape[0] > MAX_CV_ROWS:
        rng = np.random.RandomState(RANDOM_STATE)
        idx = rng.choice(np.arange(X_tr.shape[0]), size=MAX_CV_ROWS, replace=False)
        X_cv, y_cv_raw = X_tr.iloc[idx], y_tr_raw.iloc[idx]

    candidates = {}

    # OLS
    ols_pipe = Pipeline([("pre", preproc), ("reg", LinearRegression())])
    ols = TransformedTargetRegressor(regressor=ols_pipe, func=np.log1p, inverse_func=np.expm1)
    ols_cv_mae = -np.mean(cross_val_score(ols, X_cv, y_cv_raw, cv=cv, scoring="neg_mean_absolute_error", n_jobs=-1))
    ols.fit(X_tr, y_tr_raw)
    candidates["OLS"] = (ols_cv_mae, ols)

    # LASSO
    lasso_pipe = Pipeline([("pre", preproc), ("reg", Lasso(max_iter=10000, random_state=RANDOM_STATE))])
    lasso = TransformedTargetRegressor(regressor=lasso_pipe, func=np.log1p, inverse_func=np.expm1)
    lasso_grid = {"regressor__reg__alpha": (np.r_[np.logspace(-4, -1, 5), np.logspace(-1, 2, 8)] if FAST_DEMO else np.logspace(-4, 2, 20))}
    lasso_cv = GridSearchCV(lasso, lasso_grid, scoring="neg_mean_absolute_error", cv=cv, n_jobs=-1)
    lasso_cv.fit(X_cv, y_cv_raw)
    candidates["LASSO"] = (-lasso_cv.best_score_, lasso_cv.best_estimator_)

    # Ridge
    ridge_pipe = Pipeline([("pre", preproc), ("reg", Ridge())])
    ridge = TransformedTargetRegressor(regressor=ridge_pipe, func=np.log1p, inverse_func=np.expm1)
    ridge_grid = {"regressor__reg__alpha": (np.logspace(0, 4, 6) if FAST_DEMO else np.logspace(-2, 5, 12))}
    ridge_cv = GridSearchCV(ridge, ridge_grid, scoring="neg_mean_absolute_error", cv=cv, n_jobs=-1)
    ridge_cv.fit(X_cv, y_cv_raw)
    candidates["Ridge"] = (-ridge_cv.best_score_, ridge_cv.best_estimator_)

    # Elastic Net
    enet_pipe = Pipeline([("pre", preproc), ("reg", ElasticNet(max_iter=10000, random_state=RANDOM_STATE))])
    enet = TransformedTargetRegressor(regressor=enet_pipe, func=np.log1p, inverse_func=np.expm1)
    enet_grid = {
        "regressor__reg__alpha": (np.r_[np.logspace(-4, -1, 5), np.logspace(-1, 2, 8)] if FAST_DEMO else np.logspace(-4, 2, 20)),
        "regressor__reg__l1_ratio": ([0.05, 0.2, 0.4, 0.6, 0.8] if FAST_DEMO else [0.05, 0.1, 0.3, 0.5, 0.7, 0.9])
    }
    enet_cv = GridSearchCV(enet, enet_grid, scoring="neg_mean_absolute_error", cv=cv, n_jobs=-1)
    enet_cv.fit(X_cv, y_cv_raw)
    candidates["Elastic Net"] = (-enet_cv.best_score_, enet_cv.best_estimator_)

    # 选 CV MAE 最小者
    best_name = min(candidates.items(), key=lambda kv: kv[1][0])[0]
    best_est = candidates[best_name][1]
    # 以全量训练集拟合一次最佳模型
    return best_est

In [22]:
# ------------------ 主程序 ------------------
def main():
    """读取数据→分别训练 PRICE/RENT→打印两块指标表"""
    if not TRAIN_PRICE.exists() or not TRAIN_RENT.exists():
        raise FileNotFoundError(f"未找到训练数据：\n- {TRAIN_PRICE}\n- {TRAIN_RENT}\n请检查 DATA_DIR 或文件名。")

    price_train = pd.read_csv(TRAIN_PRICE, low_memory=False)
    rent_train  = pd.read_csv(TRAIN_RENT,  low_memory=False)

    price_metrics = fit_and_report(price_train, dataset_name="PRICE")
    rent_metrics  = fit_and_report(rent_train,  dataset_name="RENT")

    print("\n======== PRICE Metrics (MAE) ========")
    _pretty_print_block("价格  模型性能汇总：", price_metrics)
    print("======== RENT Metrics (MAE) ========")
    _pretty_print_block("租金  模型性能汇总：", rent_metrics)

    # ==== 导出 test 的两个 CSV ====
    price_best = train_best_estimator(price_train, dataset_name="PRICE")
    rent_best  = train_best_estimator(rent_train,  dataset_name="RENT")
    predict_to_csv(price_best, TEST_PRICE, OUT_PRICE, is_rent=False)
    predict_to_csv(rent_best,  TEST_RENT,  OUT_RENT,  is_rent=True)

    print(f"\n已导出：\n- {OUT_PRICE}\n- {OUT_RENT}\n")
    
if __name__ == "__main__":
    main()


[PRICE] IQR 剔除目标极端值: 6312；保留训练样本: 76784
[PRICE] 验证集预测条数(Out-of-sample): 20775
[PRICE] 删除高VIF(≥15.0)列: ['coord_x', 'coord_y', 'lat', 'lon']
[PRICE] CV样本: 30000 / 76784 (FAST_DEMO)
[PRICE] 训练：OLS
[PRICE] 训练：LASSO
[PRICE] 训练：Ridge
[PRICE] 训练：Elastic Net

[RENT] IQR 剔除目标极端值: 4392；保留训练样本: 74727
[RENT] 验证集预测条数(Out-of-sample): 19780
[RENT] 删除低相关数值列(|Spearman|<0.05): 1 列
[RENT] 删除高VIF(≥15.0)列: ['coord_x', 'lon']
[RENT] CV样本: 30000 / 74727 (FAST_DEMO)
[RENT] 训练：OLS
[RENT] 训练：LASSO
[RENT] 训练：Ridge
[RENT] 训练：Elastic Net

价格  模型性能汇总：
          Metrics     In sample  In sample RMAE  Out of sample  Out of sample RMAE  Cross-validation  CV RMAE Kaggle Score
            LASSO 264368.093750        0.154154  487137.596984            0.218064     265271.728779 0.154681          待提交
      Elastic Net 267149.100409        0.155776  482963.737019            0.216196     268083.842920 0.156321          待提交
            Ridge 304332.351388        0.177458  488157.436197            0.218521     300646.354675 0