## 欢迎进入 Notebook  

这里你可以编写代码，文档  

### 关于文件目录  


**project**：project 目录是本项目的工作空间，可以把将项目运行有关的所有文件放在这里，目录中文件的增、删、改操作都会被保留  


**input**：input 目录是数据集的挂载位置，所有挂载进项目的数据集都在这里，未挂载数据集时 input 目录被隐藏  


**temp**：temp 目录是临时磁盘空间，训练或分析过程中产生的不必要文件可以存放在这里，目录中的文件不会保存  


In [19]:
# 查看个人持久化工作区文件
!ls /home/mw/project/

outputs


In [20]:
# 查看当前挂载的数据集目录
!ls /home/mw/input/

folder1003


In [21]:
# Cell1 环境与依赖
"""
统一设置并行线程数与临时目录，避免小机器/在线环境爆线程或 /tmp 爆掉。
定义一个小工具函数 ensure()：检测→若缺失则静默安装→再导入依赖包，保证运行环境自
洽。
载入建模所需的科学计算栈与 scikit-learn 模块（含 Pipeline/预处理/线性模型/KMeans/网格
搜索等）。
（可选）导入文本特征用到的 TF-IDF 与 SVD。
关闭警告以净化日志，并定义一个简单的 log() 打印函数（带时间戳，立刻刷新）。
"""
# ================== Cell1 环境与依赖 ==================
import os, sys, time, json, warnings, re, importlib, subprocess
from dataclasses import dataclass
from typing import List, Tuple, Dict, Optional, Set

# 线程与临时目录
os.environ.setdefault("OPENBLAS_NUM_THREADS", "1")
os.environ.setdefault("OMP_NUM_THREADS", "1")
os.environ.setdefault("MKL_NUM_THREADS", "1")
os.environ.setdefault("NUMEXPR_NUM_THREADS", "1")
os.makedirs("/home/mw/project/outputs/joblib_tmp", exist_ok=True)
os.environ.setdefault("JOBLIB_TEMP_FOLDER", "/home/mw/project/outputs/joblib_tmp")

# 依赖自检与按需安装
def ensure(pkg, pip_name=None, version=None, optional=False):
    name = pip_name or pkg
    try:
        importlib.import_module(pkg)
        print(f"[deps] {pkg} OK")
    except Exception as e:
        if optional:
            print(f"[deps] {pkg} optional: {e}")
            return
        nv = f"{name}=={version}" if version else name
        print(f"[deps] installing {nv} ...")
        subprocess.check_call([sys.executable, "-m", "pip", "install", "-q", nv])
        importlib.import_module(pkg)
        print(f"[deps] {pkg} installed.")

ensure("numpy"); ensure("pandas"); ensure("joblib")
ensure("sklearn", pip_name="scikit-learn")
ensure("scipy")

import numpy as np
import pandas as pd
from joblib import dump
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.model_selection import train_test_split, KFold, GridSearchCV, cross_val_score
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler, FunctionTransformer, MaxAbsScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression, Ridge, ElasticNet
from sklearn.metrics import mean_absolute_error, make_scorer
from sklearn.cluster import KMeans
from sklearn.metrics import pairwise_distances

# 文本模块
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD

warnings.filterwarnings("ignore")
log = lambda m: print(time.strftime("[%H:%M:%S]"), m, flush=True)


[deps] numpy OK
[deps] pandas OK
[deps] joblib OK
[deps] sklearn OK
[deps] scipy OK


In [22]:
# Cell2 基础配置
@dataclass
class CFG:
    TRACK: str = "price"  # "price" or "rent"
    TARGET: str = "Price"
    TRAIN_PATH: str = ""
    TEST_PATH: str = ""
    TEST_SIZE: float = 0.2
    SEEDS: Tuple[int, ...] = (111, 222, 333)

    # 目标与裁剪
    USE_LOG_TARGET: bool = True
    WINSOR_LOWER: float = 0.01
    WINSOR_UPPER: float = 0.99

    # 类别/TE
    RARE_MIN_COUNT: int = 100
    HIGH_CARD_THRESH: int = 20
    TE_SMOOTH: int = 90

    # 地理聚类
    GEO_K: int = 80
    GEO_DIST_TOPK: int = 2

    # 文本（默认开）
    USE_TEXT_FEATURES: bool = True
    TEXT_COLS: Tuple[str, ...] = ("房屋优势","核心卖点","户型介绍","周边配套","交通出行","客户反馈")
    TEXT_SVD_N: int = 5
    TEXT_MAX_FEATURES: int = 10000
    TEXT_MIN_DF: int = 3

    # 交互
    ENABLE_INTERACTION_TE_AGE: bool = True
    ENABLE_INTERACTION_DIST_AREA: bool = True

    # 数值特征筛选（log域）
    FS_BY_CORR: bool = True
    MIN_ABS_CORR_Y: float = 0.02
    FS_BY_CORR_PAIR: bool = True
    MAX_ABS_CORR_FEAT: float = 0.999

    OUT_DIR: str = "/home/mw/project/outputs"
    SAVE_MODEL_PATH: str = ""
    PREDICT_CSV: str = ""
    METRICS_JSON: str = ""

def set_paths(cfg: CFG) -> CFG:
    if cfg.TRACK == "rent":
        cfg.TRAIN_PATH = "/home/mw/input/folder1003/ruc_Class25Q2_train_rent.csv"
        cfg.TEST_PATH  = "/home/mw/input/folder1003/ruc_Class25Q2_test_rent.csv"
    else:
        cfg.TRAIN_PATH = "/home/mw/input/folder1003/ruc_Class25Q2_train_price.csv"
        cfg.TEST_PATH  = "/home/mw/input/folder1003/ruc_Class25Q2_test_price.csv"
    os.makedirs(cfg.OUT_DIR, exist_ok=True)
    s = f"_{cfg.TRACK}"
    cfg.SAVE_MODEL_PATH = os.path.join(cfg.OUT_DIR, f"best_model{s}.joblib")
    cfg.PREDICT_CSV     = os.path.join(cfg.OUT_DIR, f"predictions{s}.csv")
    cfg.METRICS_JSON    = os.path.join(cfg.OUT_DIR, f"metrics{s}.json")
    print("[paths] OUT_DIR =", cfg.OUT_DIR)
    return cfg

In [23]:
# Cell3 工具
_num_pat = re.compile(r"[-+]?\d*\.?\d+")

def extract_numeric(s):
    if pd.isna(s): return np.nan
    m = _num_pat.findall(str(s))
    return float(m[0]) if m else np.nan

def winsorize_y(y, lower_q=0.01, upper_q=0.99):
    lo, hi = np.quantile(y, [lower_q, upper_q])
    return np.clip(y, lo, hi)

def _to_target(y, use_log):
    return np.log1p(y) if use_log else y

def _from_target(y, use_log):
    return np.expm1(y) if use_log else y

def numeric_postprocess(df: pd.DataFrame):
    # 物业费/面积
    if {"物 业 费_num","面积_num"}.issubset(df.columns):
        a = pd.to_numeric(df["面积_num"], errors="coerce").replace(0, np.nan)
        fee = pd.to_numeric(df["物 业 费_num"], errors="coerce")
        df["propfee_per_m2"] = (fee / a).replace([np.inf, -np.inf], np.nan)
    # 房龄
    if "年份" in df.columns:
        now_y = pd.Timestamp.now().year
        yrs = pd.to_numeric(df["年份"], errors="coerce")
        df["house_age"] = (now_y - yrs).clip(lower=0)
    return df

def load_raw(cfg: CFG):
    log("[1/10] 读取数据")
    tr = pd.read_csv(cfg.TRAIN_PATH, low_memory=False)
    te = pd.read_csv(cfg.TEST_PATH, low_memory=False)
    # 强转为数值
    for c in ["lon","lat","年份","容 积 率","停车位"]:
        if c in tr.columns: tr[c] = pd.to_numeric(tr[c], errors="coerce")
        if c in te.columns: te[c] = pd.to_numeric(te[c], errors="coerce")
    # 抽取字符串里的数值
    for c in ["建筑面积","套内面积","物 业 费","绿 化 率","面积","停车费用"]:
        if c in tr.columns: tr[c+"_num"] = tr[c].apply(extract_numeric)
        if c in te.columns: te[c+"_num"] = te[c].apply(extract_numeric)
    tr = numeric_postprocess(tr); te = numeric_postprocess(te)
    print(f"[train] shape={tr.shape}, dtypes summary: {tr.dtypes.value_counts().to_dict()}")
    print(f"[test]  shape={te.shape}, dtypes summary: {te.dtypes.value_counts().to_dict()}")
    return tr, te


# ================== 常用派生封装 ==================
def apply_common_feature_enrichment(df: pd.DataFrame) -> pd.DataFrame:
    df = parse_layout(df, "房屋户型")
    df = add_orientation_features(df, "房屋朝向")
    df = add_time_features(df, "交易时间", "上次交易")
    # 楼层解析（支持关键词+总层数）
    for floor_col in ["所在楼层", "楼层", "楼层情况"]:
        if floor_col in df.columns:
            df = add_floor_features_simple(df, col=floor_col)
            break
    return df

# ================== 户型 & 朝向 ==================
def parse_layout(df: pd.DataFrame, col="房屋户型"):
    if col not in df.columns: return df
    s = df[col].astype(str).fillna("")
    beds, halls, baths = [], [], []
    pats_bed = [r"(\d+)\s*(?:室|房|卧)"]
    pats_hall = [r"(\d+)\s*(?:厅)"]
    pats_bath = [r"(\d+)\s*(?:卫|厕|bath)"]
    for s_i in s:
        def _get(pats):
            for p in pats:
                m = re.search(p, s_i)
                if m: return int(m.group(1))
            return np.nan
        beds.append(_get(pats_bed)); halls.append(_get(pats_hall)); baths.append(_get(pats_bath))
    df["bedrooms"] = pd.to_numeric(beds, errors="coerce")
    df["livingrooms"] = pd.to_numeric(halls, errors="coerce")
    df["bathrooms"] = pd.to_numeric(baths, errors="coerce")
    return df

DIR2ANG = {"东":0,"东北":45,"北":90,"西北":135,"西":180,"西南":225,"南":270,"东南":315}

def orientation_sincos(text: str):
    if pd.isna(text) or not str(text).strip():
        return (np.nan, np.nan)
    t = str(text); hits=[]
    for k in ["东北","西北","西南","东南"]:
        if k in t: hits.append(DIR2ANG[k])
    for k in ["东","南","西","北"]:
        if k in t: hits.append(DIR2ANG[k])
    if not hits:
        if "南北" in t: hits = [DIR2ANG["南"], DIR2ANG["北"]]
        elif "东西" in t: hits = [DIR2ANG["东"], DIR2ANG["西"]]
        else: return (np.nan, np.nan)
    angs = np.deg2rad(np.array(hits, dtype=float))
    return (np.sin(angs).mean(), np.cos(angs).mean())

def add_orientation_features(df: pd.DataFrame, col="房屋朝向"):
    if col not in df.columns: return df
    sin_list, cos_list = [], []
    for v in df[col].astype(str).fillna(""):
        s, c = orientation_sincos(v)
        sin_list.append(s); cos_list.append(c)
    df["ori_sin"] = pd.to_numeric(sin_list, errors="coerce")
    df["ori_cos"] = pd.to_numeric(cos_list, errors="coerce")
    return df

# ================== 时间差 ==================
def add_time_features(df: pd.DataFrame, trade_col="交易时间", last_trade_col="上次交易"):
    today = pd.Timestamp.today().normalize()
    if trade_col in df.columns:
        dt = pd.to_datetime(df[trade_col], errors="coerce")
        df["days_since_trade"] = (today - dt).dt.days.astype("float")
        df["trade_month"] = dt.dt.month.astype("float")
    if last_trade_col in df.columns and "days_since_trade" in df.columns:
        dt2 = pd.to_datetime(df[last_trade_col], errors="coerce")
        df["days_since_last_trade"] = (today - dt2).dt.days.astype("float")
        df["interval_trade_last"] = (pd.to_numeric(df["days_since_last_trade"], errors="coerce")
                                     - pd.to_numeric(df["days_since_trade"], errors="coerce"))
    return df

# ================== 楼层解析（简化版） ==================
_re_total_simple = re.compile(r"共\s*(\d+)\s*层")

def _parse_total_simple(text: str):
    if not isinstance(text, str): return np.nan
    m = _re_total_simple.search(text)
    if not m: return np.nan
    tot = int(m.group(1))
    return tot if tot > 0 else np.nan

def _estimate_level_by_keyword(text: str, total: float):
    t = str(text)
    if '地下室' in t: return 0
    if '底层' in t: return 1
    if '顶层' in t and pd.notna(total): return int(total)
    if pd.notna(total):
        if '低楼层' in t: return max(1, int(round(0.2 * total)))
        if '中楼层' in t: return max(1, int(round(0.5 * total)))
        if '高楼层' in t: return max(1, int(round(0.8 * total)))
    return np.nan

def add_floor_features_simple(df: pd.DataFrame, col="所在楼层"):
    if col not in df.columns: return df
    s = df[col].astype(str)
    total = s.apply(_parse_total_simple)
    level = [_estimate_level_by_keyword(txt, tot) for txt, tot in zip(s, total)]
    level = pd.to_numeric(level, errors="coerce")
    total = pd.to_numeric(total, errors="coerce")
    level = np.where(np.isnan(level) | np.isnan(total), level, np.minimum(level, total))
    is_basement = s.str.contains("地下室", regex=False)
    is_top = (s.str.contains("顶层", regex=False)) | ((pd.Series(level)==pd.Series(total)) & pd.notna(level) & pd.notna(total))
    df["floor_total"] = total
    df["floor_level"] = level
    df["is_top_floor"] = is_top.astype(int)
    df["is_basement"] = is_basement.astype(int)
    return df


In [24]:
# Cell4

# ================== 分段线性 ==================
def add_piecewise(df: pd.DataFrame, col: str, cuts: List[float]):
    if col not in df.columns: return
    x = pd.to_numeric(df[col], errors="coerce")
    for t in cuts:
        df[f"{col}_gt_{t}"] = np.clip(x - t, 0, None)

def add_piecewise_map(df_list, piecewise_map: Dict[str, List[float]]):
    for df in df_list:
        for col, cuts in (piecewise_map or {}).items():
            if col in df.columns:
                add_piecewise(df, col, cuts)

# ================== 地理 KMeans ==================
def fit_geo_kmeans(full_train_df: pd.DataFrame, n_clusters=80, seed=111):
    if not {"lon","lat"}.issubset(full_train_df.columns): return None
    good = full_train_df[["lon","lat"]].dropna()
    if len(good) < max(2, n_clusters): return None
    km = KMeans(n_clusters=n_clusters, random_state=seed, n_init=10)
    km.fit(good.values)
    return km

def assign_geo_cluster(df: pd.DataFrame, km: Optional[KMeans]):
    if km is None or not {"lon","lat"}.issubset(df.columns): return df
    filler = df[["lon","lat"]].mean()
    lab = km.predict(df[["lon","lat"]].fillna(filler).values)
    df["geo_cluster"] = lab.astype("int64")
    return df

def add_geo_center_dists(df: pd.DataFrame, km: Optional[KMeans], topk=2):
    if km is None or not {"lon","lat"}.issubset(df.columns): return df
    centers = km.cluster_centers_
    pts = df[["lon","lat"]].fillna(df[["lon","lat"]].mean()).values
    d = pairwise_distances(pts, centers, metric="euclidean")
    ktop = min(topk, d.shape[1])
    idx = np.argpartition(d, kth=list(range(ktop)), axis=1)[:, :ktop]
    for k in range(idx.shape[1]):
        df[f"geo_dist_{k+1}"] = d[np.arange(len(d)), idx[:,k]]
    return df

# ================== 组内归一 & 中心化 ==================
def _clip_series(s: pd.Series, lo=None, hi=None):
    if lo is not None or hi is not None:
        s = s.clip(lower=lo, upper=hi)
    s = s.replace([np.inf, -np.inf], np.nan)
    return s.astype(float)

def add_group_norm(df_list: List[pd.DataFrame], base_df: pd.DataFrame, group_cols: List[str], num_cols: List[str], min_group_size:int=20):
    RATIO_CLIP = 30.0
    Z_CLIP = 15.0
    EPS_M = 1e-6; EPS_A = 1e-9
    for gcol in group_cols:
        if gcol not in base_df.columns: continue
        gsize = base_df.groupby(gcol).size()
        valid_groups = set(gsize[gsize >= min_group_size].index)
        if not valid_groups: continue
        g = base_df[base_df[gcol].isin(valid_groups)].groupby(gcol)
        for c in num_cols:
            if c not in base_df.columns: continue
            med = g[c].median()
            mad = g[c].apply(lambda x: np.nanmedian(np.abs(x - np.nanmedian(x))) + EPS_A)
            for df in df_list:
                if c in df.columns:
                    m = df[gcol].map(med)
                    a = df[gcol].map(mad)
                    m = m.where(df[gcol].isin(valid_groups), np.nan)
                    a = a.where(df[gcol].isin(valid_groups), np.nan)
                    denom_m = m.where(m.abs() >= EPS_M, np.nan)
                    ratio = df[c] / denom_m
                    z = (df[c] - m) / a
                    df[f"{c}_gmed_{gcol}"] = _clip_series(ratio, -RATIO_CLIP, RATIO_CLIP)
                    df[f"{c}_gz_{gcol}"] = _clip_series(z, -Z_CLIP, Z_CLIP)

def add_cat_num_centered(df_list: List[pd.DataFrame], base_df: pd.DataFrame, pairs: List[Tuple[str,str]], min_group_size:int=20):
    for cat, num in pairs:
        if cat not in base_df.columns or num not in base_df.columns: continue
        gsize = base_df.groupby(cat).size()
        valid = set(gsize[gsize >= min_group_size].index)
        means = base_df[base_df[cat].isin(valid)].groupby(cat)[num].mean()
        for df in df_list:
            if (cat in df.columns) and (num in df.columns):
                m = df[cat].map(means)
                df[f"{num}_by_{cat}_centered"] = (df[num] - m).astype(float)



In [25]:
# Cell5

# ================== 目标编码（含分层） ==================
def add_target_encoding_triplet(X_tr: pd.DataFrame, y_tr: np.ndarray, X_val: pd.DataFrame, X_te: pd.DataFrame, cols: List[str], n_splits=5, smooth=90, seed=111):
    X_tr = X_tr.copy(); X_val = X_val.copy(); X_te = X_te.copy()
    global_mean = float(np.nanmean(y_tr))
    kf = KFold(n_splits=max(2, n_splits), shuffle=True, random_state=seed)
    for col in cols:
        if col not in X_tr.columns: continue
        oof = pd.Series(index=X_tr.index, dtype=float)
        for tr_idx, vd_idx in kf.split(X_tr):
            tr_fold = X_tr.iloc[tr_idx]; y_fold = y_tr[tr_idx]
            dfm = pd.DataFrame({col: tr_fold[col].values, "y": y_fold})
            means = dfm.groupby(col)["y"].mean()
            cnts  = tr_fold[col].value_counts()
            sm = ((means * cnts) + global_mean * smooth) / (cnts + smooth)
            oof.iloc[vd_idx] = X_tr[col].iloc[vd_idx].map(sm).fillna(global_mean).values
        df_full = pd.DataFrame({col: X_tr[col].values, "y": y_tr})
        means_full = df_full.groupby(col)["y"].mean()
        cnts_full  = X_tr[col].value_counts()
        sm_full = ((means_full * cnts_full) + global_mean * smooth) / (cnts_full + smooth)
        X_tr[f"{col}__te"]  = oof.values
        X_val[f"{col}__te"] = X_val[col].map(sm_full).fillna(global_mean).values
        X_te[f"{col}__te"]  = X_te[col].map(sm_full).fillna(global_mean).values
    return X_tr, X_val, X_te

def _mode_parent_map(df: pd.DataFrame, child: str, parent: str) -> pd.Series:
    grp = df.groupby(child)[parent].agg(lambda s: s.mode().iat[0] if len(s.mode()) else s.iloc[0])
    return grp

def add_hier_target_encoding_triplet(X_tr: pd.DataFrame, y_tr: np.ndarray, X_val: pd.DataFrame, X_te: pd.DataFrame, pairs: List[Tuple[str,str]], smooth=90, seed=111):
    X_tr = X_tr.copy(); X_val = X_val.copy(); X_te = X_te.copy()
    global_mean = float(np.nanmean(y_tr))
    kf = KFold(n_splits=5, shuffle=True, random_state=seed)
    for child, parent in pairs:
        if child not in X_tr.columns or parent not in X_tr.columns: continue
        # OOF 编码
        oof = pd.Series(index=X_tr.index, dtype=float)
        for tr_idx, vd_idx in kf.split(X_tr):
            tr_fold = X_tr.iloc[tr_idx]; y_fold = y_tr[tr_idx]
            pm = _mode_parent_map(tr_fold[[child, parent]], child, parent)  # child -> parent
            prior_parent_mean = pd.DataFrame({parent: tr_fold[parent], "y": y_fold}).groupby(parent)["y"].mean()
            prior_by_child = pm.map(prior_parent_mean).fillna(global_mean)  # child -> prior
            dfm = pd.DataFrame({child: tr_fold[child], parent: tr_fold[parent], "y": y_fold})
            child_means = dfm.groupby(child)["y"].mean()
            child_cnts  = tr_fold[child].value_counts()
            prior_idxed = prior_by_child.reindex(child_means.index).fillna(global_mean)
            sm = ((child_means * child_cnts) + prior_idxed * smooth) / (child_cnts + smooth)
            oof.iloc[vd_idx] = X_tr[child].iloc[vd_idx].map(sm).fillna(global_mean).values
        # 全量映射
        pm_full = _mode_parent_map(X_tr[[child, parent]], child, parent)
        prior_parent_full = pd.DataFrame({parent: X_tr[parent], "y": y_tr}).groupby(parent)["y"].mean()
        prior_by_child_full = pm_full.map(prior_parent_full).fillna(global_mean)
        df_full = pd.DataFrame({child: X_tr[child], "y": y_tr})
        means_full = df_full.groupby(child)["y"].mean()
        cnts_full  = X_tr[child].value_counts()
        prior_full_idxed = prior_by_child_full.reindex(means_full.index).fillna(global_mean)
        sm_full = ((means_full * cnts_full) + prior_full_idxed * smooth) / (cnts_full + smooth)
        X_tr[f"{child}__te"]  = oof.values
        X_val[f"{child}__te"] = X_val[child].map(sm_full).fillna(global_mean).values
        X_te[f"{child}__te"]  = X_te[child].map(sm_full).fillna(global_mean).values
        log(f"[feat] hier-TE added: {child} (parent={parent})")
    return X_tr, X_val, X_te

# ================== 文本小容量特征 ==================
def build_text_svd_features(X_tr: pd.DataFrame, X_val: pd.DataFrame, X_te: pd.DataFrame, cfg: CFG):
    cols = [c for c in cfg.TEXT_COLS if c in X_tr.columns]
    if not cols: return X_tr, X_val, X_te

    def cat_text(df):
        arr = []
        for c in cols:
            s = df[c].astype(str).fillna("")
            arr.append(s)
        return [" ".join(x) for x in zip(*arr)]

    vect = TfidfVectorizer(ngram_range=(1,2), max_features=cfg.TEXT_MAX_FEATURES, min_df=cfg.TEXT_MIN_DF)
    tr_docs  = cat_text(X_tr);  val_docs = cat_text(X_val);  te_docs  = cat_text(X_te)
    Xtr_txt  = vect.fit_transform(tr_docs)
    n_feat   = Xtr_txt.shape[1]
    if n_feat == 0:
        # 回退
        X_tr["txt_len"] = pd.Series([len(d) for d in tr_docs], index=X_tr.index).astype(float)
        X_tr["txt_wc"]  = pd.Series([len(d.split()) for d in tr_docs], index=X_tr.index).astype(float)
        X_val["txt_len"]= pd.Series([len(d) for d in val_docs], index=X_val.index).astype(float)
        X_val["txt_wc"] = pd.Series([len(d.split()) for d in val_docs], index=X_val.index).astype(float)
        X_te["txt_len"] = pd.Series([len(d) for d in te_docs], index=X_te.index).astype(float)
        X_te["txt_wc"]  = pd.Series([len(d.split()) for d in te_docs], index=X_te.index).astype(float)
        log("[text] fallback -> added txt_len/txt_wc (no vocab)")
        return X_tr, X_val, X_te

    if n_feat == 1:
        X_tr["txt_tf1"]  = np.asarray(Xtr_txt.todense()).ravel().astype(float)
        X_val["txt_tf1"] = np.asarray(vect.transform(val_docs).todense()).ravel().astype(float)
        X_te["txt_tf1"]  = np.asarray(vect.transform(te_docs).todense()).ravel().astype(float)
        log("[text] fallback -> single TF-IDF feature (n_feat=1)")
        return X_tr, X_val, X_te

    n_comp = min(max(1, cfg.TEXT_SVD_N), n_feat - 1)
    svd = TruncatedSVD(n_components=n_comp, random_state=0)
    Xval_txt = vect.transform(val_docs)
    Xte_txt  = vect.transform(te_docs)
    Xtr_svd  = svd.fit_transform(Xtr_txt)
    Xval_svd = svd.transform(Xval_txt)
    Xte_svd  = svd.transform(Xte_txt)
    for i in range(n_comp):
        X_tr[f"txt_svd_{i+1}"]  = Xtr_svd[:, i].astype(float)
        X_val[f"txt_svd_{i+1}"] = Xval_svd[:, i].astype(float)
        X_te[f"txt_svd_{i+1}"]  = Xte_svd[:, i].astype(float)
    log(f"[text] added {n_comp} SVD dims from {n_feat} tfidf feats")
    return X_tr, X_val, X_te

# ================== 数值筛选 / 预处理 ==================
def corr_with_target(df: pd.DataFrame, y: np.ndarray, num_cols: List[str]) -> pd.Series:
    tmp = df[num_cols].copy(); tmp["_y"] = y
    return tmp.corr(numeric_only=True)["_y"].drop(index="_y")

def drop_low_target_corr(num_cols: List[str], corr_s: pd.Series, min_abs_corr=0.02) -> List[str]:
    keep = [c for c in num_cols if abs(corr_s.get(c, 0.0)) >= min_abs_corr]
    return keep if keep else num_cols

def drop_high_pair_corr(df: pd.DataFrame, cols: List[str], max_abs_corr=0.999) -> List[str]:
    if len(cols) <= 1: return cols
    cmat = df[cols].corr(numeric_only=True).abs()
    upper = cmat.where(np.triu(np.ones(cmat.shape), k=1).astype(bool))
    drop = set()
    for c in upper.columns:
        highs = [r for r in upper.index if (upper.loc[r, c] >= max_abs_corr) and (r not in drop)]
        if highs:
            drop.add(c)
    return [c for c in cols if c not in drop]

def numeric_fs_by_corr_pair(df_tr: pd.DataFrame, y_tr_log: np.ndarray, cfg: CFG) -> List[str]:
    num_cols = [c for c in df_tr.columns if pd.api.types.is_numeric_dtype(df_tr[c])]
    if not num_cols: return []
    log("[3/10] 数值特征筛选（log-y）")
    corr_s = corr_with_target(df_tr, y_tr_log, num_cols)
    keep = drop_low_target_corr(num_cols, corr_s, min_abs_corr=cfg.MIN_ABS_CORR_Y) if cfg.FS_BY_CORR else num_cols
    keep2 = drop_high_pair_corr(df_tr, keep, max_abs_corr=cfg.MAX_ABS_CORR_FEAT) if cfg.FS_BY_CORR_PAIR else keep
    log(f" kept: {len(keep2)} / {len(num_cols)}")
    return keep2



In [26]:
# Cell6
# --- to-string 变换器与 OHE 封装 ---
def _to_str_df(X):
    if hasattr(X, "astype"):
        try: return X.astype(str)
        except Exception: pass
    return pd.DataFrame(X).astype(str)

def _make_tostr_transformer():
    try:
        return FunctionTransformer(_to_str_df, validate=False, feature_names_out="one-to-one")
    except TypeError:
        return FunctionTransformer(_to_str_df, validate=False)

def _make_ohe(drop="if_binary"):
    try:
        return OneHotEncoder(handle_unknown="ignore", drop=drop, sparse_output=True)
    except TypeError:
        return OneHotEncoder(handle_unknown="ignore", drop=drop, sparse=True)

def build_preprocessor(df_tr, y_tr_log, cfg, num_keep, ohe_cols) -> ColumnTransformer:
    log("[4/10] 构建预处理器（稀疏友好 & 可并行pickle）")
    num_pipe = Pipeline([
        ("imp", SimpleImputer(strategy="median")),
        ("scaler", StandardScaler(with_mean=False)),
    ])
    tostr = _make_tostr_transformer()
    cat_pipe = Pipeline([
        ("imp", SimpleImputer(strategy="constant", fill_value="UNK")),
        ("tostr", tostr),
        ("ohe", _make_ohe("if_binary")),
    ])
    transformers = []
    if num_keep: transformers.append(("num", num_pipe, num_keep))
    if ohe_cols: transformers.append(("cat", cat_pipe, ohe_cols))
    pre = ColumnTransformer(
        transformers=transformers,
        remainder="drop",
        sparse_threshold=0.3,
        verbose_feature_names_out=False
    )
    log(f" blocks: num={len(num_keep)}, cats={len(ohe_cols)}")
    return pre

# ================== 类别列拆分 ==================
def split_categoricals_for_encoding(train_df: pd.DataFrame, high_card_thresh=20) -> Tuple[List[str], List[str]]:
    cats = [c for c in train_df.columns if train_df[c].dtype == "object"]
    exclude = set([
        "房屋优势","核心卖点","户型介绍","周边配套","交通出行","客户反馈",
        "物业办公电话","coord_x","coord_y","交易时间","上次交易"
    ])
    num_mapped = {c[:-4] for c in train_df.columns if c.endswith("_num")}
    exclude |= num_mapped
    cats = [c for c in cats if c not in exclude]
    high, low = [], []
    for c in cats:
        k = train_df[c].nunique(dropna=False)
        (high if k > high_card_thresh else low).append(c)
    for c in ["geo_cluster","环线位置","区县","环线"]:
        if (c in cats) and (c not in high):
            high.append(c)
        if c in low:
            low.remove(c)
    return high, low

# ================== winsorize 单列（按 train 分位） ==================
def winsorize_triplet_by_train_quantile(X_tr, X_val, X_te, col, lower=0.01, upper=0.99):
    if col not in X_tr.columns: return X_tr, X_val, X_te
    s = pd.to_numeric(X_tr[col], errors="coerce")
    lo, hi = s.quantile([lower, upper])
    for df in (X_tr, X_val, X_te):
        if col in df.columns:
            df[col] = pd.to_numeric(df[col], errors="coerce").clip(lower=lo, upper=hi)
    return X_tr, X_val, X_te
# ================== 轻量交互 ==================
def add_interactions(df_list: List[pd.DataFrame], cfg: CFG):
    for df in df_list:
        if cfg.ENABLE_INTERACTION_TE_AGE and ("环线位置__te" in df.columns) and ("house_age" in df.columns):
            df["ix_teXage"] = pd.to_numeric(df["环线位置__te"], errors="coerce") * pd.to_numeric(df["house_age"], errors="coerce")
        if cfg.ENABLE_INTERACTION_DIST_AREA and ("geo_dist_1" in df.columns) and ("面积_num" in df.columns):
            df["ix_distXarea"] = pd.to_numeric(df["geo_dist_1"], errors="coerce") * pd.to_numeric(df["面积_num"], errors="coerce")

# ================== 罕见类别合并（兜底实现） ==================
def build_rare_maps(df: pd.DataFrame, min_cnt: int = 100) -> Dict[str, Set[str]]:
    rare_map = {}
    for c in df.columns:
        if df[c].dtype == "object":
            vc = df[c].value_counts(dropna=False)
            rares = set(vc[vc < min_cnt].index.astype(str))
            if rares:
                rare_map[c] = rares
    return rare_map

def apply_rare_maps(df: pd.DataFrame, rare_map: Dict[str, Set[str]]) -> pd.DataFrame:
    if not rare_map: return df
    df = df.copy()
    for c, rares in rare_map.items():
        if c in df.columns:
            s = df[c].astype(str)
            df[c] = np.where(s.isin(rares), "OTHER", s)
    return df

# ================== 6折交叉验证（MAE） ==================
def compute_cv6_mae_log(estimator, X, y_log, seed=111):
    """
    log 空间 MAE（neg_mean_absolute_error），仅用于诊断/参考。
    """
    cv = KFold(n_splits=6, shuffle=True, random_state=seed)
    scores = cross_val_score(
        estimator, X, y_log,
        cv=cv,
        scoring="neg_mean_absolute_error",
        n_jobs=-1,
        error_score="raise"
    )
    return float(-scores.mean())

def compute_cv6_mae_orig(estimator, X, y_log, use_log=True, seed=111):
    """
    原始单位（价格）的 MAE：将 y_log / y_pred_log 反变换后计算 MAE。
    """
    def _mae_orig(y_true_log, y_pred_log):
        y_true = np.expm1(y_true_log) if use_log else y_true_log
        y_pred = np.expm1(y_pred_log) if use_log else y_pred_log
        y_med  = float(np.nanmedian(y_true))
        y_pred = np.where(np.isfinite(y_pred), y_pred, y_med)
        return mean_absolute_error(y_true, y_pred)
    scorer = make_scorer(_mae_orig, greater_is_better=False)
    cv = KFold(n_splits=6, shuffle=True, random_state=seed)
    scores = cross_val_score(estimator, X, y_log, cv=cv, scoring=scorer, n_jobs=-1, error_score="raise")
    return float(-scores.mean())

In [27]:
# Cell7
# ================== 单 seed 训练 ==================
def run_one_seed(cfg: CFG, seed: int):
    # 读取与基础加工
    tr, te = load_raw(cfg)
    rare_maps = build_rare_maps(tr, min_cnt=cfg.RARE_MIN_COUNT)
    tr = apply_rare_maps(tr, rare_maps); te = apply_rare_maps(te, rare_maps)
    tr = apply_common_feature_enrichment(tr); te = apply_common_feature_enrichment(te)

    km = fit_geo_kmeans(tr, n_clusters=cfg.GEO_K, seed=seed)
    if km is not None:
        log(f"[feat] geo KMeans fitted (k={cfg.GEO_K})")

    # 目标与切分
    y_raw = pd.to_numeric(tr[cfg.TARGET], errors="coerce").values
    y_raw = winsorize_y(y_raw, cfg.WINSOR_LOWER, cfg.WINSOR_UPPER)
    y_t   = _to_target(y_raw, cfg.USE_LOG_TARGET)
    X     = tr.drop(columns=[cfg.TARGET])
    X_te  = te.drop(columns=[cfg.TARGET], errors="ignore")

    log("[2/10] 训练/验证 80/20 划分")
    X_tr, X_val, y_tr, y_val = train_test_split(X, y_t, test_size=cfg.TEST_SIZE, random_state=seed)
    y_tr_orig = _from_target(y_tr, cfg.USE_LOG_TARGET)
    y_val_orig = _from_target(y_val, cfg.USE_LOG_TARGET)

    # 地理特征
    for df in (X_tr, X_val, X_te):
        assign_geo_cluster(df, km)
        add_geo_center_dists(df, km, topk=cfg.GEO_DIST_TOPK)

    # 分段、组内归一等增强
    add_piecewise_map([X_tr, X_val, X_te], {"面积_num":[60,90,120,150], "house_age":[10,20,30]})
    grp_cols = [c for c in ["geo_cluster","环线位置","区县"] if c in X_tr.columns]
    num_cols_for_group = [c for c in ["面积_num","建筑面积_num","套内面积_num","house_age","propfee_per_m2","days_since_trade"] if c in X_tr.columns]
    add_group_norm([X_tr, X_val, X_te], X_tr, grp_cols, num_cols_for_group, min_group_size=20)
    pairs = [(c,n) for (c,n) in [("环线位置","面积_num"), ("环线位置","propfee_per_m2")] if (c in X_tr.columns and n in X_tr.columns)]
    add_cat_num_centered([X_tr, X_val, X_te], X_tr, pairs, min_group_size=20)
    X_tr, X_val, X_te = winsorize_triplet_by_train_quantile(X_tr, X_val, X_te, "propfee_per_m2", 0.01, 0.99)

    # 目标编码（层级 + 普通）
    te_high, ohe_low = split_categoricals_for_encoding(X_tr, high_card_thresh=cfg.HIGH_CARD_THRESH)
    hier_pairs = [(c,p) for (c,p) in [("环线位置","环线"), ("区县","城市")] if (c in X_tr.columns and p in X_tr.columns)]
    if hier_pairs:
        X_tr, X_val, X_te = add_hier_target_encoding_triplet(X_tr, y_tr, X_val, X_te, hier_pairs, smooth=cfg.TE_SMOOTH, seed=seed)
    te_cols = [c for c in te_high if f"{c}__te" not in X_tr.columns]
    if te_cols:
        X_tr, X_val, X_te = add_target_encoding_triplet(X_tr, y_tr, X_val, X_te, cols=te_cols, n_splits=5, smooth=cfg.TE_SMOOTH, seed=seed)
    log(f"[feat] simple-TE added: {te_cols}")

    # 文本特征（若开启）
    if cfg.USE_TEXT_FEATURES:
        X_tr, X_val, X_te = build_text_svd_features(X_tr, X_val, X_te, cfg)

    # 轻交互 & 数值列筛选
    add_interactions([X_tr, X_val, X_te], cfg)
    num_keep = numeric_fs_by_corr_pair(X_tr, y_tr, cfg)
    ohe_cols = [c for c in ohe_low if c in X_tr.columns]

    # 预处理器
    preproc = build_preprocessor(X_tr, y_tr, cfg, num_keep, ohe_cols)

    # 模型 & 超参（L1/EN 范围下探）
    models = {
        "OLS": LinearRegression(),
        "Ridge": Ridge(),
        # 用 EN 退化为 Lasso（l1_ratio=1.0），统一接口
        "Lasso": ElasticNet(alpha=1e-5, l1_ratio=1.0, max_iter=8000, tol=1e-3, selection="cyclic", random_state=seed),
        "ElasticNet": ElasticNet(max_iter=8000, tol=1e-3, selection="cyclic", random_state=seed),
    }
    param_grids = {
        "Ridge": {"model__alpha": [1,10,30,60,120,200,300,450]},
        "Lasso": {"model__alpha": [1e-5, 2e-5, 5e-5, 1e-4, 2e-4, 3e-4, 5e-4, 8e-4, 1e-3],
                  "model__l1_ratio": [1.0]},
        "ElasticNet": {
            "model__alpha": [1e-5, 2e-5, 5e-5, 1e-4, 2e-4, 3e-4, 5e-4, 8e-4, 1e-3],
            "model__l1_ratio": [0.05, 0.1, 0.2, 0.35, 0.5],
        },
    }

    results, best_name, best_pipe, best_out_mae = {}, None, None, np.inf
    log_lo, log_hi = np.nanquantile(y_tr, [0.01, 0.99])
    y_med = float(np.nanmedian(y_tr_orig))
    log("[5/10] 训练与评估各模型")
    for name, mdl in models.items():
        log(f" -> {name}")
        base_pipe = Pipeline([
            ("pre", preproc),
            ("post_scale", MaxAbsScaler()),
            ("model", mdl)
        ])

        tuned = None
        pipe = None
        pipe_for_cv = None

        if name in param_grids:
            try:
                cv_folds = 3 if name in ("Lasso", "ElasticNet") else 5
                gs = GridSearchCV(
                    base_pipe, param_grids[name],
                    scoring="neg_mean_absolute_error",
                    cv=KFold(n_splits=cv_folds, shuffle=True, random_state=seed),
                    n_jobs=-1, refit=True, error_score="raise"
                )
                gs.fit(X_tr, y_tr)
                pipe = gs.best_estimator_
                tuned = gs.best_params_
                pipe_for_cv = gs.best_estimator_
            except Exception as e:
                log(f" [warn] gridsearch failed: {e}")
                pipe = base_pipe
                pipe.fit(X_tr, y_tr)
                pipe_for_cv = base_pipe
        else:
            pipe = base_pipe
            pipe.fit(X_tr, y_tr)
            pipe_for_cv = base_pipe

        # 还原预测并裁剪
        pred_log_in  = pipe.predict(X_tr)
        pred_log_out = pipe.predict(X_val)
        pred_in  = _from_target(np.clip(pred_log_in,  log_lo, log_hi), cfg.USE_LOG_TARGET)
        pred_out = _from_target(np.clip(pred_log_out, log_lo, log_hi), cfg.USE_LOG_TARGET)
        pred_in  = np.where(np.isfinite(pred_in),  pred_in,  y_med)
        pred_out = np.where(np.isfinite(pred_out), pred_out, y_med)

        in_mae  = float(mean_absolute_error(y_tr_orig,  pred_in))
        out_mae = float(mean_absolute_error(y_val_orig, pred_out))

        # cv6：同时给出 log 空间与原空间（价格）MAE
        cv6_mae_log  = compute_cv6_mae_log (pipe_for_cv, X_tr, y_tr, seed=seed)
        cv6_mae_orig = compute_cv6_mae_orig(pipe_for_cv, X_tr, y_tr, use_log=cfg.USE_LOG_TARGET, seed=seed)

        # 日志显示：in/out/cv6（原空间用整数），cv6_log 用小数
        log(f" in {in_mae:.0f} | out {out_mae:.0f} | cv6 {cv6_mae_orig:.0f} | cv6_log {cv6_mae_log:.4f} | tuned={tuned}")
        results[name] = {"in": in_mae, "out": out_mae, "cv6": cv6_mae_orig, "cv6_log": cv6_mae_log, "tuned": tuned}

        if out_mae < best_out_mae:
            best_out_mae = out_mae
            best_name, best_pipe = name, pipe

    log("[6/10] 保存指标(JSON)")
    with open(cfg.METRICS_JSON, "w", encoding="utf-8") as f:
        json.dump(results, f, indent=2, ensure_ascii=False)

    log(f"[7/10] 保存最优模型：{best_name}")
    dump(best_pipe, cfg.SAVE_MODEL_PATH)

    log("[8/10] 预测验证/测试集")
    pred_val_raw  = _from_target(np.clip(best_pipe.predict(X_val), log_lo, log_hi), cfg.USE_LOG_TARGET)
    pred_test_raw = _from_target(np.clip(best_pipe.predict(X_te),  log_lo, log_hi), cfg.USE_LOG_TARGET)
    pred_val_raw  = np.where(np.isfinite(pred_val_raw),  pred_val_raw,  y_med)
    pred_test_raw = np.where(np.isfinite(pred_test_raw), pred_test_raw, y_med)

    te_raw = pd.read_csv(cfg.TEST_PATH, low_memory=False)
    id_col = "ID" if "ID" in te_raw.columns else None
    te_ids = te_raw[id_col].values if id_col else np.arange(len(X_te))
    part = pd.DataFrame({"ID": te_ids, "Price": pred_test_raw})
    return part, results
    
# ================== 跨 seed 汇总 ==================
def _aggregate_results_across_seeds(per_seed_dicts: List[Dict]) -> pd.DataFrame:
    """
    per_seed_dicts: [results_seed1, results_seed2, ...]
    results_seedX: {"OLS":{"in":..,"out":..,"cv6":..}, "Ridge":{...}, ...}
    返回：DataFrame(columns=["model","in","out","cv6"])  # cv6 为原始单位 MAE
    """
    rows = []
    order = ["OLS", "Ridge", "Lasso", "ElasticNet"]
    for m in order:
        ins  = [d[m]["in"]  for d in per_seed_dicts if m in d]
        outs = [d[m]["out"] for d in per_seed_dicts if m in d]
        cv6s = [d[m].get("cv6", np.nan) for d in per_seed_dicts if m in d]
        if ins:
            rows.append({
                "model": m,
                "in":  float(np.nanmean(ins)),
                "out": float(np.nanmean(outs)),
                "cv6": float(np.nanmean(cv6s)),
            })
    return pd.DataFrame(rows)

# ================== 单轨道运行（返回轨道汇总表） ==================
def run_one_track(cfg: CFG):
    cfg = set_paths(cfg)
    preds = []
    seed_weights = []
    all_results = []

    for seed in cfg.SEEDS:
        log(f"[seed] = {seed}")
        part, res_dict = run_one_seed(cfg, seed)
        preds.append(part["Price"].values)
        best_out_mae = min(m["out"] for m in res_dict.values())
        seed_weights.append(1.0 / max(best_out_mae, 1e-9))
        all_results.append(res_dict)

    # 加权融合
    W = np.asarray(seed_weights, dtype=float)
    W = W / W.sum()
    P = np.vstack(preds)
    ens = (P.T @ W).astype(float)

    te = pd.read_csv(cfg.TEST_PATH, low_memory=False)
    id_col = "ID" if "ID" in te.columns else None
    out_df = pd.DataFrame({
        "ID": te[id_col].values if id_col else np.arange(len(te)),
        "Price": ens
    })

    # 轨道汇总表（cv6 为原始单位）
    summary_df = _aggregate_results_across_seeds(all_results)
    summary_df.insert(0, "track", cfg.TRACK)
    summary_path = os.path.join(cfg.OUT_DIR, f"metrics_summary_{cfg.TRACK}.csv")
    summary_df.to_csv(summary_path, index=False)
    log(f"[9/10] 该轨道汇总表已保存: {summary_path}")

    return out_df, cfg.OUT_DIR, summary_df

# ================== 合并提交（格式不变） ==================
def write_combined_csv(price_df: pd.DataFrame, rent_df: pd.DataFrame, out_path: str):
    os.makedirs(os.path.dirname(out_path), exist_ok=True)
    price_df.to_csv(out_path, index=False)                # 第一段 price 带表头
    rent_df.to_csv(out_path, index=False, header=False, mode="a")  # 第二段 rent 无表头
    log(f"[done] 合并预测 -> {out_path}")
# ================== 主流程 ==================
def run_tracks(tracks=("price","rent")):
    preds_by_track = {}
    out_dir_used = None
    all_tables = []

    for t in tracks:
        cfg = CFG(TRACK=t)
        cfg = set_paths(cfg)
        log(f"[10/10] 运行轨道 {t} | cores={os.cpu_count() or 4}")
        part, out_dir, one_table = run_one_track(cfg)
        preds_by_track[t] = part
        all_tables.append(one_table)
        if out_dir_used is None:
            out_dir_used = out_dir

    out_path = os.path.join(out_dir_used, "predictions_combined.csv")
    if ("price" in preds_by_track) and ("rent" in preds_by_track):
        write_combined_csv(preds_by_track["price"], preds_by_track["rent"], out_path)
    else:
        for k, v in preds_by_track.items():
            v.to_csv(os.path.join(out_dir_used, f"predictions_{k}.csv"), index=False)

    final_table = pd.concat(all_tables, ignore_index=True)
    final_csv = os.path.join(out_dir_used, "metrics_summary_all.csv")
    final_table.to_csv(final_csv, index=False)
    print(final_table)
    log(f"[done] OLS/Ridge/Lasso/EN — in/out/cv6(原始单位) 汇总 -> {final_csv}")

if __name__ == "__main__":
    run_tracks(tracks=("price","rent"))


[paths] OUT_DIR = /home/mw/project/outputs
[16:57:50] [10/10] 运行轨道 price | cores=64
[paths] OUT_DIR = /home/mw/project/outputs
[16:57:50] [seed] = 111
[16:57:50] [1/10] 读取数据
[train] shape=(103871, 61), dtypes summary: {dtype('O'): 41, dtype('float64'): 19, dtype('int64'): 1}
[test]  shape=(34017, 61), dtypes summary: {dtype('O'): 41, dtype('float64'): 18, dtype('int64'): 2}
[16:58:02] [feat] geo KMeans fitted (k=80)
[16:58:02] [2/10] 训练/验证 80/20 划分
[16:58:05] [feat] hier-TE added: 环线位置 (parent=环线)
[16:58:05] [feat] hier-TE added: 区县 (parent=城市)
[16:58:11] [feat] simple-TE added: ['房屋户型', '所在楼层', '房屋朝向', '梯户比例', '物业类别', '建筑年代', '开发商', '房屋总数', '楼栋总数', '物业公司', '产权描述', '燃气费', '供热费', '环线']
[16:58:13] [text] added 5 SVD dims from 240 tfidf feats
[16:58:13] [3/10] 数值特征筛选（log-y）
[16:58:16]  kept: 67 / 84
[16:58:16] [4/10] 构建预处理器（稀疏友好 & 可并行pickle）
[16:58:16]  blocks: num=67, cats=12
[16:58:16] [5/10] 训练与评估各模型
[16:58:16]  -> OLS
[16:58:38]  in 464205 | out 453407 | cv6 521330 | cv6_log 0.2136 | 

KeyboardInterrupt: 