## 欢迎进入 Notebook  

这里你可以编写代码，文档  

### 关于文件目录  


**project**：project 目录是本项目的工作空间，可以把将项目运行有关的所有文件放在这里，目录中文件的增、删、改操作都会被保留  


**input**：input 目录是数据集的挂载位置，所有挂载进项目的数据集都在这里，未挂载数据集时 input 目录被隐藏  


**temp**：temp 目录是临时磁盘空间，训练或分析过程中产生的不必要文件可以存放在这里，目录中的文件不会保存  


In [1]:
# 查看个人持久化工作区文件
!ls /home/mw/project/

images


In [2]:
# 查看当前挂载的数据集目录
!ls /home/mw/input/

hackathon255769


In [None]:
# 标准库导入
import os
import warnings
from pathlib import Path

# 第三方基础库
import numpy as np
import pandas as pd
import re

# sklearn 组件
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.impute import SimpleImputer
from sklearn.linear_model import ElasticNetCV, LassoCV, LinearRegression, RidgeCV
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, PolynomialFeatures, StandardScaler

In [1]:
DATA = Path("/home/mw/input/hackathon255769")
TRAIN_RENT  = DATA / "ruc_Class25Q2_train_rent.csv"
TEST_RENT   = DATA / "ruc_Class25Q2_test_rent.csv"
TRAIN_PRICE = DATA / "ruc_Class25Q2_train_price.csv"
TEST_PRICE  = DATA / "ruc_Class25Q2_test_price.csv"

df_train_rent  = pd.read_csv(TRAIN_RENT)
df_test_rent   = pd.read_csv(TEST_RENT)
df_train_price = pd.read_csv(TRAIN_PRICE)
df_test_price  = pd.read_csv(TEST_PRICE)

for name, df in {
    "train_rent": df_train_rent, "test_rent": df_test_rent,
    "train_price": df_train_price, "test_price": df_test_price
}.items():
    print(f"{name} shape={df.shape}")


  df_train_rent  = pd.read_csv(TRAIN_RENT)
  df_train_price = pd.read_csv(TRAIN_PRICE)


train_rent shape=(98899, 46)
test_rent shape=(9773, 46)
train_price shape=(103871, 55)
test_price shape=(34017, 55)


  df_test_price  = pd.read_csv(TEST_PRICE)


In [2]:
!pip install sklearn

Collecting sklearn
  Using cached sklearn-0.0.post12.tar.gz (2.6 kB)
  Preparing metadata (setup.py) ... [?25lerror
  [1;31merror[0m: [1msubprocess-exited-with-error[0m
  
  [31m×[0m [32mpython setup.py egg_info[0m did not run successfully.
  [31m│[0m exit code: [1;36m1[0m
  [31m╰─>[0m [31m[15 lines of output][0m
  [31m   [0m The 'sklearn' PyPI package is deprecated, use 'scikit-learn'
  [31m   [0m rather than 'sklearn' for pip commands.
  [31m   [0m 
  [31m   [0m Here is how to fix this error in the main use cases:
  [31m   [0m - use 'pip install scikit-learn' rather than 'pip install sklearn'
  [31m   [0m - replace 'sklearn' by 'scikit-learn' in your pip requirements files
  [31m   [0m   (requirements.txt, setup.py, setup.cfg, Pipfile, etc ...)
  [31m   [0m - if the 'sklearn' package is used by one of your dependencies,
  [31m   [0m   it would be great if you take some time to track which package uses
  [31m   [0m   'sklearn' instead of 'scikit-le

In [None]:
# ---------- 基础工具 ----------
def to_number(x):
    if pd.isna(x): return np.nan
    s = str(x).strip()
    if s.endswith("%"):
        try: return float(s.replace("%", "")) / 100
        except: return np.nan
    for sym in ["㎡", "元", "m³", "月", "／", "/", "每", " ", "栋", "户","年"]:
        s = s.replace(sym, "")
    s = s.strip()
    if "-" in s:
        try: return np.mean([float(i) for i in s.split("-") if i.strip()])
        except: return np.nan
    try: return float(s)
    except: return np.nan

def parse_chinese_number(s):
    if not isinstance(s, str) or s.strip() == '':
        return np.nan
    s = s.strip()
    cn_num = {'零':0,'一':1,'二':2,'两':2,'三':3,'四':4,'五':5,'六':6,'七':7,'八':8,'九':9}
    if re.match(r'^\d+$', s): return float(s)
    total = 0
    hundred, ten, num = 0, 0, 0
    if '百' in s:
        parts = s.split('百')
        hundred = cn_num.get(parts[0], 1) * 100
        s = parts[1] if len(parts) > 1 else ''
    if '十' in s:
        parts = s.split('十')
        left = parts[0]; right = parts[1] if len(parts) > 1 else ''
        ten = cn_num.get(left, 1) * 10
        num = cn_num.get(right, 0) if right else 0
        return hundred + ten + num
    for ch in s:
        num = num * 10 + cn_num.get(ch, 0)
    return hundred + num

# ---------- 各处理模块 ----------
class LeakRemover(BaseEstimator, TransformerMixin):
    def __init__(self, leak_cols): self.leak_cols = leak_cols
    def fit(self, X, y=None): return self
    def transform(self, X): return X.drop(columns=[c for c in self.leak_cols if c in X.columns], errors='ignore')
class BuildYearAverager(BaseEstimator, TransformerMixin):
    """
    将“建筑年代”列中如 '2011-2012年', '2005年', '1998-2000' 等格式转换为平均年份（float）。
    """
    def __init__(self, col='建筑年代'):
        self.col = col

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X = X.copy()
        if self.col not in X.columns:
            return X

        def parse_year(s):
            if pd.isna(s): return np.nan
            s = str(s).strip().replace("年", "")
            # 提取所有4位数字
            years = re.findall(r'\d{4}', s)
            if not years:
                return np.nan
            years = [int(y) for y in years]
            # 若是区间如 2011-2012，则取平均
            if len(years) == 2:
                return np.mean(years)
            # 若只有单一年份
            elif len(years) == 1:
                return float(years[0])
            else:
                # 出现奇怪格式则取中位数
                return np.mean(years)

        X[self.col] = X[self.col].apply(parse_year)
        return X
class HouseLayoutExtractor(BaseEstimator, TransformerMixin):
    """提取房屋户型为室/厅/厨/卫"""
    def __init__(self, col='房屋户型'): self.col = col
    def fit(self, X, y=None): return self
    def transform(self, X):
        X = X.copy()
        def extract_layout(s):
            if pd.isna(s) or str(s).strip()=='':
                return (0,0,0,0)
            s = str(s)
            if '房间' in s:
                rooms = re.findall(r'(\d+)房间', s)
                baths = re.findall(r'(\d+)卫', s)
                return (int(rooms[0]) if rooms else 0, 0, 0, int(baths[0]) if baths else 0)
            rooms = re.findall(r'(\d+)室', s)
            halls = re.findall(r'(\d+)厅', s)
            kitchens = re.findall(r'(\d+)厨', s)
            baths = re.findall(r'(\d+)卫', s)
            return (int(rooms[0]) if rooms else 0,
                    int(halls[0]) if halls else 0,
                    int(kitchens[0]) if kitchens else 0,
                    int(baths[0]) if baths else 0)
        layout_df = pd.DataFrame(X[self.col].apply(extract_layout).tolist(),
                                 columns=['户型_室数','户型_厅数','户型_厨数','户型_卫数'], index=X.index)
        return pd.concat([X.drop(columns=[self.col], errors='ignore'), layout_df], axis=1)
class UnitCleaner(BaseEstimator, TransformerMixin):
    def __init__(self, cols): self.cols = cols
    def fit(self, X, y=None): return self
    def transform(self, X):
        X = X.copy()
        for c in self.cols:
            if c in X.columns:
                X[c] = X[c].apply(to_number)
        return X
class LadderRatioExtractor(BaseEstimator, TransformerMixin):
    def __init__(self, col='梯户比例'): self.col = col
    def fit(self, X, y=None): return self
    def transform(self, X):
        X = X.copy()
        def parse_ratio(v):
            if pd.isna(v) or str(v).strip()=='':
                return 0
            s = str(v)
            ladders = re.findall(r'([一二三四五六七八九十百零两\d]+)梯', s)
            units = re.findall(r'([一二三四五六七八九十百零两\d]+)户', s)
            if ladders and units:
                a = parse_chinese_number(ladders[0])
                b = parse_chinese_number(units[0])
                if not np.isnan(a) and not np.isnan(b) and b != 0:
                    return round(a / b, 3)
            return 0
        X[self.col] = X[self.col].apply(parse_ratio)
        return X
class ElevatorFlagger(BaseEstimator, TransformerMixin):
    def __init__(self, col='配备电梯'): self.col=col
    def fit(self, X, y=None): return self
    def transform(self, X):
        X=X.copy()
        if self.col in X.columns:
            X[self.col]=X[self.col].astype(str).apply(lambda v:1 if ('有' in v or '是' in v or v.strip()=='1') else 0)
        else: X[self.col]=0
        return X
class DynamicWinsorizer(BaseEstimator, TransformerMixin):
    def __init__(self, factor=1.5): self.factor=factor
    def fit(self,X,y=None):
        num=X.select_dtypes(include=[np.number]).columns.tolist()
        if 'Price' in num: num.remove('Price')
        self.bounds_={}
        for c in num:
            q1,q3=X[c].quantile(0.25),X[c].quantile(0.75)
            iqr=q3-q1
            self.bounds_[c]=(q1-self.factor*iqr,q3+self.factor*iqr)
        return self
    def transform(self,X):
        X=X.copy()
        for c,(l,u) in self.bounds_.items():
            X[c]=X[c].clip(l,u)
        return X
class DynamicMissingDropper(BaseEstimator, TransformerMixin):
    def __init__(self, threshold=0.6, exempt_cols=None):
        self.threshold = threshold
        self.exempt_cols = exempt_cols if exempt_cols is not None else []
    def fit(self,X,y=None):
        miss = X.isnull().mean()
        self.to_drop_ = [c for c in miss.index if miss[c] > self.threshold and c not in self.exempt_cols]
        return self
    def transform(self,X):
        return X.drop(columns=self.to_drop_, errors='ignore')
class ImputerTransformer(BaseEstimator, TransformerMixin):
    def __init__(self,num_cols,cat_cols):
        self.num_cols=num_cols; self.cat_cols=cat_cols
        self.num_imp=SimpleImputer(strategy='median')
        self.cat_imp=SimpleImputer(strategy='most_frequent')
    def fit(self,X,y=None):
        self.num_exist=[c for c in self.num_cols if c in X.columns]
        self.cat_exist=[c for c in self.cat_cols if c in X.columns]
        if self.num_exist:self.num_imp.fit(X[self.num_exist])
        if self.cat_exist:self.cat_imp.fit(X[self.cat_exist])
        return self
    def transform(self,X):
        X=X.copy()
        if self.num_exist:X[self.num_exist]=self.num_imp.transform(X[self.num_exist])
        if self.cat_exist:X[self.cat_exist]=self.cat_imp.transform(X[self.cat_exist])
        return X
class FloorExtractor(BaseEstimator, TransformerMixin):
    def __init__(self,col='所在楼层'):
        self.col=col
        self.map={'地下室':0,'底层':1,'低楼层':2,'中楼层':3,'高楼层':4,'顶层':5}
    def fit(self,X,y=None):return self
    def transform(self,X):
        X=X.copy()
        if self.col in X.columns:
            s=X[self.col].astype(str)
            X['总楼层']=s.str.extract(r'共(\d+)层')[0].astype(float)
            X['楼层类型']=s.str.extract(r'(地下室|底层|低楼层|中楼层|高楼层|顶层)')[0]
            X['楼层位置编码']=X['楼层类型'].map(self.map)
            X=X.drop(columns=[self.col])
        return X
class RingEncoder(BaseEstimator, TransformerMixin):
    def __init__(self, cols=['环线']):
        self.cols = cols
        self.mapping = {
            '内环内':1,'二环内':1,
            '内环至中环':2,'二至三环':2,
            '内环至外环':3,'三至四环':3,
            '中环至外环':4,'四至五环':4,'五至六环':4,
            '六环外':5,'外环外':5
        }
    def fit(self, X, y=None): return self
    def transform(self, X):
        X = X.copy()
        col_exist = None
        for c in self.cols:
            if c in X.columns:
                col_exist = c
                break
        if col_exist:
            X['环线_missing'] = X[col_exist].isna().astype(int)
            X['环线_num'] = X[col_exist].map(self.mapping)
            mean_val = np.nanmean(X['环线_num'])
            X['环线_num_filled'] = X['环线_num'].fillna(mean_val)
            X = X.drop(columns=[col_exist, '环线_num'])
        else:
            X['环线_missing'] = 1
            X['环线_num_filled'] = 0
        return X
class CategoricalEncoder(BaseEstimator, TransformerMixin):
    def __init__(self, cols):
        self.cols = cols
        try:
            self.enc = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
        except TypeError:
            self.enc = OneHotEncoder(handle_unknown='ignore', sparse=False)
    def fit(self,X,y=None): 
        self.exist=[c for c in self.cols if c in X.columns] 
        if self.exist: self.enc.fit(X[self.exist]) 
        return self 
    def transform(self,X): 
        X=X.copy() 
        if not getattr(self,'exist', None) or not self.exist: return X 
        arr = self.enc.transform(X[self.exist]) 
        cat_df = pd.DataFrame(arr, columns=self.enc.get_feature_names_out(self.exist), index=X.index) 
        return pd.concat([X.drop(columns=self.exist, errors='ignore'), cat_df], axis=1)      
class SelectiveStandardizer(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.scaler=StandardScaler()
        self.exclude=[]
    def fit(self,X,y=None):
        num=X.select_dtypes(include=[np.number]).columns.tolist()
        self.cols_=[c for c in num if (c not in self.exclude and c!='Price' and '_' not in c)]
        if self.cols_:self.scaler.fit(X[self.cols_])
        return self
    def transform(self,X):
        X=X.copy()
        if hasattr(self,'cols_') and self.cols_:
            X[self.cols_]=self.scaler.transform(X[self.cols_])
        return X
class ColumnPruner(BaseEstimator, TransformerMixin):
    def __init__(self,drop_cols):self.drop_cols=drop_cols
    def fit(self,X,y=None):return self
    def transform(self,X):return X.drop(columns=[c for c in self.drop_cols if c in X.columns],errors='ignore')
class FinalImputer(BaseEstimator, TransformerMixin):
   
    def fit(self, X, y=None):
        self.num_cols_ = X.select_dtypes(include=[np.number]).columns.tolist()
        self.cat_cols_ = X.select_dtypes(exclude=[np.number]).columns.tolist()
        self.num_imp = SimpleImputer(strategy='median')
        self.cat_imp = SimpleImputer(strategy='most_frequent')
        if self.num_cols_:
            self.num_imp.fit(X[self.num_cols_])
        if self.cat_cols_:
            self.cat_imp.fit(X[self.cat_cols_])
        return self

    def transform(self, X):
        X = X.copy()
        if hasattr(self, 'num_cols_') and self.num_cols_:
            X[self.num_cols_] = self.num_imp.transform(X[self.num_cols_])
        if hasattr(self, 'cat_cols_') and self.cat_cols_:
            X[self.cat_cols_] = self.cat_imp.transform(X[self.cat_cols_])
        return X


# ---------- ✅ 最终组合 ----------
def FinalPricePipeline():
    leak_cols=['房屋优势','核心卖点','户型介绍','周边配套','交通出行','客户反馈',
               '上次交易','交易权属','产权描述','交易时间']
    num_cols=['建筑面积','套内面积','绿 化 率','容 积 率','物 业 费','燃气费','房屋总数','楼栋总数']
    num_impute=['容 积 率','绿 化 率','燃气费','物 业 费','房屋总数','楼栋总数']
    cat_impute=['供水','供电','供暖','建筑结构','建筑结构_comm']
    cat_cols=['建筑结构','建筑结构_comm','装修情况','房屋用途','产权所属','供水','供暖','供电','城市']
    drop_cols=['房屋朝向','物业类别','物业办公电话','物业公司','开发商','抵押信息',
               '别墅类型','房屋年限','coord_x','coord_y','环线位置','停车费用','楼层类型']

    return Pipeline([
        ('leakremover', LeakRemover(leak_cols)),
        ('ladder',LadderRatioExtractor()),
        ('build_year', BuildYearAverager()),         
        ('layout', HouseLayoutExtractor()),
        ('unit_clean', UnitCleaner(num_cols)),
        ('floor', FloorExtractor()),
        ('elevator', ElevatorFlagger()),
        ('ring', RingEncoder()),
        ('winsor', DynamicWinsorizer()),
        ('missing_drop', DynamicMissingDropper()),
        ('impute', ImputerTransformer(num_impute, cat_impute)),
        ('onehot', CategoricalEncoder(cat_cols)),
        ('scale', SelectiveStandardizer()),
        ('final_impute', FinalImputer()),            
        ('prune', ColumnPruner(drop_cols))
    ])




In [9]:
price_pipeline = FinalPricePipeline()

In [10]:
# ==========================================================
# 🧠 Quant Modeling: Housing Price Prediction (Stable Version)
# ==========================================================

import os, warnings, numpy as np, pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.linear_model import LinearRegression, RidgeCV, LassoCV, ElasticNetCV
from sklearn.feature_selection import SelectKBest, f_regression

warnings.filterwarnings("ignore")
os.environ["OMP_NUM_THREADS"] = "1"  # 限制线程数防止爆内存

# ==========================================================
# 1️⃣ 数据准备
# ==========================================================

# 删除无用列
df_test_price = df_test_price.drop(columns=["ID"], errors="ignore")

# 拆分目标与特征
y_price = df_train_price["Price"]
X_price_train = df_train_price.drop(columns=["Price"], errors="ignore")

print(f"原始训练集: {df_train_price.shape}, 测试集: {df_test_price.shape}")

# pipeline 清洗
clean_price = price_pipeline.fit_transform(X_price_train)
clean_price_test = price_pipeline.transform(df_test_price)

# 检查列数是否一致
if clean_price_test.shape[1] != clean_price.shape[1]:
    common_cols = [c for c in clean_price.columns if c in clean_price_test.columns]
    clean_price = clean_price[common_cols]
    clean_price_test = clean_price_test[common_cols]
    print(f"⚠️ 已对齐公共特征数: {len(common_cols)}")

# ==========================================================
# 2️⃣ 特征与目标设置
# ==========================================================

USE_LOG_TARGET = True  # 对价格取对数使分布更稳定

if USE_LOG_TARGET:
    y_all = np.log1p(y_price)
else:
    y_all = y_price

X_all = clean_price.copy()
X_test_all = clean_price_test.copy()

print(f"✅ 跳过多项式特征，当前特征数: {X_all.shape[1]}")

# ==========================================================
# 3️⃣ 数据划分 + 特征选择 + 标准化
# ==========================================================

X_tr, X_va, y_tr, y_va = train_test_split(X_all, y_all, test_size=0.2, random_state=111)
print(f"✅ 数据划分完成: X_train={X_tr.shape}, X_val={X_va.shape}")

# 特征筛选
selector = SelectKBest(f_regression, k='all')  # 保留所有显著特征
X_tr_sel = selector.fit_transform(X_tr, y_tr)
X_va_sel = selector.transform(X_va)
X_te_sel = selector.transform(X_test_all)
print(f"✅ 特征选择完成: {X_tr_sel.shape[1]} 个特征保留")

# 标准化
scaler = StandardScaler()
X_tr_sel = scaler.fit_transform(X_tr_sel)
X_va_sel = scaler.transform(X_va_sel)
X_te_sel = scaler.transform(X_te_sel)
print("✅ 特征已标准化")

# ==========================================================
# 4️⃣ 模型训练与评估
# ==========================================================

def evaluate_model(name, model, Xtr, ytr, Xva, yva, log_target=False):
    mdl = model.fit(Xtr, ytr)
    yhat_tr = mdl.predict(Xtr)
    yhat_va = mdl.predict(Xva)

    # 若目标为log，反变换
    if log_target:
        ytr_true, yva_true = np.expm1(ytr), np.expm1(yva)
        ytr_pred, yva_pred = np.expm1(yhat_tr), np.expm1(yhat_va)
    else:
        ytr_true, yva_true = ytr, yva
        ytr_pred, yva_pred = yhat_tr, yhat_va

    mae_tr = mean_absolute_error(ytr_true, ytr_pred)
    rmse_tr = np.sqrt(mean_squared_error(ytr_true, ytr_pred))
    r2_tr = r2_score(ytr_true, ytr_pred)
    mae_va = mean_absolute_error(yva_true, yva_pred)
    rmse_va = np.sqrt(mean_squared_error(yva_true, yva_pred))
    r2_va = r2_score(yva_true, yva_pred)

    print(f"\n[{name}]")
    print(f"In-sample:  MAE={mae_tr:,.2f}  RMSE={rmse_tr:,.2f}  R²={r2_tr:.4f}")
    print(f"Out-sample: MAE={mae_va:,.2f}  RMSE={rmse_va:,.2f}  R²={r2_va:.4f}")

    return {
        "Model": name,
        "MAE_train": mae_tr, "MAE_val": mae_va,
        "RMSE_train": rmse_tr, "RMSE_val": rmse_va,
        "R²_train": r2_tr, "R²_val": r2_va,
        "Estimator": mdl
    }

results = []
results.append(evaluate_model("OLS", LinearRegression(), X_tr_sel, y_tr, X_va_sel, y_va, USE_LOG_TARGET))
results.append(evaluate_model("RidgeCV", RidgeCV(alphas=np.logspace(-3,3,20), cv=5), X_tr_sel, y_tr, X_va_sel, y_va, USE_LOG_TARGET))
results.append(evaluate_model("LassoCV", LassoCV(alphas=np.logspace(-2,1,10), cv=3, n_jobs=1, max_iter=5000), X_tr_sel, y_tr, X_va_sel, y_va, USE_LOG_TARGET))
results.append(evaluate_model("ElasticNetCV", ElasticNetCV(alphas=np.logspace(-2,1,8), l1_ratio=[0.3,0.5,0.7], cv=3, n_jobs=1, max_iter=5000), X_tr_sel, y_tr, X_va_sel, y_va, USE_LOG_TARGET))

# ==========================================================
# 5️⃣ 汇总结果与预测输出
# ==========================================================

metrics_df = pd.DataFrame([{k:v for k,v in r.items() if k!="Estimator"} for r in results])
metrics_df = metrics_df.sort_values("MAE_val").reset_index(drop=True)
print("\n=== Summary (Validation Performance) ===")
print(metrics_df[["Model","MAE_val","RMSE_val","R²_val"]])

metrics_df.to_csv("metrics_price_summary.csv", index=False)
print("📁 已保存模型性能表 metrics_price_summary.csv")

# 使用验证集 MAE 最小的模型进行最终预测
best_model = results[metrics_df["MAE_val"].idxmin()]["Estimator"]
y_pred_test = best_model.predict(X_te_sel)
if USE_LOG_TARGET:
    y_pred_test = np.expm1(y_pred_test)

submission = pd.DataFrame({
    "ID": np.arange(len(y_pred_test)),
    "prediction": y_pred_test
})
submission.to_csv("prediction_price.csv", index=False, encoding="utf-8-sig")
print("📁 已保存预测结果 prediction_price.csv")

print("\n✅ 全流程完成（稳定版）！")


原始训练集: (103871, 55), 测试集: (34017, 54)
✅ 跳过多项式特征，当前特征数: 94
✅ 数据划分完成: X_train=(83096, 94), X_val=(20775, 94)
✅ 特征选择完成: 94 个特征保留
✅ 特征已标准化

[OLS]
In-sample:  MAE=696,085.29  RMSE=1,415,562.75  R²=0.6904
Out-sample: MAE=683,704.57  RMSE=1,407,717.67  R²=0.6799

[RidgeCV]
In-sample:  MAE=696,082.94  RMSE=1,415,577.90  R²=0.6904
Out-sample: MAE=683,721.66  RMSE=1,407,744.10  R²=0.6799

[LassoCV]
In-sample:  MAE=711,908.25  RMSE=1,475,670.73  R²=0.6635
Out-sample: MAE=700,014.50  RMSE=1,469,668.33  R²=0.6511

[ElasticNetCV]
In-sample:  MAE=700,865.65  RMSE=1,437,972.45  R²=0.6805
Out-sample: MAE=688,489.49  RMSE=1,430,171.64  R²=0.6696

=== Summary (Validation Performance) ===
          Model        MAE_val      RMSE_val    R²_val
0           OLS  683704.570439  1.407718e+06  0.679926
1       RidgeCV  683721.660834  1.407744e+06  0.679914
2  ElasticNetCV  688489.493239  1.430172e+06  0.669634
3       LassoCV  700014.499790  1.469668e+06  0.651135
📁 已保存模型性能表 metrics_price_summary.csv
📁 已保存预测结果 

In [None]:
# ---------- 基础工具 ----------
def to_number(x):
    if pd.isna(x): return np.nan
    s = str(x).strip()
    if s.endswith("%"):
        try: return float(s.replace("%",""))/100
        except: return np.nan
    for sym in ["㎡","元","m³","／","/","每","月","栋","户","元/月/㎡","元/㎡","层"," "]:
        s = s.replace(sym,"")
    s = s.strip()
    if "-" in s:
        try: return np.mean([float(i) for i in s.split("-") if i.strip()])
        except: return np.nan
    try: return float(s)
    except: return np.nan

# ---------- 各子模块 ----------
class LeakRemover(BaseEstimator, TransformerMixin):
    """删除描述/售后/明显无用列，避免信息泄露"""
    def __init__(self, leak_cols): self.leak_cols = leak_cols
    def fit(self, X, y=None): return self
    def transform(self, X):
        return X.drop(columns=[c for c in self.leak_cols if c in X.columns], errors='ignore')


class BuildYearAverager(BaseEstimator, TransformerMixin):
    def __init__(self, col='建筑年代'):
        self.col = col
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        X = X.copy()
        if self.col not in X.columns:
            return X
        def parse_year(s):
            import re
            import numpy as np
            if pd.isna(s) or str(s).strip() == "":
                return np.nan
            s = str(s)
            s = re.sub(r"[－–—~～]", "-", s)
            s = re.sub(r"[^\d\-]", "", s)
            nums = re.findall(r"\d{4}", s)
            if len(nums) >= 2:
                return np.mean([float(nums[0]), float(nums[1])])
            elif len(nums) == 1:
                return float(nums[0])
            else:
                return np.nan
        X[self.col] = X[self.col].apply(parse_year)
        return X


class HouseLayoutExtractor(BaseEstimator, TransformerMixin):
    """'户型' → 户型_室/厅/卫（数值）"""
    def __init__(self, col='户型'): self.col = col
    def fit(self, X, y=None): return self
    def transform(self, X):
        X = X.copy()
        def extract_layout(s):
            if pd.isna(s) or str(s).strip()=='':
                return (0,0,0)
            s = str(s)
            rooms = re.findall(r'(\d+)室', s)
            halls = re.findall(r'(\d+)厅', s)
            baths = re.findall(r'(\d+)卫', s)
            return (int(rooms[0]) if rooms else 0,
                    int(halls[0]) if halls else 0,
                    int(baths[0]) if baths else 0)
        layout_df = pd.DataFrame(X[self.col].apply(extract_layout).tolist(),
                                 columns=['户型_室','户型_厅','户型_卫'], index=X.index)
        return pd.concat([X.drop(columns=[self.col], errors='ignore'), layout_df], axis=1)

class UnitCleaner(BaseEstimator, TransformerMixin):
    """带单位列 → 纯数值"""
    def __init__(self, cols): self.cols = cols
    def fit(self, X, y=None): return self
    def transform(self, X):
        X = X.copy()
        for c in self.cols:
            if c in X.columns:
                X[c] = X[c].apply(to_number)
        return X
class FloorExtractor(BaseEstimator, TransformerMixin):
    """兼容 '4/6层' 与 '中楼层 (共10层)' 两类格式"""
    def __init__(self, col='楼层'):
        self.col = col
        self.map={'地下室':0,'底层':1,'低楼层':2,'中楼层':3,'高楼层':4,'顶层':5}
    def fit(self, X, y=None): return self
    def transform(self, X):
        X=X.copy()
        if self.col in X.columns:
            s = X[self.col].astype(str)
            X['总楼层'] = s.str.extract(r'共(\d+)层')[0].astype(float)
            X['当前楼层'] = s.str.extract(r'(\d+)\s*/')[0].astype(float)
            m2 = s.str.extract(r'/\s*(\d+)')[0].astype(float)
            X['总楼层'] = X['总楼层'].fillna(m2)
            X['楼层类型'] = s.str.extract(r'(地下室|底层|低楼层|中楼层|高楼层|顶层)')[0]
            X['楼层位置编码'] = X['楼层类型'].map(self.map)
            X.drop(columns=[self.col], inplace=True)
        return X
class ElevatorFlagger(BaseEstimator, TransformerMixin):
    """电梯：有/是/1 → 1；其他或空 → 0"""
    def __init__(self, col='电梯'): self.col = col
    def fit(self, X, y=None): return self
    def transform(self, X):
        X=X.copy()
        if self.col in X.columns:
            X[self.col] = X[self.col].astype(str).apply(
                lambda v: 1 if ('有' in v or '是' in v or v.strip()=='1') else 0
            )
        else:
            X[self.col] = 0
        return X
class PaymentMonths(BaseEstimator, TransformerMixin):
    """付款方式 → 付款月数（数值），白名单外样本删除"""
    def __init__(self, pay_col='付款方式'):
        self.pay_col = pay_col
        self.mapping = {'月付价':1, '双月付价':2, '季付价':3, '半年付价':6, '年付价':12}
    def fit(self, X, y=None): return self
    def transform(self, X):
        X = X.copy()
        if self.pay_col in X.columns:
            mask = X[self.pay_col].isin(self.mapping.keys())
            X = X[mask].copy()
            X['付款月数'] = X[self.pay_col].map(self.mapping).astype(float)
        else:
            # 若缺列，无法构造该特征；保守起见设为 NaN，稍后由 imputer 处理或被删除
            X['付款月数'] = np.nan
        return X
class RingEncoder(BaseEstimator, TransformerMixin):
    """环线位置：数值化 + 缺失标记（更稳健）"""
    def __init__(self, col='环线位置'):
        self.col = col
        self.mapping={'内环内':1,'二环内':1,'内环至中环':2,'二至三环':2,
                      '内环至外环':3,'三至四环':3,'中环至外环':4,'四至五环':4,'五至六环':4,
                      '六环外':5,'外环外':5}
    def fit(self, X, y=None): return self
    def transform(self, X):
        X=X.copy()
        if self.col in X.columns:
            X['环线_missing'] = X[self.col].isna().astype(int)
            X['环线_num'] = X[self.col].map(self.mapping)
            mean_val = np.nanmean(X['环线_num'])
            X['环线_num_filled'] = X['环线_num'].fillna(mean_val)
            X.drop(columns=[self.col,'环线_num'], inplace=True)
        else:
            X['环线_missing'] = 1
            X['环线_num_filled'] = 0
        return X
class DynamicWinsorizer(BaseEstimator, TransformerMixin):
    """IQR 截尾；目标列与标志列不参与"""
    def __init__(self, factor=1.5): self.factor=factor
    def fit(self,X,y=None):
        num_cols = X.select_dtypes(include=[np.number]).columns.tolist()
        for skip in ['Price','付款月数']: 
            if skip in num_cols: num_cols.remove(skip)
        self.bounds_={}
        for c in num_cols:
            q1,q3 = X[c].quantile(0.25), X[c].quantile(0.75)
            iqr = q3 - q1
            self.bounds_[c] = (q1 - self.factor*iqr, q3 + self.factor*iqr)
        return self
    def transform(self,X):
        X = X.copy()
        for c,(l,u) in self.bounds_.items():
            if c in X.columns:
                X.loc[:, c] = X[c].clip(l, u)
        return X
class DynamicMissingDropper(BaseEstimator, TransformerMixin):
    """>60% 缺失的列删除，但豁免环线位置"""
    def __init__(self, threshold=0.6, exempt_cols=None):
        self.threshold=threshold
        self.exempt_cols = exempt_cols if exempt_cols else ['环线位置']
    def fit(self,X,y=None):
        miss = X.isnull().mean()
        self.to_drop_ = [c for c in miss.index if miss[c] > self.threshold and c not in self.exempt_cols]
        return self
    def transform(self,X):
        return X.drop(columns=self.to_drop_, errors='ignore')
class ImputerTransformer(BaseEstimator, TransformerMixin):
    """数值/分类缺失填补"""
    def __init__(self, num_cols, cat_cols):
        self.num_cols=num_cols; self.cat_cols=cat_cols
        self.num_imp=SimpleImputer(strategy='median')
        self.cat_imp=SimpleImputer(strategy='most_frequent')
    def fit(self,X,y=None):
        self.num_exist=[c for c in self.num_cols if c in X.columns]
        self.cat_exist=[c for c in self.cat_cols if c in X.columns]
        if self.num_exist: self.num_imp.fit(X[self.num_exist])
        if self.cat_exist: self.cat_imp.fit(X[self.cat_exist])
        return self
    def transform(self,X):
        X=X.copy()
        if self.num_exist: X[self.num_exist]=self.num_imp.transform(X[self.num_exist])
        if self.cat_exist: X[self.cat_exist]=self.cat_imp.transform(X[self.cat_exist])
        return X
class CategoricalEncoder(BaseEstimator, TransformerMixin):
    """对指定分类变量做 OneHot（付款方式不再进来！）"""
    def __init__(self, cols):
        self.cols=cols
        self.enc=OneHotEncoder(handle_unknown='ignore')
    def fit(self,X,y=None):
        self.exist=[c for c in self.cols if c in X.columns]
        if self.exist: self.enc.fit(X[self.exist])
        return self
    def transform(self,X):
        X=X.copy()
        if not getattr(self,'exist', None) or not self.exist: return X
        arr = self.enc.transform(X[self.exist]).toarray()
        cat_df = pd.DataFrame(arr, columns=self.enc.get_feature_names_out(self.exist), index=X.index)
        return pd.concat([X.drop(columns=self.exist, errors='ignore'), cat_df], axis=1)
class SelectiveStandardizer(BaseEstimator, TransformerMixin):
    """标准化（包含 '付款月数'，但排除坐标/地理标志/二值）"""
    def __init__(self):
        self.scaler=StandardScaler()
        self.exclude=['城市','区县','板块','lon','lat','coord_x','coord_y',
                      '电梯','环线_num_filled','环线_missing']
    def fit(self,X,y=None):
        num_cols = X.select_dtypes(include=[np.number]).columns.tolist()
        # 目标变量不能标准化
        self.cols_ = [c for c in num_cols if c not in self.exclude and c != 'Price']
        if self.cols_: self.scaler.fit(X[self.cols_])
        return self
    def transform(self,X):
        X=X.copy()
        if getattr(self,'cols_', None) and self.cols_:
            X[self.cols_] = self.scaler.transform(X[self.cols_])
        return X
class ColumnPruner(BaseEstimator, TransformerMixin):
    """删除无用/文本/潜在泄露列（含付款方式原始列）"""
    def __init__(self, drop_cols): self.drop_cols=drop_cols
    def fit(self,X,y=None): return self
    def transform(self,X):
        return X.drop(columns=[c for c in self.drop_cols if c in X.columns], errors='ignore')
class GlobalMedianImputer(BaseEstimator, TransformerMixin):
    """对所有数值列的缺失值进行中位数填补"""
    def fit(self, X, y=None):
        num_cols = X.select_dtypes(include=[np.number]).columns
        self.medians_ = X[num_cols].median()
        return self

    def transform(self, X):
        X = X.copy()
        for c in self.medians_.index:
            if c in X.columns:
                X[c] = X[c].fillna(self.medians_[c])
        return X


# ---------- 最终组合 ----------
def FinalRentPipeline():
    # 带单位的数值列
    num_cols = ['面积','绿 化 率','容 积 率','物 业 费','燃气费','供热费','房屋总数','楼栋总数','停车费用']
    # 需要数值填补的列（上面的一部分）
    num_impute = ['面积','绿 化 率','容 积 率','物 业 费','燃气费','停车费用','付款月数']
    # 需要众数填补的分类列（不含“付款方式”）
    cat_impute = ['装修','采暖','供水','供电','供暖','建筑结构','物业类别','租赁方式']
    # 进入 OneHot 的分类列
    cat_cols   = ['装修','燃气','采暖','供水','供电','供暖','建筑结构','租赁方式','产权描述','物业类别']
    # 直接丢弃的列
    drop_cols  = ['交易时间','车位','开发商','物业公司','物业办公电话',
                  '客户反馈','朝向','租期','配套设施','付款方式','用水','用电','楼层类型']  # 注意把原始“付款方式”删除

    return Pipeline([
        ('leak_remove', LeakRemover(drop_cols + ['配套设施','客户反馈','产权描述'])),
        ('layout', HouseLayoutExtractor()),
        ('unit_clean', UnitCleaner(num_cols)),
        ('floor', FloorExtractor()),
        ('elevator', ElevatorFlagger()),
        ('ring', RingEncoder()),
        ('paymonths', PaymentMonths()),
        ('winsor', DynamicWinsorizer()),
        ('missing_drop', DynamicMissingDropper(exempt_cols=['环线位置'])),
        ('impute', ImputerTransformer(num_impute, cat_impute)),   # 原有列级填补
        ('global_median', GlobalMedianImputer()),                 # ✅ 新增全局中位数填补
        ('onehot', CategoricalEncoder(cat_cols)),
        ('scale', SelectiveStandardizer()),
        ('prune', ColumnPruner(drop_cols))
    ])



In [30]:
# 1) 拟合并清洗训练集
rent_pipeline = FinalRentPipeline()
train_rent_clean = rent_pipeline.fit_transform(df_train_rent)


In [31]:
# ==========================================================
# 🏠 Rent Modeling: Price Prediction (Full Stable Version)
# ==========================================================

import os, warnings, numpy as np, pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.linear_model import LinearRegression, RidgeCV, LassoCV, ElasticNetCV
from sklearn.feature_selection import SelectKBest, f_regression

warnings.filterwarnings("ignore")

# ==========================================================
# 1️⃣ 数据准备与清洗
# ==========================================================

# --- 删除ID列 ---
df_train_rent = df_train_rent.drop(columns=["ID"], errors="ignore")
df_test_rent  = df_test_rent.drop(columns=["ID"], errors="ignore")

print(f"✅ 原始训练集: {df_train_rent.shape}, 测试集: {df_test_rent.shape}")

# --- 分离目标变量 Price ---
y_rent = df_train_rent["Price"].astype(float)
X_train_raw = df_train_rent.drop(columns=["Price"], errors="ignore")

# --- pipeline 清洗 ---
train_rent_clean = rent_pipeline.fit_transform(X_train_raw)
test_rent_clean  = rent_pipeline.transform(df_test_rent)

# --- 特征对齐 ---
if test_rent_clean.shape[1] != train_rent_clean.shape[1]:
    common_cols = [c for c in train_rent_clean.columns if c in test_rent_clean.columns]
    train_rent_clean = train_rent_clean[common_cols]
    test_rent_clean = test_rent_clean[common_cols]
    print(f"⚠️ 已对齐公共特征列数: {len(common_cols)}")

print(f"✅ 清洗后训练集形状: {train_rent_clean.shape}, 测试集形状: {test_rent_clean.shape}")

# ==========================================================
# 2️⃣ 特征与目标设置
# ==========================================================

# 此时 train_rent_clean 仅包含特征，不含 Price
X_rent = train_rent_clean.copy()

USE_LOG_TARGET = True
y_target = np.log1p(y_rent) if USE_LOG_TARGET else y_rent

print(f"✅ 特征与目标分离完成: X_rent={X_rent.shape}, y_rent={y_rent.shape}")

# ==========================================================
# 3️⃣ 数据划分与特征选择
# ==========================================================

X_train, X_valid, y_train, y_valid = train_test_split(
    X_rent, y_target, test_size=0.2, random_state=42
)
print(f"✅ 训练/验证集划分完成: X_train={X_train.shape}, X_valid={X_valid.shape}")

# --- 特征选择 ---
selector = SelectKBest(f_regression, k='all')  # 可调成具体数值，如 200
X_train_sel = selector.fit_transform(X_train, y_train)
X_valid_sel = selector.transform(X_valid)
print(f"✅ 特征筛选完成，共保留 {X_train_sel.shape[1]} 个特征")

# --- 标准化 ---
scaler = StandardScaler()
X_train_std = scaler.fit_transform(X_train_sel)
X_valid_std = scaler.transform(X_valid_sel)
print("✅ 特征已标准化")

# ==========================================================
# 4️⃣ 模型定义与评估函数
# ==========================================================

def evaluate_model(name, model, Xtr, ytr, Xva, yva, log_target=False):
    """通用模型评估函数"""
    model.fit(Xtr, ytr)
    ytr_pred, yva_pred = model.predict(Xtr), model.predict(Xva)

    # 反变换回原价格空间
    if log_target:
        ytr_true, yva_true = np.expm1(ytr), np.expm1(yva)
        ytr_pred, yva_pred = np.expm1(ytr_pred), np.expm1(yva_pred)
    else:
        ytr_true, yva_true = ytr, yva

    mae_tr = mean_absolute_error(ytr_true, ytr_pred)
    rmse_tr = np.sqrt(mean_squared_error(ytr_true, ytr_pred))
    r2_tr = r2_score(ytr_true, ytr_pred)

    mae_va = mean_absolute_error(yva_true, yva_pred)
    rmse_va = np.sqrt(mean_squared_error(yva_true, yva_pred))
    r2_va = r2_score(yva_true, yva_pred)

    print(f"\n[{name}]")
    print(f"In-sample:  MAE={mae_tr:,.2f}  RMSE={rmse_tr:,.2f}  R²={r2_tr:.4f}")
    print(f"Out-sample: MAE={mae_va:,.2f}  RMSE={rmse_va:,.2f}  R²={r2_va:.4f}")

    return {
        "Model": name,
        "MAE_train": mae_tr, "RMSE_train": rmse_tr, "R²_train": r2_tr,
        "MAE_val": mae_va, "RMSE_val": rmse_va, "R²_val": r2_va,
        "Estimator": model
    }

# ==========================================================
# 5️⃣ 模型训练与验证比较
# ==========================================================

models = [
    ("OLS", LinearRegression()),
    ("RidgeCV", RidgeCV(alphas=np.logspace(-3, 3, 20), cv=5)),
    ("LassoCV", LassoCV(alphas=np.logspace(-3, 1, 15), cv=5, max_iter=8000)),
    ("ElasticNetCV", ElasticNetCV(alphas=np.logspace(-3, 1, 10), 
                                  l1_ratio=[0.2,0.5,0.8], cv=5, max_iter=8000))
]

results = []
for name, model in models:
    res = evaluate_model(name, model, X_train_std, y_train, X_valid_std, y_valid, USE_LOG_TARGET)
    results.append(res)

# ==========================================================
# 6️⃣ 结果汇总与最优模型选择
# ==========================================================

metrics_df = pd.DataFrame([{k:v for k,v in r.items() if k!="Estimator"} for r in results])
metrics_df = metrics_df.sort_values("MAE_val").reset_index(drop=True)

print("\n=== 📊 Validation Summary ===")
print(metrics_df[["Model","MAE_val","RMSE_val","R²_val"]])

metrics_df.to_csv("metrics_rent_summary.csv", index=False)
print("📁 已保存模型性能表 metrics_rent_summary.csv")

# --- 选择验证集表现最好的模型 ---
best_idx = metrics_df["MAE_val"].idxmin()
best_model = results[best_idx]["Estimator"]
best_name  = results[best_idx]["Model"]
print(f"\n🏆 最优模型: {best_name}")

# ==========================================================
# 7️⃣ 最终预测（测试集）
# ==========================================================

X_test_sel = selector.transform(test_rent_clean)
X_test_std = scaler.transform(X_test_sel)
y_pred_test = best_model.predict(X_test_std)
if USE_LOG_TARGET:
    y_pred_test = np.expm1(y_pred_test)

submission = pd.DataFrame({
    "ID": np.arange(len(y_pred_test)),
    "prediction": y_pred_test
})
submission.to_csv("prediction_rent.csv", index=False, encoding="utf-8-sig")
print("📁 已保存预测结果 prediction_rent.csv")

print("\n✅ 全流程完成（租金预测稳定版）！")


✅ 原始训练集: (98899, 45), 测试集: (9773, 44)
✅ 清洗后训练集形状: (98899, 284), 测试集形状: (9773, 284)
✅ 特征与目标分离完成: X_rent=(98899, 284), y_rent=(98899,)
✅ 训练/验证集划分完成: X_train=(79119, 284), X_valid=(19780, 284)
✅ 特征筛选完成，共保留 284 个特征
✅ 特征已标准化

[OLS]
In-sample:  MAE=178,735.54  RMSE=359,871.56  R²=0.6590
Out-sample: MAE=181,432.78  RMSE=392,866.78  R²=0.6274

[RidgeCV]
In-sample:  MAE=178,685.22  RMSE=359,810.75  R²=0.6591
Out-sample: MAE=181,392.92  RMSE=392,856.79  R²=0.6274

[LassoCV]
In-sample:  MAE=179,334.76  RMSE=361,658.96  R²=0.6556
Out-sample: MAE=182,082.09  RMSE=396,108.64  R²=0.6213

[ElasticNetCV]
In-sample:  MAE=178,764.99  RMSE=360,215.70  R²=0.6583
Out-sample: MAE=181,439.36  RMSE=393,556.34  R²=0.6261

=== 📊 Validation Summary ===
          Model        MAE_val       RMSE_val    R²_val
0       RidgeCV  181392.920280  392856.788398  0.627449
1           OLS  181432.777267  392866.782101  0.627430
2  ElasticNetCV  181439.355580  393556.337383  0.626121
3       LassoCV  182082.091757  396108.64