# Mid-Term Final Version

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import re
import jieba
from sklearn.compose import ColumnTransformer, TransformedTargetRegressor
from sklearn.pipeline import Pipeline
from sklearn.model_selection import StratifiedShuffleSplit, cross_validate
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score, KFold
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.preprocessing import MultiLabelBinarizer, OneHotEncoder
from sklearn.linear_model import LinearRegression, RidgeCV, LassoCV, ElasticNetCV, Lasso, Ridge, ElasticNet
from sklearn.metrics import mean_absolute_error, r2_score, make_scorer
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sentence_transformers import SentenceTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from itertools import combinations
from sklearn.base import clone
import itertools
from itertools import chain
from joblib import Memory
from statsmodels.stats.outliers_influence import variance_inflation_factor
from category_encoders import TargetEncoder
from tqdm import tqdm
from scipy.sparse import save_npz, load_npz

plt.rcParams['font.sans-serif'] = ['Arial Unicode MS']  # 设置中文字体

In [None]:
# 导入训练集与测试集，使用深度拷贝保留原数据
rent_train_pd = pd.read_csv("/kaggle/input/python-ai-midterm/ruc_Class25Q2_train_rent.csv")
rent_train_raw = rent_train_pd.copy()

price_train_pd = pd.read_csv("/kaggle/input/python-ai-midterm/ruc_Class25Q2_train_price.csv")
price_train_raw = price_train_pd.copy()

rent_test_pd = pd.read_csv("/kaggle/input/python-ai-midterm/ruc_Class25Q2_test_rent.csv")
rent_test = rent_test_pd.copy()

price_test_pd = pd.read_csv("/kaggle/input/python-ai-midterm/ruc_Class25Q2_test_price.csv")
price_test = price_test_pd.copy()

## 1 Data Processing

In [None]:
clean_col_price = []
clean_col_rent = []

In [None]:
# 统计price缺失值
price_train_raw.isnull().sum().sort_values(ascending=False)
# 其中抵押信息、别墅类型的缺失值过多，物业办公电话、区县、板块_comm、环线位置数据信息与其他数据重复，直接删掉
clean_col_price = ["抵押信息", "别墅类型", "物业办公电话", "板块_comm", "coord_x", "coord_y","环线位置", "区县"]

In [None]:
# 统计缺失值
rent_train_raw.isnull().sum().sort_values(ascending=False)

In [None]:
clean_col_rent = ["物业办公电话", "coord_x", "coord_y"]   # 用于记录后面需要drop掉的数据，在数据处理结束、模型开始时统一删,避免每次都要重跑

In [None]:
# 合并已经生成的sentiment, 这样可以防止IQR筛选导致的行不匹配
col_emo_price = ["客户反馈"]
col_emo_rent = ["客户反馈"]
"""""
senti_price = pd.read_csv("/kaggle/input/sentiment/price_sentiment.csv")
senti_rent = pd.read_csv("/kaggle/input/sentiment/rent_sentiment.csv")

price_train = pd.concat([price_train, senti_price], axis=1)
rent_train = pd.concat([rent_train, senti_rent], axis=1)
"""""
clean_col_price.extend(col_emo_price)
clean_col_rent.extend(col_emo_rent)

### 1.1 初步清洗

#### 1.1.1 IQR去掉Price的离群值

In [None]:
# 使用IQR去掉极端值
# 如果直接使用IQR会删掉一万个样本，此处先取log再进行IQR处理
def IQR(df_raw):
    Q1 = np.log1p(df_raw["Price"]).quantile(0.25)    # 25%分位数
    Q3 = np.log1p(df_raw["Price"]).quantile(0.75)
    IQR = Q3-Q1

    # 计算上下界
    lower = Q1 - 1.5 * IQR
    upper = Q3 + 1.5 * IQR

    mask = (np.log1p(df_raw["Price"]) >= lower) & (np.log1p(df_raw["Price"]) <= upper)
    df = df_raw[mask].copy()    # 返回列名，不会影响原有值
    return df

price_train = IQR(price_train_raw)
rent_train = IQR(rent_train_raw)

In [None]:
price_train = IQR(price_train_raw)
rent_train = IQR(rent_train_raw)

#### 1.1.2 环线数据

In [None]:
# 转换环线数据，其中二至三环这种取平均数，内环是一环内，中环是三环左右，外环是六环以外
loop_map = {
    "内环内": 1,
    "二环内": 1.5,
    "内环至中环": 2,
    "二至三环": 2.5,
    "三至四环": 3.5,
    "四至五环": 4.5,
    "五至六环": 5.5,
    "六环外": 6.5,
    "内环至外环": 3,  #存疑
    "中环至外环": 4,  #存疑
    "外环外": 7
}

price_train["环线_num"] = price_train["环线"].map(loop_map)
price_test["环线_num"] = price_test["环线"].map(loop_map)

clean_col_price.append("环线")

rent_train["环线_num"] = rent_train["环线位置"].map(loop_map)
rent_test["环线_num"] = rent_test["环线位置"].map(loop_map)

clean_col_rent.append("环线位置")

price_train["环线_num"].value_counts()

#### 1.1.3 房屋户型

In [None]:
# 采用正则表达式拆分房屋户型数据为四个变量
def separate_room(s):
    # 如果缺失就全部赋为缺失值
    if pd.isna(s):
        return np.nan, np.nan, np.nan, np.nan
    s = str(s)  # 统一格式
    def extract(pattern):
        re_match = re.search(pattern, s)
        return int(re_match.group(1)) if re_match else np.nan   # match上了就返回匹配的值，没有就返回缺失值
    room = extract(r"(\d+)\s*(?:室|房间?)")   # 有x室和x房间两种表达形式，此处统一提取室或者房/房间
    hall = extract(r"(\d+)\s*厅")
    kitchen = extract(r"(\d+)\s*厨")
    bathroom = extract(r"(\d+)\s*卫")

    return room, hall, kitchen, bathroom

#第一个apply调用函数处理每一个单元格，返回一个tuple，再用series展开
price_train[["室","厅","厨","卫"]] = price_train["房屋户型"].apply(separate_room).apply(pd.Series)
price_test[["室","厅","厨","卫"]] = price_test["房屋户型"].apply(separate_room).apply(pd.Series)

rent_train[["室","厅","厨","卫"]] = rent_train["户型"].apply(separate_room).apply(pd.Series)
rent_test[["室","厅","厨","卫"]] = rent_test["户型"].apply(separate_room).apply(pd.Series)

clean_col_price.append("房屋户型")
clean_col_rent.append("户型")

In [None]:
price_train[["室","厅","厨","卫"]].head(5)

#### 1.1.4 所在楼层

##### Price

In [None]:
# 同样以正则表达式分开所在楼层类型以及楼层类型和总共楼层，后期可以加交叉项看看共同影响
def separate_floor(s):
    if pd.isna(s):
        return np.nan, np.nan
    def extract(pattern):
        re_match = re.search(pattern, s)
        return re_match.group(1) if re_match else np.nan
    type = extract(r"^(.*?)\s*\(") # ^为匹配字符串的开头，(.*?)提取任意数量的字符，英文括号前有空格
    total_floor = int(extract(r"共(\d+)层"))
    return type, total_floor

price_train[["楼层类型", "总共楼层"]] = price_train["所在楼层"].apply(separate_floor).apply(pd.Series)
price_test[["楼层类型", "总共楼层"]] = price_test["所在楼层"].apply(separate_floor).apply(pd.Series)

clean_col_price.append("所在楼层")

In [None]:
price_train[["楼层类型", "总共楼层"]].head(5)

##### Rent

In [None]:
# 同样以正则表达式分开所在楼层类型以及楼层类型和总共楼层
# 把以下两种正则表达字符串编成一个匹配对象
pat_frac = re.compile(r'\s*(\d+)\s*/\s*(\d+)\s*层?')          # 12/18层 或 12/18
pat_text = re.compile(r'\s*((?:地下室)|[低中高])楼层?\s*/\s*(\d+)\s*层?')  # 中楼层/25层 等

level_ratio = {"地下室": 0, "低": 0.25, "中": 0.50, "高": 0.75}  # 折算比率

def separate_floor(s):
    if pd.isna(s):
        return np.nan
    s = str(s)

    # 情况1：纯数字
    m = pat_frac.match(s)
    if m:
        cur = int(m.group(1))   # 所在楼层
        tot = int(m.group(2))   # 总共楼层
        if tot == 0: return np.nan
        r = cur / tot
        return r if 0 <= r <= 1 else np.nan # 如果所在楼层大于总共楼层，返回缺失值

    # 情况2：文字+总层 “低楼层/22层”
    m = pat_text.match(s)
    if m:
        lvl = m.group(1)
        tot = int(m.group(2))
        if tot == 0: return np.nan
        r = level_ratio.get(lvl, np.nan)    # 不存在就返回缺失值
        return r if 0 <= r <= 1 else np.nan


rent_train["楼层比例"] = rent_train["楼层"].apply(separate_floor)
rent_test["楼层比例"] = rent_test["楼层"].apply(separate_floor)

clean_col_rent.append("楼层")

In [None]:
rent_train["楼层比例"].head(5)

#### 1.1.5 建筑面积、套内面积、房屋总数、楼栋总数

In [None]:
# 建筑面积与套内面积统一使用正则表达式去掉单位
def separate_area(s):
    if pd.isna(s):
        return np.nan
    re_match = re.search(r"(\d+(?:\.\d+)?)", s) # 可能是小数
    if re_match:
        return float(re_match.group(1))
    else:
        print("nan")    #检查是否存在无法match的值
        return np.nan

price_train["建筑面积(㎡)"] = price_train["建筑面积"].apply(separate_area)
price_train["套内面积(㎡)"] = price_train["套内面积"].apply(separate_area)

price_test["建筑面积(㎡)"] = price_test["建筑面积"].apply(separate_area)
price_test["套内面积(㎡)"] = price_test["套内面积"].apply(separate_area)

price_train["房屋总数_num"] = price_train["房屋总数"].apply(separate_area)
price_train["楼栋总数_num"] = price_train["楼栋总数"].apply(separate_area)

price_test["房屋总数_num"] = price_test["房屋总数"].apply(separate_area)
price_test["楼栋总数_num"] = price_test["楼栋总数"].apply(separate_area)

rent_train["面积_num"] = rent_train["面积"].apply(separate_area)
rent_test["面积_num"] = rent_test["面积"].apply(separate_area)

rent_train["房屋总数_num"] = rent_train["房屋总数"].apply(separate_area)
rent_test["房屋总数_num"] = rent_test["房屋总数"].apply(separate_area)

rent_train["楼栋总数_num"] = rent_train["楼栋总数"].apply(separate_area)
rent_test["楼栋总数_num"] = rent_test["楼栋总数"].apply(separate_area)

clean_col_rent.extend(["面积", "房屋总数", "楼栋总数"])

clean_col_price.extend(["建筑面积", "套内面积", "房屋总数", "楼栋总数"])

In [None]:
rent_train["房屋总数_num"].head(5)

#### 1.1.6 房屋朝向

In [None]:
# 房屋朝向直接拆成八个不同变量，由于中国人较为偏好坐北朝南方向的房屋，后期拟加入交叉项
original_flags = {"北","东北","东","东南","南","西南","西","西北"}

def separate_flags(s):
    if pd.isna(s):
        return [np.nan]*8   # 返回八个缺失值
    t = re.sub(r"\s+", " ", s).strip()   # 替换空格为半角，并去除首尾空格
    parts = [p for p in t.split(" ") if p]  #有缺失值的时候parts就是空的

    # 把flag的初始值都设为0
    flags = {k:0 for k in original_flags}
    for flag in parts:
        flags[flag] = 1
    return [
        flags["北"], flags["东北"], flags["东"], flags["东南"],
        flags["南"], flags["西南"], flags["西"], flags["西北"]
    ]

cols = ["朝向_北","朝向_东北","朝向_东","朝向_东南","朝向_南","朝向_西南","朝向_西","朝向_西北"]
price_train[cols] = price_train["房屋朝向"].apply(separate_flags).apply(pd.Series)
price_test[cols] = price_test["房屋朝向"].apply(separate_flags).apply(pd.Series)

clean_col_price.append("房屋朝向")

rent_train[cols] = rent_train["朝向"].apply(separate_flags).apply(pd.Series)
rent_test[cols] = rent_test["朝向"].apply(separate_flags).apply(pd.Series)

clean_col_rent.append("朝向")

#### 1.1.7 梯户比例

In [None]:
# 梯户比例分开梯与户后，再生成梯/户比率
CN = {"零":0,"一":1,"二":2,"两":2,"三":3,"四":4,"五":5,"六":6,"七":7,"八":8,"九":9}
# 中文转整数
def cn_to_int(s):
    s = str(s).strip()
    if s == "十":
        return 10
    if "十" in s:
        parts = s.split("十")   # 用十分开，前为十位数后为个位数
        tens = CN.get(parts[0], 1) if parts[0] else 1   # 十二就为1*10
        units = CN.get(parts[1], 0) if len(parts) > 1 and parts[1] else 0
        return tens*10 + units
    else:
        return CN.get(s, np.nan)

def separate_tihu(text):
    if pd.isna(text):
        return np.nan, np.nan
    t = re.search(r"([一二三四五六七八九十两\d]+)梯([一二三四五六七八九十两\d]+)户", text)
    if not t:
        return np.nan, np.nan
    elevator_raw, units_raw = t.group(1), t.group(2)
    L = cn_to_int(elevator_raw)
    U = cn_to_int(units_raw)
    return int(L), int(U)

price_train[["梯数", "户数"]] = price_train["梯户比例"].apply(separate_tihu).apply(pd.Series)
price_train["梯户比"] = price_train["梯数"] / price_train["户数"]

price_test[["梯数", "户数"]] = price_test["梯户比例"].apply(separate_tihu).apply(pd.Series)
price_test["梯户比"] = price_test["梯数"] / price_test["户数"]

clean_col_price.append("梯户比例")

#### 1.1.8 交易时间&上次交易（创新点1: 以三角函数处理交易数据）

In [None]:
# 交易时间和上次交易时间分别提取，生成本次交易的时间来捕捉年份趋势，以及持有时间来捕捉炒房与否
def separate_time(df, group):
    # 转为pandas时间戳
    df["交易时间"] = pd.to_datetime(df["交易时间"], errors = "coerce")  # 非法日期变为NaT

    # 提取交易时间
    df["交易_年"] = df["交易时间"].dt.year.astype("float64")    # 使其可以被模型读入
    df["交易_月"] = df["交易时间"].dt.month.astype("float64")

    if group == "rent":
        return df
    else:
        df["上次交易"] = pd.to_datetime(df["上次交易"], errors = "coerce")
        # 把交易月份转为正交坐标，一是使其首尾相连（1月和12月更为相似），二是使其连续可微
        df["交易_月_sin"] = np.sin(2*np.pi*(df["交易_月"]-1)/12)    # 把一月设为初始点0，1
        df["交易_月_cos"] = np.cos(2*np.pi*(df["交易_月"]-1)/12)

        # 持有周期
        hold = (df["交易时间"] - df["上次交易"])
        df["持有天数"] = hold.dt.days.astype("float64")

        return df

price_train = separate_time(price_train, "price")
price_test = separate_time(price_test, "price")
clean_col_price.extend(["交易时间", "上次交易"])


rent_train = separate_time(rent_train, "rent")
rent_test = separate_time(rent_test, "rent")
clean_col_rent.extend(["交易时间"])

#### 1.1.9 房屋优势

In [None]:
# 房屋优势分词分存为不同01变量
advantage_cat = ["地铁", "装修", "房本满五年", "房本满两年"]
cols = [f"优势_{k}" for k in advantage_cat] # 对列命名
def separate_advantage(s):
    if pd.isna(s):
        return [np.nan]*4   # 返回四个缺失值
    
    t = re.sub(r"\s+", " ", s).strip("、")   # 替换空格为半角，并去除首尾顿号
    advantages = [p for p in t.split("、") if p]

    advantage_table = {k:0 for k in advantage_cat}
    for ad in advantages:
        advantage_table[ad] = 1
    return [
        advantage_table["地铁"], advantage_table["装修"], advantage_table["房本满五年"], advantage_table["房本满两年"]
    ]

price_train[cols] = price_train["房屋优势"].apply(separate_advantage).apply(pd.Series)
price_test[cols] = price_test["房屋优势"].apply(separate_advantage).apply(pd.Series)

clean_col_price.append("房屋优势")

In [None]:
price_train[cols].head(5)   # 检查

#### 1.1.10 建筑年代

In [None]:
def separate_build_year(x):
    if pd.isna(x):
        return np.nan
    s = str(x).strip()
    years = re.findall(r'\d{4}', s) # 可能有两种，一是xxxx年，二是xxxx-xxxx年，进行区分
    if len(years) == 2:
        a, b = int(years[0]), int(years[1])
        return (a+b)/2.0
    else:
        return int(years[0])

price_train["建筑年代_num"] = price_train["建筑年代"].apply(separate_build_year)
price_test["建筑年代_num"] = price_test["建筑年代"].apply(separate_build_year)

clean_col_price.append("建筑年代")

rent_train["建筑年代_num"] = rent_train["建筑年代"].apply(separate_build_year)
rent_test["建筑年代_num"] = rent_test["建筑年代"].apply(separate_build_year)

clean_col_rent.append("建筑年代")

#### 1.1.11 物业费、燃气费、供热费、绿化率

In [None]:
cols_12 = ["物 业 费", "燃气费", "供热费", "绿 化 率"]
def separate_fees(x):
    if pd.isna(x):
        return np.nan

    # 先匹配含"-"的，取平均数处理
    m = re.search(r"(\d+(?:\.\d+)?)\s*-\s*(\d+(?:\.\d+)?)", x)    # 可能有小数点也可能没有
    if m:
        m1, m2 = float(m.group(1)), float(m.group(2))
        return (m2 + m1) / 2.0
    else:
        m = re.search(r"(\d+(?:\.\d+)?)", x)    # 此处就只需要匹配单独的数字
        return float(m.group(1))

price_train["物业费(元/月/㎡)"] = price_train["物 业 费"].apply(separate_fees)
price_train["燃气费(元/m³)"] = price_train["燃气费"].apply(separate_fees)
price_train["供热费(元/㎡)"] = price_train["供热费"].apply(separate_fees)
price_train["绿化率_num"] = price_train["绿 化 率"].apply(separate_fees) / 100

price_test["物业费(元/月/㎡)"] = price_test["物 业 费"].apply(separate_fees)
price_test["燃气费(元/m³)"] = price_test["燃气费"].apply(separate_fees)
price_test["供热费(元/㎡)"] = price_test["供热费"].apply(separate_fees)
price_test["绿化率_num"] = price_test["绿 化 率"].apply(separate_fees) / 100

clean_col_price.extend(cols_12)

rent_train["物业费(元/月/㎡)"] = rent_train["物 业 费"].apply(separate_fees)
rent_train["燃气费(元/m³)"] = rent_train["燃气费"].apply(separate_fees)
rent_train["供热费(元/㎡)"] = rent_train["供热费"].apply(separate_fees)
rent_train["绿化率_num"] = rent_train["绿 化 率"].apply(separate_fees) / 100

rent_test["物业费(元/月/㎡)"] = rent_test["物 业 费"].apply(separate_fees)
rent_test["燃气费(元/m³)"] = rent_test["燃气费"].apply(separate_fees)
rent_test["供热费(元/㎡)"] = rent_test["供热费"].apply(separate_fees)
rent_test["绿化率_num"] = rent_test["绿 化 率"].apply(separate_fees) / 100

clean_col_rent.extend(cols_12)

In [None]:
price_train["绿化率_num"].head(6)

#### 1.1.12 停车费用(这个再看看)

In [None]:
# —— 规则（把“暂无”等归为未知；免费单独识别）——
free_re = re.compile(r'(免费|不收费|无固定车位不收费|没有停车费|无停车费|地上免费停车|目前免费|无\b)')
nan_re  = re.compile(r'(暂无|未知|无法核实|无法获知|不详|待定)')

# 统一各种横杠/波浪线为 '-'
DASHES = str.maketrans({'–':'-','—':'-','－':'-','~':'-','～':'-'})

# 把一段文本里的数值提出来：区间取均值，单值直接取
def _values_from_span(text: str):
    t = re.sub(r'\s+', '', str(text)).translate(DASHES)
    vals = []
    # 区间 a-b
    for a, b in re.findall(r'(\d+(?:\.\d+)?)\s*-\s*(\d+(?:\.\d+)?)', t):
        vals.append((float(a) + float(b)) / 2.0)
    # 去掉已处理的区间，再抓单值
    t = re.sub(r'\d+(?:\.\d+)?\s*-\s*\d+(?:\.\d+)?', ' ', t)
    for v in re.findall(r'(\d+(?:\.\d+)?)', t):
        vals.append(float(v))
    return vals

def _mean_or_nan(vals):
    return float(np.mean(vals)) if len(vals) else np.nan

# —— 主函数：返回 (月租, 时租, 是否免费) —— 
def separate_parking_fees(x):
    if pd.isna(x):
        return (np.nan, np.nan, 0)
    s = re.sub(r'\s+', '', str(x)).translate(DASHES)

    # 忽略与租金无关的售价描述
    s = re.sub(r'售价[^，,；;]*', '', s)
    s = re.sub(r'\d+(?:\.\d+)?万[^，,；;]*', '', s)

    # 免费 / 未知
    if free_re.search(s):
        return (0.0, 0.0, 1)
    if nan_re.search(s):
        return (np.nan, np.nan, 0)

    # 取“月”前面的价格片段（可能多个），求均值（地上、地下的月租直接以平均值算，否则会有大量缺失值）
    month_vals = []
    for span in re.findall(r'([\d\.\-–—－~～]+)\D{0,6}月', s):
        month_vals.extend(_values_from_span(span))

    # “年”价格换算为月（/12），也并入月租
    for span in re.findall(r'([\d\.\-–—－~～]+)\D{0,6}年', s):
        vals = _values_from_span(span)
        month_vals.extend([v / 12.0 for v in vals])

    # 时租（小时/时/h）
    hour_vals = []
    for span in re.findall(r'([\d\.\-–—－~～]+)\D{0,6}(?:小时|时|h|H)', s):
        hour_vals.extend(_values_from_span(span))

    # 若既无“月”也无“时”单位，但有数字 → 默认按月租
    if not month_vals and not hour_vals:
        vals = _values_from_span(s)
        if vals:
            month_vals.extend(vals)

    month_fee = _mean_or_nan(month_vals)
    hour_fee  = _mean_or_nan(hour_vals)
    return (month_fee, hour_fee, 0)

# 应用
col_parking = ["月租费用(元/月/位)", "时租费用(元/时/位)", "车位是否免费"]

price_train[col_parking] = price_train["停车费用"].apply(
    lambda x: pd.Series(separate_parking_fees(x), index=col_parking)
)
price_test[col_parking] = price_test["停车费用"].apply(
    lambda x: pd.Series(separate_parking_fees(x), index=col_parking)
)

rent_train[col_parking] = rent_train["停车费用"].apply(
    lambda x: pd.Series(separate_parking_fees(x), index=col_parking)
)
rent_test[col_parking] = rent_test["停车费用"].apply(
    lambda x: pd.Series(separate_parking_fees(x), index=col_parking)
)

clean_col_price.append("停车费用")
clean_col_rent.append("停车费用")

In [None]:
rent_train[col_parking].head(5)

#### 1.1.13 类别变量统一学习清理

##### 少类别：Onehot encoder

In [None]:
onehot_cols_price = ["建筑结构", "装修情况", "交易权属", "房屋用途", "房屋年限", "产权所属", "楼层类型", "配备电梯","城市"]
onehot_cols_rent = ["付款方式", "租赁方式", "电梯", "用水", "用电", "燃气", "采暖","装修", "车位", "城市"]

def onehot_transfer(onehot_cols, df_train, df_test):
    # 拟合训练集
    ohe = OneHotEncoder(drop='first', handle_unknown='ignore', sparse_output=False) # 采用密集矩阵，方便直接拼回df
    ohe.fit(df_train[onehot_cols])
    # 转换
    train_encoded = ohe.transform(df_train[onehot_cols])
    test_encoded  = ohe.transform(df_test[onehot_cols])
    # 转成 DataFrame
    train_ohe = pd.DataFrame(train_encoded, columns=ohe.get_feature_names_out(onehot_cols), index=df_train.index)
    test_ohe  = pd.DataFrame(test_encoded,  columns=ohe.get_feature_names_out(onehot_cols), index=df_test.index)
    # 拼回原数据，此处直接drop，不需要再加入clean_col
    df_train = pd.concat([df_train, train_ohe], axis=1)
    df_test  = pd.concat([df_test, test_ohe],  axis=1)

    return df_train, df_test

# 保留城市
clean_col_price.extend(["建筑结构", "装修情况", "交易权属", "房屋用途", "房屋年限", "产权所属", "楼层类型", "配备电梯"])
clean_col_rent.extend(["付款方式", "租赁方式", "电梯", "用水", "用电", "燃气", "采暖","装修", "车位"])

price_train, price_test = onehot_transfer(onehot_cols_price, price_train, price_test)
rent_train, rent_test = onehot_transfer(onehot_cols_rent, rent_train, rent_test)

##### 多类别：Target encoding：将其转化为在目标变量上的平均值

In [None]:
multihot_cols = ["开发商", "物业公司", "板块", "区域"]

encoder = TargetEncoder(cols=multihot_cols)

price_train[multihot_cols] = encoder.fit_transform(price_train[multihot_cols], price_train['Price'])
price_test[multihot_cols] = encoder.transform(price_test[multihot_cols])

clean_col_price.extend(multihot_cols)

In [None]:
multihot_cols = ["开发商", "物业公司", "区县", "板块"]

encoder = TargetEncoder(cols=multihot_cols)

rent_train[multihot_cols] = encoder.fit_transform(rent_train[multihot_cols], rent_train['Price'])
rent_test[multihot_cols] = encoder.transform(rent_test[multihot_cols])

clean_col_rent.extend(multihot_cols)

#### 1.1.14 租期

In [None]:
# 预编译两类最常见的正则
_pat_range  = re.compile(r'(\d+(?:\.\d+)?)\s*[~～\-到至]\s*(\d+(?:\.\d+)?)(?:\s*(年|个?月|月))?')
_pat_single = re.compile(r'(\d+(?:\.\d+)?)\s*(年|个?月|月)')

def to_months(x):
    if pd.isna(x): 
        return np.nan
    s = str(x).strip()
    # 1) 区间取平均值
    m = _pat_range.search(s)
    if m:
        num   = (float(m.group(1)) + float(m.group(2))) / 2
        unit = m.group(3) or ('年' if '年' in s else '月')
        return num*12 if '年' in unit else num
    # 2) 单值
    m = _pat_single.search(s)
    if m:
        val, unit = float(m.group(1)), m.group(2)
        return val*12 if '年' in unit else val
    # 3) 常见词：半年
    if '半年' in s:
        return 6.0
    return np.nan

# 使用
rent_train['租期_num'] = rent_train['租期'].apply(to_months).astype('float64')
rent_test['租期_num'] = rent_test['租期'].apply(to_months).astype('float64')

clean_col_rent.append("租期")

### 1.2 依赖数据分布的清洗

#### 1.2.1 数据分组

In [None]:
# 设置自变量和因变量
X_price = price_train.drop("Price", axis = 1)
y_price = price_train["Price"]

X_rent = rent_train.drop("Price", axis = 1)
y_rent = rent_train["Price"]
# 分出测试组与验证组
X_train_price, X_val_price, y_train_price, y_val_price = train_test_split(X_price, y_price, test_size=0.2, random_state=111)
X_test_price = price_test

X_train_rent, X_val_rent, y_train_rent, y_val_rent = train_test_split(X_rent, y_rent, test_size=0.2, random_state=111)
X_test_rent = rent_test

#### 1.2.2 使用TF-IDF对非结构化特征进行处理

In [None]:
# 由于数据过多，此处准备采取合并多列使用一个向量器
col_tf_price = ["核心卖点", "户型介绍", "周边配套", "交通出行"]
"""""
texts_train = X_train_price[col_tf_price].fillna("").agg("。".join, axis=1)    # 把四列合为一列
texts_val = X_val_price[col_tf_price].fillna("").agg("。".join, axis=1)    # 把四列合为一列
texts_test= X_test_price[col_tf_price].fillna("").agg("。".join, axis=1)    # 把四列合为一列

def jieba_tokenizer(text):
    return jieba.lcut(text)

tfidf = TfidfVectorizer(
    tokenizer=jieba_tokenizer,   #使用jieba分词
    max_features=300,        # 选取前300个高频词（防止维度太高）
    stop_words=["的", "了", "是", "有", "和"],  # 中文停用词，可自定义
    ngram_range=(1, 2)       # 一元词和二元词（单词+短语）
)

# 保存向量器，
X_tfidf_train = tfidf.fit_transform(texts_train)
X_tfidf_val = tfidf.transform(texts_val)
X_tfidf_test = tfidf.transform(texts_test)
# 存储（TF-IDF为稀疏矩阵）
save_npz("X_tfidf_train.npz", X_tfidf_train)
save_npz("X_tfidf_val.npz", X_tfidf_val)
save_npz("X_tfidf_test.npz", X_tfidf_test)
"""""
clean_col_price.extend(col_tf_price)

In [None]:
"""""
# 读取
X_tfidf_train = load_npz("X_tfidf_train.npz")
X_tfidf_val = load_npz("X_tfidf_val.npz")
X_tfidf_test = load_npz("X_tfidf_test.npz")

# TF-IDF → DataFrame
X_tfidf_train_df = pd.DataFrame(X_tfidf_train.toarray(),columns=tfidf.get_feature_names_out(),index=X_train_price.index)
X_tfidf_val_df = pd.DataFrame(X_tfidf_val.toarray(),columns=tfidf.get_feature_names_out(),index=X_val_price.index)
X_tfidf_test_df  = pd.DataFrame(X_tfidf_test.toarray(),columns=tfidf.get_feature_names_out(),index=X_test_price.index)

# 拼接
X_train_price = pd.concat([X_train_price, X_tfidf_train_df], axis=1)
X_val_price   = pd.concat([X_val_price,   X_tfidf_val_df],   axis=1)
X_test_price  = pd.concat([X_test_price,  X_tfidf_test_df],  axis=1)
"""""

#### 1.2.3 多类别变量：采用MultiLabelBinarizer转换

In [None]:
# 对训练集进行fit和transform，就每一个传进去的col_name进行训练
def MLB_fit_transform(col_name, X_tr, X_va, X_te):
    # 先统计所有可能的物业类别
    def tokenize(series):
        s = series.fillna('').astype(str)   # 用空填补缺失
        tokens = s.apply(lambda x: re.split(r'[\\/、]', x) if x else []) # 每行作为一个列表
        return tokens

    missing_train = X_tr[col_name].isna().astype(int)  # 保留是否为缺失值
    tokens = tokenize(X_tr[col_name])

    # 使用mlb进行转换
    mlb = MultiLabelBinarizer()
    Z = mlb.fit_transform(tokens)   # 学习每一行样本是不是包含每个标签，转为多热编码

    multi_cols = [f"{col_name}_{c}" for c in mlb.classes_] # 生成不同类别的标签
    dummies = pd.DataFrame(Z, columns = multi_cols, index = X_tr.index)   # 转成带有列名并与X_train对齐的df
    dummies[f"{col_name}_missing"] = missing_train

    X_tr = pd.concat([X_tr, dummies], axis = 1)

    # 对验证集和测试集进行transform
    def MLB_transform(df):
        missing_train = df[col_name].isna().astype(int)  # 保留是否为缺失值
        tokens = tokenize(df[col_name])

        # 使用mlb进行转换
        Z = mlb.transform(tokens)   # 学习每一行样本是不是包含每个标签，转为多热编码

        multi_cols = [f"{col_name}_{c}" for c in mlb.classes_] # 生成不同物业类别的标签
        dummies = pd.DataFrame(Z, columns = multi_cols, index = df.index)   # 转成带有列名并与X_train对齐的df
        dummies[f"{col_name}_missing"] = missing_train
        df = pd.concat([df, dummies], axis = 1)
        return df
    
    return X_tr, MLB_transform(X_va), MLB_transform(X_te)


In [None]:
# 对需要的cols进行调用
multi_cols_price = ["物业类别", "建筑结构_comm", "产权描述", "供水", "供暖", "供电"]
for col in multi_cols_price:
    X_train_price, X_val_price, X_test_price = MLB_fit_transform(col, X_train_price, X_val_price, X_test_price)
clean_col_price.extend(multi_cols_price)

multi_cols_rent = ["配套设施", "物业类别", "建筑结构", "产权描述", "供水", "供暖", "供电"]
for col in multi_cols_rent:
    X_train_rent, X_val_rent, X_test_rent = MLB_fit_transform(col, X_train_rent, X_val_rent, X_test_rent)
clean_col_rent.extend(multi_cols_rent)

In [None]:
# 清理掉所有clean_col中包含的列
X_train_price = X_train_price.drop(columns = clean_col_price)
X_val_price = X_val_price.drop(columns = clean_col_price)
X_test_price = X_test_price.drop(columns = clean_col_price)

# 清理掉所有clean_col中包含的列
X_train_rent = X_train_rent.drop(columns = clean_col_rent)
X_val_rent = X_val_rent.drop(columns = clean_col_rent)
X_test_rent = X_test_rent.drop(columns = clean_col_rent)

In [None]:
non_num = X_train_price.select_dtypes(exclude=[np.number]).columns.tolist()
print("仍为非数值的列：", non_num[:50])
# 看看这些列的前几个取值
for c in non_num[:10]:
    print(c, X_train_price[c].dropna().unique()[:5])

## 2 Price Modeling

### 2.1 填充缺失值

In [None]:
X_train = X_train_price
X_val = X_val_price
X_test = X_test_price
y_train = y_train_price
y_val = y_val_price

In [None]:
id_series = X_test["ID"].copy()
X_test_pred = X_test.copy()
X_test_pred = X_test_pred.drop(columns = "ID")  # 把ID一列Drop掉防止干扰回归

In [None]:
# 区分数值列和0-1变量列
def classify_columns(df):
    bin_cols = []
    num_cols = []
    for col in df.columns:
        unique_vals = set(df[col].dropna().unique())
        if unique_vals.issubset({0,1}):
            bin_cols.append(col)
        else:
            num_cols.append(col)
    return bin_cols, num_cols
bin_cols, num_cols = classify_columns(X_train)

In [None]:
# 分层依据
group_cols = ["城市"]

# 计算缺失率
na_ratio = X_train[num_cols].isna().mean()

low_na  = na_ratio[na_ratio <= 0.30].index.tolist()               # ≤20% 用全局均值
mid_na  = na_ratio[(na_ratio > 0.30) & (na_ratio <= 0.70)].index.tolist()  # 20–50% 分层均值
high_na = na_ratio[na_ratio > 0.70].index.tolist()                # >50% 直接删除

# 记录最终保留的列
num_cols_kept = [c for c in num_cols if c not in high_na]

# 删除缺失值>50%的列
X_train_imp0 = X_train.copy()
X_val_imp0   = X_val.copy()
X_test_imp0  = X_test_pred.copy()   # 复制原表格

X_train_imp0.drop(columns=high_na, inplace=True, errors="ignore")
X_val_imp0.drop(columns=high_na,   inplace=True, errors="ignore")
X_test_imp0.drop(columns=high_na,  inplace=True, errors="ignore")

# 用均值填补缺失值<=10%的列
if low_na:
    mean_map = X_train[low_na].mean()  # 只用训练集计算
    X_train_imp0[low_na] = X_train_imp0[low_na].fillna(mean_map)
    X_val_imp0[low_na]   = X_val_imp0[low_na].fillna(mean_map)
    X_test_imp0[low_na]  = X_test_imp0[low_na].fillna(mean_map)

# 用分层均值填补缺失值为10%-70%的列
def fill_by_group_mean(df_to_fill, df_ref, cols, group_cols):   # df_ref为训练集
    if not cols:    #没有缺失值在此范围内的
        return df_to_fill
    out = df_to_fill.copy()

    if group_cols:
        grp_mean = df_ref[group_cols + cols].groupby(group_cols, dropna=False).mean()
        for c in cols:
            mapped = out[group_cols].merge(
                grp_mean[[c]].reset_index(),    # 把城市和训练组的均值按照城市左链接
                on=group_cols, 
                how="left"
            )[c].values if group_cols else None # 取出生成的那一列
            if mapped is not None:
                out[c] = out[c].where(~out[c].isna(), mapped)
            out[c] = out[c].fillna(df_ref[c].mean())    # 仍然缺失的还是用全局均值
    else: # 方便反悔
        for c in cols:
            out[c] = out[c].fillna(df_ref[c].mean())
    return out
X_train_imp0 = fill_by_group_mean(X_train_imp0, X_train, mid_na, group_cols)
X_val_imp0   = fill_by_group_mean(X_val_imp0,   X_train, mid_na, group_cols)
X_test_imp0  = fill_by_group_mean(X_test_imp0,  X_train, mid_na, group_cols)

# 二值变量采用众数填补
if bin_cols:
    # 计算缺失比例
    bin_na_ratio = X_train[bin_cols].isna().mean()
    # 按照缺失比例分组
    bin_high_na = bin_na_ratio[bin_na_ratio > 0.70].index.tolist()
    bin_keep    = [c for c in bin_cols if c not in bin_high_na]
    print(f"删除缺失>70%的二值列: {bin_high_na}")

    # 同步删除这类列
    X_train_imp0.drop(columns=bin_high_na, inplace=True, errors="ignore")
    X_val_imp0.drop(columns=bin_high_na,   inplace=True, errors="ignore")
    X_test_imp0.drop(columns=bin_high_na,  inplace=True, errors="ignore")

    # 对剩余二值列用众数填补
    if bin_keep:
        mode_imputer = SimpleImputer(strategy="most_frequent")
        X_train_imp0[bin_keep] = mode_imputer.fit_transform(X_train_imp0[bin_keep])
        X_val_imp0[bin_keep]   = mode_imputer.transform(X_val_imp0[bin_keep])
        X_test_imp0[bin_keep]  = mode_imputer.transform(X_test_imp0[bin_keep])

In [None]:
# 将price取log

y_train_log = np.log1p(y_train) # 取log(1+y_train)，防止价格=0的时候报错
y_val_log = np.log1p(y_val)

# 数值型的填补pipeline
num_pipeline = Pipeline([
    ("scaler", StandardScaler())
])

bin_passthrough = "passthrough"

preprocessor = ColumnTransformer([
    ("num", num_pipeline, num_cols_kept),
    ("bin", bin_passthrough, bin_keep)
], remainder="drop")


# 拟合/变换
X_train_imp = preprocessor.fit_transform(X_train_imp0)
X_val_imp   = preprocessor.transform(X_val_imp0)
X_test_imp  = preprocessor.transform(X_test_imp0)

out_cols = preprocessor.get_feature_names_out()

X_train_prep = pd.DataFrame(X_train_imp, columns=out_cols, index=X_train.index)
X_val_prep   = pd.DataFrame(X_val_imp,   columns=out_cols, index=X_val.index)
X_test_prep  = pd.DataFrame(X_test_imp,  columns=out_cols, index=X_test_pred.index)

num_out_cols = [c for c in out_cols if c.startswith("num__")]
bin_out_cols = [c for c in out_cols if c.startswith("bin__")]

In [None]:
print(len(X_train_prep.columns))

### 2.2 选择需要使用的变量

In [None]:
# 生成数值型变量的二次项和交叉项
def square_interaction(df, num_out_cols = num_out_cols, bin_out_cols = bin_out_cols):
    # 对数值型变量生成平方项并命名
    df_square = df[num_out_cols] ** 2
    df_square.columns = [f"{c}_square" for c in num_out_cols]
    # 对数值型变量生成交叉项并命名
    inter_df = pd.DataFrame(
        {f"{a}*{b}": df[a].values * df[b].values for a, b in combinations(num_out_cols, 2)},
        index = df.index
    )
    # 生成指定交叉项
    df["南北通透"] = df["bin__朝向_南"] * df["bin__朝向_北"]

    out = pd.concat([df, df_square, inter_df], axis=1)    # 合成大表
    return out

X_train_si = square_interaction(X_train_prep)
X_val_si = square_interaction(X_val_prep)
X_test_si = square_interaction(X_test_prep)

In [None]:
print(len(X_train_si.columns))

In [None]:
pearson_cols = X_train_si.columns

# 合并并排序
corr_df = (
    X_train_si[pearson_cols].corrwith(y_train_log, method = "pearson").to_frame("pearson")
    .reset_index().rename(columns={"index": "feature"}).assign(abs_pearson=lambda d: d["pearson"].abs())
    .sort_values("abs_pearson", ascending=False)
)
print(corr_df.head(5))

# 设置筛选阈值
corr_threhold = 0.1
preselected_cols = corr_df.loc[corr_df["abs_pearson"] >= corr_threhold, "feature"]
print(f"保留样本{len(preselected_cols)}个")

# 只保留preselected中的列
X_train_final = X_train_si.loc[:, preselected_cols]
X_val_final = X_val_si.loc[:, preselected_cols]
X_test_final = X_test_si.loc[:, preselected_cols]

### 2.3 Modeling (记得还原对数！！！)

In [None]:
# 对y取对数（提升稳定性）
y_train_log = np.log1p(y_train)
y_val_log   = np.log1p(y_val)


# 定义模型
ols   = LinearRegression()
ridge = Ridge(alpha=0.015, random_state=111)
lasso = Lasso(alpha=0.00005, max_iter=500000, random_state=111)
enet  = ElasticNet(alpha=0.00005, l1_ratio=0.0005, max_iter=500000, random_state=111)

models = {'OLS': ols, 'Ridge': ridge, 'Lasso': lasso, 'ElasticNet': enet}

def safe_transfer(log_pred: np.ndarray) -> np.ndarray:
    log_pred = log_pred.astype(np.float64)
    y_pred = np.expm1(log_pred)
    return y_pred

rows = []
pred_dict = {}

# 交叉验证mae的打分器
def mae_on_original_scale(y_true_log, y_pred_log):
    # 还原
    y_true = np.expm1(y_true_log)
    y_pred = np.expm1(y_pred_log)
    return mean_absolute_error(y_true, y_pred)

# 交叉验证rmae的打分器
def rmae_on_original_scale(y_true_log, y_pred_log):
    y_true = np.expm1(y_true_log)
    y_pred = np.expm1(y_pred_log)
    mae = mean_absolute_error(y_true, y_pred)
    return mae / np.mean(y_true)

mae_scorer = make_scorer(mae_on_original_scale, greater_is_better= False)  # 自定义一个评分器，规则是分数越小越好
rmae_scorer = make_scorer(rmae_on_original_scale, greater_is_better= False)

cv = KFold(n_splits=6, shuffle=True, random_state=111)  # 六折交叉验证，shuffle = True指交叉验证切块前随机打乱数据

for name, model in tqdm(models.items(), desc="Training models", ncols=100):

    model.fit(X_train_final, y_train_log)    # 训练模型
    y_train_pred = safe_transfer(model.predict(X_train_final))    # in-sample
    y_val_pred = safe_transfer(model.predict(X_val_final))    # out-of-sample
    y_test_pred = safe_transfer(model.predict(X_test_final))  # 测试集结果
    # mae结果
    mae_train = mean_absolute_error(y_train, y_train_pred)
    mae_val = mean_absolute_error(y_val, y_val_pred)
    # rmae = mae/y真实值的平均值
    rmae_train = mae_train / np.mean(y_train)
    rmae_val = mae_val / np.mean(y_val)
    # R方
    r2_train = r2_score(y_train, y_train_pred)
    r2_val = r2_score(y_val, y_val_pred)

    # 在确定最优参数范围之后再来用交叉检验，避免算很多次
    # 此处注意，交叉验证的mae也需要从log还原
    cv_mae_scores = cross_val_score(
        clone(model),   # 复制一个没训练的模型
        X_train_final,
        y_train_log,    # 目标变量
        scoring = mae_scorer,
        cv = cv,
        n_jobs = -1 # 使用多个CPU并行运算
    )

    cv_rmae_scores = cross_val_score(
        clone(model),   # 复制一个没训练的模型
        X_train_final,
        y_train_log,    # 目标变量
        scoring = rmae_scorer,
        cv = cv,
        n_jobs = -1 # 使用多个CPU并行运算
    )

    cv_mae = -cv_mae_scores.mean()  # cross_val_score返回计算分数组成的数组，以负值形式
    cv_rmae = -cv_rmae_scores.mean()

    rows.append({
        "Metrics": name,
        "In sample (mae)": mae_train,
        "In sample (rmae)": rmae_train,
        "In sample (R2)": r2_train,
        "Out of sample (mae)": mae_val,
        "Out of sample (rmae)": rmae_val,
        "Out of sample (R2)": r2_val,
        "Cross-Validation (mae)": cv_mae,
        "Cross-Validation (rmae)": cv_rmae
    })
    pred_dict[f"{name}_pred"] = y_test_pred


# 汇总成表
report_df = pd.DataFrame(rows)
print(report_df)

In [None]:
pd.set_option('display.float_format', '{:.6f}'.format)   # 保留6位小数
pred_df = pd.DataFrame(pred_dict, index = X_test.index)
pred_df = pd.concat([pd.DataFrame(pred_dict, index=X_test.index), X_test[["ID"]]], axis=1)
pred_df.head(5)

In [None]:
# 为每一个模型都生成一个submit结果
model_pred = ["OLS_pred", "Ridge_pred", "Lasso_pred", "ElasticNet_pred"]
for pre in model_pred:
    pred_df_renamed = pred_df.rename(columns={pre: "Price"})
    pred_df_renamed = pred_df_renamed[["ID", "Price"]]
    pred_df_renamed.to_excel(f"output_price_{pre}.xlsx", index=False)

## 3 Rent Modeling

### 3.1 填充缺失值并标准化

In [None]:
X_train = X_train_rent
X_val = X_val_rent
X_test = X_test_rent
y_train = y_train_rent
y_val = y_val_rent

In [None]:
id_series = X_test["ID"].copy()
X_test_pred = X_test.copy()
X_test_pred = X_test_pred.drop(columns = "ID")  # 把ID一列Drop掉防止干扰回归

In [None]:
# 区分数值列和0-1变量列
def classify_columns(df):
    bin_cols = []
    num_cols = []
    for col in df.columns:
        unique_vals = set(df[col].dropna().unique())
        if unique_vals.issubset({0,1}):
            bin_cols.append(col)
        else:
            num_cols.append(col)
    return bin_cols, num_cols
bin_cols, num_cols = classify_columns(X_train)

In [None]:
bad_cols = [c for c in low_na if not pd.api.types.is_numeric_dtype(X_train[c])]
print("非数值列：", bad_cols)

In [None]:
# 分层依据
group_cols = ["城市"]

# 计算缺失率
na_ratio = X_train[num_cols].isna().mean()

low_na  = na_ratio[na_ratio <= 0.30].index.tolist()               # ≤20% 用全局均值
mid_na  = na_ratio[(na_ratio > 0.30) & (na_ratio <= 0.70)].index.tolist()  # 20–50% 分层均值
high_na = na_ratio[na_ratio > 0.70].index.tolist()                # >50% 直接删除

# 记录最终保留的列
num_cols_kept = [c for c in num_cols if c not in high_na]

# 删除缺失值>50%的列
X_train_imp0 = X_train.copy()
X_val_imp0   = X_val.copy()
X_test_imp0  = X_test_pred.copy()   # 复制原表格

X_train_imp0.drop(columns=high_na, inplace=True, errors="ignore")
X_val_imp0.drop(columns=high_na,   inplace=True, errors="ignore")
X_test_imp0.drop(columns=high_na,  inplace=True, errors="ignore")

# 用均值填补缺失值<=10%的列
if low_na:
    mean_map = X_train[low_na].mean()  # 只用训练集计算
    X_train_imp0[low_na] = X_train_imp0[low_na].fillna(mean_map)
    X_val_imp0[low_na]   = X_val_imp0[low_na].fillna(mean_map)
    X_test_imp0[low_na]  = X_test_imp0[low_na].fillna(mean_map)

# 用分层均值填补缺失值为10%-70%的列
def fill_by_group_mean(df_to_fill, df_ref, cols, group_cols):   # df_ref为训练集
    if not cols:    #没有缺失值在此范围内的
        return df_to_fill
    out = df_to_fill.copy()

    if group_cols:
        grp_mean = df_ref[group_cols + cols].groupby(group_cols, dropna=False).mean()
        for c in cols:
            mapped = out[group_cols].merge(
                grp_mean[[c]].reset_index(),    # 把城市和训练组的均值按照城市左链接
                on=group_cols, 
                how="left"
            )[c].values if group_cols else None # 取出生成的那一列
            if mapped is not None:
                out[c] = out[c].where(~out[c].isna(), mapped)
            out[c] = out[c].fillna(df_ref[c].mean())    # 仍然缺失的还是用全局均值
    else: # 方便反悔
        for c in cols:
            out[c] = out[c].fillna(df_ref[c].mean())
    return out
X_train_imp0 = fill_by_group_mean(X_train_imp0, X_train, mid_na, group_cols)
X_val_imp0   = fill_by_group_mean(X_val_imp0,   X_train, mid_na, group_cols)
X_test_imp0  = fill_by_group_mean(X_test_imp0,  X_train, mid_na, group_cols)

# 二值变量采用众数填补
if bin_cols:
    # 计算缺失比例
    bin_na_ratio = X_train[bin_cols].isna().mean()
    # 按照缺失比例分组
    bin_high_na = bin_na_ratio[bin_na_ratio > 0.70].index.tolist()
    bin_keep    = [c for c in bin_cols if c not in bin_high_na]
    print(f"删除缺失>70%的二值列: {bin_high_na}")

    # 同步删除这类列
    X_train_imp0.drop(columns=bin_high_na, inplace=True, errors="ignore")
    X_val_imp0.drop(columns=bin_high_na,   inplace=True, errors="ignore")
    X_test_imp0.drop(columns=bin_high_na,  inplace=True, errors="ignore")

    # 对剩余二值列用众数填补
    if bin_keep:
        mode_imputer = SimpleImputer(strategy="most_frequent")
        X_train_imp0[bin_keep] = mode_imputer.fit_transform(X_train_imp0[bin_keep])
        X_val_imp0[bin_keep]   = mode_imputer.transform(X_val_imp0[bin_keep])
        X_test_imp0[bin_keep]  = mode_imputer.transform(X_test_imp0[bin_keep])

In [None]:
# 将price取log

y_train_log = np.log1p(y_train) # 取log(1+y_train)，防止价格=0的时候报错
y_val_log = np.log1p(y_val)

# 数值型的填补pipeline
num_pipeline = Pipeline([
    ("scaler", StandardScaler())
])

bin_passthrough = "passthrough"

preprocessor = ColumnTransformer([
    ("num", num_pipeline, num_cols_kept),
    ("bin", bin_passthrough, bin_keep)
], remainder="drop")


# 拟合/变换
X_train_imp = preprocessor.fit_transform(X_train_imp0)
X_val_imp   = preprocessor.transform(X_val_imp0)
X_test_imp  = preprocessor.transform(X_test_imp0)

out_cols = preprocessor.get_feature_names_out()

X_train_prep = pd.DataFrame(X_train_imp, columns=out_cols, index=X_train.index)
X_val_prep   = pd.DataFrame(X_val_imp,   columns=out_cols, index=X_val.index)
X_test_prep  = pd.DataFrame(X_test_imp,  columns=out_cols, index=X_test_pred.index)

num_out_cols = [c for c in out_cols if c.startswith("num__")]
bin_out_cols = [c for c in out_cols if c.startswith("bin__")]

### 3.2 选择需要使用的变量

In [None]:
# 生成数值型变量的二次项和交叉项
def square_interaction(df, num_out_cols = num_out_cols, bin_out_cols = bin_out_cols):
    # 对数值型变量生成平方项并命名
    df_square = df[num_out_cols] ** 2
    df_square.columns = [f"{c}_square" for c in num_out_cols]
    # 对数值型变量生成交叉项并命名
    inter_df = pd.DataFrame(
        {f"{a}*{b}": df[a].values * df[b].values for a, b in combinations(num_out_cols, 2)},
        index = df.index
    )
    # 生成指定交叉项
    df["南北通透"] = df["bin__朝向_南"] * df["bin__朝向_北"]

    out = pd.concat([df, df_square, inter_df], axis=1)    # 合成大表
    return out

X_train_si = square_interaction(X_train_prep)
X_val_si = square_interaction(X_val_prep)
X_test_si = square_interaction(X_test_prep)

In [None]:
print(len(X_train_si.columns))

In [None]:
pearson_cols = X_train_si.columns

# 合并并排序
corr_df = (
    X_train_si[pearson_cols].corrwith(y_train_log, method = "pearson").to_frame("pearson")
    .reset_index().rename(columns={"index": "feature"}).assign(abs_pearson=lambda d: d["pearson"].abs())
    .sort_values("abs_pearson", ascending=False)
)
print(corr_df.head(5))

# 设置筛选阈值
corr_threhold = 0.08
preselected_cols = corr_df.loc[corr_df["abs_pearson"] >= corr_threhold, "feature"]
print(f"保留样本{len(preselected_cols)}个")

X_train_final = X_train_si.loc[:, preselected_cols]
X_val_final = X_val_si.loc[:, preselected_cols]
X_test_final = X_test_si.loc[:, preselected_cols]

### 3.3 Modeling (记得还原对数！！！)

In [None]:
# 对y取对数（提升稳定性）
y_train_log = np.log1p(y_train)
y_val_log   = np.log1p(y_val)


# 定义模型
ols   = LinearRegression()
ridge = Ridge(alpha=0.015, random_state=111)
lasso = Lasso(alpha=0.00005, max_iter=500000, random_state=111)
enet  = ElasticNet(alpha=0.00005, l1_ratio=0.0001, max_iter=700000, random_state=111)

models = {'OLS': ols, 'Ridge': ridge, 'Lasso': lasso, 'ElasticNet': enet}

def safe_transfer(log_pred: np.ndarray) -> np.ndarray:
    log_pred = log_pred.astype(np.float64)
    y_pred = np.expm1(log_pred)
    return y_pred

rows = []
pred_dict = {}

# 交叉验证mae的打分器
def mae_on_original_scale(y_true_log, y_pred_log):
    # 还原
    y_true = np.expm1(y_true_log)
    y_pred = np.expm1(y_pred_log)
    return mean_absolute_error(y_true, y_pred)

# 交叉验证rmae的打分器
def rmae_on_original_scale(y_true_log, y_pred_log):
    y_true = np.expm1(y_true_log)
    y_pred = np.expm1(y_pred_log)
    mae = mean_absolute_error(y_true, y_pred)
    return mae / np.mean(y_true)

mae_scorer = make_scorer(mae_on_original_scale, greater_is_better= False)  # 自定义一个评分器，规则是分数越小越好
rmae_scorer = make_scorer(rmae_on_original_scale, greater_is_better= False)

cv = KFold(n_splits=6, shuffle=True, random_state=111)  # 六折交叉验证，shuffle = True指交叉验证切块前随机打乱数据

for name, model in tqdm(models.items(), desc="Training models", ncols=100):

    model.fit(X_train_final, y_train_log)    # 训练模型
    y_train_pred = safe_transfer(model.predict(X_train_final))    # in-sample
    y_val_pred = safe_transfer(model.predict(X_val_final))    # out-of-sample
    y_test_pred = safe_transfer(model.predict(X_test_final))  # 测试集结果
    # mae结果
    mae_train = mean_absolute_error(y_train, y_train_pred)
    mae_val = mean_absolute_error(y_val, y_val_pred)
    # rmae = mae/y真实值的平均值
    rmae_train = mae_train / np.mean(y_train)
    rmae_val = mae_val / np.mean(y_val)
    # R方
    r2_train = r2_score(y_train, y_train_pred)
    r2_val = r2_score(y_val, y_val_pred)

    # 在确定最优参数范围之后再来用交叉检验，避免算很多次
    # 此处注意，交叉验证的mae也需要从log还原
    cv_mae_scores = cross_val_score(
        clone(model),   # 复制一个没训练的模型
        X_train_final,
        y_train_log,    # 目标变量
        scoring = mae_scorer,
        cv = cv,
        n_jobs = -1 # 使用多个CPU并行运算
    )

    cv_rmae_scores = cross_val_score(
        clone(model),   # 复制一个没训练的模型
        X_train_final,
        y_train_log,    # 目标变量
        scoring = rmae_scorer,
        cv = cv,
        n_jobs = -1 # 使用多个CPU并行运算
    )

    cv_mae = -cv_mae_scores.mean()  # cross_val_score返回计算分数组成的数组，以负值形式
    cv_rmae = -cv_rmae_scores.mean()

    rows.append({
        "Metrics": name,
        "In sample (mae)": mae_train,
        "In sample (rmae)": rmae_train,
        "In sample (R2)": r2_train,
        "Out of sample (mae)": mae_val,
        "Out of sample (rmae)": rmae_val,
        "Out of sample (R2)": r2_val,
        "Cross-Validation (mae)": cv_mae,
        "Cross-Validation (rmae)": cv_rmae
    })
    pred_dict[f"{name}_pred"] = y_test_pred


# 汇总成表
report_df = pd.DataFrame(rows)
print(report_df)

In [None]:
# 检查最优参数，方便调参
print("Best alpha for Ridge:", ridge.alpha_)
print("Best alpha for Lasso:", lasso.alpha_)
print("Best alpha for ElasticNet:", enet.alpha_)
print("Best l1_ratio for ElasticNet:", enet.l1_ratio_)

In [None]:
pd.set_option('display.float_format', '{:.6f}'.format)   # 保留6位小数
pred_df = pd.DataFrame(pred_dict, index = X_test.index)
pred_df = pd.concat([pd.DataFrame(pred_dict, index=X_test.index), X_test[["ID"]]], axis=1)
pred_df.head(5)

In [None]:
# 为每一个模型都生成一个submit结果
model_pred = ["OLS_pred", "Ridge_pred", "Lasso_pred", "ElasticNet_pred"]
for pre in model_pred:
    pred_df_renamed = pred_df.rename(columns={pre: "Price"})
    pred_df_renamed = pred_df_renamed[["ID", "Price"]]
    pred_df_renamed.to_excel(f"output_rent_{pre}.xlsx", index=False)

    df1 = pd.read_excel(f"output_price_{pre}.xlsx", engine="openpyxl")
    df2 = pd.read_excel(f"output_rent_{pre}.xlsx", engine="openpyxl")

    df_combined = pd.concat([df1, df2], ignore_index=True)
    df_combined.to_csv(f"submit_{pre}.csv", index = False)