# 文本情感分析

In [None]:
from tqdm import tqdm
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
import numpy as np

def compute_sentiment_scores(X, batch_size=32, device='cuda'):
    """
    输入：中文客户反馈文本
    输出：每条文本的情感得分，范围约[-1, 1]
    """
    model_name = "uer/roberta-base-finetuned-jd-binary-chinese"
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForSequenceClassification.from_pretrained(model_name)

    model.to(device)
    model.eval()
    
    scores = []
    s = X.iloc[:,0].fillna("")

    # 批量处理文本
    for start_idx in tqdm(range(0, len(s), batch_size), desc="计算情感得分"):
        batch_texts = s[start_idx:start_idx + batch_size]
        
        # 跳过空文本
        if batch_texts.str.strip().eq("").all():
            scores.extend([0.0] * len(batch_texts))
            continue
        
        # tokenizer 返回字典格式，所以需要传递 list
        inputs = tokenizer(batch_texts.tolist(), return_tensors="pt", truncation=True, padding=True).to(device)
        
        with torch.no_grad():
            outputs = model(**inputs)
            probs = torch.softmax(outputs.logits, dim=1)
            
            # 获取每条文本的情感得分：正面 - 负面
            batch_scores = (probs[:, 1] - probs[:, 0]).cpu().numpy()
        
        scores.extend(batch_scores)
    
    return np.array(scores).reshape(-1, 1)

In [None]:
import pandas as pd

test_price = pd.read_csv("ruc_Class25Q2_test_price.csv")
test_rent = pd.read_csv("ruc_Class25Q2_test_rent.csv")
train_price = pd.read_csv("ruc_Class25Q2_train_price.csv")
train_rent = pd.read_csv("ruc_Class25Q2_train_rent.csv")

In [None]:
train_price["客户反馈"] = compute_sentiment_scores(train_price[["客户反馈"]])
train_rent["客户反馈"] = compute_sentiment_scores(train_rent[["客户反馈"]])
test_price["客户反馈"] = compute_sentiment_scores(test_price[["客户反馈"]])
test_rent["客户反馈"] = compute_sentiment_scores(test_rent[["客户反馈"]])
train_price.to_csv("processed_train_price.csv", index=False)
train_rent.to_csv("processed_train_rent.csv", index=False)
test_price.to_csv("processed_test_price.csv", index=False)
test_rent.to_csv("processed_test_rent.csv", index=False)

In [None]:
test_price = pd.read_csv("processed_test_price.csv")
test_rent = pd.read_csv("processed_test_rent.csv")
train_price = pd.read_csv("processed_train_price.csv")
train_rent = pd.read_csv("processed_train_rent.csv")

# 房价

In [None]:
import re
import numpy as np
from cn2an import cn2an
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer, TransformedTargetRegressor
from sklearn.preprocessing import OneHotEncoder, StandardScaler, FunctionTransformer
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.metrics import mean_absolute_error, r2_score
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.cluster import KMeans

In [None]:
def fill_ring_road_null(X):
    s = X.iloc[:,0].fillna("无环线").astype(str)
    return pd.DataFrame(s)

ring_road_transformer = FunctionTransformer(fill_ring_road_null, validate=False)

ring_road_pipeline = Pipeline([
    ("ring_road_transformer", ring_road_transformer),
    ("OneHot", OneHotEncoder(drop="first", handle_unknown="ignore"))
])

In [None]:
def parse_house_type(X):
    
    dims = {
        "室数": r'(\d+)室',
        "厅数": r'(\d+)厅',
        "厨数": r'(\d+)厨',
        "卫数": r'(\d+)卫'
    }
    df = pd.DataFrame(index=X.index)
    for key, pattern in dims.items():
        df[key] = X.iloc[:,0].fillna("").str.extract(pattern).fillna(0).astype(int)
    return df

house_type_transformer = FunctionTransformer(parse_house_type, validate=False)

house_type_pipeline = Pipeline(steps=[
    ("parse", house_type_transformer),
    ("standardize", StandardScaler()),
    ("impute", SimpleImputer(strategy='most_frequent'))
])


In [None]:
def floor_calculator(X):

    pos_map = {"地下室": 0, "底层": 0.2, "低楼层": 0.4, "中楼层": 0.6, "高楼层": 0.8, "顶层": 1.0}

    s = X.iloc[:, 0].fillna("")
    extracted = s.str.extract(r'^(.+?)\s*[（(]\s*共\s*(\d+)\s*层\s*[）)]$')
    floor_pos = extracted[0].map(pos_map)
    total_floors = extracted[1].astype(float, errors='ignore').astype('Int64', errors='ignore')
    
    return pd.DataFrame({
        "楼层位置": floor_pos,
        "总层数": total_floors
    })

floor_transformer = FunctionTransformer(floor_calculator, validate=False)

floor_pipeline = Pipeline(steps=[
    ("parse", floor_transformer),
    ("standardize", StandardScaler()),
    ("impute", SimpleImputer(strategy='mean'))
])


In [None]:
def extract_area(X):
    s = X.iloc[:,0].fillna("")
    s2 = s.str.extract(r'(\d+\.?\d*)')[0].astype(float)
    return pd.DataFrame(s2)

area_transformer = FunctionTransformer(extract_area, validate=False)

area_pipeline = Pipeline(steps=[
    ("parse", area_transformer),
    ("standardize", StandardScaler()),
    ("impute", SimpleImputer(strategy='mean'))  # 数值缺失填充平均值
])


In [None]:
def parse_orientation(X):
    orientations = ["东", "南", "西", "北", "东南", "东北", "西南", "西北"]
    df = pd.DataFrame(0, index=X.index, columns=orientations)
    for idx, val in X.iloc[:,0].items():
        if pd.isna(val): continue
        for ori in re.split(r'[ ,，、/]+', str(val)):
            if ori in orientations:
                df.loc[idx, ori] = 1
    return df

orientation_transformer = FunctionTransformer(parse_orientation, validate=False)

orientation_pipeline = Pipeline(steps=[
    ("parse", orientation_transformer),
    ("impute", SimpleImputer(strategy='most_frequent'))
])

In [None]:
def parse_elevator_house_ratio_cn2an(X):
    pattern = r'^(.+)梯(.+)户$'
    s = X.iloc[:,0].fillna("")
    def calc_ratio(text):
        if text.strip() == "":
            return np.nan
        m = re.match(pattern, text.strip())
        if not m:
            return np.nan
        try:
            ladder_num = cn2an(m[1], mode="smart")
            house_num = cn2an(m[2], mode="smart")
            if house_num == 0:
                return np.nan
            return ladder_num / house_num
        except:
            return np.nan
    return pd.DataFrame(s.map(calc_ratio))

elevator_house_transformer = FunctionTransformer(parse_elevator_house_ratio_cn2an, validate=False)

elevator_house_pipeline = Pipeline([
    ("transform", elevator_house_transformer),
    ("standardize", StandardScaler()),
    ("imputer", SimpleImputer(strategy="mean"))
])


In [None]:
def fill_elevator_null(X):
    s = X.iloc[:,0].fillna("无").astype(str)
    return pd.DataFrame(s)

elevator_transformer = FunctionTransformer(fill_elevator_null, validate=False)

elevator_pipeline = Pipeline([
    ("fill_null", elevator_transformer),
    ("OneHot", OneHotEncoder(drop="first", handle_unknown="ignore"))
])

In [None]:
def fill_villa_null(X):
    s = X.iloc[:,0].fillna("非别墅").astype(str)
    return pd.DataFrame(s)

villa_transformer = FunctionTransformer(fill_villa_null, validate=False)

villa_pipeline = Pipeline([
    ("fill_null", villa_transformer),
    ("OneHot", OneHotEncoder(drop="first", handle_unknown="ignore"))
])

In [None]:
def trading_date_to_timestamp(X):
    s = pd.to_datetime(X.iloc[:,0], errors='coerce')
    epoch = pd.Timestamp("1970-01-01")
    days = (s - epoch).dt.days
    return pd.DataFrame(days)

trading_date_transformer = FunctionTransformer(trading_date_to_timestamp, validate=False)

trading_date_pipeline = Pipeline([
    ("transformer", trading_date_transformer),
    ("standardize", StandardScaler()),
    ("imputer", SimpleImputer(strategy="mean"))
])

In [None]:
def last_transaction_null_to_0(X):
    s = X.iloc[:,0].notna().astype(int)
    return pd.DataFrame(s)

last_transaction_transformer = FunctionTransformer(last_transaction_null_to_0, validate=False)

last_transaction_pipeline = Pipeline([
    ("last_transaction_transformer", last_transaction_transformer)
])

In [None]:
def extract_features(X):
    X = X.fillna("")
    
    features = ["装修", "地铁", "房本满五年", "房本满两年"]
    result = pd.DataFrame(0, index=X.index, columns=features)
    
    for i, val in X.iloc[:, 0].items():
        for feat in features:
            if feat in val:
                result.at[i, feat] = 1
    return result

features_transformer = FunctionTransformer(extract_features, validate=False)

features_pipeline = Pipeline([
    ("extract_features", features_transformer)
])

In [None]:
def merge_small_property_developers(X, min_count=5):

    X = X.fillna("")
    series = X.iloc[:, 0]
    
    counts = series.value_counts()

    small_cats = counts[counts < min_count].index
    
    series_replaced = series.apply(lambda x: "其他" if x in small_cats else x)
    
    return pd.DataFrame(series_replaced, columns=[X.columns[0]])

property_developers_transformer = FunctionTransformer(merge_small_property_developers, validate=False)

property_developers_pipeline = Pipeline([
    ("property_developers", property_developers_transformer),
    ("OneHot", OneHotEncoder(drop="first", handle_unknown="ignore"))
])

In [None]:
def extract_numbers(X):

    s = X.iloc[:,0].fillna("")
    s2 = s.str.extract(r'(\d+\.?\d*)')[0].astype(float)
    return pd.DataFrame(s2)

number_transformer = FunctionTransformer(extract_numbers, validate=False)

number_pipeline = Pipeline(steps=[
    ("parse", number_transformer),
    ("standardize", StandardScaler()),
    ("impute", SimpleImputer(strategy='mean'))  # 数值缺失填充平均值
])

In [None]:
def parse_property_fee(X):

    s = X.iloc[:,0].fillna("")
    
    result = []
    for val in s.iloc[:, 0]:
        if not isinstance(val, str):
            result.append(np.nan)
            continue

        val = val.replace('元/月/㎡', '').strip()
        if val == '':
            result.append(np.nan)
        elif '-' in val:
            try:
                low, high = map(float, val.split('-'))
                result.append((low + high) / 2)
            except:
                result.append(np.nan)
        else:
            try:
                result.append(float(val))
            except:
                result.append(np.nan)

    return pd.DataFrame(result, index=X.index)

property_fee_transformer = FunctionTransformer(parse_property_fee)

property_fee_pipeline = Pipeline([
    ("transformer", property_fee_transformer),
    ("standardize", StandardScaler()),
    ("impute", SimpleImputer(strategy='mean'))
])

In [None]:
def null_to_0(X):
    s = X.iloc[:, 0].fillna(0)
    return pd.DataFrame(s)

null_to_0_transformer = FunctionTransformer(null_to_0, validate=False)

null_to_0_pipeline = Pipeline([
    ("null_to_0_transformer", null_to_0_transformer),
    ("standardize", StandardScaler())
])

In [None]:
num_features = ["lon", "lat", "容 积 率", "客户反馈"]  # 数值特征

cat_features = ["建筑结构", "装修情况", "交易权属", "房屋用途", "房屋年限", "产权所属", "建筑结构_comm", "供水", "供暖", "供电"]  # 独热编码

cluster_features = ["城市", "区域", "板块", "年份", "区县"]  # 聚类特征

In [None]:
num_pipeline = Pipeline(steps=[
    # ("clip", IQRClipper()),
    ("standardize", StandardScaler()),
    ("impute", SimpleImputer(strategy="mean"))
])

cat_pipeline = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("encoder", OneHotEncoder(drop="first", sparse_output=False, handle_unknown="ignore")),  # drop_first避免共线性
])

cluster_pipeline = Pipeline(steps=[
    ("standardize", StandardScaler()),
    ("impute", SimpleImputer(strategy="mean")),
    ("cluster", KMeans(n_clusters=10, random_state=42))
])


In [None]:
# 定义预处理逻辑（数值型填充+类别型编码）
preprocessor = ColumnTransformer(
    transformers=[
        ("ring_road", ring_road_pipeline,["环线"]),
        ("house_type", house_type_pipeline, ["房屋户型"]),
        ("floor", floor_pipeline, ["所在楼层"]),
        ("area", area_pipeline, ["建筑面积"]),
        ("orientation", orientation_pipeline, ["房屋朝向"]),
        ("elevator_house", elevator_house_pipeline, ["梯户比例"]),
        ("elevator", elevator_pipeline, ["配备电梯"]),
        ("villa", villa_pipeline, ["别墅类型"]),
        ("trading_date", trading_date_pipeline, ["交易时间"]),
        ("last_transaction", last_transaction_pipeline, ["上次交易"]),
        ("features", features_pipeline, ["房屋优势"]),
        ("property_developers", property_developers_pipeline, ["开发商"]),
        ("number", number_pipeline, ["房屋总数", "楼栋总数", "绿 化 率", "燃气费", "供热费"]),
        ("parking_spot", null_to_0_pipeline, ["停车位"]),

        # 数值型特征：用中位数填充缺失值
        ("num", num_pipeline, num_features),
        # 高基数/无序类别：独热编码（自动处理缺失值为新类别）
        ("cat", cat_pipeline, cat_features),
        # 聚类特征：KMeans聚类（自动处理缺失值为新类别）
        ("cluster", cluster_pipeline, cluster_features)
    ])

In [None]:
# price model
X_price = train_price.drop("Price", axis=1)
y_price = train_price["Price"]

X_train_price, X_valid_price, y_train_price, y_valid_price = train_test_split(
    X_price, y_price, test_size=0.2, random_state=111
)

In [None]:
# 构建完整管道（预处理 + OLS模型）
ols_pipeline = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("regressor", LinearRegression())
])

In [None]:
ols_model = TransformedTargetRegressor(
    regressor=ols_pipeline,
    transformer=FunctionTransformer(func=np.log, inverse_func=np.exp),
    check_inverse=False
)

In [None]:
# 用训练集训练（管道会自动先预处理，再训练模型）
ols_model.fit(X_train_price, y_train_price)

In [None]:
y_pred_train_price_ols = ols_model.predict(X_train_price)
mae_in_ols = mean_absolute_error(y_train_price, y_pred_train_price_ols)
print(f"样本内 MAE: {mae_in_ols:.4f}")

y_pred_valid_price_ols = ols_model.predict(X_valid_price)
mae_out_ols = mean_absolute_error(y_valid_price, y_pred_valid_price_ols)
print(f"样本外 MAE: {mae_out_ols:.4f}")

In [None]:
# 在训练集上进行 6 折交叉验证
# 注意：cross_val_score 默认是“越大越好”，
# 所以 MAE 要使用 “neg_mean_absolute_error”
cv_mae_ols = cross_val_score(
    ols_model, X_price, y_price,
    cv=6,
    scoring="neg_mean_absolute_error"
)

# 将负值转正
mae_cv_ols = -cv_mae_ols.mean()
print(f"6折交叉验证 MAE 平均值: {mae_cv_ols:.4f}")
print(cv_mae_ols)

In [None]:
X_test_price = test_price
y_pred_price_ols = ols_model.predict(X_test_price.drop("ID", axis=1))

result = pd.DataFrame({
    "ID": X_test_price["ID"],
    "Price": y_pred_price_ols
})

result.to_csv("OLS_price.csv", index=False)


In [None]:
lasso_pipeline = Pipeline([
    ("preprocessor", preprocessor),
    ("regressor", Lasso(alpha=0.001, precompute=True, max_iter=100000, random_state=42))
])

In [None]:
lasso_model = TransformedTargetRegressor(
    regressor=lasso_pipeline,
    transformer=FunctionTransformer(func=np.log, inverse_func=np.exp),
    check_inverse=False
)

In [None]:
lasso_model.fit(X_train_price, y_train_price)

In [None]:
y_pred_train_lasso = lasso_model.predict(X_train_price)
mae_in_lasso = mean_absolute_error(y_train_price, y_pred_train_lasso)
print(f"样本内 MAE: {mae_in_lasso:.4f}")

y_pred_valid_lasso = lasso_model.predict(X_valid_price)
mae_out_lasso  = mean_absolute_error(y_valid_price, y_pred_valid_lasso)
print(f"样本外 MAE: {mae_out_lasso:.4f}")

In [None]:
# 在训练集上进行 6 折交叉验证
# 注意：cross_val_score 默认是“越大越好”，
# 所以 MAE 要使用 “neg_mean_absolute_error”
cv_mae_lasso = cross_val_score(
    lasso_model, X_price, y_price,
    cv=6,
    scoring="neg_mean_absolute_error"
)

# 将负值转正
mae_cv_lasso = -cv_mae_lasso.mean()
print(f"6折交叉验证 MAE 平均值: {mae_cv_lasso:.4f}")
print(cv_mae_lasso)

In [None]:
X_test_price = test_price
y_pred_price_lasso = lasso_model.predict(X_test_price.drop("ID", axis=1))

result = pd.DataFrame({
    "ID": X_test_price["ID"],
    "Price": y_pred_price_lasso
})

result.to_csv("Lasso_price.csv", index=False)


In [None]:
ridge_pipeline = Pipeline([
    ("preprocessor", preprocessor),
    ("regressor", Ridge(alpha=1, max_iter=100000, random_state=42))
])

In [None]:
ridge_model = TransformedTargetRegressor(
    regressor=ridge_pipeline,
    transformer=FunctionTransformer(func=np.log, inverse_func=np.exp),
    check_inverse=False
)

In [None]:
ridge_model.fit(X_train_price, y_train_price)

In [None]:
y_pred_train_ridge = ridge_model.predict(X_train_price)
mae_in_ridge = mean_absolute_error(y_train_price, y_pred_train_ridge)
print(f"样本内 MAE: {mae_in_ridge:.4f}")

y_pred_valid_ridge = ridge_model.predict(X_valid_price)
mae_out_ridge  = mean_absolute_error(y_valid_price, y_pred_valid_ridge)
print(f"样本外 MAE: {mae_out_ridge:.4f}")

In [None]:
# 在训练集上进行 6 折交叉验证
# 注意：cross_val_score 默认是“越大越好”，
# 所以 MAE 要使用 “neg_mean_absolute_error”
cv_mae_lasso = cross_val_score(
    lasso_model, X_price, y_price,
    cv=6,
    scoring="neg_mean_absolute_error"
)

# 将负值转正
mae_cv_lasso = -cv_mae_lasso.mean()
print(f"6折交叉验证 MAE 平均值: {mae_cv_lasso:.4f}")
print(cv_mae_lasso)

In [None]:
X_test_price = test_price
y_pred_price_ridge = ridge_model.predict(X_test_price.drop("ID", axis=1))

result = pd.DataFrame({
    "ID": X_test_price["ID"],
    "Price": y_pred_price_ridge
})

result.to_csv("Ridge_price.csv", index=False)


# 租金

In [None]:
def fill_ring_road_null(X):
    s = X.iloc[:,0].fillna("无环线").astype(str)
    return pd.DataFrame(s)

ring_road_transformer = FunctionTransformer(fill_ring_road_null, validate=False)

ring_road_pipeline = Pipeline([
    ("ring_road_transformer", ring_road_transformer),
    ("OneHot", OneHotEncoder(drop="first", handle_unknown="ignore"))
])

In [None]:
def parse_house_type(X):
    # 直接操作 Series
    dims = {
        "室数": r'(\d+)室',
        "厅数": r'(\d+)厅',
        "厨数": r'(\d+)厨',
        "卫数": r'(\d+)卫'
    }
    df = pd.DataFrame(index=X.index)
    for key, pattern in dims.items():
        df[key] = X.iloc[:,0].fillna("").str.extract(pattern).fillna(0).astype(int)
    return df

# 构造 FunctionTransformer
house_type_transformer = FunctionTransformer(parse_house_type, validate=False)

# 构造 pipeline
house_type_pipeline = Pipeline(steps=[
    ("parse", house_type_transformer),
    ("standardize", StandardScaler()),
    ("impute", SimpleImputer(strategy='most_frequent'))
])


In [None]:
def floor_calculator(X):
    # 楼层位置映射关系
    pos_map = {"低楼层": 0.4, "中楼层": 0.6, "高楼层": 0.8, "底层": 0.2, "顶层": 1.0, "地下室": 0.0}

    s = X.iloc[:, 0].fillna("").astype(str)
    
    # 提取：文字或数字楼层 + 总层数
    extracted = s.str.extract(r'^(?:((?:低|中|高)楼层|底层|顶层|地下室)|(\d+))/?(\d+)层$')
    extracted.columns = ["text_floor", "num_floor", "total_floor"]
    
    # 转换为数值类型
    extracted["num_floor"] = pd.to_numeric(extracted["num_floor"], errors="coerce")
    extracted["total_floor"] = pd.to_numeric(extracted["total_floor"], errors="coerce")
    
    # 计算楼层等级
    def calc_floor_level(row):
        # 已有文字描述的
        if pd.notna(row["text_floor"]):
            return row["text_floor"] + "楼层"
        # 数字比例判断
        if pd.notna(row["num_floor"]) and pd.notna(row["total_floor"]) and row["total_floor"] > 0:
            ratio = row["num_floor"] / row["total_floor"]
            if ratio <= 1/3:
                return "低楼层"
            elif ratio <= 2/3:
                return "中楼层"
            else:
                return "高楼层"
        return np.nan
    
    extracted["楼层位置文本"] = extracted.apply(calc_floor_level, axis=1)
    extracted["楼层位置"] = extracted["楼层位置文本"].map(pos_map)
    
    # 输出 DataFrame
    return pd.DataFrame({
        "楼层位置": extracted["楼层位置"],
        "总层数": extracted["total_floor"].astype("Int64", errors="ignore")
    })

# FunctionTransformer 封装
floor_transformer = FunctionTransformer(floor_calculator, validate=False)

# Pipeline 组合
floor_pipeline = Pipeline(steps=[
    ("parse", floor_transformer),
    ("standardize", StandardScaler()),
    ("impute", SimpleImputer(strategy='mean'))
])


In [None]:
def extract_area(X):
    s = X.iloc[:,0].fillna("")
    s2 = s.str.extract(r'(\d+\.?\d*)')[0].astype(float)
    return pd.DataFrame(s2)

# FunctionTransformer
area_transformer = FunctionTransformer(extract_area, validate=False)

# Pipeline
area_pipeline = Pipeline(steps=[
    ("parse", area_transformer),
    ("standardize", StandardScaler()),
    ("impute", SimpleImputer(strategy='mean'))  # 数值缺失填充平均值
])


In [None]:
def parse_orientation(X):
    orientations = ["东", "南", "西", "北", "东南", "东北", "西南", "西北"]
    df = pd.DataFrame(0, index=X.index, columns=orientations)
    for idx, val in X.iloc[:,0].items():
        if pd.isna(val): continue
        for ori in re.split(r'[ ,，、/]+', str(val)):
            if ori in orientations:
                df.loc[idx, ori] = 1
    return df

# FunctionTransformer
orientation_transformer = FunctionTransformer(parse_orientation, validate=False)

# Pipeline
orientation_pipeline = Pipeline(steps=[
    ("parse", orientation_transformer),
    ("impute", SimpleImputer(strategy='most_frequent'))
])

In [None]:
def parse_elevator_house_ratio_cn2an(X):
    pattern = r'^(.+)梯(.+)户$'
    s = X.iloc[:,0].fillna("")
    def calc_ratio(text):
        if text.strip() == "":
            return np.nan
        m = re.match(pattern, text.strip())
        if not m:
            return np.nan
        try:
            ladder_num = cn2an(m[1], mode="smart")
            house_num = cn2an(m[2], mode="smart")
            if house_num == 0:
                return np.nan
            return ladder_num / house_num
        except:
            return np.nan
    return pd.DataFrame(s.map(calc_ratio))

# FunctionTransformer
elevator_house_transformer = FunctionTransformer(parse_elevator_house_ratio_cn2an, validate=False)

# Pipeline
elevator_house_pipeline = Pipeline([
    ("transform", elevator_house_transformer),
    ("standardize", StandardScaler()),
    ("imputer", SimpleImputer(strategy="mean"))
])


In [None]:
def fill_elevator_null(X):
    s = X.iloc[:,0].fillna("无").astype(str)
    return pd.DataFrame(s)

# 定义 FunctionTransformer
elevator_transformer = FunctionTransformer(fill_elevator_null, validate=False)

# 构建完整 pipeline
elevator_pipeline = Pipeline([
    ("fill_null", elevator_transformer),
    ("OneHot", OneHotEncoder(drop="first", handle_unknown="ignore"))
])

In [None]:
def fill_villa_null(X):
    s = X.iloc[:,0].fillna("非别墅").astype(str)
    return pd.DataFrame(s)

villa_transformer = FunctionTransformer(fill_villa_null, validate=False)

villa_pipeline = Pipeline([
    ("fill_null", villa_transformer),
    ("OneHot", OneHotEncoder(drop="first", handle_unknown="ignore"))
])

In [None]:
def trading_date_to_timestamp(X):
    s = pd.to_datetime(X.iloc[:,0], errors='coerce')
    epoch = pd.Timestamp("1970-01-01")
    days = (s - epoch).dt.days
    return pd.DataFrame(days)

# 定义 transformer
trading_date_transformer = FunctionTransformer(trading_date_to_timestamp, validate=False)

# 构建 pipeline
trading_date_pipeline = Pipeline([
    ("transformer", trading_date_transformer),
    ("standardize", StandardScaler()),
    ("imputer", SimpleImputer(strategy="mean"))
])

In [None]:
def last_transaction_null_to_0(X):
    s = X.iloc[:,0].notna().astype(int)
    return pd.DataFrame(s)

last_transaction_transformer = FunctionTransformer(last_transaction_null_to_0, validate=False)

last_transaction_pipeline = Pipeline([
    ("last_transaction_transformer", last_transaction_transformer)
])

In [None]:
def extract_features(X):
    X = X.fillna("")
    
    features = ["装修", "地铁", "房本满五年", "房本满两年"]
    result = pd.DataFrame(0, index=X.index, columns=features)
    
    for i, val in X.iloc[:, 0].items():
        for feat in features:
            if feat in val:
                result.at[i, feat] = 1
    return result

features_transformer = FunctionTransformer(extract_features, validate=False)

features_pipeline = Pipeline([
    ("extract_features", features_transformer)
])

In [None]:
def merge_small_property_developers(X, min_count=5):
    # 将单列DataFrame中样本量小于min_count的类别合并为'其他'

    X = X.fillna("")  # 处理空值
    series = X.iloc[:, 0]
    
    # 统计每个类别的样本量
    counts = series.value_counts()
    # 找到小众类别
    small_cats = counts[counts < min_count].index
    
    # 将小众类别替换为 '其他'
    series_replaced = series.apply(lambda x: "其他" if x in small_cats else x)
    
    return pd.DataFrame(series_replaced, columns=[X.columns[0]])

property_developers_transformer = FunctionTransformer(merge_small_property_developers, validate=False)

property_developers_pipeline = Pipeline([
    ("property_developers", property_developers_transformer),
    ("OneHot", OneHotEncoder(drop="first", handle_unknown="ignore"))
])

In [None]:
def extract_numbers(X):
    """
    提取房屋总数、楼栋总数、绿化率、燃气费、供热费
    """
    s = X.iloc[:,0].fillna("")
    s2 = s.str.extract(r'(\d+\.?\d*)')[0].astype(float)
    return pd.DataFrame(s2)

# FunctionTransformer
number_transformer = FunctionTransformer(extract_numbers, validate=False)

# Pipeline
number_pipeline = Pipeline(steps=[
    ("parse", number_transformer),
    ("standardize", StandardScaler()),
    ("impute", SimpleImputer(strategy='mean')),  # 数值缺失填充平均值
])
    

In [None]:
def parse_property_fee(X):
    """将物业费字符串转换为数值型"""
    s = X.iloc[:,0].fillna("")
    
    result = []
    for val in s.iloc[:, 0]:
        if not isinstance(val, str):
            result.append(np.nan)
            continue

        val = val.replace('元/月/㎡', '').strip()
        if val == '':
            result.append(np.nan)
        elif '-' in val:  # 区间
            try:
                low, high = map(float, val.split('-'))
                result.append((low + high) / 2)
            except:
                result.append(np.nan)
        else:  # 单一数值
            try:
                result.append(float(val))
            except:
                result.append(np.nan)

    return pd.DataFrame(result, index=X.index)

property_fee_transformer = FunctionTransformer(parse_property_fee)

property_fee_pipeline = Pipeline([
    ("transformer", property_fee_transformer),
    ("standardize", StandardScaler()),
    ("impute", SimpleImputer(strategy='mean'))
])

In [None]:
def null_to_0(X):
    s = X.iloc[:, 0].fillna(0)
    return pd.DataFrame(s)

null_to_0_transformer = FunctionTransformer(null_to_0, validate=False)

null_to_0_pipeline = Pipeline([
    ("null_to_0_transformer", null_to_0_transformer),
    ("standardize", StandardScaler())
])

In [None]:
def fill_null(X):
    s = X.iloc[:,0].fillna("无").astype(str)
    return pd.DataFrame(s)

# 定义 FunctionTransformer
fill_null_transformer = FunctionTransformer(fill_null, validate=False)

# 构建完整 pipeline
fill_null_pipeline = Pipeline([
    ("fill_null", fill_null_transformer),
    ("OneHot", OneHotEncoder(drop="first", handle_unknown="ignore"))
])

In [None]:
def support_facilities(X):
    s = X.iloc[:, 0].fillna("")
    s2 = s.str.len()
    return pd.DataFrame(s2)

support_facilities_transformer = FunctionTransformer(support_facilities, validate=False)

support_facilities_pipeline = Pipeline([
    ("support_facilities", support_facilities_transformer),
    ("standardize", StandardScaler())
])


In [None]:
def extract_building_age(X):
    """将建筑年代字符串转换为数值型"""
    s = X.iloc[:, 0].fillna("")
    
    def convert_age(val):
        if not isinstance(val, str):
            return np.nan
            
        val = val.replace('年', '').strip()
        if val == '':
            return np.nan
        
        try:
            if '-' in val:  # 区间
                low, high = map(float, val.split('-'))
                return (low + high) / 2
            else:  # 单一数值
                return float(val)
        except:
            return np.nan
    
    result = s.apply(convert_age)
    return pd.DataFrame(result, index=X.index)

building_age_transformer = FunctionTransformer(extract_building_age, validate=False)

building_age_pipeline = Pipeline([
    ("transformer", building_age_transformer),
    ("standardize", StandardScaler()),
    ("impute", SimpleImputer(strategy='mean'))
])

In [None]:
def count_property_types(X):
    """
    统计每条记录中产权类型的数量（斜杠分隔的元素个数）
    处理空值（NaN）和空字符串的边缘情况
    """
    # 对每一行的产权字符串进行处理
    def process_single_value(value):
        # 空值或空字符串返回0
        if pd.isna(value) or str(value).strip() == "":
            return np.nan
        # 按斜杠分割后统计非空元素数量（避免空字符串干扰）
        else:
            types = str(value).split('/')
            return len([t for t in types if t.strip() != ""])
    X.iloc[:, 0] = X.iloc[:, 0].apply(process_single_value)
    return X

# 构建FunctionTransformer转换器
property_count_transformer = FunctionTransformer(count_property_types, validate=False)

property_count_pipeline = Pipeline([
    ("property_count", property_count_transformer),
    ("standardize", StandardScaler()),
    ("impute", SimpleImputer(strategy='median'))
])

In [None]:
num_features = ["lon", "lat", "容 积 率", "客户反馈"]

cat_features = ["付款方式", "租赁方式", "用水", "用电", "租期", "物业类别", "产权描述", "建筑结构", "供水", "供暖", "供电"]  # 独热编码

cluster_features = ["城市", "年份", "区县", "板块",]  # 聚类特征

In [None]:
num_pipeline = Pipeline(steps=[
    ("impute", SimpleImputer(strategy="mean")),
    ("standardize", StandardScaler())
])

cat_pipeline = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("encoder", OneHotEncoder(drop="first", handle_unknown="ignore")),  # drop_first避免共线性
])

cluster_pipeline = Pipeline(steps=[
    ("impute", SimpleImputer(strategy="mean")),
    ("standardize", StandardScaler()),
    ("cluster", KMeans(n_clusters=10, random_state=42))
])

In [None]:
# 定义预处理逻辑（数值型填充+类别型编码）
preprocessor = ColumnTransformer(
    transformers=[
        ("house_type", house_type_pipeline, ["户型"]),
        ("floor", floor_pipeline, ["楼层"]),
        ("area", area_pipeline, ["面积"]),
        ("orientation", orientation_pipeline, ["朝向"]),
        ("trading_date", trading_date_pipeline, ["交易时间"]),
        ("elevator", elevator_pipeline, ["电梯"]),
        ("fill_null", fill_null_pipeline, ["装修", "车位", "燃气", "采暖"]),
        ("support_facilities", support_facilities_pipeline, ["配套设施"]),
        ("ring_road", ring_road_pipeline, ["环线位置"]),
        ("property_developers", property_developers_pipeline, ["开发商"]),
        ("number", number_pipeline, ["房屋总数", "楼栋总数", "绿 化 率", "燃气费", "供热费"]),
        ("null_to_0", null_to_0_pipeline, ["停车位",]),
        ("property_type", property_count_pipeline, ["产权描述"]),


        # 数值型特征：用中位数填充缺失值（sklearn默认不处理缺失，需手动加填充器）
        ("num", num_pipeline, num_features),
        # 高基数/无序类别：独热编码（自动处理缺失值为新类别）
        ("cat", cat_pipeline, cat_features),
        # 聚类特征：用均值填充缺失值，标准化后聚类
        ("cluster", cluster_pipeline, cluster_features)
    ])

In [None]:
# rent model
X_rent = train_rent.drop("Price", axis=1)
y_rent = train_rent["Price"]

X_train_rent, X_valid_rent, y_train_rent, y_valid_rent = train_test_split(
    X_rent, y_rent, test_size=0.2, random_state=111
)

In [None]:
# 构建完整管道（预处理 + OLS模型）
ols_pipeline = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("regressor", LinearRegression())
])

In [None]:
ols_model = TransformedTargetRegressor(
    regressor=ols_pipeline,
    transformer=FunctionTransformer(func=np.log, inverse_func=np.exp),
    check_inverse=False
)

In [None]:
# 用训练集训练（管道会自动先预处理，再训练模型）
ols_model.fit(X_train_rent, y_train_rent)

In [None]:
y_pred_train_rent_ols = ols_model.predict(X_train_rent)
mae_in_ols = mean_absolute_error(y_train_rent, y_pred_train_rent_ols)
print(f"样本内 MAE: {mae_in_ols:.4f}")

y_pred_valid_rent_ols = ols_model.predict(X_valid_rent)
mae_out_ols = mean_absolute_error(y_valid_rent, y_pred_valid_rent_ols)
print(f"样本外 MAE: {mae_out_ols:.4f}")

In [None]:
# 在训练集上进行 6 折交叉验证
# 注意：cross_val_score 默认是“越大越好”，
# 所以 MAE 要使用 “neg_mean_absolute_error”
cv_mae_ols = cross_val_score(
    ols_model, X_rent, y_rent,
    cv=6,
    scoring="neg_mean_absolute_error"
)

# 将负值转正
mae_cv_ols = -cv_mae_ols.mean()
print(f"6折交叉验证 MAE 平均值: {mae_cv_ols:.4f}")
print(cv_mae_ols)

In [None]:
X_test_rent = test_rent
y_pred_rent_ols = ols_model.predict(X_test_rent.drop("ID", axis=1))

result = pd.DataFrame({
    "ID": X_test_rent["ID"],
    "Price": y_pred_rent_ols
})

result.to_csv("OLS_rent.csv", index=False)


In [None]:
lasso_pipeline = Pipeline([
    ("preprocessor", preprocessor),
    ("regressor", Lasso(alpha=0.001, precompute=True, max_iter=100000, random_state=42))
])

In [None]:
lasso_model = TransformedTargetRegressor(
    regressor=lasso_pipeline,
    transformer=FunctionTransformer(func=np.log, inverse_func=np.exp),
    check_inverse=False
)

In [None]:
lasso_model.fit(X_train_rent, y_train_rent)

In [None]:
y_pred_train_lasso = lasso_model.predict(X_train_rent)
mae_in_lasso = mean_absolute_error(y_train_rent, y_pred_train_lasso)
print(f"样本内 MAE: {mae_in_lasso:.4f}")

y_pred_valid_lasso = lasso_model.predict(X_valid_rent)
mae_out_lasso  = mean_absolute_error(y_valid_rent, y_pred_valid_lasso)
print(f"样本外 MAE: {mae_out_lasso:.4f}")

In [None]:
# 在训练集上进行 6 折交叉验证
# 注意：cross_val_score 默认是“越大越好”，
# 所以 MAE 要使用 “neg_mean_absolute_error”
cv_mae_lasso = cross_val_score(
    lasso_model, X_rent, y_rent,
    cv=6,
    scoring="neg_mean_absolute_error"
)

# 将负值转正
mae_cv_lasso = -cv_mae_lasso.mean()
print(f"6折交叉验证 MAE 平均值: {mae_cv_lasso:.4f}")
print(cv_mae_lasso)

In [None]:
X_test_rent = test_rent
y_pred_rent_lasso = lasso_model.predict(X_test_rent.drop("ID", axis=1))

result = pd.DataFrame({
    "ID": X_test_rent["ID"],
    "Price": y_pred_rent_lasso
})

result.to_csv("Lasso_rent.csv", index=False)


In [None]:
ridge_pipeline = Pipeline([
    ("preprocessor", preprocessor),
    ("regressor", Ridge(alpha=1, max_iter=100000, random_state=42))
])

In [None]:
ridge_model = TransformedTargetRegressor(
    regressor=ridge_pipeline,
    transformer=FunctionTransformer(func=np.log, inverse_func=np.exp),
    check_inverse=False
)

In [None]:
ridge_model.fit(X_train_rent, y_train_rent)

In [None]:
y_pred_train_ridge = ridge_model.predict(X_train_rent)
mae_in_ridge = mean_absolute_error(y_train_rent, y_pred_train_ridge)
print(f"样本内 MAE: {mae_in_ridge:.4f}")
r2_in_ridge = r2_score(y_train_rent, y_pred_train_ridge)
print(f"样本内 R2: {r2_in_ridge:.4f}")

y_pred_valid_ridge = ridge_model.predict(X_valid_rent)
mae_out_ridge  = mean_absolute_error(y_valid_rent, y_pred_valid_ridge)
print(f"样本外 MAE: {mae_out_ridge:.4f}")
r2_out_ridge = r2_score(y_valid_rent, y_pred_valid_ridge)
print(f"样本外 R2: {r2_out_ridge:.4f}")

In [None]:
# 在训练集上进行 6 折交叉验证
# 注意：cross_val_score 默认是“越大越好”，
# 所以 MAE 要使用 “neg_mean_absolute_error”
cv_mae_lasso = cross_val_score(
    lasso_model, X_rent, y_rent,
    cv=6,
    scoring="neg_mean_absolute_error"
)

# 将负值转正
mae_cv_lasso = -cv_mae_lasso.mean()
print(f"6折交叉验证 MAE 平均值: {mae_cv_lasso:.4f}")
print(cv_mae_lasso)

In [None]:
X_test_rent = test_rent
y_pred_rent_ridge = ridge_model.predict(X_test_rent.drop("ID", axis=1))

result = pd.DataFrame({
    "ID": X_test_rent["ID"],
    "Price": y_pred_rent_ridge
})

result.to_csv("Ridge_rent.csv", index=False)

# 结果

In [None]:
OLS_price = pd.read_csv( "OLS_price.csv")
OLS_rent = pd.read_csv( "OLS_rent.csv")

OLS = pd.concat([OLS_price, OLS_rent], axis=0, ignore_index=True)
OLS.to_csv("OLS.csv", index=False)

In [None]:
Lasso_price = pd.read_csv("Lasso_price.csv")
Lasso_rent = pd.read_csv("Lasso_rent.csv")

Lasso = pd.concat([Lasso_price, Lasso_rent], axis=0, ignore_index=True)
Lasso.to_csv("Lasso.csv", index=False)

In [None]:
Ridge_price = pd.read_csv("Ridge_price.csv")
Ridge_rent = pd.read_csv("Ridge_rent.csv")

Ridge = pd.concat([Ridge_price, Ridge_rent], axis=0, ignore_index=True)
Ridge.to_csv("Ridge.csv", index=False)