In [87]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, KFold, cross_val_score, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder
from sklearn.impute import KNNImputer, SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression, Lasso, Ridge, ElasticNet
from sklearn.metrics import mean_absolute_error, make_scorer
import lightgbm as lgb
import re
import warnings

In [89]:
warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', None)

In [91]:
def load_data(file_name):
    """尝试用 'gbk' 或 'utf-8' 加载 CSV"""
    try:
        return pd.read_csv(file_name, encoding='gbk')
    except UnicodeDecodeError:
        return pd.read_csv(file_name)
    except Exception as e:
        print(f"Error loading {file_name}: {e}")
        return pd.DataFrame()

def mae_on_original_scale(y_log_true, y_log_pred):
    """计算原始价格尺度上的 MAE"""
    y_true = np.expm1(y_log_true)
    y_pred = np.expm1(y_log_pred)
    return mean_absolute_error(y_true, y_pred)

# 创建评估器
mae_scorer = make_scorer(mae_on_original_scale, greater_is_better=False)
# 6折交叉验证
cv_6_folds = KFold(n_splits=6, shuffle=True, random_state=111)

def clean_numeric_text(text_str):
    """清理 '绿化率', '容积率' (如 '45%' -> 0.45, '1.8' -> 1.8)"""
    if pd.isnull(text_str):
        return np.nan
    text_str = str(text_str)
    if '%' in text_str:
        val = re.search(r'([\d\.]+)', text_str)
        return float(val.group(1)) / 100.0 if val else np.nan
    if '-' in text_str:
        vals = re.findall(r'([\d\.]+)', text_str)
        if len(vals) >= 2:
            return (float(vals[0]) + float(vals[1])) / 2
        elif len(vals) == 1:
            return float(vals[0])
    val = re.search(r'([\d\.]+)', text_str)
    return float(val.group(1)) if val else np.nan

def clean_parking(parking_str):
    """清理 '停车费用' (如 '免费' -> 0)"""
    if pd.isnull(parking_str):
        return np.nan
    if '免费' in str(parking_str):
        return 0
    return clean_numeric_text(parking_str)

## 房价 (Price) 预测任务

In [94]:
#Price 特征工程

def parse_floor_price(floor_str):
    """解析 '所在楼层' (Price: '中楼层 (共23层)')"""
    if pd.isnull(floor_str):
        return np.nan, np.nan
    floor_level_map = {'低': 1, '中': 2, '高': 3}
    level = np.nan
    for key, val in floor_level_map.items():
        if key in floor_str:
            level = val
            break
    total_floor_match = re.search(r'共(\d+)层', floor_str)
    total_floor = float(total_floor_match.group(1)) if total_floor_match else np.nan
    return level, total_floor

def parse_layout_price(layout_str):
    """解析 '房屋户型' (Price: '3室2厅1厨2卫')"""
    if pd.isnull(layout_str):
        return [np.nan] * 4
    room = re.search(r'(\d+)室', layout_str); hall = re.search(r'(\d+)厅', layout_str)
    kitchen = re.search(r'(\d+)厨', layout_str); bath = re.search(r'(\d+)卫', layout_str)
    return [
        float(room.group(1)) if room else 0, float(hall.group(1)) if hall else 0,
        float(kitchen.group(1)) if kitchen else 0, float(bath.group(1)) if bath else 0
    ]

def parse_area_price(area_str):
    """解析 '建筑面积' (Price: '282.02㎡')"""
    if pd.isnull(area_str):
        return np.nan
    area_match = re.search(r'([\d\.]+)', str(area_str))
    return float(area_match.group(1)) if area_match else np.nan

def parse_ratio_price(ratio_str):
    """解析 '梯户比例'
    格式如1梯2户、1梯 2户、一梯二户"""
    if pd.isna(ratio_str) or ratio_str == '':
        return 1, 1
    
    ratio_str = str(ratio_str)
    chinese_map = {'一': '1', '二': '2', '两': '2', '三': '3', '四': '4', 
                  '五': '5', '六': '6', '七': '7', '八': '8', '九': '9', '十': '10'}
    for ch, num in chinese_map.items():
        ratio_str = ratio_str.replace(ch, num)
    numbers = re.findall(r'\d+', ratio_str)
    if len(numbers) >= 2:
        return float(numbers[0]), float(numbers[1])
    elif len(numbers) == 1:
        return float(numbers[0]), 1
    else:
        return 1, 1

def clean_year_price(year_str):
    """解析 '建筑年代' (Price: '2002-2006年')"""
    if pd.isnull(year_str):
        return np.nan
    year_str = str(year_str).replace('年', '')
    if '-' in year_str:
        years = re.findall(r'(\d{4})', year_str)
        if len(years) >= 2:
            return (float(years[0]) + float(years[1])) / 2
    year_match = re.search(r'(\d{4})', year_str)
    return float(year_match.group(1)) if year_match else np.nan

In [96]:
def price_prediction(train_rent_df):
    # 数据
    train_price = load_data('D:/ai finance/midterm/ruc_Class25Q2_train_price.csv')
    test_price = load_data('D:/ai finance/midterm/ruc_Class25Q2_test_price.csv')

    # 合并、预处理
    test_ids = test_price['ID']
    y_target = train_price['Price']
    y_target_log = np.log1p(y_target)
    
    train_price = train_price.drop('Price', axis=1)
    test_price = test_price.drop('ID', axis=1)
    train_price['source'] = 'train'
    test_price['source'] = 'test'
    
    data = pd.concat([train_price, test_price], ignore_index=True)
    
    # 泄露列，采取删除
    leaky_cols = [
        '房屋优势', '核心卖点', '户型介绍', '周边配套', '交通出行', '客户反馈', 
        '物业办公电话', '产权描述', '开发商', '物业公司', '抵押信息', '上次交易',
        'coord_x', 'coord_y', '年份' # 冗余列
    ]
    cols_to_drop = [col for col in leaky_cols if col in data.columns]
    data_cleaned = data.drop(columns=cols_to_drop)
    
    # 特征工程
    print("Price-FE")
    df = data_cleaned.copy()
    
    df[['Room', 'Hall', 'Kitchen', 'Bath']] = df['房屋户型'].apply(lambda x: pd.Series(parse_layout_price(x)))
    df[['FloorLevel', 'TotalFloor']] = df['所在楼层'].apply(lambda x: pd.Series(parse_floor_price(x)))
    df['BuildingArea'] = df['建筑面积'].apply(parse_area_price)
    df['InnerArea'] = df['套内面积'].apply(parse_area_price)
    df[['ElevatorCount', 'HouseholdCount']] = df['梯户比例'].apply(lambda x: pd.Series(parse_ratio_price(x)))
    
    df['TransactionTime'] = pd.to_datetime(df['交易时间'])
    df['TransactionYear'] = df['TransactionTime'].dt.year
    df['BuildingYear'] = df['建筑年代'].apply(clean_year_price)
    df['HouseAge'] = df['TransactionYear'] - df['BuildingYear']
    
    df['GreenRate'] = df['绿 化 率'].apply(clean_numeric_text)
    df['VolumeRate'] = df['容 积 率'].apply(clean_numeric_text)
    df['PropertyFee'] = df['物 业 费'].apply(clean_numeric_text)
    df['ParkingSpots'] = df['停车位'].apply(clean_numeric_text)
    df['ParkingFee'] = df['停车费用'].apply(clean_parking)
    
    # 交互特征
    df['TotalRooms'] = df['Room'] + df['Hall'] + df['Kitchen'] + df['Bath']
    df['InnerAreaRatio'] = df['InnerArea'] / (df['BuildingArea'] + 1e-6)

    # 交叉特征：融合平均租金
    train_rent_df['Price'] = pd.to_numeric(train_rent_df['Price'], errors='coerce')
    rent_agg = train_rent_df.groupby('板块')['Price'].mean().reset_index()
    rent_agg = rent_agg.rename(columns={'Price': 'AvgRent_Community', '板块': '板块_comm'})
    df = df.merge(rent_agg, on='板块_comm', how='left')

    # 识别特征类型
    original_cols = [
        '房屋户型', '所在楼层', '建筑面积', '套内面积', '梯户比例', '交易时间', 
        '建筑年代', '绿 化 率', '容 积 率', '物 业 费', '停车位', '停车费用',
        'TransactionTime'  #also drop datetime列以避免KNNImputer错误
    ]
    df = df.drop(columns=[col for col in original_cols if col in df.columns])
    
    categorical_features = []
    numerical_features = []
    for col in df.columns:
        if col == 'source': continue
        if df[col].dtype == 'object' or df[col].nunique() < 50: # 经验法则
            categorical_features.append(col)
            df[col] = df[col].astype(str).fillna('Missing').astype('category')
        else:
            numerical_features.append(col)

    print(f"Price 数值特征: {len(numerical_features)}")
    print(f"Price 类别特征: {len(categorical_features)}")

    # 异常值处理
    train_processed = df[df['source'] == 'train'].drop('source', axis=1)
    test_processed = df[df['source'] == 'test'].drop('source', axis=1)
    
    Q1 = y_target.quantile(0.25); Q3 = y_target.quantile(0.75); IQR = Q3 - Q1
    price_mask = (y_target >= (Q1 - 1.5 * IQR)) & (y_target <= (Q3 + 1.5 * IQR))
    area_mask = (train_processed['BuildingArea'] > 10) & (train_processed['BuildingArea'] < 1000)
    combined_mask = price_mask & area_mask.fillna(True)
    
    train_processed = train_processed[combined_mask]
    y_target_log_clean = y_target_log[combined_mask]
    print(f"Price 移除异常值后，训练样本总数: {train_processed.shape[0]}")
    
    X = train_processed
    y = y_target_log_clean
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=111)

    # Pipeline 
    print("Price-Model: 构建 Pipelines...")
    # 数值: 中位数填充 + 标准化
    numeric_transformer_linear = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='median')), 
        ('scaler', StandardScaler()) 
    ])
    # 类别: 众数填充 + 独热编码
    categorical_transformer_linear = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('onehot', OneHotEncoder(handle_unknown='ignore'))
    ])
    preprocessor_linear = ColumnTransformer(
        transformers=[
            ('num', numeric_transformer_linear, numerical_features),
            ('cat', categorical_transformer_linear, categorical_features)
        ], remainder='passthrough')

    # LGBM 管道
    numeric_transformer_lgbm = Pipeline(steps=[
            ('imputer', SimpleImputer(strategy='median'))
        ])
    categorical_transformer_lgbm = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('ordinal', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1))
    ])
    preprocessor_lgbm = ColumnTransformer(
        transformers=[
            ('num', numeric_transformer_lgbm, numerical_features),
            ('cat', categorical_transformer_lgbm, categorical_features)
        ], remainder='passthrough')
    
    # 模型训练与评估
    results = {}
    all_predictions = {}
    
    # OLS
    print("Price-Model: 训练 OLS...")
    pipeline_ols = Pipeline(steps=[('preprocessor', preprocessor_linear), ('model', LinearRegression())])
    pipeline_ols.fit(X_train, y_train)
    y_pred_train_log_ols = pipeline_ols.predict(X_train)
    y_pred_test_log_ols = pipeline_ols.predict(X_test)
    cv_scores_ols = -cross_val_score(pipeline_ols, X, y, cv=cv_6_folds, scoring=mae_scorer, n_jobs=-1)
    results['OLS (Price)'] = {
        'In-sample MAE': mae_on_original_scale(y_train, y_pred_train_log_ols),
        'Out-of-sample MAE': mae_on_original_scale(y_test, y_pred_test_log_ols),
        '6-fold CV MAE': np.mean(cv_scores_ols)
    }
    pipeline_ols.fit(X, y)
    pred_log_ols = pipeline_ols.predict(test_processed)
    all_predictions['OLS'] = np.expm1(pred_log_ols)


    # Lasso
    print("Price-Model: 训练 Lasso (GridSearch)...")
    pipeline_lasso = Pipeline(steps=[('preprocessor', preprocessor_linear), ('model', Lasso(random_state=111, max_iter=2000))])
    grid_lasso = GridSearchCV(pipeline_lasso, {'model__alpha': [0.001, 0.01, 0.1]}, cv=cv_6_folds, scoring=mae_scorer, n_jobs=-1, verbose=0)
    grid_lasso.fit(X, y)
    best_lasso = grid_lasso.best_estimator_
    results['Lasso (Price)'] = {
        'In-sample MAE': mae_on_original_scale(y_train, best_lasso.predict(X_train)),
        'Out-of-sample MAE': mae_on_original_scale(y_test, best_lasso.predict(X_test)),
        '6-fold CV MAE': -grid_lasso.best_score_
    }
    pred_log_lasso = best_lasso.predict(test_processed)
    all_predictions['Lasso'] = np.expm1(pred_log_lasso)

    # Ridge
    print("Price-Model: 训练 Ridge (GridSearch)...")
    pipeline_ridge = Pipeline(steps=[('preprocessor', preprocessor_linear), ('model', Ridge(random_state=111))])
    grid_ridge = GridSearchCV(pipeline_ridge, {'model__alpha': [0.1, 1.0, 10.0]}, cv=cv_6_folds, scoring=mae_scorer, n_jobs=-1, verbose=0)
    grid_ridge.fit(X, y)
    best_ridge = grid_ridge.best_estimator_
    results['Ridge (Price)'] = {
        'In-sample MAE': mae_on_original_scale(y_train, best_ridge.predict(X_train)),
        'Out-of-sample MAE': mae_on_original_scale(y_test, best_ridge.predict(X_test)),
        '6-fold CV MAE': -grid_ridge.best_score_
    }
    pred_log_ridge = best_ridge.predict(test_processed)
    all_predictions['Ridge'] = np.expm1(pred_log_ridge)
    
    # ElasticNet
    print("Price-Model: 训练 ElasticNet (GridSearch)...")
    pipeline_en = Pipeline(steps=[('preprocessor', preprocessor_linear), ('model', ElasticNet(random_state=111, max_iter=2000))])
    param_grid_en = {'model__alpha': [0.001, 0.01], 'model__l1_ratio': [0.1, 0.5, 0.9]}
    grid_en = GridSearchCV(pipeline_en, param_grid_en, cv=cv_6_folds, scoring=mae_scorer, n_jobs=-1, verbose=0)
    grid_en.fit(X, y)
    best_en = grid_en.best_estimator_
    results['Best Linear (ElasticNet-Price)'] = {
        'In-sample MAE': mae_on_original_scale(y_train, best_en.predict(X_train)),
        'Out-of-sample MAE': mae_on_original_scale(y_test, best_en.predict(X_test)),
        '6-fold CV MAE': -grid_en.best_score_
    }
    pred_log_en = best_en.predict(test_processed)
    all_predictions['ElasticNet'] = np.expm1(pred_log_en)

    # LightGBM
    print("Price-Model: 训练 LightGBM (GridSearch)...")
    X_train_lgbm = preprocessor_lgbm.fit_transform(X_train)
    X_test_lgbm = preprocessor_lgbm.transform(X_test)
    X_full_train_lgbm = preprocessor_lgbm.fit_transform(X) # *注意*: 这里要 fit_transform 完整X
    kaggle_test_data_processed = preprocessor_lgbm.transform(test_processed)
    
    cat_features_indices = list(range(len(numerical_features), len(numerical_features) + len(categorical_features)))
    
    lgbm = lgb.LGBMRegressor(random_state=111, n_jobs=-1)
    param_grid_lgbm = {'n_estimators': [200, 500], 'learning_rate': [0.05, 0.1]}
    grid_lgbm = GridSearchCV(lgbm, param_grid_lgbm, cv=cv_6_folds, scoring=mae_scorer, n_jobs=-1, verbose=0)

    grid_lgbm.fit(X_full_train_lgbm, y, categorical_feature=cat_features_indices)
    best_lgbm = grid_lgbm.best_estimator_

    results['Any Other Model (LGBM-Price)'] = {
        'In-sample MAE': mae_on_original_scale(y_train, best_lgbm.predict(X_train_lgbm)),
        'Out-of-sample MAE': mae_on_original_scale(y_test, best_lgbm.predict(X_test_lgbm)),
        '6-fold CV MAE': -grid_lgbm.best_score_
    }

    pred_log_lgbm = best_lgbm.predict(kaggle_test_data_processed)
    all_predictions['LGBM'] = np.expm1(pred_log_lgbm)

    
    # 报告
    report_df = pd.DataFrame(results).T
    report_df = report_df.rename(columns={'In-sample MAE': 'In sample', 'Out-of-sample MAE': 'out of sample', '6-fold CV MAE': 'Cross-validation'})
    report_df['Kaggle Score'] = '[Submit]'
    
    
    print("房价 (Price) 预测任务完成。")
    return report_df, all_predictions, test_ids

## 租金 (Rent) 预测任务

In [100]:
# Rent特征工程
def parse_floor_rent(floor_str):
    """解析 '楼层' (Rent: '低楼层/18层' 或 '4/6层')"""
    if pd.isnull(floor_str):
        return np.nan, np.nan, np.nan
    
    floor_level_map = {'低': 1, '中': 2, '高': 3}
    level = np.nan
    for key, val in floor_level_map.items():
        if key in floor_str:
            level = val
            break
            
    total_floor_match = re.search(r'/(\d+)层', floor_str)
    total_floor = float(total_floor_match.group(1)) if total_floor_match else np.nan
    
    current_floor_match = re.search(r'^(\d+)/', floor_str)
    current_floor = float(current_floor_match.group(1)) if current_floor_match else np.nan
    
    return level, total_floor, current_floor

def parse_layout_rent(layout_str):
    """解析 '户型' (Rent: '2室2厅1卫')"""
    if pd.isnull(layout_str):
        return [np.nan] * 3
    room = re.search(r'(\d+)室', layout_str); hall = re.search(r'(\d+)厅', layout_str)
    bath = re.search(r'(\d+)卫', layout_str)
    return [
        float(room.group(1)) if room else 0,
        float(hall.group(1)) if hall else 0,
        float(bath.group(1)) if bath else 0
    ]

def parse_area_rent(area_str):
    """解析 '面积' (Rent: '86.94㎡')"""
    if pd.isnull(area_str):
        return np.nan
    area_match = re.search(r'([\d\.]+)', str(area_str))
    return float(area_match.group(1)) if area_match else np.nan

def parse_facilities_rent(facilities_str):
    """解析 '配套设施' (Rent: '洗衣机、空调...')"""
    if pd.isnull(facilities_str):
        return 0
    return len(str(facilities_str).split('、'))

In [102]:
def rent_prediction(train_price_df):
    # 数据
    train_rent = load_data('D:/ai finance/midterm/ruc_Class25Q2_train_rent.csv')
    test_rent = load_data('D:/ai finance/midterm/ruc_Class25Q2_test_rent.csv')
    
    #合并、预处理
    test_ids = test_rent['ID']
    # Price应是数值型
    train_rent['Price'] = pd.to_numeric(train_rent['Price'], errors='coerce')
    train_rent = train_rent.dropna(subset=['Price'])
    
    y_target = train_rent['Price']
    y_target_log = np.log1p(y_target)
    
    train_rent = train_rent.drop('Price', axis=1)
    test_rent = test_rent.drop('ID', axis=1)
    train_rent['source'] = 'train'
    test_rent['source'] = 'test'
    
    data = pd.concat([train_rent, test_rent], ignore_index=True)
    
    # 泄露列
    leaky_cols = [
        '客户反馈', '物业办公电话', '产权描述', '开发商', '物业公司', 'coord_x', 'coord_y', '年份'
    ]
    cols_to_drop = [col for col in leaky_cols if col in data.columns]
    data_cleaned = data.drop(columns=cols_to_drop)

    # 特征工程 (Rent)
    print("Rent-FE")
    df = data_cleaned.copy()
    
    df[['Room', 'Hall', 'Bath']] = df['户型'].apply(lambda x: pd.Series(parse_layout_rent(x)))
    df[['FloorLevel', 'TotalFloor', 'CurrentFloor']] = df['楼层'].apply(lambda x: pd.Series(parse_floor_rent(x)))
    df['BuildingArea'] = df['面积'].apply(parse_area_rent)
    df['FacilitiesCount'] = df['配套设施'].apply(parse_facilities_rent)
    
    df['TransactionTime'] = pd.to_datetime(df['交易时间'])
    df['TransactionYear'] = df['TransactionTime'].dt.year
    df['TransactionMonth'] = df['TransactionTime'].dt.month
    df['BuildingYear'] = df['建筑年代'].apply(clean_year_price) # 复用 Price 的年份解析
    df['HouseAge'] = df['TransactionYear'] - df['BuildingYear']

    df['GreenRate'] = df['绿 化 率'].apply(clean_numeric_text)
    df['VolumeRate'] = df['容 积 率'].apply(clean_numeric_text)
    df['PropertyFee'] = df['物 业 费'].apply(clean_numeric_text)
    df['ParkingSpots'] = df['停车位'].apply(clean_numeric_text)
    df['ParkingFee'] = df['停车费用'].apply(clean_parking)
    
    # 交互特征
    df['TotalRooms'] = df['Room'] + df['Hall'] + df['Bath']
    df['FloorRatio'] = df['CurrentFloor'] / (df['TotalFloor'] + 1e-6)

    # 交叉特征：融合平均售价
    train_price_df['Price'] = pd.to_numeric(train_price_df['Price'], errors='coerce')
    price_agg = train_price_df.groupby('板块_comm')['Price'].mean().reset_index()
    price_agg = price_agg.rename(columns={'Price': 'AvgPrice_Community', '板块_comm': '板块'})
    df = df.merge(price_agg, on='板块', how='left')
    
    # 识别特征类型
    original_cols = [
        '户型', '楼层', '面积', '配套设施', '交易时间', '建筑年代', 
        '绿 化 率', '容 积 率', '物 业 费', '停车位', '停车费用',
        'TransactionTime'  #避免KNNImputer报错
    ]
    df = df.drop(columns=[col for col in original_cols if col in df.columns])

    categorical_features = []
    numerical_features = []
    for col in df.columns:
        if col == 'source': continue
        # Rent 数据中 '板块' '区县' 是数值，但应为类别
        if col in ['板块', '区县']:
            categorical_features.append(col)
            df[col] = df[col].astype(str).fillna('Missing').astype('category')
        elif df[col].dtype == 'object' or df[col].nunique() < 50:
            categorical_features.append(col)
            df[col] = df[col].astype(str).fillna('Missing').astype('category')
        else:
            numerical_features.append(col)
            
    print(f"Rent 数值特征: {len(numerical_features)}")
    print(f"Rent 类别特征: {len(categorical_features)}")

    # 异常值处理与拆分
    train_processed = df[df['source'] == 'train'].drop('source', axis=1)
    test_processed = df[df['source'] == 'test'].drop('source', axis=1)
    
    Q1 = y_target.quantile(0.25); Q3 = y_target.quantile(0.75); IQR = Q3 - Q1
    price_mask = (y_target >= (Q1 - 1.5 * IQR)) & (y_target <= (Q3 + 1.5 * IQR))
    area_mask = (train_processed['BuildingArea'] > 5) & (train_processed['BuildingArea'] < 500)
    combined_mask = price_mask & area_mask.fillna(True)
    
    train_processed = train_processed[combined_mask]
    y_target_log_clean = y_target_log[combined_mask]
    print(f"Rent 移除异常值后，训练样本总数: {train_processed.shape[0]}")

    X = train_processed
    y = y_target_log_clean
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=111)

    # Pipeline
    print("Rent-Model: 构建 Pipelines...")
    # (复用 Price 的 Pipeline 定义)
    numeric_transformer_linear = Pipeline(steps=[('imputer', SimpleImputer(strategy='median')),('scaler', StandardScaler())])
    categorical_transformer_linear = Pipeline(steps=[('imputer', SimpleImputer(strategy='most_frequent')), ('onehot', OneHotEncoder(handle_unknown='ignore'))])
    preprocessor_linear = ColumnTransformer(
        transformers=[
            ('num', numeric_transformer_linear, numerical_features),
            ('cat', categorical_transformer_linear, categorical_features)
        ], remainder='passthrough')

    numeric_transformer_lgbm = Pipeline(steps=[('imputer', SimpleImputer(strategy='median'))])
    categorical_transformer_lgbm = Pipeline(steps=[('imputer', SimpleImputer(strategy='most_frequent')), ('ordinal', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1))])
    preprocessor_lgbm = ColumnTransformer(
        transformers=[
            ('num', numeric_transformer_lgbm, numerical_features),
            ('cat', categorical_transformer_lgbm, categorical_features)
        ], remainder='passthrough')
    
    # 模型训练与评估
    results = {}
    all_predictions = {}
    
    # OLS
    print("Rent-Model: 训练 OLS...")
    pipeline_ols = Pipeline(steps=[('preprocessor', preprocessor_linear), ('model', LinearRegression())])
    pipeline_ols.fit(X_train, y_train)
    cv_scores_ols = -cross_val_score(pipeline_ols, X, y, cv=cv_6_folds, scoring=mae_scorer, n_jobs=-1)
    results['OLS (Rent)'] = {
        'In-sample MAE': mae_on_original_scale(y_train, pipeline_ols.predict(X_train)),
        'Out-of-sample MAE': mae_on_original_scale(y_test, pipeline_ols.predict(X_test)),
        '6-fold CV MAE': np.mean(cv_scores_ols)
    }
    pipeline_ols.fit(X, y)
    pred_log_ols = pipeline_ols.predict(test_processed)
    all_predictions['OLS'] = np.expm1(pred_log_ols)

    # Lasso
    print("Rent-Model: 训练 Lasso (GridSearch)...")
    pipeline_lasso = Pipeline(steps=[('preprocessor', preprocessor_linear), ('model', Lasso(random_state=111, max_iter=2000))])
    grid_lasso = GridSearchCV(pipeline_lasso, {'model__alpha': [0.001, 0.01]}, cv=cv_6_folds, scoring=mae_scorer, n_jobs=-1, verbose=0)
    grid_lasso.fit(X, y)
    best_lasso = grid_lasso.best_estimator_
    results['Lasso (Rent)'] = {
        'In-sample MAE': mae_on_original_scale(y_train, best_lasso.predict(X_train)),
        'Out-of-sample MAE': mae_on_original_scale(y_test, best_lasso.predict(X_test)),
        '6-fold CV MAE': -grid_lasso.best_score_
    }
    pred_log_lasso = best_lasso.predict(test_processed)
    all_predictions['Lasso'] = np.expm1(pred_log_lasso)

    # Ridge
    print("Rent-Model: 训练 Ridge (GridSearch)...")
    pipeline_ridge = Pipeline(steps=[('preprocessor', preprocessor_linear), ('model', Ridge(random_state=111))])
    grid_ridge = GridSearchCV(pipeline_ridge, {'model__alpha': [0.1, 1.0]}, cv=cv_6_folds, scoring=mae_scorer, n_jobs=-1, verbose=0)
    grid_ridge.fit(X, y)
    best_ridge = grid_ridge.best_estimator_
    results['Ridge (Rent)'] = {
        'In-sample MAE': mae_on_original_scale(y_train, best_ridge.predict(X_train)),
        'Out-of-sample MAE': mae_on_original_scale(y_test, best_ridge.predict(X_test)),
        '6-fold CV MAE': -grid_ridge.best_score_
    }
    pred_log_ridge = best_ridge.predict(test_processed)
    all_predictions['Ridge'] = np.expm1(pred_log_ridge)
    
    # ElasticNet
    print("Rent-Model: 训练 ElasticNet (GridSearch)...")
    pipeline_en = Pipeline(steps=[('preprocessor', preprocessor_linear), ('model', ElasticNet(random_state=111, max_iter=2000))])
    param_grid_en = {'model__alpha': [0.001, 0.01], 'model__l1_ratio': [0.1, 0.5, 0.9]}
    grid_en = GridSearchCV(pipeline_en, param_grid_en, cv=cv_6_folds, scoring=mae_scorer, n_jobs=-1, verbose=0)
    grid_en.fit(X, y)
    best_en = grid_en.best_estimator_
    results['Best Linear (ElasticNet-Rent)'] = {
        'In-sample MAE': mae_on_original_scale(y_train, best_en.predict(X_train)),
        'Out-of-sample MAE': mae_on_original_scale(y_test, best_en.predict(X_test)),
        '6-fold CV MAE': -grid_en.best_score_
    }
    pred_log_en = best_en.predict(test_processed)
    all_predictions['ElasticNet'] = np.expm1(pred_log_en)

    # LightGBM
    print("Rent-Model: 训练 LightGBM (GridSearch)...")
    X_train_lgbm = preprocessor_lgbm.fit_transform(X_train)
    X_test_lgbm = preprocessor_lgbm.transform(X_test)
    X_full_train_lgbm = preprocessor_lgbm.fit_transform(X) # *注意*: 这里要 fit_transform 完整X
    kaggle_test_data_processed = preprocessor_lgbm.transform(test_processed)
    cat_features_indices = list(range(len(numerical_features), len(numerical_features) + len(categorical_features)))
    
    lgbm = lgb.LGBMRegressor(random_state=111, n_jobs=-1)
    param_grid_lgbm = {'n_estimators': [200, 500], 'learning_rate': [0.05, 0.1]}
    grid_lgbm = GridSearchCV(lgbm, param_grid_lgbm, cv=cv_6_folds, scoring=mae_scorer, n_jobs=-1, verbose=0)
    grid_lgbm.fit(X_full_train_lgbm, y, categorical_feature=cat_features_indices)
    best_lgbm = grid_lgbm.best_estimator_
    results['Any Other Model (LGBM-Rent)'] = {
        'In-sample MAE': mae_on_original_scale(y_train, best_lgbm.predict(X_train_lgbm)),
        'Out-of-sample MAE': mae_on_original_scale(y_test, best_lgbm.predict(X_test_lgbm)),
        '6-fold CV MAE': -grid_lgbm.best_score_
    }
    pred_log_lgbm = best_lgbm.predict(kaggle_test_data_processed)
    all_predictions['LGBM'] = np.expm1(pred_log_lgbm)


    # 性能报告
    report_df = pd.DataFrame(results).T
    report_df = report_df.rename(columns={'In-sample MAE': 'In sample', 'Out-of-sample MAE': 'out of sample', '6-fold CV MAE': 'Cross-validation'})
    report_df['Kaggle Score'] = '[Submit]'
    
    
    print("租金 (Rent) 预测任务完成。")
    return report_df, all_predictions, test_ids

In [106]:
def main():
    train_rent_base = load_data('D:/ai finance/midterm/ruc_Class25Q2_train_rent.csv')
    train_price_base = load_data('D:/ai finance/midterm/ruc_Class25Q2_train_price.csv')

    #Price 任务
    report_price, preds_price, ids_price = price_prediction(train_rent_base)
    
    #Rent 任务
    report_rent, preds_rent, ids_rent = rent_prediction(train_price_base)
    
    print("\n\n" + "="*50)
    print("     最终报告: 房价 (Price) 模型性能 (MAE)")
    print("="*50)
    print(report_price.to_markdown(floatfmt=",.0f"))
    
    print("\n\n" + "="*50)
    print("     最终报告: 租金 (Rent) 模型性能 (MAE)")
    print("="*50)
    print(report_rent.to_markdown(floatfmt=",.0f"))
    
    # 合并提交
    # 获取所有模型的键
    model_keys = list(preds_price.keys()) # 应该是 ['OLS', 'Lasso', 'Ridge', 'ElasticNet', 'LGBM']
    
    for model_name in model_keys:
        # Price 预测部分
        sub_price_df = pd.DataFrame({'ID': ids_price, 'Price': preds_price[model_name]})
        # Rent 预测部分
        sub_rent_df = pd.DataFrame({'ID': ids_rent, 'Price': preds_rent[model_name]})
        # 清理负数预测 (log-linear
        sub_price_df['Price'] = sub_price_df['Price'].apply(lambda x: max(0, x))
        sub_rent_df['Price'] = sub_rent_df['Price'].apply(lambda x: max(0, x))
        submission_combined = pd.concat([sub_price_df, sub_rent_df], ignore_index=True)
        filename = f'D:/ai finance/midterm/output/submission_combined_{model_name}.xlsx'
        submission_combined.to_excel(filename, index=False)
        print(f"已生成: {filename} (总行数: {len(submission_combined)})")

if __name__ == "__main__":
    main()

Price-FE
Price 数值特征: 19
Price 类别特征: 29
Price 移除异常值后，训练样本总数: 96048
Price-Model: 构建 Pipelines...
Price-Model: 训练 OLS...
Price-Model: 训练 Lasso (GridSearch)...
Price-Model: 训练 Ridge (GridSearch)...
Price-Model: 训练 ElasticNet (GridSearch)...
Price-Model: 训练 LightGBM (GridSearch)...
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.008491 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 5929
[LightGBM] [Info] Number of data points in the train set: 96048, number of used features: 48
[LightGBM] [Info] Start training from score 14.127254
房价 (Price) 预测任务完成。
Rent-FE
Rent 数值特征: 13
Rent 类别特征: 33
Rent 移除异常值后，训练样本总数: 93365
Rent-Model: 构建 Pipelines...
Rent-Model: 训练 OLS...
Rent-Model: 训练 Lasso (GridSearch)...
Rent-Model: 训练 Ridge (GridSearch)...
Rent-Model: 训练 ElasticNet (GridSearch)...
Rent-Model: 训练 LightGBM (GridSearch)...
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.0178