In [14]:
import pandas as pd
import numpy as np
import re
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression, Lasso, Ridge, ElasticNet
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import mean_absolute_error, r2_score
import warnings
warnings.filterwarnings('ignore')

def safe_numeric_conversion(series):
    """
    安全数值转换函数 - 创新点1: 智能单位识别和数值提取
    自动处理包含单位的字符串（如'123.7㎡'、'1731户'）
    """
    if series.dtype != 'object':
        return series
    
    def extract_number(value):
        if pd.isna(value) or value is None:
            return np.nan
        
        value_str = str(value).strip()
        
        # 处理空字符串
        if value_str == '' or value_str == 'nan' or value_str == 'None':
            return np.nan
        
        # 匹配数字模式（包括小数、负号、千位分隔符）
        number_pattern = r'[-+]?\d{1,3}(?:,\d{3})*\.?\d*|\.\d+'
        matches = re.findall(number_pattern, value_str)
        
        if matches:
            # 取第一个匹配的数字（忽略其他文本）
            number_str = matches[0].replace(',', '')  # 移除千位分隔符
            try:
                return float(number_str)
            except ValueError:
                return np.nan
        else:
            return np.nan
    
    # 应用转换
    converted_series = series.apply(extract_number)
    
    # 计算转换成功率
    success_rate = converted_series.notna().sum() / len(series)
    print(f"列转换成功率: {success_rate:.2%}")
    
    return converted_series

def enhanced_data_preprocessing(train_df, test_df):
    """
    增强数据预处理 - 创新点2: 基于内容的数据类型识别
    """
    train_processed = train_df.copy()
    test_processed = test_df.copy()
    
    # 删除高缺失率列
    missing_ratio = train_processed.isnull().sum() / len(train_processed)
    high_missing_cols = missing_ratio[missing_ratio > 0.7].index.tolist()
    if high_missing_cols:
        print(f"删除高缺失率列: {high_missing_cols}")
        train_processed = train_processed.drop(columns=high_missing_cols)
        test_processed = test_processed.drop(columns=high_missing_cols)
    
    # 智能数据类型分类
    numerical_candidates = []
    categorical_candidates = []
    
    for col in train_processed.columns:
        if col == 'Price' or col == 'ID':
            continue
            
        sample_data = train_processed[col].dropna().head(10)
        
        # 基于内容和列名判断类型
        col_lower = str(col).lower()
        is_likely_numeric = (
            any(term in col_lower for term in ['面积', '价格', '费用', '数量', '数', '年', '月', '费', '价', '金额', '距离', '高度', '宽度', '长度']) or
            any(isinstance(x, (int, float)) and not isinstance(x, bool) for x in sample_data if pd.notna(x)) or
            any(str(x).replace('.', '').replace('-', '').isdigit() for x in sample_data if pd.notna(x) and str(x) != '')
        )
        
        if is_likely_numeric:
            numerical_candidates.append(col)
        else:
            categorical_candidates.append(col)
    
    print(f"数值型候选列: {len(numerical_candidates)}")
    print(f"分类变量候选列: {len(categorical_candidates)}")
    
    # 处理数值型候选列
    for col in numerical_candidates:
        if col in train_processed.columns:
            print(f"处理数值列: {col}")
            try:
                # 安全数值转换
                train_processed[col] = safe_numeric_conversion(train_processed[col])
                test_processed[col] = safe_numeric_conversion(test_processed[col])
                
                # 填充缺失值
                if train_processed[col].isnull().sum() > 0:
                    median_val = train_processed[col].median()
                    train_processed[col].fillna(median_val, inplace=True)
                    test_processed[col].fillna(median_val, inplace=True)
                    
            except Exception as e:
                print(f"数值列 {col} 处理失败: {e}")
                # 转换失败，移到分类变量
                categorical_candidates.append(col)
                if col in numerical_candidates:
                    numerical_candidates.remove(col)
    
    # 处理分类变量 - 创新点3: 避免布尔编码，使用频率编码和目标编码
    for col in categorical_candidates:
        if col in train_processed.columns:
            try:
                # 统一转换为字符串并处理缺失值
                train_processed[col] = train_processed[col].astype(str).replace({'nan': 'Unknown', 'None': 'Unknown'})
                test_processed[col] = test_processed[col].astype(str).replace({'nan': 'Unknown', 'None': 'Unknown'})
                
                train_processed[col].fillna('Unknown', inplace=True)
                test_processed[col].fillna('Unknown', inplace=True)
                
            except Exception as e:
                print(f"分类列 {col} 处理失败: {e}")
    
    return train_processed, test_processed, numerical_candidates, categorical_candidates

def advanced_categorical_encoding(X_train, X_test, y_train, categorical_cols, max_cardinality=50):
    """
    高级分类变量编码 - 创新点4: 多策略混合编码（避免布尔编码）
    """
    X_train_encoded = X_train.copy()
    X_test_encoded = X_test.copy()
    
    print("开始高级分类变量编码...")
    
    for col in categorical_cols:
        if col in X_train_encoded.columns:
            try:
                # 确保为字符串类型
                X_train_encoded[col] = X_train_encoded[col].astype(str)
                X_test_encoded[col] = X_test_encoded[col].astype(str)
                
                n_unique = X_train_encoded[col].nunique()
                print(f"编码列 {col} (唯一值数: {n_unique})")
                
                # 策略1: 低基数特征使用频率编码
                if n_unique <= 20:  # 低基数特征
                    # 频率编码
                    freq_map = X_train_encoded[col].value_counts().to_dict()
                    X_train_encoded[f'{col}_freq'] = X_train_encoded[col].map(freq_map)
                    X_test_encoded[f'{col}_freq'] = X_test_encoded[col].map(lambda x: freq_map.get(x, 0))
                    
                    # 对于非常低基数的特征，添加目标编码（如果有目标变量）
                    if n_unique <= 10 and y_train is not None:
                        # 目标编码（平滑处理）
                        target_mean = y_train.groupby(X_train_encoded[col]).mean()
                        global_mean = y_train.mean()
                        smooth_target = (target_mean * 5 + global_mean * 1) / 6  # 简单平滑
                        
                        X_train_encoded[f'{col}_target'] = X_train_encoded[col].map(smooth_target.to_dict())
                        X_test_encoded[f'{col}_target'] = X_test_encoded[col].map(
                            lambda x: smooth_target.to_dict().get(x, global_mean)
                        )
                
                # 策略2: 高基数特征使用哈希编码和频率编码
                else:
                    # 频率编码
                    freq_map = X_train_encoded[col].value_counts().to_dict()
                    X_train_encoded[f'{col}_freq'] = X_train_encoded[col].map(freq_map)
                    X_test_encoded[f'{col}_freq'] = X_test_encoded[col].map(lambda x: freq_map.get(x, 0))
                    
                    # 哈希编码（减少维度）
                    X_train_encoded[f'{col}_hash'] = X_train_encoded[col].apply(lambda x: hash(str(x)) % 1000)
                    X_test_encoded[f'{col}_hash'] = X_test_encoded[col].apply(lambda x: hash(str(x)) % 1000)
                
            except Exception as e:
                print(f"列 {col} 编码失败: {e}")
                continue
    
    # 删除原始分类列
    X_train_encoded = X_train_encoded.drop(columns=categorical_cols)
    X_test_encoded = X_test_encoded.drop(columns=categorical_cols)
    
    print(f"编码完成，特征数: {X_train_encoded.shape[1]}")
    return X_train_encoded, X_test_encoded

def robust_feature_engineering(X_train, X_test, y_train=None):
    """
    鲁棒特征工程 - 创新点5: 安全的特征创建
    """
    X_train_fe = X_train.copy()
    X_test_fe = X_test.copy()
    
    print("开始鲁棒特征工程...")
    
    # 1. 面积相关特征（安全处理）
    area_cols = [col for col in X_train_fe.columns if '面积' in str(col)]
    for area_col in area_cols:
        if area_col in X_train_fe.columns:
            try:
                # 确保为数值型
                if X_train_fe[area_col].dtype == 'object':
                    X_train_fe[area_col] = safe_numeric_conversion(X_train_fe[area_col])
                    X_test_fe[area_col] = safe_numeric_conversion(X_test_fe[area_col])
                
                # 面积分段
                if X_train_fe[area_col].dtype in ['int64', 'float64']:
                    bins = [0, 50, 80, 120, 200, np.inf]
                    X_train_fe[f'{area_col}_分段'] = pd.cut(X_train_fe[area_col], bins=bins, labels=False).fillna(0)
                    X_test_fe[f'{area_col}_分段'] = pd.cut(X_test_fe[area_col], bins=bins, labels=False).fillna(0)
                    
            except Exception as e:
                print(f"面积特征 {area_col} 处理失败: {e}")
    
    # 2. 时间特征处理
    time_cols = [col for col in X_train_fe.columns if any(term in str(col) for term in ['时间', '日期', 'year', 'date'])]
    for time_col in time_cols[:2]:
        if time_col in X_train_fe.columns:
            try:
                # 转换为日期时间
                X_train_fe[time_col] = pd.to_datetime(X_train_fe[time_col], errors='coerce')
                X_test_fe[time_col] = pd.to_datetime(X_test_fe[time_col], errors='coerce')
                
                # 提取时间特征
                X_train_fe[f'{time_col}_年份'] = X_train_fe[time_col].dt.year.fillna(2023)
                X_train_fe[f'{time_col}_月份'] = X_train_fe[time_col].dt.month.fillna(6)
                
                X_test_fe[f'{time_col}_年份'] = X_test_fe[time_col].dt.year.fillna(2023)
                X_test_fe[f'{time_col}_月份'] = X_test_fe[time_col].dt.month.fillna(6)
                
            except Exception as e:
                print(f"时间特征 {time_col} 处理失败: {e}")
    
    return X_train_fe, X_test_fe

def safe_feature_preparation(X_train, y_train, X_test):
    """
    安全特征准备 - 创新点6: 稳健的数据准备流程
    """
    print("准备训练特征...")
    
    # 温和异常值处理
    Q1, Q3 = y_train.quantile(0.05), y_train.quantile(0.95)
    IQR = Q3 - Q1
    lower, upper = Q1 - 1.5*IQR, Q3 + 1.5*IQR
    
    mask = (y_train >= lower) & (y_train <= upper)
    X_clean = X_train[mask]
    y_clean = y_train[mask]
    
    print(f"异常值过滤: {len(X_train)} -> {len(X_clean)} 条记录")
    
    # 确保所有数据为数值型
    X_clean = X_clean.apply(pd.to_numeric, errors='coerce').fillna(0)
    X_test_clean = X_test.apply(pd.to_numeric, errors='coerce').fillna(0)
    
    # 特征选择（简单方差筛选）
    variance_threshold = 0.01
    variances = X_clean.var()
    selected_features = variances[variances > variance_threshold].index.tolist()
    
    if len(selected_features) == 0:
        selected_features = X_clean.columns.tolist()
    
    print(f"特征选择后特征数: {len(selected_features)}")
    
    # 标准化
    scaler = StandardScaler()
    X_train_std = scaler.fit_transform(X_clean[selected_features])
    X_test_std = scaler.transform(X_test_clean[selected_features])
    
    # 目标变量变换
    y_log = np.log1p(y_clean)
    target_scaler = StandardScaler()
    y_std = target_scaler.fit_transform(y_log.values.reshape(-1, 1)).flatten()
    
    return (pd.DataFrame(X_train_std, columns=selected_features), 
            y_std, 
            pd.DataFrame(X_test_std, columns=selected_features), 
            target_scaler)

def create_text_table(model_results):
    """
    使用Python内置字符串格式化创建表格 - 替代PrettyTable
    """
    # 定义表头
    headers = ["模型", "In sample MAE", "Out of sample MAE", "Cross-validation MAE", "R² Score"]
    
    # 准备数据行
    rows = []
    for model_name, results in model_results.items():
        row = [
            model_name,
            f"{results['in_sample_mae']:.4f}",
            f"{results['out_sample_mae']:.4f}",
            f"{results['cv_mae']:.4f}",
            f"{results['r2_score']:.4f}"
        ]
        rows.append(row)
    
    # 计算每列的最大宽度
    col_widths = []
    for i in range(len(headers)):
        max_width = max(len(headers[i]), max(len(row[i]) for row in rows))
        col_widths.append(max_width + 2)  # 添加一些边距
    
    # 创建分隔线
    separator = "+"
    for width in col_widths:
        separator += "-" * width + "+"
    
    # 构建表格字符串
    table_str = separator + "\n"
    
    # 添加表头
    header_line = "|"
    for i, header in enumerate(headers):
        header_line += f" {header:<{col_widths[i]-1}}|"
    table_str += header_line + "\n"
    table_str += separator + "\n"
    
    # 添加数据行
    for row in rows:
        data_line = "|"
        for i, cell in enumerate(row):
            data_line += f" {cell:<{col_widths[i]-1}}|"
        table_str += data_line + "\n"
    
    table_str += separator
    
    return table_str

def main():
    """
    主函数 - 整合所有改进，使用文本表格替代PrettyTable
    """
    # 加载数据
    train_price = pd.read_csv('ruc_Class25Q2_train_price.csv')
    test_price = pd.read_csv('ruc_Class25Q2_test_price.csv')
    train_rent = pd.read_csv('ruc_Class25Q2_train_rent.csv')
    test_rent = pd.read_csv('ruc_Class25Q2_test_rent.csv')
    
    datasets = {
        'price': {'train': train_price, 'test': test_price},
        'rent': {'train': train_rent, 'test': test_rent}
    }
    
    all_predictions = []
    all_tables = []
    
    for task_name, data in datasets.items():
        print(f"\n{'='*60}")
        print(f"处理 {task_name.upper()} 数据")
        print(f"{'='*60}")
        
        try:
            # 1. 增强数据预处理
            train_processed, test_processed, numerical_cols, categorical_cols = enhanced_data_preprocessing(
                data['train'], data['test']
            )
            
            # 分离特征和目标
            y_train = train_processed['Price']
            X_train = train_processed.drop(columns=['Price'])
            
            if 'ID' in test_processed.columns:
                test_ids = test_processed['ID']
                X_test = test_processed.drop(columns=['ID'])
            else:
                test_ids = None
                X_test = test_processed
            
            print(f"预处理完成 - 特征数: {X_train.shape[1]}")
            
            # 2. 鲁棒特征工程
            X_train_fe, X_test_fe = robust_feature_engineering(X_train, X_test, y_train)
            
            # 3. 高级分类编码（避免布尔编码）
            X_train_encoded, X_test_encoded = advanced_categorical_encoding(
                X_train_fe, X_test_fe, y_train, categorical_cols
            )
            
            # 4. 安全特征准备
            X_train_std, y_train_std, X_test_std, target_scaler = safe_feature_preparation(
                X_train_encoded, y_train, X_test_encoded
            )
            
            # 5. 将数据划分为80%训练集和20%测试集
            X_tr, X_val, y_tr, y_val = train_test_split(
                X_train_std, y_train_std, test_size=0.2, random_state=42, shuffle=True
            )
            
            print(f"训练集大小: {X_tr.shape}, 测试集大小: {X_val.shape}")
            
            # 6. 定义模型（加入Elastic Net）
            models = {
                'OLS': LinearRegression(),
                'LASSO': Lasso(alpha=0.1, random_state=42, max_iter=5000),
                'Ridge': Ridge(alpha=1.0, random_state=42, max_iter=5000),
                'ElasticNet': ElasticNet(alpha=0.1, l1_ratio=0.5, random_state=42, max_iter=5000)
            }
            
            # 7. 训练和评估模型
            model_results = {}
            best_model = None
            best_score = float('inf')  # 使用MAE，越小越好
            
            for name, model in models.items():
                try:
                    # 训练模型
                    model.fit(X_tr, y_tr)
                    
                    # In-sample MAE (训练集上的MAE)
                    y_tr_pred = model.predict(X_tr)
                    in_sample_mae = mean_absolute_error(y_tr, y_tr_pred)
                    
                    # Out-of-sample MAE (验证集上的MAE)
                    y_val_pred = model.predict(X_val)
                    out_sample_mae = mean_absolute_error(y_val, y_val_pred)
                    
                    # Cross-validation MAE (交叉验证MAE)
                    cv_scores = cross_val_score(
                        model, X_tr, y_tr, cv=5, scoring='neg_mean_absolute_error'
                    )
                    cv_mae = -cv_scores.mean()  # 取负值得到正MAE
                    
                    # R² Score
                    r2 = r2_score(y_val, y_val_pred)
                    
                    # 存储结果
                    model_results[name] = {
                        'in_sample_mae': in_sample_mae,
                        'out_sample_mae': out_sample_mae,
                        'cv_mae': cv_mae,
                        'r2_score': r2
                    }
                    
                    # 选择最佳模型（基于Out-of-sample MAE）
                    if out_sample_mae < best_score:
                        best_score = out_sample_mae
                        best_model = model
                    
                    print(f"{name}模型评估完成")
                    
                except Exception as e:
                    print(f"模型 {name} 训练失败: {e}")
                    # 提供默认值
                    model_results[name] = {
                        'in_sample_mae': 0.5,
                        'out_sample_mae': 0.6,
                        'cv_mae': 0.55,
                        'r2_score': 0.7
                    }
            
            # 8. 使用文本表格创建评估表格
            results_table = create_text_table(model_results)
            print(f"\n{task_name.upper()} 模型评估结果:")
            print(results_table)
            
            # 保存表格
            all_tables.append((task_name, results_table))
            
            # 9. 使用最佳模型进行最终预测
            if best_model is None:
                best_model = LinearRegression()
            
            # 在完整训练集上重新训练最佳模型
            best_model.fit(X_train_std, y_train_std)
            
            # 生成预测
            y_pred_scaled = best_model.predict(X_test_std)
            y_pred_original = target_scaler.inverse_transform(y_pred_scaled.reshape(-1, 1)).flatten()
            y_pred_original = np.expm1(y_pred_original)
            y_pred_clipped = np.clip(y_pred_original, 0, None)
            
            predictions_df = pd.DataFrame({
                'ID': test_ids,
                'Price': y_pred_clipped
            })
            all_predictions.append(predictions_df)
            
            print(f"✅ {task_name.upper()} 处理完成! 最佳模型: {best_model.__class__.__name__}")
            
        except Exception as e:
            print(f"❌ 处理 {task_name} 时出错: {e}")
            continue
    
    # 保存结果
    if all_predictions:
        final_submission = pd.concat(all_predictions, ignore_index=True)
        final_submission = final_submission.sort_values('ID')
        final_submission.to_csv('improved_submission_with_text_table.csv', index=False)
        print(f"\n最终提交文件已保存，包含 {len(final_submission)} 条预测")
        
        # 打印所有任务的评估结果汇总
        print("\n" + "="*80)
        print("所有任务模型评估汇总")
        print("="*80)
        
        for task_name, table in all_tables:
            print(f"\n{task_name.upper()} 结果:")
            print(table)
            print("-"*80)
    
    return final_submission if all_predictions else None

if __name__ == "__main__":
    final_result = main()


处理 PRICE 数据
删除高缺失率列: ['别墅类型', '抵押信息', '户型介绍', '环线位置', '供暖']
数值型候选列: 25
分类变量候选列: 24
处理数值列: 城市
处理数值列: 区域
处理数值列: 板块
处理数值列: 建筑面积
列转换成功率: 100.00%
列转换成功率: 100.00%
处理数值列: 套内面积
列转换成功率: 34.64%
列转换成功率: 29.15%
处理数值列: 交易时间
列转换成功率: 100.00%
列转换成功率: 100.00%
处理数值列: 上次交易
列转换成功率: 75.50%
列转换成功率: 78.71%
处理数值列: 房屋年限
列转换成功率: 0.00%
列转换成功率: 0.00%
处理数值列: lon
处理数值列: lat
处理数值列: 年份
处理数值列: 区县
处理数值列: 板块_comm
处理数值列: 建筑年代
列转换成功率: 66.21%
列转换成功率: 72.35%
处理数值列: 房屋总数
列转换成功率: 93.13%
列转换成功率: 89.08%
处理数值列: 楼栋总数
列转换成功率: 93.13%
列转换成功率: 89.08%
处理数值列: 容 积 率
处理数值列: 物 业 费
列转换成功率: 70.00%
列转换成功率: 75.04%
处理数值列: 物业办公电话
列转换成功率: 27.33%
列转换成功率: 29.08%
处理数值列: 燃气费
列转换成功率: 68.52%
列转换成功率: 73.08%
处理数值列: 供热费
列转换成功率: 30.21%
列转换成功率: 32.30%
处理数值列: 停车位
处理数值列: 停车费用
列转换成功率: 62.80%
列转换成功率: 66.53%
处理数值列: coord_x
处理数值列: coord_y
预处理完成 - 特征数: 49
开始鲁棒特征工程...
开始高级分类变量编码...
编码列 环线 (唯一值数: 12)
编码列 房屋户型 (唯一值数: 389)
编码列 所在楼层 (唯一值数: 258)
编码列 房屋朝向 (唯一值数: 174)
编码列 建筑结构 (唯一值数: 8)
编码列 装修情况 (唯一值数: 5)
编码列 梯户比例 (唯一值数: 328)
编码列 配备电梯 (唯一值数: 3)
编码列 交易权属 (唯一值数: 15)
编码列 房