# 房价预测项目 - 完整流程
本 Notebook 提供了租房和售房数据的完整机器学习流程，包括：
- 数据预处理
- 特征工程
- 模型训练
- 预测生成

## 1. 导入必要的库

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.linear_model import LinearRegression, Lasso, Ridge, ElasticNet
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler, PolynomialFeatures, LabelEncoder
from sklearn.feature_selection import SelectFromModel
import warnings
import joblib
import os
from datetime import datetime
import re
warnings.filterwarnings('ignore')

print("库导入成功！")
print(f"开始时间: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")

库导入成功！
开始时间: 2025-10-29 23:36:11


## 2. 配置文件路径

In [2]:
# 数据文件路径配置
CONFIG = {
    # 训练数据
    'rent_train_file': 'data/ruc_Class25Q2_train_rent.csv',
    'price_train_file': 'data/ruc_Class25Q2_train_price.csv',
    
    # 测试数据
    'rent_test_file': 'data/ruc_Class25Q2_test_rent.csv',
    'price_test_file': 'data/ruc_Class25Q2_test_price.csv',
    
    # 输出目录
    'rent_train_dir': 'rent_train',
    'price_train_dir': 'price_train',
    'rent_test_dir': 'rent_test',
    'price_test_dir': 'price_test',
    'rent_models_dir': 'rent_models',
    'price_models_dir': 'price_models',
    'predictions_dir': 'predictions'
}

# 创建输出目录
for dir_path in [CONFIG['rent_train_dir'], CONFIG['price_train_dir'], 
                 CONFIG['rent_test_dir'], CONFIG['price_test_dir'],
                 CONFIG['rent_models_dir'], CONFIG['price_models_dir'],
                 CONFIG['predictions_dir']]:
    os.makedirs(dir_path, exist_ok=True)

print("文件路径配置完成！")
print(f"租房训练数据: {CONFIG['rent_train_file']}")
print(f"售房训练数据: {CONFIG['price_train_file']}")

文件路径配置完成！
租房训练数据: data/ruc_Class25Q2_train_rent.csv
售房训练数据: data/ruc_Class25Q2_train_price.csv


## 3. 数据预处理函数

In [3]:
class DataPreprocessor:
    """数据预处理类"""
    
    def __init__(self, data_type='rent'):
        self.data_type = data_type
        self.data = None
        
    def load_data(self, file_path):
        """加载数据"""
        print(f"\n{'='*60}")
        print(f"加载{self.data_type}数据: {file_path}")
        print(f"{'='*60}")
        
        try:
            self.data = pd.read_csv(file_path, encoding='utf-8')
        except UnicodeDecodeError:
            try:
                self.data = pd.read_csv(file_path, encoding='gbk')
            except:
                self.data = pd.read_csv(file_path, encoding='latin-1')
        
        print(f"数据形状: {self.data.shape}")
        print(f"数据列数: {len(self.data.columns)}")
        
        if 'Price' in self.data.columns:
            print(f"Price统计: 均值={self.data['Price'].mean():.2f}, 中位数={self.data['Price'].median():.2f}")
        
        return self
    
    def clean_column_names(self):
        """清理列名"""
        print("\n清理列名...")
        self.data.columns = [re.sub(r'\s+', '', col) for col in self.data.columns]
        print(f"列名清理完成，共{len(self.data.columns)}列")
        return self
    
    def handle_missing_values(self):
        """处理缺失值"""
        print("\n处理缺失值...")
        
        # 数值列用中位数填充
        numeric_cols = self.data.select_dtypes(include=[np.number]).columns
        for col in numeric_cols:
            if self.data[col].isnull().sum() > 0:
                self.data[col].fillna(self.data[col].median(), inplace=True)
        
        # 分类列用众数填充
        categorical_cols = self.data.select_dtypes(include=['object']).columns
        for col in categorical_cols:
            if self.data[col].isnull().sum() > 0:
                mode_val = self.data[col].mode()[0] if not self.data[col].mode().empty else '未知'
                self.data[col].fillna(mode_val, inplace=True)
        
        print(f"缺失值处理完成")
        return self
    
    def remove_leakage_features(self):
        """移除数据泄露特征"""
        print("\n移除数据泄露特征...")
        
        leakage_features = ['物业费', '停车费用', '燃气费', '供热费', '客户反馈', 
                           '年份', '坐标X', '坐标Y']
        
        actual_leakage = [col for col in leakage_features if col in self.data.columns]
        if actual_leakage:
            self.data = self.data.drop(columns=actual_leakage)
            print(f"移除了{len(actual_leakage)}个泄露特征")
        
        return self
    
    def split_data(self, test_size=0.2, random_state=111):
        """划分训练集和测试集"""
        print(f"\n划分数据集 (test_size={test_size}, random_state={random_state})...")
        
        if 'Price' not in self.data.columns:
            print("错误: 数据中没有Price列")
            return None, None, None, None
        
        y = self.data['Price']
        X = self.data.drop(columns=['Price'])
        
        # 只保留数值型特征
        numeric_cols = X.select_dtypes(include=[np.number]).columns
        X = X[numeric_cols]
        
        X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=test_size, random_state=random_state, shuffle=True
        )
        
        print(f"训练集: {X_train.shape}, 测试集: {X_test.shape}")
        print(f"特征数量: {X_train.shape[1]}")
        
        return X_train, X_test, y_train, y_test
    
    def save_processed_data(self, X_train, X_test, y_train, y_test, output_dir):
        """保存预处理后的数据"""
        print(f"\n保存预处理数据到: {output_dir}")
        
        os.makedirs(output_dir, exist_ok=True)
        
        X_train.to_csv(f'{output_dir}/processed_X_train.csv', index=False, encoding='utf-8-sig')
        X_test.to_csv(f'{output_dir}/processed_X_test.csv', index=False, encoding='utf-8-sig')
        pd.DataFrame(y_train, columns=['Price']).to_csv(f'{output_dir}/processed_y_train.csv', index=False, encoding='utf-8-sig')
        pd.DataFrame(y_test, columns=['Price']).to_csv(f'{output_dir}/processed_y_test.csv', index=False, encoding='utf-8-sig')
        
        print("数据保存完成！")
        
        return X_train, X_test, y_train, y_test

print("数据预处理类定义完成！")

数据预处理类定义完成！


## 4. 特征工程函数

In [4]:
class FeatureEngineer:
    """特征工程类"""
    
    def __init__(self, X_train, X_test, y_train, y_test):
        self.X_train = X_train.copy()
        self.X_test = X_test.copy()
        self.y_train = y_train.copy()
        self.y_test = y_test.copy()
        self.scaler = None
    
    def handle_outliers(self):
        """处理异常值"""
        print("\n处理异常值...")
        
        numeric_cols = self.X_train.select_dtypes(include=[np.number]).columns
        
        for col in numeric_cols:
            try:
                Q1 = self.X_train[col].quantile(0.25)
                Q3 = self.X_train[col].quantile(0.75)
                IQR = Q3 - Q1
                
                if IQR > 0:
                    lower_bound = Q1 - 1.5 * IQR
                    upper_bound = Q3 + 1.5 * IQR
                    
                    self.X_train[col] = np.clip(self.X_train[col], lower_bound, upper_bound)
                    self.X_test[col] = np.clip(self.X_test[col], lower_bound, upper_bound)
            except:
                continue
        
        print("异常值处理完成")
        return self
    
    def create_interaction_features(self):
        """创建交互特征"""
        print("\n创建交互特征...")
        
        numeric_cols = self.X_train.select_dtypes(include=[np.number]).columns
        
        # 选择前5个重要特征创建交互
        if len(numeric_cols) >= 5:
            important_cols = numeric_cols[:5]
            
            for i, col1 in enumerate(important_cols):
                for col2 in important_cols[i+1:]:
                    interaction_name = f'{col1}_x_{col2}'
                    self.X_train[interaction_name] = self.X_train[col1] * self.X_train[col2]
                    self.X_test[interaction_name] = self.X_test[col1] * self.X_test[col2]
            
            print(f"创建了{len(important_cols) * (len(important_cols)-1) // 2}个交互特征")
        
        return self
    
    def standardize_features(self):
        """标准化特征"""
        print("\n标准化特征...")
        
        numeric_cols = self.X_train.select_dtypes(include=[np.number]).columns
        
        if len(numeric_cols) > 0:
            self.scaler = StandardScaler()
            self.X_train[numeric_cols] = self.scaler.fit_transform(self.X_train[numeric_cols])
            self.X_test[numeric_cols] = self.scaler.transform(self.X_test[numeric_cols])
            
            print(f"标准化了{len(numeric_cols)}个特征")
        
        return self
    
    def select_features(self, alpha=0.01):
        """特征选择"""
        print(f"\n特征选择 (alpha={alpha})...")
        
        try:
            lasso = Lasso(alpha=alpha, random_state=111)
            lasso.fit(self.X_train, self.y_train)
            
            selector = SelectFromModel(lasso, prefit=True)
            
            self.X_train = pd.DataFrame(
                selector.transform(self.X_train),
                columns=self.X_train.columns[selector.get_support()],
                index=self.X_train.index
            )
            
            self.X_test = pd.DataFrame(
                selector.transform(self.X_test),
                columns=self.X_test.columns[selector.get_support()],
                index=self.X_test.index
            )
            
            print(f"选择了{self.X_train.shape[1]}个特征")
        except:
            print("特征选择失败，保留所有特征")
        
        return self
    
    def get_processed_data(self):
        """获取处理后的数据"""
        return self.X_train, self.X_test, self.y_train, self.y_test
    
    def save_feature_engineered_data(self, output_dir, prefix='advanced_feature_engineered'):
        """保存特征工程后的数据"""
        print(f"\n保存特征工程数据到: {output_dir}")
        
        os.makedirs(output_dir, exist_ok=True)
        
        self.X_train.to_csv(f'{output_dir}/{prefix}_X_train.csv', index=False, encoding='utf-8-sig')
        self.X_test.to_csv(f'{output_dir}/{prefix}_X_test.csv', index=False, encoding='utf-8-sig')
        pd.DataFrame(self.y_train, columns=['Price']).to_csv(f'{output_dir}/{prefix}_y_train.csv', index=False, encoding='utf-8-sig')
        pd.DataFrame(self.y_test, columns=['Price']).to_csv(f'{output_dir}/{prefix}_y_test.csv', index=False, encoding='utf-8-sig')
        
        # 保存scaler
        if self.scaler:
            joblib.dump(self.scaler, f'{output_dir}/feature_scaler.pkl')
        
        # 保存特征列名
        pd.DataFrame({'columns': self.X_train.columns}).to_csv(
            f'{output_dir}/feature_columns.csv', index=False, encoding='utf-8-sig'
        )
        
        print("特征工程数据保存完成！")

print("特征工程类定义完成！")

特征工程类定义完成！


## 5. 模型训练函数

In [5]:
class ModelTrainer:
    """模型训练类"""
    
    def __init__(self, X_train, X_test, y_train, y_test, data_type='rent'):
        self.X_train = X_train.fillna(0)
        self.X_test = X_test.fillna(0)
        self.y_train = y_train.fillna(0)
        self.y_test = y_test.fillna(0)
        self.data_type = data_type
        self.models = {}
        self.results = {}
        self.best_model = None
        self.best_model_name = None
    
    def calculate_metrics(self, y_true, y_pred):
        """计算评估指标"""
        mae = mean_absolute_error(y_true, y_pred)
        rmse = np.sqrt(mean_squared_error(y_true, y_pred))
        r2 = r2_score(y_true, y_pred)
        return {'mae': mae, 'rmse': rmse, 'r2': r2}
    
    def train_ols(self):
        """训练OLS模型"""
        print("\n训练OLS模型...")
        
        model = LinearRegression()
        model.fit(self.X_train, self.y_train)
        
        # 预测
        y_train_pred = model.predict(self.X_train)
        y_test_pred = model.predict(self.X_test)
        
        # 计算指标
        train_metrics = self.calculate_metrics(self.y_train, y_train_pred)
        test_metrics = self.calculate_metrics(self.y_test, y_test_pred)
        
        # 交叉验证
        cv_scores = cross_val_score(model, self.X_train, self.y_train, 
                                   cv=6, scoring='neg_mean_absolute_error')
        cv_mae = -cv_scores.mean()
        
        self.models['OLS'] = model
        self.results['OLS'] = {
            'train_mae': train_metrics['mae'],
            'test_mae': test_metrics['mae'],
            'cv_mae': cv_mae,
            'train_r2': train_metrics['r2'],
            'test_r2': test_metrics['r2']
        }
        
        print(f"OLS - Train MAE: {train_metrics['mae']:.2f}, Test MAE: {test_metrics['mae']:.2f}, CV MAE: {cv_mae:.2f}")
        return self
    
    def train_lasso(self, alpha_range=[0.001, 0.01, 0.1, 1, 10, 100]):
        """训练Lasso模型"""
        print("\n训练Lasso模型...")
        
        param_grid = {'alpha': alpha_range}
        lasso = Lasso(random_state=111, max_iter=10000)
        
        grid_search = GridSearchCV(lasso, param_grid, cv=6, 
                                  scoring='neg_mean_absolute_error', n_jobs=-1)
        grid_search.fit(self.X_train, self.y_train)
        
        best_model = grid_search.best_estimator_
        
        # 预测和评估
        y_train_pred = best_model.predict(self.X_train)
        y_test_pred = best_model.predict(self.X_test)
        
        train_metrics = self.calculate_metrics(self.y_train, y_train_pred)
        test_metrics = self.calculate_metrics(self.y_test, y_test_pred)
        
        cv_scores = cross_val_score(best_model, self.X_train, self.y_train,
                                   cv=6, scoring='neg_mean_absolute_error')
        cv_mae = -cv_scores.mean()
        
        self.models['LASSO'] = best_model
        self.results['LASSO'] = {
            'train_mae': train_metrics['mae'],
            'test_mae': test_metrics['mae'],
            'cv_mae': cv_mae,
            'train_r2': train_metrics['r2'],
            'test_r2': test_metrics['r2'],
            'best_params': grid_search.best_params_
        }
        
        print(f"LASSO (alpha={grid_search.best_params_['alpha']}) - Train MAE: {train_metrics['mae']:.2f}, Test MAE: {test_metrics['mae']:.2f}, CV MAE: {cv_mae:.2f}")
        return self
    
    def train_ridge(self, alpha_range=[0.001, 0.01, 0.1, 1, 10, 100, 1000]):
        """训练Ridge模型"""
        print("\n训练Ridge模型...")
        
        param_grid = {'alpha': alpha_range}
        ridge = Ridge(random_state=111)
        
        grid_search = GridSearchCV(ridge, param_grid, cv=6,
                                  scoring='neg_mean_absolute_error', n_jobs=-1)
        grid_search.fit(self.X_train, self.y_train)
        
        best_model = grid_search.best_estimator_
        
        # 预测和评估
        y_train_pred = best_model.predict(self.X_train)
        y_test_pred = best_model.predict(self.X_test)
        
        train_metrics = self.calculate_metrics(self.y_train, y_train_pred)
        test_metrics = self.calculate_metrics(self.y_test, y_test_pred)
        
        cv_scores = cross_val_score(best_model, self.X_train, self.y_train,
                                   cv=6, scoring='neg_mean_absolute_error')
        cv_mae = -cv_scores.mean()
        
        self.models['Ridge'] = best_model
        self.results['Ridge'] = {
            'train_mae': train_metrics['mae'],
            'test_mae': test_metrics['mae'],
            'cv_mae': cv_mae,
            'train_r2': train_metrics['r2'],
            'test_r2': test_metrics['r2'],
            'best_params': grid_search.best_params_
        }
        
        print(f"Ridge (alpha={grid_search.best_params_['alpha']}) - Train MAE: {train_metrics['mae']:.2f}, Test MAE: {test_metrics['mae']:.2f}, CV MAE: {cv_mae:.2f}")
        return self
    
    def select_best_model(self):
        """选择最佳模型"""
        print("\n选择最佳模型...")
        
        best_model_name = min(self.results.keys(), key=lambda k: self.results[k]['test_mae'])
        self.best_model = self.models[best_model_name]
        self.best_model_name = best_model_name
        
        print(f"最佳模型: {best_model_name}")
        print(f"Test MAE: {self.results[best_model_name]['test_mae']:.2f}")
        
        return self
    
    def save_models(self, output_dir):
        """保存所有模型"""
        print(f"\n保存模型到: {output_dir}")
        
        os.makedirs(output_dir, exist_ok=True)
        
        for model_name, model in self.models.items():
            filename = f"{output_dir}/{model_name.lower()}_model.pkl"
            joblib.dump(model, filename)
        
        # 保存最佳模型
        joblib.dump(self.best_model, f'{output_dir}/best_model.pkl')
        
        # 保存性能报告
        report_data = []
        for model_name in ['OLS', 'LASSO', 'Ridge']:
            if model_name in self.results:
                result = self.results[model_name]
                report_data.append({
                    'Model': model_name,
                    'Train MAE': f"{result['train_mae']:.2f}",
                    'Test MAE': f"{result['test_mae']:.2f}",
                    'CV MAE': f"{result['cv_mae']:.2f}",
                    'Train R²': f"{result['train_r2']:.4f}",
                    'Test R²': f"{result['test_r2']:.4f}"
                })
        
        report_df = pd.DataFrame(report_data)
        report_df.to_csv(f'{output_dir}/performance_report.csv', index=False, encoding='utf-8-sig')
        
        print("模型保存完成！")
        print("\n性能报告:")
        print(report_df.to_string(index=False))

print("模型训练类定义完成！")

模型训练类定义完成！


## 6. 预测生成函数

In [6]:
class PredictionGenerator:
    """预测生成器"""
    
    def __init__(self):
        self.rent_predictions = {}
        self.price_predictions = {}
    
    def generate_predictions(self, test_file, model_dir, scaler_file, feature_columns_file, data_type='rent'):
        """生成预测"""
        print(f"\n{'='*60}")
        print(f"生成{data_type}预测")
        print(f"{'='*60}")
        
        # 加载测试数据
        try:
            test_data = pd.read_csv(test_file, encoding='utf-8')
        except:
            try:
                test_data = pd.read_csv(test_file, encoding='gbk')
            except:
                test_data = pd.read_csv(test_file, encoding='latin-1')
        
        print(f"测试数据形状: {test_data.shape}")
        
        # 提取ID
        id_col = None
        for col in ['ID', 'id', 'Id']:
            if col in test_data.columns:
                id_col = col
                break
        
        if id_col:
            ids = test_data[id_col].copy()
            test_data = test_data.drop(columns=[id_col])
        else:
            ids = pd.RangeIndex(start=0, stop=len(test_data))
        
        # 加载特征列名
        if os.path.exists(feature_columns_file):
            feature_columns = pd.read_csv(feature_columns_file)['columns'].tolist()
            
            # 对齐特征
            for col in feature_columns:
                if col not in test_data.columns:
                    test_data[col] = 0
            
            test_data = test_data[feature_columns]
        
        # 填充缺失值
        test_data = test_data.fillna(0)
        
        # 加载scaler并转换
        if os.path.exists(scaler_file):
            scaler = joblib.load(scaler_file)
            test_data = pd.DataFrame(
                scaler.transform(test_data),
                columns=test_data.columns
            )
        
        # 加载模型并预测
        predictions = {}
        
        for model_name in ['ols', 'lasso', 'ridge']:
            model_file = f"{model_dir}/{model_name}_model.pkl"
            
            if os.path.exists(model_file):
                print(f"\n加载{model_name.upper()}模型...")
                model = joblib.load(model_file)
                
                preds = model.predict(test_data)
                preds = np.maximum(preds, 0)  # 确保非负
                
                predictions[model_name.upper()] = preds
                print(f"{model_name.upper()} - 预测范围: {preds.min():.2f} - {preds.max():.2f}")
        
        # 加载最佳模型
        best_model_file = f"{model_dir}/best_model.pkl"
        if os.path.exists(best_model_file):
            model = joblib.load(best_model_file)
            preds = model.predict(test_data)
            preds = np.maximum(preds, 0)
            predictions['Best_Model'] = preds
        
        if data_type == 'rent':
            self.rent_predictions = {'predictions': predictions, 'ids': ids}
        else:
            self.price_predictions = {'predictions': predictions, 'ids': ids}
        
        return predictions, ids
    
    def save_predictions(self, output_dir):
        """保存预测文件"""
        print(f"\n{'='*60}")
        print(f"保存预测文件")
        print(f"{'='*60}")
        
        os.makedirs(output_dir, exist_ok=True)
        os.makedirs(f"{output_dir}/rent", exist_ok=True)
        os.makedirs(f"{output_dir}/price", exist_ok=True)
        os.makedirs(f"{output_dir}/merged", exist_ok=True)
        
        # 保存租房预测
        if self.rent_predictions:
            for model_name, preds in self.rent_predictions['predictions'].items():
                pred_df = pd.DataFrame({
                    'ID': self.rent_predictions['ids'],
                    'Price': preds
                })
                pred_df.to_csv(f"{output_dir}/rent/rent_prediction_{model_name}.csv", 
                             index=False, encoding='utf-8-sig')
            print("租房预测文件保存完成")
        
        # 保存售房预测
        if self.price_predictions:
            for model_name, preds in self.price_predictions['predictions'].items():
                pred_df = pd.DataFrame({
                    'ID': self.price_predictions['ids'],
                    'Price': preds
                })
                pred_df.to_csv(f"{output_dir}/price/price_prediction_{model_name}.csv",
                             index=False, encoding='utf-8-sig')
            print("售房预测文件保存完成")
        
        # 合并预测（使用Best_Model）
        if self.rent_predictions and self.price_predictions:
            if 'Best_Model' in self.rent_predictions['predictions'] and 'Best_Model' in self.price_predictions['predictions']:
                rent_df = pd.DataFrame({
                    'ID': self.rent_predictions['ids'],
                    'Price': self.rent_predictions['predictions']['Best_Model']
                })
                
                price_df = pd.DataFrame({
                    'ID': self.price_predictions['ids'],
                    'Price': self.price_predictions['predictions']['Best_Model']
                })
                
                merged_df = pd.concat([rent_df, price_df], ignore_index=True)
                merged_df.to_csv(f"{output_dir}/merged/merged_prediction.csv",
                               index=False, encoding='utf-8-sig')
                
                # Kaggle提交文件
                merged_df.to_csv(f"{output_dir}/kaggle_submission.csv",
                               index=False, encoding='utf-8-sig')
                
                print("合并预测文件保存完成")
                print(f"总记录数: {len(merged_df)}")
                print(f"租房: {len(rent_df)}, 售房: {len(price_df)}")

print("预测生成类定义完成！")

预测生成类定义完成！


## 7. 执行完整流程

### 7.1 租房数据处理

In [7]:
print("\n" + "="*80)
print("开始处理租房数据")
print("="*80)

# 1. 数据预处理
rent_preprocessor = DataPreprocessor(data_type='rent')
rent_preprocessor.load_data(CONFIG['rent_train_file'])
rent_preprocessor.clean_column_names()
rent_preprocessor.handle_missing_values()
rent_preprocessor.remove_leakage_features()

X_train_rent, X_test_rent, y_train_rent, y_test_rent = rent_preprocessor.split_data()
rent_preprocessor.save_processed_data(X_train_rent, X_test_rent, y_train_rent, y_test_rent, 
                                     CONFIG['rent_train_dir'])

# 2. 特征工程
rent_fe = FeatureEngineer(X_train_rent, X_test_rent, y_train_rent, y_test_rent)
rent_fe.handle_outliers()
rent_fe.create_interaction_features()
rent_fe.standardize_features()
rent_fe.select_features(alpha=0.01)

X_train_rent_fe, X_test_rent_fe, y_train_rent_fe, y_test_rent_fe = rent_fe.get_processed_data()
rent_fe.save_feature_engineered_data(CONFIG['rent_train_dir'])

# 3. 模型训练
rent_trainer = ModelTrainer(X_train_rent_fe, X_test_rent_fe, y_train_rent_fe, y_test_rent_fe, data_type='rent')
rent_trainer.train_ols()
rent_trainer.train_lasso()
rent_trainer.train_ridge()
rent_trainer.select_best_model()
rent_trainer.save_models(CONFIG['rent_models_dir'])

print("\n租房数据处理完成！")


开始处理租房数据

加载rent数据: data/ruc_Class25Q2_train_rent.csv
数据形状: (98899, 46)
数据列数: 46
Price统计: 均值=582908.98, 中位数=394936.89

清理列名...
列名清理完成，共46列

处理缺失值...
缺失值处理完成

移除数据泄露特征...
移除了6个泄露特征

划分数据集 (test_size=0.2, random_state=111)...
训练集: (79119, 9), 测试集: (19780, 9)
特征数量: 9

保存预处理数据到: rent_train
数据保存完成！

处理异常值...
异常值处理完成

创建交互特征...
创建了10个交互特征

标准化特征...
标准化了19个特征

特征选择 (alpha=0.01)...
选择了19个特征

保存特征工程数据到: rent_train
特征工程数据保存完成！

训练OLS模型...
OLS - Train MAE: 322906.73, Test MAE: 321405.31, CV MAE: 322960.86

训练Lasso模型...


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = c

LASSO (alpha=100) - Train MAE: 322709.51, Test MAE: 321471.02, CV MAE: 322765.53

训练Ridge模型...
Ridge (alpha=100) - Train MAE: 321304.60, Test MAE: 320145.73, CV MAE: 321424.01

选择最佳模型...
最佳模型: Ridge
Test MAE: 320145.73

保存模型到: rent_models
模型保存完成！

性能报告:
Model Train MAE  Test MAE    CV MAE Train R² Test R²
  OLS 322906.73 321405.31 322960.86   0.1142  0.1182
LASSO 322709.51 321471.02 322765.53   0.1133  0.1168
Ridge 321304.60 320145.73 321424.01   0.1096  0.1128

租房数据处理完成！


### 7.2 售房数据处理

In [8]:
print("\n" + "="*80)
print("开始处理售房数据")
print("="*80)

# 1. 数据预处理
price_preprocessor = DataPreprocessor(data_type='price')
price_preprocessor.load_data(CONFIG['price_train_file'])
price_preprocessor.clean_column_names()
price_preprocessor.handle_missing_values()
price_preprocessor.remove_leakage_features()

X_train_price, X_test_price, y_train_price, y_test_price = price_preprocessor.split_data()
price_preprocessor.save_processed_data(X_train_price, X_test_price, y_train_price, y_test_price,
                                      CONFIG['price_train_dir'])

# 2. 特征工程
price_fe = FeatureEngineer(X_train_price, X_test_price, y_train_price, y_test_price)
price_fe.handle_outliers()
price_fe.create_interaction_features()
price_fe.standardize_features()
price_fe.select_features(alpha=0.01)

X_train_price_fe, X_test_price_fe, y_train_price_fe, y_test_price_fe = price_fe.get_processed_data()
price_fe.save_feature_engineered_data(CONFIG['price_train_dir'])

# 3. 模型训练
price_trainer = ModelTrainer(X_train_price_fe, X_test_price_fe, y_train_price_fe, y_test_price_fe, data_type='price')
price_trainer.train_ols()
price_trainer.train_lasso()
price_trainer.train_ridge()
price_trainer.select_best_model()
price_trainer.save_models(CONFIG['price_models_dir'])

print("\n售房数据处理完成！")


开始处理售房数据

加载price数据: data/ruc_Class25Q2_train_price.csv
数据形状: (103871, 55)
数据列数: 55
Price统计: 均值=2262366.07, 中位数=1479407.11

清理列名...
列名清理完成，共55列

处理缺失值...
缺失值处理完成

移除数据泄露特征...
移除了6个泄露特征

划分数据集 (test_size=0.2, random_state=111)...
训练集: (83096, 12), 测试集: (20775, 12)
特征数量: 12

保存预处理数据到: price_train
数据保存完成！

处理异常值...
异常值处理完成

创建交互特征...
创建了10个交互特征

标准化特征...
标准化了22个特征

特征选择 (alpha=0.01)...
特征选择失败，保留所有特征

保存特征工程数据到: price_train
特征工程数据保存完成！

训练OLS模型...
OLS - Train MAE: 1386950.85, Test MAE: 1358050.50, CV MAE: 1387223.07

训练Lasso模型...


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = c

LASSO (alpha=0.001) - Train MAE: 1386950.79, Test MAE: 1358050.40, CV MAE: 1387223.02

训练Ridge模型...
Ridge (alpha=1) - Train MAE: 1386897.26, Test MAE: 1357967.29, CV MAE: 1387163.94

选择最佳模型...
最佳模型: Ridge
Test MAE: 1357967.29

保存模型到: price_models
模型保存完成！

性能报告:
Model  Train MAE   Test MAE     CV MAE Train R² Test R²
  OLS 1386950.85 1358050.50 1387223.07   0.1035  0.1036
LASSO 1386950.79 1358050.40 1387223.02   0.1035  0.1036
Ridge 1386897.26 1357967.29 1387163.94   0.1035  0.1036

售房数据处理完成！


### 7.3 生成预测文件

In [13]:
print("\n" + "="*80)
print("生成预测文件")
print("="*80)

class PredictionGenerator:
    def __init__(self):
        self.predictions = {}
    
    def generate_predictions(self, test_file, model_dir, scaler_file, feature_columns_file, data_type):
        print(f"\n处理{data_type}数据集...")
        
        # 加载测试数据
        test_df = pd.read_csv(test_file)
        original_test_df = test_df.copy()
        
        # ===== 检查并处理 ID 列 =====
        print(f"测试数据的列名: {test_df.columns.tolist()[:10]}...")  # 显示前10列
        
        # 确保有 ID 列
        if 'id' not in test_df.columns:
            if 'ID' in test_df.columns:
                test_df['id'] = test_df['ID']
                original_test_df['id'] = original_test_df['ID']
            elif 'Id' in test_df.columns:
                test_df['id'] = test_df['Id']
                original_test_df['id'] = original_test_df['Id']
            else:
                # 如果没有任何 ID 列，创建一个
                test_df['id'] = range(len(test_df))
                original_test_df['id'] = range(len(original_test_df))
                print(f"⚠️ 未找到 ID 列，已自动生成: 0 到 {len(test_df)-1}")
        
        print(f"✓ ID 列已确认，共 {len(test_df)} 条记录")
        # ================================
        
        # 特征工程
        test_df = self.feature_engineering(test_df)
        
        # 加载特征列
        feature_df = pd.read_csv(feature_columns_file)
        print(f"特征列文件的列名: {feature_df.columns.tolist()}")
        
        # 尝试多种可能的列名
        if 'feature' in feature_df.columns:
            feature_columns = feature_df['feature'].tolist()
        elif len(feature_df.columns) == 1:
            feature_columns = feature_df.iloc[:, 0].tolist()
        else:
            feature_columns = feature_df[feature_df.columns[0]].tolist()
        
        print(f"加载了 {len(feature_columns)} 个特征")
        
        # 确保所有需要的特征都存在
        for col in feature_columns:
            if col not in test_df.columns:
                test_df[col] = 0
        
        # 选择特征
        test_data = test_df[feature_columns].copy()
        
        # 第一次处理缺失值 (缩放前)
        print(f"缩放前 - 处理缺失值前: {test_data.isnull().sum().sum()} 个 NaN")
        
        # 数值型特征用中位数填充
        for col in test_data.columns:
            if test_data[col].isnull().any():
                median_val = test_data[col].median()
                if pd.isna(median_val):
                    median_val = 0
                test_data.loc[:, col] = test_data[col].fillna(median_val)
        
        # 替换 inf 值
        test_data.replace([np.inf, -np.inf], 0, inplace=True)
        
        print(f"缩放前 - 处理缺失值后: {test_data.isnull().sum().sum()} 个 NaN")
        
        # 加载scaler并转换
        scaler = joblib.load(scaler_file)
        test_data_scaled = scaler.transform(test_data)
        
        # 第二次处理缺失值 (缩放后)
        if np.isnan(test_data_scaled).any():
            print(f"⚠️ 缩放后发现 NaN: {np.isnan(test_data_scaled).sum()} 个")
            test_data_scaled = np.nan_to_num(test_data_scaled, nan=0.0, posinf=0.0, neginf=0.0)
            print(f"缩放后 - 处理后的 NaN: {np.isnan(test_data_scaled).sum()} 个")
        
        # 最终检查
        assert not np.isnan(test_data_scaled).any(), "数据中仍有 NaN!"
        assert not np.isinf(test_data_scaled).any(), "数据中仍有 inf!"
        print(f"✓ 数据验证通过，shape: {test_data_scaled.shape}")
        
        # 加载模型并预测
        predictions = {}
        model_files = {
            'ridge': f'{model_dir}/ridge_model.pkl',
            'lasso': f'{model_dir}/lasso_model.pkl',
            'elasticnet': f'{model_dir}/elasticnet_model.pkl',
            'xgboost': f'{model_dir}/xgboost_model.pkl',
            'lightgbm': f'{model_dir}/lightgbm_model.pkl'
        }
        
        for model_name, model_file in model_files.items():
            if not os.path.exists(model_file):
                print(f"警告: 找不到 {model_name} 模型文件")
                continue
            
            print(f"\n加载{model_name.upper()}模型...")
            model = joblib.load(model_file)
            
            preds = model.predict(test_data_scaled)
            preds = np.maximum(preds, 0)  # 确保非负
            
            predictions[model_name.upper()] = preds
            print(f"{model_name.upper()} 预测范围: [{preds.min():.2f}, {preds.max():.2f}]")
        
        # 保存预测结果
        self.predictions[data_type] = {
            'df': original_test_df,
            'predictions': predictions
        }
        
        print(f"\n{data_type}数据集预测完成!")
    
    def feature_engineering(self, df):
        """与训练时相同的特征工程"""
        df = df.copy()
        
        # 基础特征
        if 'bedrooms' in df.columns and 'bathrooms' in df.columns:
            df['total_rooms'] = df['bedrooms'] + df['bathrooms']
            df['bed_bath_ratio'] = df['bedrooms'] / (df['bathrooms'] + 1)
        
        if 'sqft' in df.columns:
            df['sqft_log'] = np.log1p(df['sqft'])
            if 'bedrooms' in df.columns:
                df['sqft_per_bedroom'] = df['sqft'] / (df['bedrooms'] + 1)
        
        # 地理特征
        if 'latitude' in df.columns and 'longitude' in df.columns:
            df['lat_lon_ratio'] = df['latitude'] / (np.abs(df['longitude']) + 1)
            df['distance_to_center'] = np.sqrt(
                (df['latitude'] - df['latitude'].mean())**2 + 
                (df['longitude'] - df['longitude'].mean())**2
            )
        
        # 类别特征编码
        categorical_features = ['state', 'type']
        for col in categorical_features:
            if col in df.columns:
                freq = df[col].value_counts(normalize=True)
                df[f'{col}_freq'] = df[col].map(freq)
                # 处理新类别
                df[f'{col}_freq'].fillna(0, inplace=True)
        
        # One-hot编码
        if 'state' in df.columns:
            state_dummies = pd.get_dummies(df['state'], prefix='state', drop_first=True)
            df = pd.concat([df, state_dummies], axis=1)
        
        if 'type' in df.columns:
            type_dummies = pd.get_dummies(df['type'], prefix='type', drop_first=True)
            df = pd.concat([df, type_dummies], axis=1)
        
        # 替换所有 inf 值
        df.replace([np.inf, -np.inf], np.nan, inplace=True)
        
        return df
    
    def save_predictions(self, output_dir):
        """保存预测结果"""
        os.makedirs(output_dir, exist_ok=True)
        
        # 保存租房预测
        if 'rent' in self.predictions:
            rent_preds = self.predictions['rent']
            print(f"\n保存租房预测结果...")
            
            for model_name, preds in rent_preds['predictions'].items():
                output_file = f"{output_dir}/rent_predictions_{model_name.lower()}.csv"
                result_df = pd.DataFrame({
                    'id': rent_preds['df']['id'],
                    'rent': preds
                })
                result_df.to_csv(output_file, index=False)
                print(f"✓ {output_file}")
        
        # 保存售房预测
        if 'price' in self.predictions:
            price_preds = self.predictions['price']
            print(f"\n保存售房预测结果...")
            
            for model_name, preds in price_preds['predictions'].items():
                output_file = f"{output_dir}/price_predictions_{model_name.lower()}.csv"
                result_df = pd.DataFrame({
                    'id': price_preds['df']['id'],
                    'price': preds
                })
                result_df.to_csv(output_file, index=False)
                print(f"✓ {output_file}")
        
        # 创建Kaggle提交文件 (使用集成预测)
        if 'rent' in self.predictions and 'price' in self.predictions:
            print(f"\n创建Kaggle提交文件...")
            
            rent_preds = self.predictions['rent']
            price_preds = self.predictions['price']
            
            # 使用所有模型的平均值
            rent_ensemble = np.mean(list(rent_preds['predictions'].values()), axis=0)
            price_ensemble = np.mean(list(price_preds['predictions'].values()), axis=0)
            
            submission_df = pd.concat([
                pd.DataFrame({'id': rent_preds['df']['id'], 'rent': rent_ensemble, 'price': np.nan}),
                pd.DataFrame({'id': price_preds['df']['id'], 'rent': np.nan, 'price': price_ensemble})
            ], ignore_index=True)
            
            submission_file = f"{output_dir}/kaggle_submission.csv"
            submission_df.to_csv(submission_file, index=False)
            print(f"✓ {submission_file}")
            print(f"\n提交文件统计:")
            print(f"  - 租房预测: {(~submission_df['rent'].isna()).sum()} 条")
            print(f"  - 售房预测: {(~submission_df['price'].isna()).sum()} 条")
            print(f"  - 总计: {len(submission_df)} 条")

predictor = PredictionGenerator()

# 生成租房预测
predictor.generate_predictions(
    test_file=CONFIG['rent_test_file'],
    model_dir=CONFIG['rent_models_dir'],
    scaler_file=f"{CONFIG['rent_train_dir']}/feature_scaler.pkl",
    feature_columns_file=f"{CONFIG['rent_train_dir']}/feature_columns.csv",
    data_type='rent'
)

# 生成售房预测
predictor.generate_predictions(
    test_file=CONFIG['price_test_file'],
    model_dir=CONFIG['price_models_dir'],
    scaler_file=f"{CONFIG['price_train_dir']}/feature_scaler.pkl",
    feature_columns_file=f"{CONFIG['price_train_dir']}/feature_columns.csv",
    data_type='price'
)

# 保存所有预测文件
predictor.save_predictions(CONFIG['predictions_dir'])

print("\n" + "="*80)
print("所有处理完成!")
print("="*80)
print("\n生成的文件:")
print(f"- 租房模型: {CONFIG['rent_models_dir']}/")
print(f"- 售房模型: {CONFIG['price_models_dir']}/")
print(f"- 预测文件: {CONFIG['predictions_dir']}/")
print(f"- Kaggle提交: {CONFIG['predictions_dir']}/kaggle_submission.csv")
print(f"\n完成时间: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")


生成预测文件

处理rent数据集...
测试数据的列名: ['ID', '城市', '户型', '装修', '楼层', '面积', '朝向', '交易时间', '付款方式', '租赁方式']...
✓ ID 列已确认，共 9773 条记录
特征列文件的列名: ['columns']
加载了 19 个特征
缩放前 - 处理缺失值前: 6822 个 NaN
缩放前 - 处理缺失值后: 0 个 NaN
✓ 数据验证通过，shape: (9773, 19)

加载RIDGE模型...
RIDGE 预测范围: [0.00, 835451.56]

加载LASSO模型...
LASSO 预测范围: [0.00, 1332440.17]
警告: 找不到 elasticnet 模型文件
警告: 找不到 xgboost 模型文件
警告: 找不到 lightgbm 模型文件

rent数据集预测完成!

处理price数据集...
测试数据的列名: ['ID', '城市', '区域', '板块', '环线', '房屋户型', '所在楼层', '建筑面积', '套内面积', '房屋朝向']...
✓ ID 列已确认，共 34017 条记录
特征列文件的列名: ['columns']
加载了 22 个特征
缩放前 - 处理缺失值前: 58562 个 NaN
缩放前 - 处理缺失值后: 0 个 NaN
⚠️ 缩放后发现 NaN: 170085 个
缩放后 - 处理后的 NaN: 0 个
✓ 数据验证通过，shape: (34017, 22)

加载RIDGE模型...
RIDGE 预测范围: [0.00, 31068031.67]

加载LASSO模型...
LASSO 预测范围: [0.00, 31742665.73]
警告: 找不到 elasticnet 模型文件
警告: 找不到 xgboost 模型文件
警告: 找不到 lightgbm 模型文件

price数据集预测完成!

保存租房预测结果...
✓ predictions/rent_predictions_ridge.csv
✓ predictions/rent_predictions_lasso.csv

保存售房预测结果...
✓ predictions/price_predictions_ridge.csv
✓ pre