In [2]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression, Lasso, Ridge, ElasticNet
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.feature_selection import SelectKBest, f_regression, RFE
import warnings
warnings.filterwarnings('ignore')
import time
import os

class LinearRegressionProject:
    def __init__(self):
        self.results = []
        self.models = {}
        self.best_model = None
        self.best_model_name = None
        
    def load_and_clean_data(self):
        """加载并清洗数据"""
        print("="*60)
        print("步骤1: 加载并清洗数据")
        print("="*60)
        
        try:
            # 加载训练集和测试集
            train_data = pd.read_csv("正常化数据_train_price.csv")
            test_data = pd.read_csv("正常化数据_test_price.csv")
            
            print(f"原始训练集形状: {train_data.shape}")
            print(f"原始测试集形状: {test_data.shape}")
            
            # 数据清洗函数
            def clean_dataset(df):
                df_clean = df.copy()
                
                # 处理无穷大值
                df_clean = df_clean.replace([np.inf, -np.inf], np.nan)
                
                # 处理超出float64范围的值
                float_max = np.finfo(np.float64).max
                float_min = np.finfo(np.float64).min
                
                for col in df_clean.select_dtypes(include=[np.number]).columns:
                    df_clean[col] = df_clean[col].apply(
                        lambda x: np.nan if x > float_max or x < float_min else x
                    )
                
                # 填充缺失值
                for col in df_clean.columns:
                    if df_clean[col].isnull().sum() > 0:
                        median_val = df_clean[col].median()
                        df_clean[col].fillna(median_val, inplace=True)
                
                return df_clean
            
            # 清洗数据
            train_data_clean = clean_dataset(train_data)
            test_data_clean = clean_dataset(test_data)
            
            print(f"清洗后训练集缺失值: {train_data_clean.isnull().sum().sum()}")
            print(f"清洗后测试集缺失值: {test_data_clean.isnull().sum().sum()}")
            
            # 检查必要的列
            if 'Price' not in train_data_clean.columns:
                raise KeyError("训练集中未找到'Price'列")
            if 'ID' not in test_data_clean.columns:
                raise KeyError("测试集中未找到'ID'列")
            
            return train_data_clean, test_data_clean
            
        except Exception as e:
            print(f"数据加载错误: {e}")
            return None, None
    
    def remove_outliers(self, df, target_col, n_std=3):
        """移除异常值"""
        if target_col not in df.columns:
            return df
        
        # 先处理缺失值
        df_clean = df.dropna(subset=[target_col])
        
        mean = df_clean[target_col].mean()
        std = df_clean[target_col].std()
        
        # 计算异常值边界
        lower_bound = mean - n_std * std
        upper_bound = mean + n_std * std
        
        # 过滤异常值
        filtered_df = df_clean[(df_clean[target_col] >= lower_bound) & 
                              (df_clean[target_col] <= upper_bound)]
        
        print(f"原始样本数: {len(df)}")
        print(f"移除异常值后样本数: {len(filtered_df)}")
        print(f"移除的异常值数量: {len(df) - len(filtered_df)}")
        
        return filtered_df
    
    def prepare_features(self, train_data, test_data):
        """准备特征和目标变量"""
        # 分离特征和目标变量
        X = train_data.drop('Price', axis=1)
        y = train_data['Price']
        
        # 测试集特征
        X_test = test_data.drop('ID', axis=1)
        test_ids = test_data['ID']
        
        print(f"特征维度: {X.shape[1]}")
        print(f"训练样本数: {X.shape[0]}")
        
        # 处理特征中的缺失值
        imputer = SimpleImputer(strategy='median')
        X_imputed = imputer.fit_transform(X)
        X_test_imputed = imputer.transform(X_test)
        
        # 转换回DataFrame
        X = pd.DataFrame(X_imputed, columns=X.columns, index=X.index)
        X_test = pd.DataFrame(X_test_imputed, columns=X_test.columns, index=X_test.index)
        
        # 划分训练集和验证集
        X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)
        
        print(f"训练特征形状: {X_train.shape}")
        print(f"验证特征形状: {X_val.shape}")
        print(f"测试特征形状: {X_test.shape}")
        
        return X_train, X_val, X_test, y_train, y_val, test_ids
    
    def create_polynomial_features(self, X_train, X_val, X_test, degree=2):
        """创建多项式特征和交互项 - 根据图片要求添加非线性特征"""
        print(f"创建多项式特征 (degree={degree})...")
        
        try:
            poly = PolynomialFeatures(degree=degree, include_bias=False, interaction_only=False)
            
            X_train_poly = poly.fit_transform(X_train)
            X_val_poly = poly.transform(X_val)
            X_test_poly = poly.transform(X_test)
            
            print(f"多项式特征后训练集形状: {X_train_poly.shape}")
            
            return X_train_poly, X_val_poly, X_test_poly, poly
            
        except Exception as e:
            print(f"创建多项式特征时出错: {e}")
            return X_train, X_val, X_test, None
    
    def feature_selection(self, X_train, X_val, X_test, y_train, k=50):
        """特征选择 - 根据图片要求增删特征"""
        print("进行特征选择...")
        
        try:
            selector = SelectKBest(score_func=f_regression, k=min(k, X_train.shape[1]))
            
            X_train_selected = selector.fit_transform(X_train, y_train)
            X_val_selected = selector.transform(X_val)
            X_test_selected = selector.transform(X_test)
            
            print(f"特征选择后训练集形状: {X_train_selected.shape}")
            
            return X_train_selected, X_val_selected, X_test_selected, selector
            
        except Exception as e:
            print(f"特征选择失败: {e}")
            return X_train, X_val, X_test, None
    
    def evaluate_model(self, model, X_train, X_val, y_train, y_val, model_name, cv_folds=6):
        """评估模型性能 - 根据图片要求使用6折交叉验证和MAE/RMSE"""
        print(f"\n评估 {model_name} 模型...")
        
        try:
            # 训练集预测和评估
            y_train_pred = model.predict(X_train)
            train_mae = mean_absolute_error(y_train, y_train_pred)
            train_rmse = np.sqrt(mean_squared_error(y_train, y_train_pred))
            
            # 验证集预测和评估
            y_val_pred = model.predict(X_val)
            val_mae = mean_absolute_error(y_val, y_val_pred)
            val_rmse = np.sqrt(mean_squared_error(y_val, y_val_pred))
            
            # 6折交叉验证 - 根据图片要求
            print(f"进行{cv_folds}折交叉验证...")
            cv_scores_mae = -cross_val_score(model, X_train, y_train, 
                                           scoring='neg_mean_absolute_error', cv=cv_folds, n_jobs=1)
            cv_mae = cv_scores_mae.mean()
            
            cv_scores_rmse = np.sqrt(-cross_val_score(model, X_train, y_train, 
                                                    scoring='neg_mean_squared_error', cv=cv_folds, n_jobs=1))
            cv_rmse = cv_scores_rmse.mean()
            
            results = {
                'model_name': model_name,
                'in_sample_mae': train_mae,
                'in_sample_rmse': train_rmse,
                'out_sample_mae': val_mae,
                'out_sample_rmse': val_rmse,
                'cv_mae': cv_mae,
                'cv_rmse': cv_rmse
            }
            
            print(f"{model_name} 性能:")
            print(f"  样本内 MAE: {train_mae:.4f}, RMSE: {train_rmse:.4f}")
            print(f"  样本外 MAE: {val_mae:.4f}, RMSE: {val_rmse:.4f}")
            print(f"  {cv_folds}折交叉验证 MAE: {cv_mae:.4f}, RMSE: {cv_rmse:.4f}")
            
            return results, y_val_pred
            
        except Exception as e:
            print(f"评估模型 {model_name} 时出错: {e}")
            return None, None
    
    def train_ols(self, X_train, X_val, y_train, y_val):
        """训练OLS模型"""
        print("\n" + "="*50)
        print("训练 OLS 模型")
        print("="*50)
        
        try:
            ols = LinearRegression()
            start_time = time.time()
            ols.fit(X_train, y_train)
            training_time = time.time() - start_time
            
            print(f"OLS训练完成，耗时: {training_time:.2f}秒")
            
            results, _ = self.evaluate_model(ols, X_train, X_val, y_train, y_val, "OLS")
            self.models['OLS'] = ols
            
            if results:
                self.results.append(results)
                return results
                
        except Exception as e:
            print(f"OLS训练失败: {e}")
        
        return None
    
    def train_regularized_model(self, model_type, X_train, X_val, y_train, y_val):
        """训练正则化模型(Lasso/Ridge/ElasticNet)并进行超参数调优"""
        print("\n" + "="*50)
        print(f"训练 {model_type} 模型")
        print("="*50)
        
        # 根据模型类型设置参数
        if model_type == "LASSO":
            model_class = Lasso
            param_grid = {
                'regressor__alpha': [0.001, 0.01, 0.1, 1, 10, 100, 1000],
                'regressor__max_iter': [1000, 5000]
            }
        elif model_type == "Ridge":
            model_class = Ridge
            param_grid = {
                'regressor__alpha': [0.001, 0.01, 0.1, 1, 10, 100, 1000],
                'regressor__max_iter': [1000, 5000]
            }
        elif model_type == "ElasticNet":
            model_class = ElasticNet
            param_grid = {
                'regressor__alpha': [0.001, 0.01, 0.1, 1, 10],
                'regressor__l1_ratio': [0.1, 0.3, 0.5, 0.7, 0.9],
                'regressor__max_iter': [1000, 5000]
            }
        else:
            print(f"未知模型类型: {model_type}")
            return None
        
        try:
            # 创建管道
            model_pipe = Pipeline([
                ('scaler', StandardScaler()),
                ('regressor', model_class(random_state=42))
            ])
            
            # 使用GridSearchCV进行超参数调优 - 根据图片要求
            print(f"开始超参数调优，参数网格: {param_grid}")
            print(f"开始时间: {time.strftime('%Y-%m-%d %H:%M:%S')}")
            
            # 计算总拟合次数用于进度估计
            total_fits = 1
            for param, values in param_grid.items():
                total_fits *= len(values)
            total_fits *= 6  # 6折交叉验证
            
            print(f"预计总拟合次数: {total_fits}")
            
            # 设置详细的进度显示
            grid_search = GridSearchCV(
                model_pipe, param_grid, cv=6,  # 6折交叉验证
                scoring='neg_mean_absolute_error', 
                n_jobs=1,  # 单进程以便更好显示进度
                verbose=10,  # 最详细的进度输出
                return_train_score=True
            )
            
            start_time = time.time()
            grid_search.fit(X_train, y_train)
            end_time = time.time()
            
            training_time = end_time - start_time
            print(f"{model_type}训练完成!")
            print(f"总耗时: {training_time:.2f}秒 ({training_time/60:.2f}分钟)")
            print(f"最佳参数: {grid_search.best_params_}")
            print(f"最佳分数(MAE): {-grid_search.best_score_:.4f}")
            
            # 保存最佳模型
            best_model = grid_search.best_estimator_
            self.models[model_type] = best_model
            
            # 评估模型
            results, _ = self.evaluate_model(best_model, X_train, X_val, y_train, y_val, model_type)
            
            if results:
                self.results.append(results)
                return results
                
        except Exception as e:
            print(f"{model_type}训练失败: {e}")
            # 尝试使用默认参数
            try:
                print("尝试使用默认参数训练...")
                default_model = Pipeline([
                    ('scaler', StandardScaler()),
                    ('regressor', model_class(alpha=1.0, random_state=42))
                ])
                default_model.fit(X_train, y_train)
                self.models[model_type] = default_model
                
                results, _ = self.evaluate_model(default_model, X_train, X_val, y_train, y_val, model_type)
                if results:
                    self.results.append(results)
                    return results
            except Exception as e2:
                print(f"默认参数训练也失败: {e2}")
        
        return None
    
    def find_best_model(self):
        """根据验证集性能选择最佳模型"""
        if not self.results:
            print("没有可用的模型结果")
            return None
        
        # 过滤有效结果
        valid_results = [r for r in self.results if r is not None and not np.isnan(r['out_sample_mae'])]
        
        if not valid_results:
            print("没有有效的模型结果")
            return None
        
        # 根据样本外MAE选择最佳模型
        best_result = min(valid_results, key=lambda x: x['out_sample_mae'])
        self.best_model_name = best_result['model_name']
        self.best_model = self.models.get(self.best_model_name)
        
        print(f"\n最佳模型: {self.best_model_name}")
        print(f"最佳模型验证集MAE: {best_result['out_sample_mae']:.4f}")
        
        # 添加最佳模型行到结果中
        best_row = {
            'model_name': 'Best Linear Model',
            'in_sample_mae': best_result['in_sample_mae'],
            'in_sample_rmse': best_result['in_sample_rmse'],
            'out_sample_mae': best_result['out_sample_mae'],
            'out_sample_rmse': best_result['out_sample_rmse'],
            'cv_mae': best_result['cv_mae'],
            'cv_rmse': best_result['cv_rmse']
        }
        
        self.results.append(best_row)
        
        return best_result
    
    def create_results_table(self, total_predictions):
        """创建符合图片要求的性能表格"""
        print("\n" + "="*80)
        print("模型性能汇总表 (按照图片要求格式)")
        print("="*80)
        
        # 创建表格数据
        table_data = []
        
        for result in self.results:
            if result is None:
                continue
                
            # 计算Kaggle Score (MAE * 100，取整)
            kaggle_score = int(result['out_sample_mae'] * 100)
            
            row = {
                'Metrics': result['model_name'],
                'In sample': f"{result['in_sample_mae']:.4f}",
                'out of sample': f"{result['out_sample_mae']:.4f}",
                'Cross-validation': f"{result['cv_mae']:.4f}",
                'Kaggle Score': f"{kaggle_score}"
            }
            table_data.append(row)
        
        # 创建DataFrame
        results_df = pd.DataFrame(table_data)
        
        # 显示表格
        print(results_df.to_string(index=False))
        
        # 报告去除异常值后的总预测数量
        print(f"\n去除异常值后的总预测数量: {total_predictions}")
        
        return results_df
    
    def save_predictions(self, X_test, test_ids):
        """保存预测结果到CSV文件"""
        print("\n" + "="*60)
        print("保存预测结果")
        print("="*60)
        
        for model_name, model in self.models.items():
            try:
                predictions = model.predict(X_test)
                
                result_df = pd.DataFrame({
                    'ID': test_ids,
                    'predict_price': predictions
                })
                
                filename = f"{model_name}_predict_price.csv"
                result_df.to_csv(filename, index=False)
                print(f"{model_name}预测结果已保存到: {filename}")
                
            except Exception as e:
                print(f"保存{model_name}预测结果失败: {e}")
        
        # 保存最佳模型预测结果
        if self.best_model:
            try:
                predictions = self.best_model.predict(X_test)
                
                result_df = pd.DataFrame({
                    'ID': test_ids,
                    'predict_price': predictions
                })
                
                filename = "Best_Linear_Model_predict_price.csv"
                result_df.to_csv(filename, index=False)
                print(f"最佳模型预测结果已保存到: {filename}")
                
            except Exception as e:
                print(f"保存最佳模型预测结果失败: {e}")
    
    def run(self):
        """主运行函数"""
        print("开始线性回归建模任务")
        print("任务要求:")
        print("- 使用线性模型: OLS, Lasso, Ridge, ElasticNet")
        print("- 特征工程: 添加非线性特征和交互项")
        print("- 超参数调优: 使用GridSearchCV")
        print("- 6折交叉验证评估")
        print("- 使用MAE和RMSE报告性能")
        print("- 输出符合图片要求的表格")
        
        start_time = time.time()
        
        # 1. 加载并清洗数据
        train_data, test_data = self.load_and_clean_data()
        if train_data is None or test_data is None:
            print("数据加载失败，程序退出")
            return
        
        # 2. 移除异常值
        print("\n" + "="*60)
        print("步骤2: 处理异常值")
        print("="*60)
        train_data_clean = self.remove_outliers(train_data, 'Price')
        total_predictions = len(train_data_clean)
        
        # 3. 准备特征
        print("\n" + "="*60)
        print("步骤3: 准备特征")
        print("="*60)
        X_train, X_val, X_test, y_train, y_val, test_ids = self.prepare_features(train_data_clean, test_data)
        
        # 4. 特征工程 - 根据图片要求添加非线性特征和交互项
        print("\n" + "="*60)
        print("步骤4: 特征工程")
        print("="*60)
        X_train_poly, X_val_poly, X_test_poly, poly_transformer = self.create_polynomial_features(
            X_train, X_val, X_test, degree=2
        )
        
        # 5. 特征选择 - 根据图片要求增删特征
        print("\n" + "="*60)
        print("步骤5: 特征选择")
        print("="*60)
        X_train_final, X_val_final, X_test_final, selector = self.feature_selection(
            X_train_poly, X_val_poly, X_test_poly, y_train, k=100
        )
        
        # 6. 训练模型
        print("\n" + "="*60)
        print("步骤6: 训练模型")
        print("="*60)
        
        # 训练OLS模型
        self.train_ols(X_train_final, X_val_final, y_train, y_val)
        
        # 训练正则化模型
        for model_type in ["LASSO", "Ridge", "ElasticNet"]:
            self.train_regularized_model(model_type, X_train_final, X_val_final, y_train, y_val)
        
        # 7. 选择最佳模型
        print("\n" + "="*60)
        print("步骤7: 选择最佳模型")
        print("="*60)
        self.find_best_model()
        
        # 8. 创建结果表格
        print("\n" + "="*60)
        print("步骤8: 生成性能表格")
        print("="*60)
        results_table = self.create_results_table(total_predictions)
        
        # 保存结果表格
        results_table.to_csv('model_performance_summary.csv', index=False)
        print("\n模型性能汇总表已保存到: model_performance_summary.csv")
        
        # 9. 保存预测结果
        print("\n" + "="*60)
        print("步骤9: 保存预测结果")
        print("="*60)
        self.save_predictions(X_test_final, test_ids)
        
        # 计算总运行时间
        total_time = time.time() - start_time
        print(f"\n任务完成! 总运行时间: {total_time:.2f}秒 ({total_time/60:.2f}分钟)")
        
        return results_table

# 运行主程序
if __name__ == "__main__":
    project = LinearRegressionProject()
    results = project.run()

开始线性回归建模任务
任务要求:
- 使用线性模型: OLS, Lasso, Ridge, ElasticNet
- 特征工程: 添加非线性特征和交互项
- 超参数调优: 使用GridSearchCV
- 6折交叉验证评估
- 使用MAE和RMSE报告性能
- 输出符合图片要求的表格
步骤1: 加载并清洗数据
原始训练集形状: (103871, 72)
原始测试集形状: (34017, 72)
清洗后训练集缺失值: 0
清洗后测试集缺失值: 0

步骤2: 处理异常值
原始样本数: 103871
移除异常值后样本数: 101844
移除的异常值数量: 2027

步骤3: 准备特征
特征维度: 71
训练样本数: 101844
训练特征形状: (81475, 71)
验证特征形状: (20369, 71)
测试特征形状: (34017, 71)

步骤4: 特征工程
创建多项式特征 (degree=2)...
多项式特征后训练集形状: (81475, 2627)

步骤5: 特征选择
进行特征选择...
特征选择后训练集形状: (81475, 100)

步骤6: 训练模型

训练 OLS 模型
OLS训练完成，耗时: 0.31秒

评估 OLS 模型...
进行6折交叉验证...
OLS 性能:
  样本内 MAE: 764748.0128, RMSE: 1144693.3356
  样本外 MAE: 779055.6001, RMSE: 1163470.0911
  6折交叉验证 MAE: 766108.3478, RMSE: 1146726.9363

训练 LASSO 模型
开始超参数调优，参数网格: {'regressor__alpha': [0.001, 0.01, 0.1, 1, 10, 100, 1000], 'regressor__max_iter': [1000, 5000]}
开始时间: 2025-10-28 20:29:29
预计总拟合次数: 84
Fitting 6 folds for each of 14 candidates, totalling 84 fits
[CV 1/6; 1/14] START regressor__alpha=0.001, regressor__max_iter=1000...........
[CV 1/6