In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
import statsmodels.api as sm

In [2]:
# 读取CSV文件
# data_batch = "Price"
data = pd.read_csv(f"{data_batch}_all_data.csv")

# 准备特征变量和目标变量
X = data[['space', 'location']]  # 自变量：房屋面积和所在地区
y = data[data_batch]  # 因变量：房屋价格

# 添加常数项（用于计算截距）
X_with_const = sm.add_constant(X)

# 使用statsmodels进行线性回归（可以获取详细统计信息包括p值）
model_sm = sm.OLS(y, X_with_const).fit()

# 使用sklearn进行线性回归（用于预测）
model_sk = LinearRegression()
model_sk.fit(X, y)

LinearRegression()

In [3]:
# 进行预测
predictions = model_sk.predict(X)

# 将预测值添加到原始DataFrame中
data[f"pre_{data_batch}"] = predictions

In [4]:
# 保存结果到新的CSV文件
data.to_csv(f"{data_batch}_linear_result.csv", index=False)

In [5]:
# 打印模型准确度指标
print("=" * 50)
print("线性回归模型评估结果")
print("=" * 50)
print(f"R方 (R²): {model_sm.rsquared:.4f}")
print(f"调整后R方 (Adjusted R²): {model_sm.rsquared_adj:.4f}")
print("\n系数估计和显著性检验:")
print(model_sm.summary())

线性回归模型评估结果
R方 (R²): 0.7537
调整后R方 (Adjusted R²): 0.7536

系数估计和显著性检验:
                            OLS Regression Results                            
Dep. Variable:                  price   R-squared:                       0.754
Model:                            OLS   Adj. R-squared:                  0.754
Method:                 Least Squares   F-statistic:                     4314.
Date:                Wed, 15 Oct 2025   Prob (F-statistic):               0.00
Time:                        19:09:29   Log-Likelihood:                -46155.
No. Observations:                2822   AIC:                         9.232e+04
Df Residuals:                    2819   BIC:                         9.233e+04
Df Model:                           2                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------------------------------------

In [6]:
# 提取并格式化p值信息
print("\n各个特征的p值:")
for i, feature in enumerate(X_with_const.columns):
    if i == 0:
        print(f"常数项 (截距): p值 = {model_sm.pvalues[i]:.6f}")
    else:
        print(f"{feature}: p值 = {model_sm.pvalues[i]:.6f}")


各个特征的p值:
常数项 (截距): p值 = 0.000094
space: p值 = 0.000000
location: p值 = 0.000000


In [7]:
# 使用sklearn计算R方作为验证
r2_sklearn = r2_score(y, predictions)
print(f"\nSklearn计算的R方 (验证): {r2_sklearn:.4f}")


Sklearn计算的R方 (验证): 0.7537
