In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
import scipy.stats as stats
import statsmodels.api as sm

In [2]:
# 读取CSV文件
# data_batch = "Rent"
# 请将下面的路径替换为您的实际文件路径
df = pd.read_csv(f"{data_batch}_all_data.csv")  # 替换为您的CSV文件路径

# 准备数据
X = df[['space', 'location']]
y = df[data_batch]

In [3]:
# 生成多项式特征和交互项（包括平方项和交叉项）
poly = PolynomialFeatures(degree=2, include_bias=False, interaction_only=False)
X_poly = poly.fit_transform(X)

# 获取特征名称
feature_names = poly.get_feature_names_out(['space', 'location'])
print("生成的特征项:", feature_names)

# 创建并训练线性回归模型（在扩展后的特征空间中是线性的，但能捕捉原始特征的非线性关系）
model = LinearRegression()
model.fit(X_poly, y)

生成的特征项: ['space' 'location' 'space^2' 'space location' 'location^2']


LinearRegression()

In [4]:
# 进行预测
y_pred = model.predict(X_poly)

# 将预测结果添加到原始DataFrame中
df[f"pre_{data_batch}"] = y_pred

# 创建包含四列结果的新DataFrame
result_df = df[[data_batch, "space", "location", f"pre_{data_batch}"]]

# 保存结果为新的CSV文件
result_df.to_csv(f"{data_batch}_nonlinear_result.csv", index=False)

In [5]:
# 使用statsmodels进行详细的统计评估（可以获取p值）
X_with_const = sm.add_constant(X_poly)  # 添加常数项
model_sm = sm.OLS(y, X_with_const).fit()

# 计算R方和其他指标
r_squared = r2_score(y, y_pred)
n = len(y)  # 样本数量
p = X_poly.shape[1]  # 特征数量（不包括常数项）

# 计算调整R方
adj_r_squared = 1 - (1 - r_squared) * (n - 1) / (n - p - 1)

# 计算RMSE
rmse = np.sqrt(np.mean((y - y_pred) ** 2))

In [6]:
# 输出模型准确度指标
print("\n" + "="*50)
print("模型评估指标")
print("="*50)
print(f"R方 (R-squared): {r_squared:.6f}")
print(f"调整R方 (Adjusted R-squared): {adj_r_squared:.6f}")
print(f"均方根误差 (RMSE): {rmse:.6f}")
print(f"样本数量: {n}")
print(f"特征数量: {p}")


模型评估指标
R方 (R-squared): 0.970007
调整R方 (Adjusted R-squared): 0.969936
均方根误差 (RMSE): 7152.601716
样本数量: 2119
特征数量: 5


In [7]:
# 输出详细的统计摘要（包含每个特征的p值）
print("\n" + "="*50)
print("详细统计摘要")
print("="*50)
print(model_sm.summary())


详细统计摘要
                            OLS Regression Results                            
Dep. Variable:                   Rent   R-squared:                       0.970
Model:                            OLS   Adj. R-squared:                  0.970
Method:                 Least Squares   F-statistic:                 1.367e+04
Date:                Wed, 15 Oct 2025   Prob (F-statistic):               0.00
Time:                        20:05:30   Log-Likelihood:                -21813.
No. Observations:                2119   AIC:                         4.364e+04
Df Residuals:                    2113   BIC:                         4.367e+04
Df Model:                           5                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const      -8649.2999   1434.848     -6.028 

In [8]:
# 输出特征系数
print("\n" + "="*50)
print("特征系数")
print("="*50)
for i, name in enumerate(['常数项'] + list(feature_names)):
    coef = model_sm.params[i]
    p_value = model_sm.pvalues[i]
    significance = "***" if p_value < 0.001 else "**" if p_value < 0.01 else "*" if p_value < 0.05 else ""
    print(f"{name:15}: {coef:10.6f} (p值: {p_value:.6f}) {significance}")


特征系数
常数项            : -8649.299869 (p值: 0.000000) ***
space          : 177.363015 (p值: 0.000000) ***
location       : 5461.443754 (p值: 0.000309) ***
space^2        :   0.013370 (p值: 0.000000) ***
space location : -65.569608 (p值: 0.000000) ***
location^2     : -516.229146 (p值: 0.186674) 


In [9]:
print(f"原始数据形状: {df.shape}")
print(f"结果数据形状: {result_df.shape}")

原始数据形状: (2119, 4)
结果数据形状: (2119, 4)
