In [42]:
import pandas as pd  
import numpy as np
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
from sklearn.linear_model import LassoCV
from statsmodels.stats.outliers_influence import variance_inflation_factor
from scipy import stats
from sklearn.model_selection import train_test_split  
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.linear_model import LinearRegression, Lasso, Ridge, ElasticNet
from sklearn.metrics import mean_absolute_error, mean_squared_error
from math import sqrt
from sklearn.covariance import EllipticEnvelope  

In [43]:
# 1. 数据处理
data_rent =r"D:\人工智能\Python exam\ruc_Class25Q2_train_rent.csv"
data_house=r"D:\人工智能\Python exam\ruc_Class25Q2_train_price.csv"
df1 = pd.read_csv(data_rent, dtype=str)
df2 = pd.read_csv(data_house, dtype=str)

#print("===== df1 原始数据基本信息 =====")
#print(df1.info()) 
#print("\n===== df2 原始数据基本信息 =====")
#print(df2.info())

non_null_counts_df1 = df1.notnull().sum()
keep_cols_df1 = non_null_counts_df1[non_null_counts_df1 > 90000].index.tolist()
df1 = df1[keep_cols_df1]
#print("\n===== df1 筛选后数据基本信息 =====")
#print(df1.info())

non_null_counts_df2 = df2.notnull().sum()
keep_cols_df2 = non_null_counts_df2[non_null_counts_df2 > 90000].index.tolist()
df2 = df2[keep_cols_df2]
#print("\n===== df2 筛选后数据基本信息 =====")
#print(df2.info())

df1 = df1.dropna()  
df2 = df2.dropna()

df1 = df1.drop_duplicates()  
df2 = df2.drop_duplicates() 
#print(df1.info())
#print(df2.info())

numeric_cols = ['Price', '面积', 'lon', 'lat', 'coord_x', 'coord_y','城市','房屋总数','楼栋总数']

for col in numeric_cols:
    if col == 'Price':
        df1[col] = df1[col].str.replace('¥', '').str.replace(',', '').astype(float)
        df1[col] = df1[col]/1000000
    elif col == '面积':
        df1[col] = df1[col].str.replace('㎡', '').astype(float)
    elif col == '房屋总数':        
        df1[col] = df1[col].str.replace('户', '').astype(float)
    elif col == '楼栋总数':        
        df1[col] = df1[col].str.replace('栋', '').astype(float)
    else:
        df1[col] = pd.to_numeric(df1[col], errors='coerce')
df1['户均楼栋房屋数'] = df1['房屋总数'] / df1['楼栋总数']
'''
print("===== 数值列转换后信息 =====")
print(df1[numeric_cols].info())
print(df1[numeric_cols].head()) 

print("\n数据统计描述：")
print(df1.describe())  
print("\n前5行数据：")
print(df1.head()) 
'''
#数据结论房价的两级分化较大，存在大量高价和低价的极值，相比之下租户对房屋的面积刚需较为稳定，经纬度说明跨度大但集中于核心区域

numeric_cols = ['Price', '建筑面积', 'lon', 'lat', 'coord_x', 'coord_y','城市','区域','房屋总数','楼栋总数']

for col in numeric_cols:
    if col == 'Price':
        df2[col] = df2[col].str.replace('¥', '').str.replace(',', '').astype(float)
        df2[col] = df2[col]/1000000
    elif col == '建筑面积':
        df2[col] = df2[col].str.replace('㎡', '').astype(float)
    elif col == '房屋总数':        
        df2[col] = df2[col].str.replace('户', '').astype(float)
    elif col == '楼栋总数':        
        df2[col] = df2[col].str.replace('栋', '').astype(float)
    else:
        df2[col] = pd.to_numeric(df2[col], errors='coerce')
df2['户均楼栋房屋数'] = df2['房屋总数'] / df2['楼栋总数']
'''
print("===== 数值列转换后信息 =====")
print(df2[numeric_cols].info())
print(df2[numeric_cols].head()) 

print("\n数据统计描述：")
print(df2.describe())  
print("\n前5行数据：")
print(df2.head()) 
'''

'\nprint("===== 数值列转换后信息 =====")\nprint(df2[numeric_cols].info())\nprint(df2[numeric_cols].head()) \n\nprint("\n数据统计描述：")\nprint(df2.describe())  \nprint("\n前5行数据：")\nprint(df2.head()) \n'

In [44]:
# 确定特征（X）和目标变量（y）
X_rent = df1.drop(columns=['Price']) 
y_rent = df1['Price'] 

X_train_rent, X_test_rent, y_train_rent, y_test_rent = train_test_split(
    X_rent, y_rent,
    test_size=0.3,  
    random_state=111  
)

# 确定特征（X）和目标变量（y）
X_house = df2.drop(columns=['Price']) 
y_house = df2['Price']  

X_train_house, X_test_house, y_train_house, y_test_house = train_test_split(
    X_house, y_house,
    test_size=0.3,
    random_state=111
)
df1 = df1.loc[:, ~df1.columns.duplicated()]  
df2 = df2.loc[:, ~df2.columns.duplicated()] 
'''
print("数据类型：", type(df1['面积']))
print("维度：", df1['面积'].ndim)
print("形状：", df1['面积'].shape)
print(df1['面积'].head())
'''

'\nprint("数据类型：", type(df1[\'面积\']))\nprint("维度：", df1[\'面积\'].ndim)\nprint("形状：", df1[\'面积\'].shape)\nprint(df1[\'面积\'].head())\n'

In [45]:
# 1. 对数转换
df1['log_Price'] = np.log(df1['Price'])  # 租金的对数转换
df1['log_面积'] = np.log(df1['面积'])    # 面积的对数转换
numeric_cols = df1.select_dtypes(include=['float64', 'int64']).columns.tolist()

# 2. VIF（方差膨胀因子）：筛选VIF<10的特征（VIF>10表示多重共线性严重）
def calculate_vif(df, features):
    vif_df = pd.DataFrame()
    vif_df['Feature'] = features
    vif_df['VIF'] = [
        variance_inflation_factor(df[features].values, i) 
        for i in range(len(features))
    ]
    return vif_df

numeric_features = df1.select_dtypes(include=['float64', 'int64']).columns.tolist()
vif_df = calculate_vif(df1, numeric_features)
vif_features = vif_df[vif_df['VIF'] < 10]['Feature'].tolist()
# 打印VIF分析结果
print("VIF分析结果：")
print(vif_df)
final_features = vif_features
print("最终筛选的特征：", final_features)


VIF分析结果：
      Feature           VIF
0          城市  8.889415e+00
1       Price  6.845030e+00
2          面积  2.659083e+01
3         lon  3.503129e+07
4         lat  3.399903e+06
5        房屋总数  2.915501e+00
6        楼栋总数  1.684154e+00
7     coord_x  3.503175e+07
8     coord_y  3.400464e+06
9     户均楼栋房屋数  2.657980e+00
10  log_Price  6.574477e+00
11     log_面积  2.354105e+02
最终筛选的特征： ['城市', 'Price', '房屋总数', '楼栋总数', '户均楼栋房屋数', 'log_Price']


In [46]:
#OLS
# 提取筛选后的特征 + 目标变量
df1_selected = df1[['面积','城市','户均楼栋房屋数','log_Price']]

X = df1_selected.drop(columns=['log_Price'])
y = df1_selected['log_Price']
X = X.fillna(X.median())

X_rent = df1_selected.drop(columns=['log_Price']) 
y_rent = df1_selected['log_Price']  

X_train_rent, X_test_rent, y_train_rent, y_test_rent = train_test_split(
    X_rent, y_rent,
    test_size=0.3, 
    random_state=111  
)
ols = LinearRegression()
ols.fit(X_train_rent, y_train_rent)
def evaluate_model(y_true, y_pred):
    """计算MAE和RMSE指标"""
    mae = mean_absolute_error(y_true, y_pred)
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    return mae, rmse
# 样本内/外预测
y_train_pred_ols = ols.predict(X_train_rent)
y_test_pred_ols = ols.predict(X_test_rent)

y_train_pred_ols = np.exp(y_train_pred_ols)
y_test_pred_ols = np.exp(y_test_pred_ols)
y_train_rent = np.exp(y_train_rent)
y_test_rent = np.exp(y_test_rent)

ols_train_mae, ols_train_rmse = evaluate_model(y_train_rent, y_train_pred_ols)
ols_test_mae, ols_test_rmse = evaluate_model(y_test_rent, y_test_pred_ols)
y = np.exp(y)

cv_scores_ols = cross_val_score(ols, X, y, cv=6, scoring='neg_mean_absolute_error')
ols_cv_mae = -cv_scores_ols.mean()
# 打印结果
print("OLS样本内MAE：", ols_train_mae)
print("OLS样本外MAE：", ols_test_mae)
print("OLS交叉验证MAE：", ols_cv_mae)

OLS样本内MAE： 0.32350140452246473
OLS样本外MAE： 0.32340367228042216
OLS交叉验证MAE： 0.4465350925894815


In [47]:
#LASSO
df1_selected = df1[['面积','城市','户均楼栋房屋数','log_Price']]
X = df1_selected.drop(columns=['log_Price'])
y = df1_selected['log_Price']
X = X.fillna(X.median())
X_rent = df1_selected.drop(columns=['log_Price'])  
y_rent = df1_selected['log_Price']  
X_train_rent, X_test_rent, y_train_rent, y_test_rent = train_test_split(
    X_rent, y_rent,
    test_size=0.3,  
    random_state=111  
)
lasso = Lasso(random_state=111)
lasso_params = {'alpha': [0.001, 0.01, 0.1, 1, 10]}
lasso_grid = GridSearchCV(
    lasso, lasso_params, cv=6, scoring='neg_mean_absolute_error'
)
lasso_grid.fit(X_train_rent, y_train_rent)
best_lasso = lasso_grid.best_estimator_

y_train_pred_lasso = best_lasso.predict(X_train_rent)
y_test_pred_lasso = best_lasso.predict(X_test_rent)

y_train_pred_lasso = np.exp(y_train_pred_lasso)
y_test_pred_lasso = np.exp(y_test_pred_lasso)
y_train_rent = np.exp(y_train_rent)
y_test_rent = np.exp(y_test_rent)
lasso_train_mae, lasso_train_rmse = evaluate_model(y_train_rent, y_train_pred_lasso)
lasso_test_mae, lasso_test_rmse = evaluate_model(y_test_rent, y_test_pred_lasso)

y = np.exp(y)
cv_scores_lasso = cross_val_score(best_lasso, X, y, cv=6, scoring='neg_mean_absolute_error')
lasso_cv_mae = -cv_scores_lasso.mean()
# 打印最佳超参数
print("Lasso最佳超参数：", lasso_grid.best_params_)

lasso_results = pd.DataFrame({
    "指标类型": ["样本内MAE", "样本内RMSE", "样本外MAE", "样本外RMSE", "交叉验证MAE"],
    "数值": [
        lasso_train_mae,
        lasso_train_rmse,
        lasso_test_mae,
        lasso_test_rmse,
        lasso_cv_mae
    ]
})

print("\nLasso模型性能指标：")
print(lasso_results)


Lasso最佳超参数： {'alpha': 0.001}

Lasso模型性能指标：
      指标类型        数值
0   样本内MAE  0.323505
1  样本内RMSE  0.554931
2   样本外MAE  0.323406
3  样本外RMSE  0.547671
4  交叉验证MAE  0.446438


In [48]:
#ELASTIC
df1_selected = df1[['面积','城市','户均楼栋房屋数','log_Price']]
X = df1_selected.drop(columns=['log_Price'])
y = df1_selected['log_Price']
X = X.fillna(X.median())
X_rent = df1_selected.drop(columns=['log_Price'])  
y_rent = df1_selected['log_Price'] 
X_train_rent, X_test_rent, y_train_rent, y_test_rent = train_test_split(
    X_rent, y_rent,
    test_size=0.3,  
    random_state=111  
)
elastic = ElasticNet(random_state=111)
elastic_params = {
    'alpha': [0.001, 0.01, 0.1, 1, 10],
    'l1_ratio': [0.1, 0.3, 0.5, 0.7, 0.9]
}
elastic_grid = GridSearchCV(
    elastic, elastic_params, cv=6, scoring='neg_mean_absolute_error'
)
elastic_grid.fit(X_train_rent, y_train_rent)
best_elastic = elastic_grid.best_estimator_

y_train_pred_elastic = best_elastic.predict(X_train_rent)
y_test_pred_elastic = best_elastic.predict(X_test_rent)
y_train_pred_elastic = np.exp(y_train_pred_elastic)
y_test_pred_elastic = np.exp(y_test_pred_elastic)
y_train_rent = np.exp(y_train_rent)
y_test_rent = np.exp(y_test_rent)
elastic_train_mae, elastic_train_rmse = evaluate_model(y_train_rent, y_train_pred_elastic)
elastic_test_mae, elastic_test_rmse = evaluate_model(y_test_rent, y_test_pred_elastic)

y = np.exp(y)
cv_scores_elastic = cross_val_score(best_elastic, X, y, cv=6, scoring='neg_mean_absolute_error')
elastic_cv_mae = -cv_scores_elastic.mean()
# 打印最佳超参数
print("弹性网络最佳超参数：", elastic_grid.best_params_)

elastic_results = pd.DataFrame({
    "指标类型": ["样本内MAE", "样本内RMSE", "样本外MAE", "样本外RMSE", "交叉验证MAE"],
    "数值": [
        elastic_train_mae,
        elastic_train_rmse,
        elastic_test_mae,
        elastic_test_rmse,
        elastic_cv_mae
    ]
})

print("\n弹性网络模型性能指标：")
print(elastic_results)


弹性网络最佳超参数： {'alpha': 0.001, 'l1_ratio': 0.1}

弹性网络模型性能指标：
      指标类型        数值
0   样本内MAE  0.323502
1  样本内RMSE  0.554917
2   样本外MAE  0.323404
3  样本外RMSE  0.547656
4  交叉验证MAE  0.446521


In [49]:
#RIDGE
df1_selected = df1[['面积','城市','户均楼栋房屋数','log_Price']]
X = df1_selected.drop(columns=['log_Price'])
y = df1_selected['log_Price']
X = X.fillna(X.median())
X_rent = df1_selected.drop(columns=['log_Price'])  
y_rent = df1_selected['log_Price'] 
X_train_rent, X_test_rent, y_train_rent, y_test_rent = train_test_split(
    X_rent, y_rent,
    test_size=0.3,  
    random_state=111 
)
ridge = Ridge(random_state=111)
ridge_params = {'alpha': [0.001, 0.01, 0.1, 1, 10]}
ridge_grid = GridSearchCV(
    ridge, ridge_params, cv=6, scoring='neg_mean_absolute_error'
)
ridge_grid.fit(X_train_rent, y_train_rent)
best_ridge = ridge_grid.best_estimator_

y_train_pred_ridge = best_ridge.predict(X_train_rent)
y_test_pred_ridge = best_ridge.predict(X_test_rent)
y_train_pred_ridge = np.exp(y_train_pred_ridge)
y_test_pred_ridge = np.exp(y_test_pred_ridge)
y_train_rent = np.exp(y_train_rent)
y_test_rent = np.exp(y_test_rent)
ridge_train_mae, ridge_train_rmse = evaluate_model(y_train_rent, y_train_pred_ridge)
ridge_test_mae, ridge_test_rmse = evaluate_model(y_test_rent, y_test_pred_ridge)

y = np.exp(y)
cv_scores_ridge = cross_val_score(best_ridge, X, y, cv=6, scoring='neg_mean_absolute_error')
ridge_cv_mae = -cv_scores_ridge.mean()


In [50]:

# 结果字典
model_results = {
    'OLS': {
        'In_sample_MAE': ols_train_mae,
        'Out_of_sample_MAE': ols_test_mae,
        'In_sample_RMSE': ols_train_rmse,
        'Out_of_sample_RMSE': ols_test_rmse,
        'Cross_validation_MAE': ols_cv_mae
    },
    'Lasso': {
        'In_sample_MAE': lasso_train_mae,
        'Out_of_sample_MAE': lasso_test_mae,
        'In_sample_RMSE': lasso_train_rmse,
        'Out_of_sample_RMSE': lasso_test_rmse,
        'Cross_validation_MAE': lasso_cv_mae,
        'Best_params': lasso_grid.best_params_
    },
    'Ridge': {
        'In_sample_MAE': ridge_train_mae,
        'Out_of_sample_MAE': ridge_test_mae,
        'In_sample_RMSE': ridge_train_rmse,
        'Out_of_sample_RMSE': ridge_test_rmse,
        'Cross_validation_MAE': ridge_cv_mae,
        'Best_params': ridge_grid.best_params_
    },
    'ElasticNet': {
        'In_sample_MAE': elastic_train_mae,
        'Out_of_sample_MAE': elastic_test_mae,
        'In_sample_RMSE': elastic_train_rmse,
        'Out_of_sample_RMSE': elastic_test_rmse,
        'Cross_validation_MAE': elastic_cv_mae,
        'Best_params': elastic_grid.best_params_
    }
}

results_df = pd.DataFrame(model_results).T
print("模型性能评估结果：")
print(results_df)


模型性能评估结果：
           In_sample_MAE Out_of_sample_MAE In_sample_RMSE Out_of_sample_RMSE  \
OLS             0.323501          0.323404       0.554915           0.547654   
Lasso           0.323505          0.323406       0.554931           0.547671   
Ridge           0.323501          0.323404       0.554915           0.547654   
ElasticNet      0.323502          0.323404       0.554917           0.547656   

           Cross_validation_MAE                        Best_params  
OLS                    0.446535                                NaN  
Lasso                  0.446438                   {'alpha': 0.001}  
Ridge                  0.446535                   {'alpha': 0.001}  
ElasticNet             0.446521  {'alpha': 0.001, 'l1_ratio': 0.1}  


In [51]:
# 异常值检测（基于椭圆包络法，假设污染率5%）
outlier_detector = EllipticEnvelope(contamination=0.05)
outlier_labels = outlier_detector.fit_predict(X)
normal_indices = outlier_labels == 1  # 正常样本的索引

# 统计移除异常值后的样本数
X_normal = X[normal_indices]
y_normal = y[normal_indices]
print(f"移除异常值后的预测总数：{len(X_normal)}")

移除异常值后的预测总数：86027


In [52]:
#用训练好的模型去预测Price
#处理数据
data_test_rent =r"D:\人工智能\Python exam\ruc_Class25Q2_test_rent.csv"
df1 = pd.read_csv(data_test_rent, dtype=str)
#print("===== df1 原始数据基本信息 =====")
#print(df1.info())  # 查看列名、数据类型、缺失值情况
non_null_counts_df1 = df1.notnull().sum()
keep_cols_df1 = non_null_counts_df1[non_null_counts_df1 > 8000 ].index.tolist()
df1 = df1[keep_cols_df1]
#print("\n===== df1 筛选后数据基本信息 =====")
#print(df1.info())
numeric_cols = [ '面积', 'lon', 'lat', 'coord_x', 'coord_y','城市','房屋总数','楼栋总数']
for col in numeric_cols:
    if col == '面积':
        df1[col] = df1[col].str.replace('㎡', '').astype(float)
    elif col == '房屋总数':        
        df1[col] = df1[col].str.replace('户', '').astype(float)    
    elif col == '楼栋总数':        
        df1[col] = df1[col].str.replace('栋', '').astype(float)
    else:
        df1[col] = pd.to_numeric(df1[col], errors='coerce')
df1[numeric_cols] = df1[numeric_cols].fillna(df1[numeric_cols].mean())
df1['户均楼栋房屋数'] = df1['房屋总数'] / df1['楼栋总数']
df1[numeric_cols] = df1[numeric_cols].fillna(df1[numeric_cols].mean())
df1['面积'] = df1['面积'].replace(0, 0.1) 
df1['log_面积'] = np.log10(df1['面积']) 
'''
print("===== 数值列转换后信息 =====")
print(df1[numeric_cols].info())
print(df1[numeric_cols].head()) 

print("\n数据统计描述：")
print(df1.describe())  
print("\n前5行数据：")
print(df1.head()) 
'''
df1['log_Price'] = 0
df1_selected = df1[['面积','城市','log_Price','户均楼栋房屋数']] 

X = df1_selected.drop(columns=['log_Price']) 
y = df1_selected['log_Price']  
train_features = X.columns.tolist()  

missing_features = [f for f in train_features if f not in df1.columns]
if missing_features:
    raise ValueError(f"新数据缺少必要特征：{missing_features}")

new_X = df1[train_features].copy()
train_medians = X.median()  
new_X = new_X.fillna(train_medians)
new_X = new_X.replace([np.inf, -np.inf], train_medians.max())

#y_pred = best_elastic.predict(new_X) 
#y_pred = best_lasso.predict(new_X) 
y_pred = best_ridge.predict(new_X) 
#y_pred = ols.predict(new_X) 


### 步骤3：整理预测结果
new_data_with_pred = df1.copy()
new_data_with_pred['预测Price'] = np.exp(y_pred) * 10000000
#new_data_with_pred['预测Price'] = new_data_with_pred['预测Price'].apply(lambda x: 700000 if x < 300000 else x)

print("预测结果：")
print(new_data_with_pred[['预测Price'] + train_features])  

# 若需要保存到Excel
new_data_with_pred.to_csv("预测结果.csv", index=False)

预测结果：
           预测Price      面积  城市     户均楼栋房屋数
0     5.009367e+06   86.94   1   82.666667
1     3.746008e+06   72.60  10   53.733333
2     5.248551e+06   98.00   3  102.076923
3     5.600716e+06   98.97   0  140.000000
4     9.346431e+06  170.53   3   59.800000
...            ...     ...  ..         ...
9768  3.094342e+06   25.60   0  337.222222
9769  5.018242e+06   91.84   3   39.076923
9770  3.294479e+06   43.00   3  587.000000
9771  9.756412e+06  176.00   3   66.088554
9772  5.021241e+06   94.50   4   48.689655

[9773 rows x 4 columns]


In [53]:
#house price
df2['log_Price'] = np.log(df2['Price']) 
df2['log_建筑面积'] = np.log(df2['建筑面积'])  
numeric_cols = df2.select_dtypes(include=['float64', 'int64']).columns.tolist()
print(numeric_cols)

def calculate_vif(df, features):
    vif_df = pd.DataFrame()
    vif_df['Feature'] = features
    vif_df['VIF'] = [
        variance_inflation_factor(df[features].values, i) 
        for i in range(len(features))
    ]
    return vif_df

numeric_features = df2.select_dtypes(include=['float64', 'int64']).columns.tolist()
vif_df = calculate_vif(df2, numeric_features)
vif_features = vif_df[vif_df['VIF'] < 10]['Feature'].tolist()
print("VIF分析结果：")
print(vif_df)

final_features = vif_features
print("最终筛选的特征：", final_features)
df2_selected = df2[final_features + ['Price']]


['城市', '区域', 'Price', '建筑面积', 'lon', 'lat', '房屋总数', '楼栋总数', 'coord_x', 'coord_y', '户均楼栋房屋数', 'log_Price', 'log_建筑面积']
VIF分析结果：
      Feature           VIF
0          城市  5.907560e+00
1          区域  5.049513e+00
2       Price  7.117582e+00
3        建筑面积  4.041688e+01
4         lon  2.559693e+07
5         lat  2.988490e+06
6        房屋总数  2.796649e+00
7        楼栋总数  1.729097e+00
8     coord_x  2.560072e+07
9     coord_y  2.989345e+06
10    户均楼栋房屋数  2.713511e+00
11  log_Price  4.796068e+00
12   log_建筑面积  4.953987e+02
最终筛选的特征： ['城市', '区域', 'Price', '房屋总数', '楼栋总数', '户均楼栋房屋数', 'log_Price']


In [54]:
#OLS
df2_selected = df2[['城市','区域','建筑面积','户均楼栋房屋数','Price']]
X = df2_selected.drop(columns=['Price'])
y = df2_selected['Price']
X = X.fillna(X.median())
X_house = df2_selected.drop(columns=['Price'])  
y_house = df2_selected['Price']  

X_train_house, X_test_house, y_train_house, y_test_house = train_test_split(
    X_house, y_house,
    test_size=0.3,
    random_state=111
)
ols = LinearRegression()
ols.fit(X_train_house, y_train_house)
def evaluate_model(y_true, y_pred):
    """计算MAE和RMSE指标"""
    mae = mean_absolute_error(y_true, y_pred)
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    return mae, rmse
y_train_pred_ols = ols.predict(X_train_house)
y_test_pred_ols = ols.predict(X_test_house)
ols_train_mae, ols_train_rmse = evaluate_model(y_train_house, y_train_pred_ols)
ols_test_mae, ols_test_rmse = evaluate_model(y_test_house, y_test_pred_ols)
cv_scores_ols = cross_val_score(ols, X, y, cv=6, scoring='neg_mean_absolute_error')
ols_cv_mae = -cv_scores_ols.mean()
print("OLS样本内MAE：", ols_train_mae)
print("OLS样本外MAE：", ols_test_mae)
print("OLS交叉验证MAE：", ols_cv_mae)

OLS样本内MAE： 1.4736276219932154
OLS样本外MAE： 1.4617386289426613
OLS交叉验证MAE： 1.9156902564450469


In [55]:
#LASSO
lasso = Lasso(random_state=111)
lasso_params = {'alpha': [0.001, 0.01, 0.1, 1, 10]}
lasso_grid = GridSearchCV(
    lasso, lasso_params, cv=6, scoring='neg_mean_absolute_error'
)
lasso_grid.fit(X_train_house, y_train_house)
best_lasso = lasso_grid.best_estimator_
y_train_pred_lasso = best_lasso.predict(X_train_house)
y_test_pred_lasso = best_lasso.predict(X_test_house)
lasso_train_mae, lasso_train_rmse = evaluate_model(y_train_house, y_train_pred_lasso)
lasso_test_mae, lasso_test_rmse = evaluate_model(y_test_house, y_test_pred_lasso)
cv_scores_lasso = cross_val_score(best_lasso, X, y, cv=6, scoring='neg_mean_absolute_error')
lasso_cv_mae = -cv_scores_lasso.mean()
print("Lasso最佳超参数：", lasso_grid.best_params_)

lasso_results = pd.DataFrame({
    "指标类型": ["样本内MAE", "样本内RMSE", "样本外MAE", "样本外RMSE", "交叉验证MAE"],
    "数值": [
        lasso_train_mae,
        lasso_train_rmse,
        lasso_test_mae,
        lasso_test_rmse,
        lasso_cv_mae
    ]
})
print("\nLasso模型性能指标：")
print(lasso_results)


Lasso最佳超参数： {'alpha': 0.1}

Lasso模型性能指标：
      指标类型        数值
0   样本内MAE  1.470816
1  样本内RMSE  2.187584
2   样本外MAE  1.458851
3  样本外RMSE  2.194934
4  交叉验证MAE  1.876846


In [56]:
#ELASTIC
df2_selected = df2[['城市','区域','建筑面积','户均楼栋房屋数','Price']]
X = df2_selected.drop(columns=['Price'])
y = df2_selected['Price']
X = X.fillna(X.median())
X_house = df2_selected.drop(columns=['Price']) 
y_house = df2_selected['Price']  
X_train_house, X_test_house, y_train_house, y_test_house = train_test_split(
    X_house, y_house,
    test_size=0.3,
    random_state=111
)
elastic = ElasticNet(random_state=111)
elastic_params = {
    'alpha': [0.001, 0.01, 0.1, 1, 10],
    'l1_ratio': [0.1, 0.3, 0.5, 0.7, 0.9]
}
elastic_grid = GridSearchCV(
    elastic, elastic_params, cv=6, scoring='neg_mean_absolute_error'
)
elastic_grid.fit(X_train_house, y_train_house)
best_elastic = elastic_grid.best_estimator_
y_train_pred_elastic = best_elastic.predict(X_train_house)
y_test_pred_elastic = best_elastic.predict(X_test_house)
elastic_train_mae, elastic_train_rmse = evaluate_model(y_train_house, y_train_pred_elastic)
elastic_test_mae, elastic_test_rmse = evaluate_model(y_test_house, y_test_pred_elastic)
cv_scores_elastic = cross_val_score(best_elastic, X, y, cv=6, scoring='neg_mean_absolute_error')
elastic_cv_mae = -cv_scores_elastic.mean()
print("弹性网络最佳超参数：", elastic_grid.best_params_)
elastic_results = pd.DataFrame({
    "指标类型": ["样本内MAE", "样本内RMSE", "样本外MAE", "样本外RMSE", "交叉验证MAE"],
    "数值": [
        elastic_train_mae,
        elastic_train_rmse,
        elastic_test_mae,
        elastic_test_rmse,
        elastic_cv_mae
    ]
})

print("\n弹性网络模型性能指标：")
print(elastic_results)


弹性网络最佳超参数： {'alpha': 1, 'l1_ratio': 0.5}

弹性网络模型性能指标：
      指标类型        数值
0   样本内MAE  1.465081
1  样本内RMSE  2.193523
2   样本外MAE  1.452573
3  样本外RMSE  2.199176
4  交叉验证MAE  1.713611


In [57]:
#RIDGE
ridge = Ridge(random_state=111)
ridge_params = {'alpha': [0.001, 0.01, 0.1, 1, 10]}
ridge_grid = GridSearchCV(
    ridge, ridge_params, cv=6, scoring='neg_mean_absolute_error'
)
ridge_grid.fit(X_train_house, y_train_house)
best_ridge = ridge_grid.best_estimator_

y_train_pred_ridge = best_ridge.predict(X_train_house)
y_test_pred_ridge = best_ridge.predict(X_test_house)
ridge_train_mae, ridge_train_rmse = evaluate_model(y_train_house, y_train_pred_ridge)
ridge_test_mae, ridge_test_rmse = evaluate_model(y_test_house, y_test_pred_ridge)

cv_scores_ridge = cross_val_score(best_ridge, X, y, cv=6, scoring='neg_mean_absolute_error')
ridge_cv_mae = -cv_scores_ridge.mean()


In [58]:
'''
# 结果字典
model_results = {
    'OLS': {
        'In_sample_MAE': ols_train_mae,
        'Out_of_sample_MAE': ols_test_mae,
        'In_sample_RMSE': ols_train_rmse,
        'Out_of_sample_RMSE': ols_test_rmse,
        'Cross_validation_MAE': ols_cv_mae
    },
    'Lasso': {
        'In_sample_MAE': lasso_train_mae,
        'Out_of_sample_MAE': lasso_test_mae,
        'In_sample_RMSE': lasso_train_rmse,
        'Out_of_sample_RMSE': lasso_test_rmse,
        'Cross_validation_MAE': lasso_cv_mae,
        'Best_params': lasso_grid.best_params_
    },
    'Ridge': {
        'In_sample_MAE': ridge_train_mae,
        'Out_of_sample_MAE': ridge_test_mae,
        'In_sample_RMSE': ridge_train_rmse,
        'Out_of_sample_RMSE': ridge_test_rmse,
        'Cross_validation_MAE': ridge_cv_mae,
        'Best_params': ridge_grid.best_params_
    },
    'ElasticNet': {
        'In_sample_MAE': elastic_train_mae,
        'Out_of_sample_MAE': elastic_test_mae,
        'In_sample_RMSE': elastic_train_rmse,
        'Out_of_sample_RMSE': elastic_test_rmse,
        'Cross_validation_MAE': elastic_cv_mae,
        'Best_params': elastic_grid.best_params_
    }
}

results_df = pd.DataFrame(model_results).T
print("模型性能评估结果：")
print(results_df)
'''

'\n# 结果字典\nmodel_results = {\n    \'OLS\': {\n        \'In_sample_MAE\': ols_train_mae,\n        \'Out_of_sample_MAE\': ols_test_mae,\n        \'In_sample_RMSE\': ols_train_rmse,\n        \'Out_of_sample_RMSE\': ols_test_rmse,\n        \'Cross_validation_MAE\': ols_cv_mae\n    },\n    \'Lasso\': {\n        \'In_sample_MAE\': lasso_train_mae,\n        \'Out_of_sample_MAE\': lasso_test_mae,\n        \'In_sample_RMSE\': lasso_train_rmse,\n        \'Out_of_sample_RMSE\': lasso_test_rmse,\n        \'Cross_validation_MAE\': lasso_cv_mae,\n        \'Best_params\': lasso_grid.best_params_\n    },\n    \'Ridge\': {\n        \'In_sample_MAE\': ridge_train_mae,\n        \'Out_of_sample_MAE\': ridge_test_mae,\n        \'In_sample_RMSE\': ridge_train_rmse,\n        \'Out_of_sample_RMSE\': ridge_test_rmse,\n        \'Cross_validation_MAE\': ridge_cv_mae,\n        \'Best_params\': ridge_grid.best_params_\n    },\n    \'ElasticNet\': {\n        \'In_sample_MAE\': elastic_train_mae,\n        \'Out_of_s

In [59]:
# 异常值检测
outlier_detector = EllipticEnvelope(contamination=0.05)
outlier_labels = outlier_detector.fit_predict(X)
normal_indices = outlier_labels == 1 
X_normal = X[normal_indices]
y_normal = y[normal_indices]
print(f"移除异常值后的预测总数：{len(X_normal)}")

移除异常值后的预测总数：81252


In [60]:
#用训练好的模型去预测Price
data_test_rent =r"D:\人工智能\Python exam\ruc_Class25Q2_test_price.csv"
df2 = pd.read_csv(data_test_rent, dtype=str)
#print("===== df2 原始数据基本信息 =====")
#print(df2.info())  
non_null_counts_df2 = df2.notnull().sum()
keep_cols_df2 = non_null_counts_df2[non_null_counts_df2 > 30000 ].index.tolist()
df2 = df2[keep_cols_df2]
#print("\n===== df2 筛选后数据基本信息 =====")
#print(df2.info())
numeric_cols = [ '建筑面积', 'lon', 'lat', 'coord_x', 'coord_y','城市','区域','房屋总数','楼栋总数']
for col in numeric_cols:
    if col == '建筑面积':
        df2[col] = df2[col].str.replace('㎡', '').astype(float)
    elif col == '房屋总数':        
        df2[col] = df2[col].str.replace('户', '').astype(float)        
    elif col == '楼栋总数':        
        df2[col] = df2[col].str.replace('栋', '').astype(float)
    else:
        df2[col] = pd.to_numeric(df2[col], errors='coerce')
df2[numeric_cols] = df2[numeric_cols].fillna(df2[numeric_cols].mean())
df2['户均楼栋房屋数'] = df2['房屋总数'] / df2['楼栋总数']
df2[numeric_cols] = df2[numeric_cols].fillna(df2[numeric_cols].mean())
df2['建筑面积'] = df2['建筑面积'].replace(0, 0.1)  
df2['log_建筑面积'] = np.log10(df2['建筑面积'])  
'''
print("===== 数值列转换后信息 =====")
print(df2[numeric_cols].info())
print(df2[numeric_cols].head()) 

print("\n数据统计描述：")
print(df2.describe())  
print("\n前5行数据：")
print(df2.head())  
'''
print("模型训练时的特征列：", X.columns.tolist())
df2['Price'] = 0
df2_selected = df2[['城市','区域','建筑面积','户均楼栋房屋数','Price']] 
X = df2_selected.drop(columns=['Price'])  
y = df2_selected['Price']  
train_features = X.columns.tolist() 

missing_features = [f for f in train_features if f not in df2.columns]
if missing_features:
    raise ValueError(f"新数据缺少必要特征：{missing_features}")

new_X = df2[train_features].copy()
train_medians = X.median() 
new_X = new_X.fillna(train_medians)
new_X = new_X.replace([np.inf, -np.inf], train_medians.max())

#y_pred = best_lasso.predict(new_X) 
#y_pred = best_elastic.predict(new_X) 
y_pred = best_ridge.predict(new_X) # 未缩放时
#y_pred = ols.predict(new_X)  # 缩放过时

new_data_with_pred = df2.copy()
new_data_with_pred['预测Price'] = y_pred * 1000000
#new_data_with_pred['预测Price'] = new_data_with_pred['预测Price'].apply(lambda x: 700000 if x < 500000 else x)

print("预测结果：")
print(new_data_with_pred[['预测Price'] + train_features])  
new_data_with_pred.to_csv("预测结果2.csv", index=False)

模型训练时的特征列： ['城市', '区域', '建筑面积', '户均楼栋房屋数']
预测结果：
            预测Price  城市     区域    建筑面积     户均楼栋房屋数
0      8.878317e+06   0  109.0  282.02  152.666667
1      2.255942e+06   0   28.0   88.42  150.652174
2      5.754067e+06   0  123.0  175.52   72.000000
3      3.165152e+06   0   65.0  106.13   35.555556
4      3.849665e+06   0  109.0  116.80   37.777778
...             ...  ..    ...     ...         ...
34012  2.891249e+06  11   87.0  132.00   73.666667
34013  9.586425e+05  11   84.0   69.30   29.937500
34014  1.703493e+06  11  106.0   88.10   53.555556
34015  1.700411e+06  11  106.0   88.00   53.555556
34016  1.577137e+06  11  106.0   84.00   53.555556

[34017 rows x 5 columns]


In [61]:
#合并
df1 = pd.read_csv(r"D:\人工智能\lecture-python-programming.notebooks-main\预测结果.csv")  
df2 = pd.read_csv(r"D:\人工智能\lecture-python-programming.notebooks-main\预测结果2.csv")  
extracted_df1 = df1[['ID', '预测Price']].copy()
extracted_df2 = df2[['ID', '预测Price']].copy()
combined_df = pd.concat([extracted_df2, extracted_df1], axis=0, ignore_index=True)
print("提取的两列数据：")
print(combined_df.head())  
combined_df.to_csv("合并预测结果.csv", index=False)  

提取的两列数据：
        ID       预测Price
0  1000000  8.878317e+06
1  1000001  2.255942e+06
2  1000002  5.754067e+06
3  1000003  3.165152e+06
4  1000004  3.849665e+06
