# Homework3-3 / Homework3-4 / Homework3-5
本 Notebook 完成数据研究（Data Research）与两轮建模（基础与增强），并输出图表与关键指标。

In [9]:
# === 加载与准备数据 ===
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
#中文字体
plt.rcParams['font.sans-serif'] = ['SimHei']
plt.rcParams['axes.unicode_minus'] = False

# 读取标准化数据（来自前一步清洗输出）
sales = pd.read_csv('data_standardized/standardized_sales_all.csv')
rent = pd.read_csv('data_standardized/standardized_rent_all.csv')

# 计算价格/租金单价（兜底补齐）
sales['price_per_m2'] = sales['unit_price_yuan_per_m2']
sales.loc[sales['price_per_m2'].isna(), 'price_per_m2'] = sales['total_price_yuan'] / sales['area_m2']
rent['rent_per_m2'] = rent['rent_per_m2']
rent.loc[rent['rent_per_m2'].isna(), 'rent_per_m2'] = rent['rent_month_yuan'] / rent['area_m2']

# 基本清洗：去除不合理或缺失
sales = sales[(sales['area_m2']>0) & (sales['price_per_m2']>0)].copy()
rent = rent[(rent['area_m2']>0) & (rent['rent_per_m2']>0)].copy()

print('Sales blocks:', sales['block'].unique())
print('Rent blocks:', rent['block'].unique())


Sales blocks: ['Wanliu' 'Suzhouqiao' 'Beitaipingzhuang' 'Shijicheng']
Rent blocks: ['Wanliu' 'Suzhouqiao' 'Beitaipingzhuang' 'Shijicheng']


In [10]:
# === Homework3-3: Data Research ===
# 数据描述函数
def describe_by_block(df, value_col):
    g = df.groupby('block')[value_col]
    desc = g.agg(['count','mean','median','std']).rename(columns={
        'count':'样本量','mean':'均值','median':'中位数','std':'标准差'
    })
    return desc

# 离群值检测（IQR 方法）
def outlier_share_by_block(df, value_col):
    shares = {}
    for b, sub in df.groupby('block'):
        x = sub[value_col].dropna()
        if x.empty:
            shares[b] = np.nan
            continue
        q1, q3 = np.percentile(x, [25, 75])
        iqr = q3 - q1
        lower, upper = q1 - 1.5*iqr, q3 + 1.5*iqr
        shares[b] = ((x < lower) | (x > upper)).mean()
    return pd.Series(shares, name=f'{value_col}_outlier_share')

sales_desc = describe_by_block(sales, 'price_per_m2')
rent_desc = describe_by_block(rent, 'rent_per_m2')
sales_out = outlier_share_by_block(sales, 'price_per_m2')
rent_out = outlier_share_by_block(rent, 'rent_per_m2')

print('各板块销售单价描述:')
print(sales_desc)
print('各板块租金单价描述:')
print(rent_desc)
print('各板块销售离群值占比:')
print(sales_out)
print('各板块租金离群值占比:')
print(rent_out)

# 价格-租金中位数比值（Figure A）
med_price = sales.groupby('block')['price_per_m2'].median()
med_rent = rent.groupby('block')['rent_per_m2'].median()
ratio_A = (med_price / med_rent).dropna().rename('median_price_to_rent')

os.makedirs('output', exist_ok=True)
ratio_A.to_csv('output/figureA_ratio_research.csv', header=True, encoding='utf-8-sig')

plt.figure(figsize=(8,4))
ratio_A.sort_values(ascending=False).plot(kind='bar', color='#4C78A8')
plt.axhline(200, color='red', linestyle='--', label='Global fair value ≈ 200')
plt.title('Figure A: 各板块价格-租金中位数比值（Data Research）')
plt.ylabel('Median Price-to-Rent Ratio')
plt.legend()
plt.tight_layout()
plt.savefig('output/Figure_A_median_price_to_rent_ratio.png', dpi=150)
plt.close()
print('Figure A saved to output/Figure_A_median_price_to_rent_ratio.png')


各板块销售单价描述:
                   样本量             均值       中位数           标准差
block                                                        
Beitaipingzhuang   194   86937.963918   89640.5  22907.273379
Shijicheng        1200  113945.005833  110303.0  18859.731515
Suzhouqiao         311   80176.752412   80723.0  13524.613300
Wanliu            1175  154800.230638  141983.0  44988.348296
各板块租金单价描述:
                   样本量          均值         中位数        标准差
block                                                    
Beitaipingzhuang   249  140.267889  132.075472  32.936136
Shijicheng        1200  120.343626  108.843537  33.344278
Suzhouqiao         305  136.070882  126.732673  40.587086
Wanliu            2400  178.127109  151.898734  76.339066
各板块销售离群值占比:
Beitaipingzhuang    0.103093
Shijicheng          0.019167
Suzhouqiao          0.022508
Wanliu              0.112340
Name: price_per_m2_outlier_share, dtype: float64
各板块租金离群值占比:
Beitaipingzhuang    0.068273
Shijicheng          0.028333
Suzhouqiao 

In [11]:
# === Homework3-4: Data Science Modeling（基础模型） ===
from sklearn.linear_model import LinearRegression

# 特征：面积 + 板块哑变量（以 block 近似 location_i）
X_sales = pd.get_dummies(sales[['area_m2','block']], columns=['block'], drop_first=True)
y_sales = sales['price_per_m2']
X_rent = pd.get_dummies(rent[['area_m2','block']], columns=['block'], drop_first=True)
y_rent = rent['rent_per_m2']

m1 = LinearRegression().fit(X_sales, y_sales)
m2 = LinearRegression().fit(X_rent, y_rent)

sales['pred_price_per_m2_m1'] = m1.predict(X_sales)
rent['pred_rent_per_m2_m2'] = m2.predict(X_rent)

r2_m1 = m1.score(X_sales, y_sales)
r2_m2 = m2.score(X_rent, y_rent)
print(f'Model 1 R² (price/m2): {r2_m1:.3f}')
print(f'Model 2 R² (rent/m2): {r2_m2:.3f}')

# 各板块预测中位数与比值（Figure B）
med_pred_price = sales.groupby('block')['pred_price_per_m2_m1'].median()
med_pred_rent = rent.groupby('block')['pred_rent_per_m2_m2'].median()
ratio_B = (med_pred_price / med_pred_rent).dropna().rename('median_price_to_rent_pred_basic')
ratio_B.to_csv('output/figureB_ratio_model_basic.csv', header=True, encoding='utf-8-sig')

plt.figure(figsize=(8,4))
ratio_B.sort_values(ascending=False).plot(kind='bar', color='#F58518')
plt.axhline(200, color='red', linestyle='--', label='Global fair value ≈ 200')
plt.title('Figure B: 各板块价格-租金中位数比值（Model 1/2 预测）')
plt.ylabel('Median Price-to-Rent Ratio (Predicted)')
plt.legend()
plt.tight_layout()
plt.savefig('output/Figure_B_median_price_to_rent_ratio_model_basic.png', dpi=150)
plt.close()
print('Figure B saved to output/Figure_B_median_price_to_rent_ratio_model_basic.png')


Model 1 R² (price/m2): 0.580
Model 2 R² (rent/m2): 0.269


Figure B saved to output/Figure_B_median_price_to_rent_ratio_model_basic.png


In [12]:
# === Homework3-5: Data Science Modeling Pro Max（增强模型） ===
# 非线性与交互：area_m2, area_m2^2, log(area_m2) 与 block 交互
def build_plus_features(df):
    df2 = df.copy()
    df2['log_area'] = np.log(df2['area_m2'])
    df2['area_sq'] = df2['area_m2']**2
    dummies = pd.get_dummies(df2['block'], prefix='block', drop_first=True)
    # 与面积交互
    inter_area = dummies.mul(df2['area_m2'], axis=0)
    inter_log = dummies.mul(df2['log_area'], axis=0)
    inter_sq = dummies.mul(df2['area_sq'], axis=0)
    X = pd.concat([df2[['area_m2','log_area','area_sq']], dummies, inter_area.add_prefix('ia_'), inter_log.add_prefix('il_'), inter_sq.add_prefix('is_')], axis=1)
    return X

X_sales_plus = build_plus_features(sales)
y_sales = sales['price_per_m2']
X_rent_plus = build_plus_features(rent)
y_rent = rent['rent_per_m2']

m1_plus = LinearRegression().fit(X_sales_plus, y_sales)
m2_plus = LinearRegression().fit(X_rent_plus, y_rent)

sales['pred_price_per_m2_m1_plus'] = m1_plus.predict(X_sales_plus)
rent['pred_rent_per_m2_m2_plus'] = m2_plus.predict(X_rent_plus)

r2_m1_plus = m1_plus.score(X_sales_plus, y_sales)
r2_m2_plus = m2_plus.score(X_rent_plus, y_rent)
print(f'Model 1+ R² (price/m2): {r2_m1_plus:.3f}')
print(f'Model 2+ R² (rent/m2): {r2_m2_plus:.3f}')
print('R² 对比: Model1 ->', round(r2_m1,3), 'vs Model1+ ->', round(r2_m1_plus,3))
print('R² 对比: Model2 ->', round(r2_m2,3), 'vs Model2+ ->', round(r2_m2_plus,3))

# 保存 R² 对比并绘制图表
r2_df = pd.DataFrame({
    'Model': ['Model 1 (price/m2)', 'Model 2 (rent/m2)', 'Model 1+ (price/m2)', 'Model 2+ (rent/m2)'],
    'R2': [r2_m1, r2_m2, r2_m1_plus, r2_m2_plus]
})
os.makedirs('output', exist_ok=True)
r2_df.to_csv('output/r2_compare_models.csv', index=False, encoding='utf-8-sig')

plt.figure(figsize=(7.5,4))
sns.barplot(x='Model', y='R2', data=r2_df, palette=['#4C78A8','#F58518','#54A24B','#E45756'])
plt.ylim(0, max(r2_df['R2'].max()*1.1, 1))
plt.title('R² Comparison: Basic vs Plus Models')
plt.ylabel('R²')
plt.xlabel('')
plt.xticks(rotation=20)
plt.tight_layout()
plt.savefig('output/Figure_R2_Comparison.png', dpi=150)
plt.close()
print('R² 对比表已保存: output/r2_compare_models.csv图已保存: output/Figure_R2_Comparison.png')

# 各板块预测中位数与比值（Figure C）
med_pred_price_plus = sales.groupby('block')['pred_price_per_m2_m1_plus'].median()
med_pred_rent_plus = rent.groupby('block')['pred_rent_per_m2_m2_plus'].median()
ratio_C = (med_pred_price_plus / med_pred_rent_plus).dropna().rename('median_price_to_rent_pred_plus')
ratio_C.to_csv('output/figureC_ratio_model_plus.csv', header=True, encoding='utf-8-sig')

plt.figure(figsize=(8,4))
ratio_C.sort_values(ascending=False).plot(kind='bar', color='#54A24B')
plt.axhline(200, color='red', linestyle='--', label='Global fair value ≈ 200')
plt.title('Figure C: 各板块价格-租金中位数比值（Model 1+/2+ 预测）')
plt.ylabel('Median Price-to-Rent Ratio (Predicted, Plus)')
plt.legend()
plt.tight_layout()
plt.savefig('output/Figure_C_median_price_to_rent_ratio_model_plus.png', dpi=150)
plt.close()
print('Figure C saved to output/Figure_C_median_price_to_rent_ratio_model_plus.png')

# 三种方法结果对比汇总
compare = pd.concat([ratio_A.rename('DataResearch'), ratio_B.rename('ModelBasic'), ratio_C.rename('ModelPlus')], axis=1)
compare.to_csv('output/ratio_compare_all_methods.csv', encoding='utf-8-sig')
print('三方法比值对比（按板块）:')
print(compare)

# 样本量提示：样本量过少的板块比值参考意义较弱
n_sales = sales.groupby('block').size().rename('n_sales')
n_rent = rent.groupby('block').size().rename('n_rent')
size_info = pd.concat([n_sales, n_rent], axis=1)
size_info.to_csv('output/sample_size_by_block.csv', encoding='utf-8-sig')
print('各板块样本量（销售/租房）：')
print(size_info)


Model 1+ R² (price/m2): 0.652
Model 2+ R² (rent/m2): 0.357
R² 对比: Model1 -> 0.58 vs Model1+ -> 0.652
R² 对比: Model2 -> 0.269 vs Model2+ -> 0.357


R² 对比表已保存: output/r2_compare_models.csv图已保存: output/Figure_R2_Comparison.png



Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.barplot(x='Model', y='R2', data=r2_df, palette=['#4C78A8','#F58518','#54A24B','#E45756'])
  plt.tight_layout()
  plt.savefig('output/Figure_R2_Comparison.png', dpi=150)


Figure C saved to output/Figure_C_median_price_to_rent_ratio_model_plus.png
三方法比值对比（按板块）:
                  DataResearch  ModelBasic   ModelPlus
block                                                 
Beitaipingzhuang    678.706643  582.256654  634.953485
Shijicheng         1013.408812  932.241572  968.005291
Suzhouqiao          636.954922  559.970489  619.909088
Wanliu              934.721417  868.059941  824.572165
各板块样本量（销售/租房）：
                  n_sales  n_rent
block                            
Beitaipingzhuang      194     249
Shijicheng           1200    1200
Suzhouqiao            311     305
Wanliu               1175    2400
