In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import OneHotEncoder, PolynomialFeatures
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import r2_score
import os


# 创建结果目录
if not os.path.exists('results'):
    os.makedirs('results')
    print(" 'results' folder created for outputs")

# 数据加载与检查
def load_data():
    try:
        # 尝试读取数据
        print(" Loading data files...")
        housing_df = pd.read_csv('Housing_Price.csv')
        rent_df = pd.read_csv('Rent_Price.csv')
        
        possible_housing_mappings = [
            {'Block': 'Block', '面积': 'area', '单价': 'price_per_sqm'},
        ]
        
        possible_rent_mappings = [
            {'Block': 'Block', '面积': 'area', '每平米月租': 'rent_per_sqm'},
        ]
        
        # 找到可用的列名映射
        housing_cols = None
        for mapping in possible_housing_mappings:
            if all(col in housing_df.columns for col in mapping.keys()):
                housing_cols = mapping
                break
                
        rent_cols = None
        for mapping in possible_rent_mappings:
            if all(col in rent_df.columns for col in mapping.keys()):
                rent_cols = mapping
                break
        
    
        # 重命名列并选择需要的列
        housing_df = housing_df.rename(columns=housing_cols)[['Block', 'area', 'price_per_sqm']]
        rent_df = rent_df.rename(columns=rent_cols)[['Block', 'area', 'rent_per_sqm']]
        
        # 添加非线性特征：面积的平方
        housing_df['area_squared'] = housing_df['area'] **2
        rent_df['area_squared'] = rent_df['area']** 2
        
        # 处理Block列
        housing_df['Block'] = housing_df['Block'].astype(str).str.strip()
        rent_df['Block'] = rent_df['Block'].astype(str).str.strip()
        
        # 处理数值列
        for col in ['area', 'area_squared', 'price_per_sqm']:
            housing_df[col] = pd.to_numeric(housing_df[col], errors='coerce')
        
        for col in ['area', 'area_squared', 'rent_per_sqm']:
            rent_df[col] = pd.to_numeric(rent_df[col], errors='coerce')
        
        # 显示唯一区域
        unique_blocks = sorted(housing_df['Block'].unique())
        print(f"- Unique blocks found: {unique_blocks} ({len(unique_blocks)} types)")
        
        return housing_df, rent_df
    
    except FileNotFoundError as e:
        print(f" File not found: {str(e)}")
        print("Please ensure 'Housing_Price.csv' and 'Rent_Price.csv' are in the same directory")
        raise
    except Exception as e:
        print(f" Data processing error: {str(e)}")
        raise

# 训练回归模型（包含原始模型和增强模型）
def train_regression_models(housing_df, rent_df):
    print("\n===== Training Regression Models =====")
    
    # 再次检查数据
    if len(housing_df) < 5:
        print(f"Warning: Small housing dataset ({len(housing_df)} samples)")
    if len(rent_df) < 5:
        print(f"Warning: Small rent dataset ({len(rent_df)} samples)")
    
    # 定义原始模型的预处理步骤
    preprocessor_original = ColumnTransformer(
        transformers=[
            ('categorical', OneHotEncoder(drop='first', sparse_output=False), ['Block']),
            ('numerical', 'passthrough', ['area'])
        ]
    )
    
    # 定义增强模型的预处理步骤（包含非线性和交互项）
    preprocessor_enhanced = ColumnTransformer(
        transformers=[
            ('categorical', OneHotEncoder(drop='first', sparse_output=False), ['Block']),
            ('numerical', PolynomialFeatures(degree=2, include_bias=False, interaction_only=False), ['area'])
        ]
    )
    
    # ----------------------
    # 模型1: 原始房价模型
    # ----------------------
    X1 = housing_df[['Block', 'area']]
    y1 = housing_df['price_per_sqm']
    
    if X1.shape[0] == 0:
        raise ValueError("No features available for price model training")
    
    model_price = Pipeline(steps=[
        ('preprocessor', preprocessor_original),
        ('regressor', LinearRegression())
    ])
    model_price.fit(X1, y1)
    y1_pred = model_price.predict(X1)
    r2_price = r2_score(y1, y1_pred)
    print(f" Model 1 (Original Price) trained. R²: {r2_price:.4f}")
    
    # ----------------------
    # 模型1+: 增强房价模型（含非线性和交互项）
    # ----------------------
    model_price_plus = Pipeline(steps=[
        ('preprocessor', preprocessor_enhanced),
        ('regressor', LinearRegression())
    ])
    model_price_plus.fit(X1, y1)
    y1_plus_pred = model_price_plus.predict(X1)
    r2_price_plus = r2_score(y1, y1_plus_pred)
    print(f" Model 1+ (Enhanced Price) trained. R²: {r2_price_plus:.4f}")
    
    # ----------------------
    # 模型2: 原始租金模型
    # ----------------------
    X2 = rent_df[['Block', 'area']]
    y2 = rent_df['rent_per_sqm']
    
    if X2.shape[0] == 0:
        raise ValueError("No features available for rent model training")
    
    model_rent = Pipeline(steps=[
        ('preprocessor', preprocessor_original),
        ('regressor', LinearRegression())
    ])
    model_rent.fit(X2, y2)
    y2_pred = model_rent.predict(X2)
    r2_rent = r2_score(y2, y2_pred)
    print(f" Model 2 (Original Rent) trained. R²: {r2_rent:.4f}")
    
    # ----------------------
    # 模型2+: 增强租金模型（含非线性和交互项）
    # ----------------------
    model_rent_plus = Pipeline(steps=[
        ('preprocessor', preprocessor_enhanced),
        ('regressor', LinearRegression())
    ])
    model_rent_plus.fit(X2, y2)
    y2_plus_pred = model_rent_plus.predict(X2)
    r2_rent_plus = r2_score(y2, y2_plus_pred)
    print(f" Model 2+ (Enhanced Rent) trained. R²: {r2_rent_plus:.4f}")
    
    # 输出模型比较
    print("\n===== Model R² Comparison =====")
    print(f"Price models: Original {r2_price:.4f} vs Enhanced {r2_price_plus:.4f}")
    print(f"Rent models: Original {r2_rent:.4f} vs Enhanced {r2_rent_plus:.4f}")
    
    # 分析R²差异原因
    if r2_price_plus > r2_price:
        print("\nEnhanced price model performs better - indicates non-linear relationships in data")
    else:
        print("\nOriginal price model performs better - suggests linear relationships are sufficient")
        
    if r2_rent_plus > r2_rent:
        print("Enhanced rent model performs better - indicates non-linear relationships in data")
    else:
        print("Original rent model performs better - suggests linear relationships are sufficient")
    
    return model_price, model_price_plus, model_rent, model_rent_plus, r2_price, r2_price_plus, r2_rent, r2_rent_plus

# 预测与比率计算
def predict_and_calculate_ratio(housing_df, rent_df, model_price, model_price_plus, model_rent, model_rent_plus):
    print("\n===== Predictions & Price-to-Rent Ratio Calculation =====")
    
    # 使用三种方法生成预测
    # 方法1: 原始模型
    housing_df['predicted_price_original'] = model_price.predict(housing_df[['Block', 'area']])
    rent_df['predicted_rent_original'] = model_rent.predict(rent_df[['Block', 'area']])
    
    # 方法2: 增强模型
    housing_df['predicted_price_enhanced'] = model_price_plus.predict(housing_df[['Block', 'area']])
    rent_df['predicted_rent_enhanced'] = model_rent_plus.predict(rent_df[['Block', 'area']])
    
    # 方法3: 原始数据（非预测）
    merged_original = pd.merge(
        housing_df[['Block', 'price_per_sqm']],
        rent_df[['Block', 'rent_per_sqm']],
        on='Block',
        suffixes=('_price', '_rent')
    )
    merged_original = merged_original[merged_original['rent_per_sqm'] > 0]
    merged_original['price_rent_ratio_actual'] = (merged_original['price_per_sqm'] / merged_original['rent_per_sqm']).round(1)
    
    # 按区域合并预测数据
    combined_data = []
    for block in housing_df['Block'].unique():
        housing_block = housing_df[housing_df['Block'] == block].copy()
        rent_block = rent_df[rent_df['Block'] == block].copy()
        
        min_length = min(len(housing_block), len(rent_block))
        if min_length > 0:
            block_data = pd.DataFrame({
                'Block': block,
                'predicted_price_original': housing_block['predicted_price_original'].iloc[:min_length].values,
                'predicted_rent_original': rent_block['predicted_rent_original'].iloc[:min_length].values,
                'predicted_price_enhanced': housing_block['predicted_price_enhanced'].iloc[:min_length].values,
                'predicted_rent_enhanced': rent_block['predicted_rent_enhanced'].iloc[:min_length].values
            })
            combined_data.append(block_data)
    
    if not combined_data:
        raise ValueError("No matching data between housing and rent datasets")
    
    combined_df = pd.concat(combined_data, ignore_index=True)
    # 过滤无效值
    combined_df = combined_df[(combined_df['predicted_rent_original'] > 0) & 
                             (combined_df['predicted_rent_enhanced'] > 0)]
    
    if len(combined_df) == 0:
        raise ValueError("No valid data after filtering for rent > 0")
    
    # 计算三种方法的价格租金比
    combined_df['price_rent_ratio_original'] = (combined_df['predicted_price_original'] / 
                                              combined_df['predicted_rent_original']).round(1)
    combined_df['price_rent_ratio_enhanced'] = (combined_df['predicted_price_enhanced'] / 
                                               combined_df['predicted_rent_enhanced']).round(1)
    
    print(f" Calculated ratios for {len(combined_df)} data points")
    return combined_df, merged_original

# 绘制图形
def plot_figures(combined_df, merged_original):
    print("\n===== Generating Figures =====")
    
    # 计算每种方法每个区域的中位数比率
    median_ratios_original = combined_df.groupby('Block')['price_rent_ratio_original'].median().reset_index()
    median_ratios_enhanced = combined_df.groupby('Block')['price_rent_ratio_enhanced'].median().reset_index()
    median_ratios_actual = merged_original.groupby('Block')['price_rent_ratio_actual'].median().reset_index()
    
    # 合并三种方法的结果用于比较
    comparison_df = median_ratios_original.merge(median_ratios_enhanced, on='Block')
    comparison_df = comparison_df.merge(median_ratios_actual, on='Block', how='left')
    comparison_df.columns = ['Block', 'Original Model', 'Enhanced Model', 'Actual Data']
    
    # 绘制图C: 增强模型的比率
    plt.figure(figsize=(10, 6))
    bar_plot = sns.barplot(
        x='Block',
        y='price_rent_ratio_enhanced',
        data=median_ratios_enhanced,
        hue='Block',
        palette='Set3',
        legend=False
    )
    
    plt.axhline(y=200, color='red', linestyle='--', linewidth=2, label='Global Fair Value (200 months)')
    plt.title('Median Price-to-Rent Ratio by Block (Enhanced Model)', fontsize=16)
    plt.xlabel('Block', fontsize=14)
    plt.ylabel('Price-to-Rent Ratio (Months)', fontsize=14)
    plt.xticks(rotation=45)
    plt.legend()
    
    # 添加数值标签
    for p in bar_plot.patches:
        bar_plot.annotate(
            f'{p.get_height():.1f}',
            (p.get_x() + p.get_width()/2, p.get_height()),
            ha='center',
            va='bottom',
            xytext=(0, 5),
            textcoords='offset points',
            fontsize=11
        )
    
    plt.tight_layout()
    plt.savefig('results/figureC_enhanced_ratio.png', dpi=300, bbox_inches='tight')
    plt.close()
    
    print(" Figure C (Enhanced Model) saved to 'results/figureC_enhanced_ratio.png'")
    
    return comparison_df

# 主函数
def main():
    print("===== Regression Models & Price-to-Rent Ratio Analysis =====")
    try:
        # 1. 加载数据
        housing_data, rent_data = load_data()
        
        # 2. 训练模型（包含原始和增强模型）
        model_price, model_price_plus, model_rent, model_rent_plus, r2_price, r2_price_plus, r2_rent, r2_rent_plus = train_regression_models(housing_data, rent_data)
        
        # 3. 预测并计算比率
        combined_df, merged_original = predict_and_calculate_ratio(housing_data, rent_data, model_price, model_price_plus, model_rent, model_rent_plus)
        
        # 4. 生成图形
        comparison_df = plot_figures(combined_df, merged_original)
    
    except Exception as e:
        print(f"\n Analysis failed: {str(e)}")
        print("Please check your data files and try again")

if __name__ == "__main__":
    main()


✅ 'results' folder created for outputs
===== Regression Models & Price-to-Rent Ratio Analysis =====
📂 Loading data files...
- Unique blocks found: ['dachang', 'majuqiao', 'yanjiao', 'yizhuang'] (4 types)

===== Training Regression Models =====
 Model 1 (Original Price) trained. R²: 0.7884
 Model 1+ (Enhanced Price) trained. R²: 0.7898
 Model 2 (Original Rent) trained. R²: 0.6839
 Model 2+ (Enhanced Rent) trained. R²: 0.6901

===== Model R² Comparison =====
Price models: Original 0.7884 vs Enhanced 0.7898
Rent models: Original 0.6839 vs Enhanced 0.6901

Enhanced price model performs better - indicates non-linear relationships in data
Enhanced rent model performs better - indicates non-linear relationships in data

===== Predictions & Price-to-Rent Ratio Calculation =====
 Calculated ratios for 4311 data points

===== Generating Figures =====
 Figure B (Comparison) saved to 'results/figureB_comparison.png'
 Figure C (Enhanced Model) saved to 'results/figureC_enhanced_ratio.png'

===== Sa