In [12]:
import pandas as pd
import numpy as np

def clean_xiahuayuan_data(df): #清除下花园数据中的单位
    cleaned_df = df.copy()
    
    cleaned_df['面积'] = cleaned_df['房屋面积'].astype(str).str.replace('㎡', '').str.strip()
    cleaned_df['总价'] = cleaned_df['总价'].astype(str).str.replace('万', '').str.strip()
    cleaned_df['单价'] = cleaned_df['单价'].astype(str).str.extract(r'(\d+)')[0]
    
    # 转换为数值类型
    cleaned_df['面积'] = pd.to_numeric(cleaned_df['面积'], errors='coerce')
    cleaned_df['总价'] = pd.to_numeric(cleaned_df['总价'], errors='coerce')
    cleaned_df['单价'] = pd.to_numeric(cleaned_df['单价'], errors='coerce')
    
    return cleaned_df[['面积', '总价', '单价']]

def clean_zu_xiahuayuan_data(df):
    cleaned_df = df.copy()
    
    # 清洗面积列
    cleaned_df['面积'] = cleaned_df['面积'].astype(str).str.replace('㎡', '').str.strip()
    # 清洗租金列
    cleaned_df['租金'] = cleaned_df['月租金'].astype(str).str.replace('元/月', '').str.strip()
    
    # 转换为数值类型
    cleaned_df['面积'] = pd.to_numeric(cleaned_df['面积'], errors='coerce')
    cleaned_df['租金'] = pd.to_numeric(cleaned_df['租金'], errors='coerce')
    
    return cleaned_df[['面积', '租金']]

# 读取二手房数据
print("正在读取二手房数据...")

# 怀来二手房
esf_huailai = pd.read_excel('hebei_esf_huailai_data.xlsx')
esf_huailai['桥西'] = 0
esf_huailai['下花园'] = 0
esf_huailai['怀来'] = 1
esf_huailai['张北'] = 0

# 桥西二手房
esf_qiaoxi = pd.read_excel('hebei_esf_qiaoxi_data.xlsx')
esf_qiaoxi['桥西'] = 1
esf_qiaoxi['下花园'] = 0
esf_qiaoxi['怀来'] = 0
esf_qiaoxi['张北'] = 0

# 下花园二手房
esf_xiahuayuan = pd.read_excel('hebei_esf_xiahuayuan_data.xlsx', sheet_name='张家口下花园二手房')
esf_xiahuayuan_cleaned = clean_xiahuayuan_data(esf_xiahuayuan)
esf_xiahuayuan_cleaned['桥西'] = 0
esf_xiahuayuan_cleaned['下花园'] = 1
esf_xiahuayuan_cleaned['怀来'] = 0
esf_xiahuayuan_cleaned['张北'] = 0

# 张北二手房
esf_zhangbei = pd.read_excel('二手房数据_Edge.xlsx')
esf_zhangbei = esf_zhangbei.rename(columns={
        'esf': '单价',
        'm2': '面积', 
        'esf_m2': '总价'
    })
esf_zhangbei['桥西'] = 0
esf_zhangbei['下花园'] = 0
esf_zhangbei['怀来'] = 0
esf_zhangbei['张北'] = 1

# 合并二手房数据
esf_combined = pd.concat([
    esf_huailai.rename(columns={'面积(㎡)': '面积', '总价(万)': '总价', '均价(元/㎡)': '单价'}),
    esf_qiaoxi.rename(columns={'面积(㎡)': '面积', '总价(万)': '总价', '均价(元/㎡)': '单价'}),
    esf_xiahuayuan_cleaned,
    esf_zhangbei
], ignore_index=True)

# 读取租房数据
print("正在读取租房数据...")

# 怀来租房
zu_huailai = pd.read_excel('hebei_zu_huailai_data.xlsx')
zu_huailai['桥西'] = 0
zu_huailai['下花园'] = 0
zu_huailai['怀来'] = 1
zu_huailai['张北'] = 0

# 桥西租房
zu_qiaoxi = pd.read_excel('hebei_zu_qiaoxi_data.xlsx')
zu_qiaoxi['桥西'] = 1
zu_qiaoxi['下花园'] = 0
zu_qiaoxi['怀来'] = 0
zu_qiaoxi['张北'] = 0

# 下花园租房（需要特殊处理）
zu_xiahuayuan = pd.read_excel('hebei_zu_xiahuyuan_data.xlsx', sheet_name='租房信息')
zu_xiahuayuan_cleaned = clean_zu_xiahuayuan_data(zu_xiahuayuan)
zu_xiahuayuan_cleaned['桥西'] = 0
zu_xiahuayuan_cleaned['下花园'] = 1
zu_xiahuayuan_cleaned['怀来'] = 0
zu_xiahuayuan_cleaned['张北'] = 0

# 张北租房
zu_zhangbei = pd.read_excel('租房数据.xlsx')
zu_zhangbei = zu_zhangbei.rename(columns={
        'rent': '租金',
        'm2': '面积'
    })
zu_zhangbei['桥西'] = 0
zu_zhangbei['下花园'] = 0
zu_zhangbei['怀来'] = 0
zu_zhangbei['张北'] = 1

# 合并租房数据
zu_combined = pd.concat([
    zu_huailai.rename(columns={'面积(㎡)': '面积', '租金(元/月)': '租金'}),
    zu_qiaoxi.rename(columns={'面积(㎡)': '面积', '租金(元/月)': '租金'}),
    zu_xiahuayuan_cleaned,
    zu_zhangbei
], ignore_index=True)

# 保存结果
print("正在保存结果...")
esf_combined.to_excel('combined_esf_data.xlsx', index=False)
zu_combined.to_excel('combined_zu_data.xlsx', index=False)

print("数据融合完成！")
print(f"二手房数据总行数: {len(esf_combined)}")
print(f"租房数据总行数: {len(zu_combined)}")

# 显示数据预览
print("\n二手房数据预览:")
print(esf_combined.head())
print("\n租房数据预览:")
print(zu_combined.head())

正在读取二手房数据...
正在读取租房数据...
正在保存结果...
数据融合完成！
二手房数据总行数: 3010
租房数据总行数: 1147

二手房数据预览:
      面积     总价     单价  桥西  下花园  怀来  张北
0   49.0   15.0   3061   0    0   1   0
1  110.0  110.0  10000   0    0   1   0
2  118.0   45.0   3813   0    0   1   0
3   78.0   55.0   7051   0    0   1   0
4    9.0  150.0   9560   0    0   1   0

租房数据预览:
    面积    租金  桥西  下花园  怀来  张北
0   40   950   0    0   1   0
1  104  1000   0    0   1   0
2   87  1200   0    0   1   0
3   78  2000   0    0   1   0
4  175  3000   0    0   1   0
