In [1]:
import pandas as pd
data_wanliu_sales = pd.read_csv('output\\beijing_haidian_wanliu_sales.csv')
print(data_wanliu_sales.columns)
data_wanliu_rent = pd.read_csv('output\\beijing_haidian_wanliu_rent.csv')
print(data_wanliu_rent.columns)
data_suzhouqiao_sales = pd.read_csv('output\\Beijing_Suzhouqiao_Sales.csv')
print(data_suzhouqiao_sales.columns)
data_suzhouqiao_rent = pd.read_csv('output\\Beijing_Suzhouqiao_Rent.csv')
print(data_suzhouqiao_rent.columns)
data_beitaipingzhuang_sales = pd.read_csv('output\\Beijing_Beitaipingzhuang_Sales.csv')
print(data_beitaipingzhuang_sales.columns)
data_beitaipingzhuang_rent = pd.read_csv('output\\Beijing_Beitaipingzhuang_Rent.csv')
print(data_beitaipingzhuang_rent.columns)
data_shijicheng_sales = pd.read_excel('output\\Beijing_Shijicheng_Sales.xlsx')
print(data_shijicheng_sales.columns)
data_shijicheng_rent = pd.read_excel('output\\Beijing_Shijicheng_Rent.xlsx')
print(data_shijicheng_rent.columns)


Index(['source', 'title', 'total_price_yuan', 'unit_price_yuan_per_m2',
       'area_m2', 'rooms', 'location', 'city', 'district', 'block', 'page'],
      dtype='object')
Index(['source', 'title', 'rent_month_yuan', 'area_m2', 'location', 'city',
       'district', 'block', 'page'],
      dtype='object')
Index(['面积(㎡)', '总价', '单价(元/㎡)'], dtype='object')
Index(['面积(㎡)', '总价(元/月)'], dtype='object')
Index(['total_price', 'unit_price', 'area'], dtype='object')
Index(['total_rent_price', 'area'], dtype='object')
Index(['标题', '面积(㎡)', '总价(万)', '单价(元/㎡)', '页码'], dtype='object')
Index(['面积(㎡)', '月租(元/月)', '页码', '租金单价(元/㎡/月)'], dtype='object')


In [None]:
# 统一列名并只保留需要用的列，生成标准化输出
import os, re

def to_num(x):
    if x is None:
        return None
    try:
        if isinstance(x, (int, float)):
            return float(x)
    except Exception:
        pass
    s = str(x).replace(',', '').strip()
    m = re.search(r'([0-9]+(?:\.[0-9]+)?)', s)
    if not m:
        return None
    v = float(m.group(1))
    if '万' in s:
        v *= 10000
    return v

def unify_sales_df(df, meta):
    out = pd.DataFrame()
    # 标题
    if 'title' in df.columns:
        out['title'] = df['title']
    elif '标题' in df.columns:
        out['title'] = df['标题']
    else:
        out['title'] = None
    # 面积
    if 'area_m2' in df.columns:
        out['area_m2'] = pd.to_numeric(df['area_m2'], errors='coerce')
    elif '面积(㎡)' in df.columns:
        out['area_m2'] = pd.to_numeric(df['面积(㎡)'], errors='coerce')
    elif 'area' in df.columns:
        out['area_m2'] = pd.to_numeric(df['area'], errors='coerce')
    else:
        out['area_m2'] = None
    # 单价
    if 'unit_price_yuan_per_m2' in df.columns:
        out['unit_price_yuan_per_m2'] = pd.to_numeric(df['unit_price_yuan_per_m2'], errors='coerce')
    elif '单价(元/㎡)' in df.columns:
        out['unit_price_yuan_per_m2'] = pd.to_numeric(df['单价(元/㎡)'], errors='coerce')
    elif 'unit_price' in df.columns:
        out['unit_price_yuan_per_m2'] = pd.to_numeric(df['unit_price'], errors='coerce')
    else:
        out['unit_price_yuan_per_m2'] = None
    # 总价
    if 'total_price_yuan' in df.columns:
        out['total_price_yuan'] = pd.to_numeric(df['total_price_yuan'], errors='coerce')
    elif '总价(万)' in df.columns:
        out['total_price_yuan'] = pd.to_numeric(df['总价(万)'], errors='coerce') * 10000
    elif '总价' in df.columns:
        out['total_price_yuan'] = [to_num(x) for x in df['总价']]
    elif 'total_price' in df.columns:
        out['total_price_yuan'] = pd.to_numeric(df['total_price'], errors='coerce')
    else:
        out['total_price_yuan'] = None
    # 若总价缺失但有单价和面积，计算补齐
    mask = out['total_price_yuan'].isna() & out['unit_price_yuan_per_m2'].notna() & out['area_m2'].notna()
    out.loc[mask, 'total_price_yuan'] = out.loc[mask, 'unit_price_yuan_per_m2'] * out.loc[mask, 'area_m2']
    # 页码
    if 'page' in df.columns:
        out['page'] = pd.to_numeric(df['page'], errors='coerce')
    elif '页码' in df.columns:
        out['page'] = pd.to_numeric(df['页码'], errors='coerce')
    else:
        out['page'] = None
    # 补充元数据
    out['city'] = meta.get('city', 'Beijing')
    out['district'] = meta.get('district', 'Haidian')
    out['block'] = meta.get('block')
    out['source'] = meta.get('source', 'fang')
    # 列顺序
    out = out[['city','district','block','source','title','area_m2','total_price_yuan','unit_price_yuan_per_m2','page']]
    return out

def unify_rent_df(df, meta):
    out = pd.DataFrame()
    # 标题
    if 'title' in df.columns:
        out['title'] = df['title']
    elif '标题' in df.columns:
        out['title'] = df['标题']
    else:
        out['title'] = None
    # 面积
    if 'area_m2' in df.columns:
        out['area_m2'] = pd.to_numeric(df['area_m2'], errors='coerce')
    elif '面积(㎡)' in df.columns:
        out['area_m2'] = pd.to_numeric(df['面积(㎡)'], errors='coerce')
    elif 'area' in df.columns:
        out['area_m2'] = pd.to_numeric(df['area'], errors='coerce')
    else:
        out['area_m2'] = None
    # 月租（元/月）
    if 'rent_month_yuan' in df.columns:
        out['rent_month_yuan'] = pd.to_numeric(df['rent_month_yuan'], errors='coerce')
    elif '总价(元/月)' in df.columns:
        out['rent_month_yuan'] = pd.to_numeric(df['总价(元/月)'], errors='coerce')
    elif '月租(元/月)' in df.columns:
        out['rent_month_yuan'] = pd.to_numeric(df['月租(元/月)'], errors='coerce')
    elif 'total_rent_price' in df.columns:
        out['rent_month_yuan'] = pd.to_numeric(df['total_rent_price'], errors='coerce')
    else:
        out['rent_month_yuan'] = None
    # 租金单价（元/㎡/月）
    if 'rent_per_m2' in df.columns:
        out['rent_per_m2'] = pd.to_numeric(df['rent_per_m2'], errors='coerce')
    elif '租金单价(元/㎡/月)' in df.columns:
        out['rent_per_m2'] = pd.to_numeric(df['租金单价(元/㎡/月)'], errors='coerce')
    else:
        out['rent_per_m2'] = None
    # 若租金单价缺失但有月租与面积，计算补齐
    mask = out['rent_per_m2'].isna() & out['rent_month_yuan'].notna() & out['area_m2'].notna()
    out.loc[mask, 'rent_per_m2'] = out.loc[mask, 'rent_month_yuan'] / out.loc[mask, 'area_m2']
    # 页码
    if 'page' in df.columns:
        out['page'] = pd.to_numeric(df['page'], errors='coerce')
    elif '页码' in df.columns:
        out['page'] = pd.to_numeric(df['页码'], errors='coerce')
    else:
        out['page'] = None
    # 补充元数据
    out['city'] = meta.get('city', 'Beijing')
    out['district'] = meta.get('district', 'Haidian')
    out['block'] = meta.get('block')
    out['source'] = meta.get('source', 'fang')
    # 列顺序
    out = out[['city','district','block','source','title','area_m2','rent_month_yuan','rent_per_m2','page']]
    return out

# 逐个数据集统一列名
sales_frames = [
    unify_sales_df(data_wanliu_sales, {'city':'Beijing','district':'Haidian','block':'Wanliu','source':'fang'}),
    unify_sales_df(data_suzhouqiao_sales, {'city':'Beijing','district':'Haidian','block':'Suzhouqiao','source':'fang'}),
    unify_sales_df(data_beitaipingzhuang_sales, {'city':'Beijing','district':'Haidian','block':'Beitaipingzhuang','source':'fang'}),
    unify_sales_df(data_shijicheng_sales, {'city':'Beijing','district':'Haidian','block':'Shijicheng','source':'fang'}),
]
rent_frames = [
    unify_rent_df(data_wanliu_rent, {'city':'Beijing','district':'Haidian','block':'Wanliu','source':'fang'}),
    unify_rent_df(data_suzhouqiao_rent, {'city':'Beijing','district':'Haidian','block':'Suzhouqiao','source':'fang'}),
    unify_rent_df(data_beitaipingzhuang_rent, {'city':'Beijing','district':'Haidian','block':'Beitaipingzhuang','source':'fang'}),
    unify_rent_df(data_shijicheng_rent, {'city':'Beijing','district':'Haidian','block':'Shijicheng','source':'fang'}),
]

sales_clean = pd.concat(sales_frames, ignore_index=True)
rent_clean = pd.concat(rent_frames, ignore_index=True)

# 输出标准化文件
os.makedirs('data_standardized', exist_ok=True)
sales_path = os.path.join('data_standardized', 'standardized_sales_all.csv')
rent_path = os.path.join('data_standardized', 'standardized_rent_all.csv')
sales_clean.to_csv(sales_path, index=False, encoding='utf-8-sig')
rent_clean.to_csv(rent_path, index=False, encoding='utf-8-sig')

print('Sales standardized columns:', list(sales_clean.columns))
print('Rent standardized columns:', list(rent_clean.columns))
print('Saved:', sales_path, '|', rent_path)


Sales standardized columns: ['city', 'district', 'block', 'source', 'title', 'area_m2', 'total_price_yuan', 'unit_price_yuan_per_m2', 'page']
Rent standardized columns: ['city', 'district', 'block', 'source', 'title', 'area_m2', 'rent_month_yuan', 'rent_per_m2', 'page']
Saved: output\standardized_sales_all.csv | output\standardized_rent_all.csv
