In [1]:
import pandas as pd
import numpy as np

# 文件路径（在你当前工作目录下）
BUSINESS_JSON = "yelp_academic_dataset_business.json"
ACS_CSV       = "acs_phx_2023.csv"

In [3]:
# 读取 business.json（行式 JSON）
business = pd.read_json(BUSINESS_JSON, lines=True)

# 只保留 Arizona
biz_az = business.loc[business['state'] == 'AZ'].copy()

# 只保留建模可能用到的列（节省内存）
cols_keep = [
    'business_id', 'name', 'city', 'state', 'postal_code',
    'stars', 'review_count',
    'attributes', 'categories',
    'latitude', 'longitude'
]
biz_az = biz_az[cols_keep]

print("AZ 样本量：", len(biz_az))
biz_az.head()


AZ 样本量： 9912


Unnamed: 0,business_id,name,city,state,postal_code,stars,review_count,attributes,categories,latitude,longitude
2,tUFrWirKiKi_TAnsVWINQQ,Target,Tucson,AZ,85711,3.5,22,"{'BikeParking': 'True', 'BusinessAcceptsCredit...","Department Stores, Shopping, Fashion, Home & G...",32.223236,-110.880452
16,rBmpy_Y1UbBx8ggHlyb7hA,Arizona Truck Outfitters,Tucson,AZ,85705,4.5,10,"{'DriveThru': 'False', 'BusinessAcceptsCreditC...","Automotive, Auto Parts & Supplies, Auto Custom...",32.229872,-110.972342
45,wghnIlMb_i5U46HMBGx9ig,China Dragon Restaurant,Tucson,AZ,85746,3.0,23,"{'WiFi': ''no'', 'BusinessParking': '{'garage'...","Restaurants, Chinese",32.132305,-110.999985
56,txyXRytGjwOXvS8s4sc-WA,Smoothie King,Tucson,AZ,85713,3.0,29,"{'RestaurantsPriceRange2': '2', 'BusinessParki...","Vitamins & Supplements, Ice Cream & Frozen Yog...",32.186794,-110.954765
67,eYxGFkxo6m3SYGVTh5m2nQ,Big Boyz Toyz Motorcycle Rentals,Tucson,AZ,85712,4.5,8,,"Towing, Hotels & Travel, Automotive, Motorcycl...",32.250324,-110.903655


In [5]:
# 从 postal_code 中抽取 5 位 zip
biz_az['postal_code'] = biz_az['postal_code'].astype(str)

# 只保留前 5 位数字作为 zip5
biz_az['zip5'] = biz_az['postal_code'].str.extract(r'(\d{5})', expand=False)
biz_az['zip5'] = biz_az['zip5'].str.zfill(5)

# 去掉没有 zip5 的
biz_az = biz_az[~biz_az['zip5'].isna()].copy()

# log_review_count
biz_az['log_review_count'] = np.log1p(biz_az['review_count'].clip(lower=0))

# 高评分指示
biz_az['high_rating'] = (biz_az['stars'] >= 4.0).astype(int)

print("清理 zip 后样本量：", len(biz_az))
biz_az[['business_id', 'city', 'stars', 'review_count', 'log_review_count', 'zip5']].head()


清理 zip 后样本量： 9907


Unnamed: 0,business_id,city,stars,review_count,log_review_count,zip5
2,tUFrWirKiKi_TAnsVWINQQ,Tucson,3.5,22,3.135494,85711
16,rBmpy_Y1UbBx8ggHlyb7hA,Tucson,4.5,10,2.397895,85705
45,wghnIlMb_i5U46HMBGx9ig,Tucson,3.0,23,3.178054,85746
56,txyXRytGjwOXvS8s4sc-WA,Tucson,3.0,29,3.401197,85713
67,eYxGFkxo6m3SYGVTh5m2nQ,Tucson,4.5,8,2.197225,85712


In [7]:
# --- 从 attributes 里提取 price ---
def extract_price(attrs):
    if not isinstance(attrs, dict):
        return np.nan
    val = attrs.get('RestaurantsPriceRange2')
    if val is None:
        return np.nan
    try:
        return float(val)
    except (ValueError, TypeError):
        return np.nan

biz_az['price'] = biz_az['attributes'].apply(extract_price)

# price 缺失指示 + 填补
biz_az['price_missing'] = biz_az['price'].isna().astype(int)
price_median = biz_az['price'].median()
biz_az['price_filled'] = biz_az['price'].fillna(price_median)

# is_restaurant
def is_restaurant_from_cat(cats):
    if not isinstance(cats, str):
        return 0
    return int('Restaurants' in cats)

biz_az['is_restaurant'] = biz_az['categories'].apply(is_restaurant_from_cat)

# main_category：简单做法——categories 的第一个标签
def get_main_category(cats):
    if not isinstance(cats, str):
        return 'Unknown'
    parts = [c.strip() for c in cats.split(',') if c.strip()]
    return parts[0] if parts else 'Unknown'

biz_az['main_category_raw'] = biz_az['categories'].apply(get_main_category)

# 取 Top-K 类别，其余合并为 "Other"
K = 15
top_cats = biz_az['main_category_raw'].value_counts().nlargest(K).index
biz_az['main_category'] = np.where(
    biz_az['main_category_raw'].isin(top_cats),
    biz_az['main_category_raw'],
    'Other'
)

biz_az[['stars', 'price', 'price_missing', 'price_filled',
        'is_restaurant', 'main_category']].head()


Unnamed: 0,stars,price,price_missing,price_filled,is_restaurant,main_category
2,3.5,2.0,0,2.0,0,Other
16,4.5,,1,2.0,0,Automotive
45,3.0,2.0,0,2.0,1,Restaurants
56,3.0,2.0,0,2.0,0,Other
67,4.5,,1,2.0,0,Other


In [9]:
acs = pd.read_csv(ACS_CSV)

# 统一 zip 为 5 位字符串
if 'zip' in acs.columns:
    acs['zip5'] = acs['zip'].astype(str).str.zfill(5)
elif 'ZCTA5' in acs.columns:
    acs['zip5'] = acs['ZCTA5'].astype(str).str.zfill(5)
else:
    print("请在 acs 表中确认 ZIP 列名（比如 'zip' 或 'ZCTA5'），然后在这里改。")

acs_cols_keep = ['zip5', 'median_income', 'pct_college', 'total_pop']
print("ACS 可用列：", acs.columns.tolist())

acs_small = acs[acs_cols_keep].copy()

# 合并（左连接：保留所有 Yelp 商家）
df = biz_az.merge(acs_small, on='zip5', how='left')

print("合并后样本量：", len(df))
df[['business_id', 'city', 'zip5', 'median_income', 'pct_college', 'total_pop']].head()


ACS 可用列： ['zip', 'median_income', 'pct_college', 'total_pop', 'zip5']
合并后样本量： 9907


Unnamed: 0,business_id,city,zip5,median_income,pct_college,total_pop
0,tUFrWirKiKi_TAnsVWINQQ,Tucson,85711,52358.0,30.5,41201.0
1,rBmpy_Y1UbBx8ggHlyb7hA,Tucson,85705,36606.0,20.1,54853.0
2,wghnIlMb_i5U46HMBGx9ig,Tucson,85746,68506.0,17.8,44030.0
3,txyXRytGjwOXvS8s4sc-WA,Tucson,85713,50264.0,20.6,45615.0
4,eYxGFkxo6m3SYGVTh5m2nQ,Tucson,85712,48185.0,37.2,31964.0


In [11]:
OUTPUT_CSV = "az_business_acs_clean.csv"
df.to_csv(OUTPUT_CSV, index=False)
print("已保存到：", OUTPUT_CSV)
print("列名：")
print(df.columns.tolist())


已保存到： az_business_acs_clean.csv
列名：
['business_id', 'name', 'city', 'state', 'postal_code', 'stars', 'review_count', 'attributes', 'categories', 'latitude', 'longitude', 'zip5', 'log_review_count', 'high_rating', 'price', 'price_missing', 'price_filled', 'is_restaurant', 'main_category_raw', 'main_category', 'median_income', 'pct_college', 'total_pop']
