In [None]:
import json, numpy as np, pandas as pd
from ucimlrepo import fetch_ucirepo

adult = fetch_ucirepo(id=2)
df = adult.data.features.assign(income=adult.data.targets.squeeze())
df = df.replace("?", np.nan).fillna("0")

domain = {}

skip = {"fnlwgt", "capital-gain", "capital-loss", "age", "hours-per-week"}
for col in df.columns:
    if col in skip:
        continue
    domain[col] = sorted(df[col].astype(str).unique().tolist())

domain["age_bin"] = sorted(pd.cut(
    df["age"].astype(int),
    bins=[17,25,35,50,65, df["age"].astype(int).max()+1],
    labels=False, include_lowest=True
).unique().tolist())

domain["hours_bin"] = sorted(pd.cut(
    df["hours-per-week"].astype(int),
    bins=[0,20,40,60,80, df["hours-per-week"].astype(int).max()+1],
    labels=False, include_lowest=True
).unique().tolist())

domain["income"] = ["<=50K", ">50K"]

with open("domain.json","w",encoding="utf-8") as f:
    json.dump(domain, f, indent=2, ensure_ascii=False)

for k, v in domain.items():
    print(f"{k} ({len(v)}): {v[:5]} …")


workclass (9): ['0', 'Federal-gov', 'Local-gov', 'Never-worked', 'Private'] …
education (16): ['10th', '11th', '12th', '1st-4th', '5th-6th'] …
education-num (16): ['1', '10', '11', '12', '13'] …
marital-status (7): ['Divorced', 'Married-AF-spouse', 'Married-civ-spouse', 'Married-spouse-absent', 'Never-married'] …
occupation (15): ['0', 'Adm-clerical', 'Armed-Forces', 'Craft-repair', 'Exec-managerial'] …
relationship (6): ['Husband', 'Not-in-family', 'Other-relative', 'Own-child', 'Unmarried'] …
race (5): ['Amer-Indian-Eskimo', 'Asian-Pac-Islander', 'Black', 'Other', 'White'] …
sex (2): ['Female', 'Male'] …
native-country (42): ['0', 'Cambodia', 'Canada', 'China', 'Columbia'] …
income (2): ['<=50K', '>50K'] …
age_bin (5): [0, 1, 2, 3, 4] …
hours_bin (5): [0, 1, 2, 3, 4] …


In [4]:
import json
import pandas as pd

with open("domain.json", encoding="utf-8") as f:
    domain = json.load(f)

from ucimlrepo import fetch_ucirepo
adult = fetch_ucirepo(id=2)
df = adult.data.features.assign(income=adult.data.targets.squeeze())
df = df.replace("?", pd.NA).fillna("0")

df["age_bin"] = pd.cut(
    df["age"].astype(int),
    bins=[17,25,35,50,65, df["age"].astype(int).max()+1],
    labels=False, include_lowest=True
)

df["hours_bin"] = pd.cut(
    df["hours-per-week"].astype(int),
    bins=[0,20,40,60,80, df["hours-per-week"].astype(int).max()+1],
    labels=False, include_lowest=True
)

df_code = df.copy()

for col, vals in domain.items():
    mapping = { str(v): i for i, v in enumerate(vals) }
    df_code[col] = df_code[col].astype(str).map(mapping).fillna(0).astype(int)

print(df_code[ list(domain.keys()) ].head())

df_code = df_code.drop(columns=["age", "hours-per-week"] + 
                       [c for c in df.columns if c in domain and c not in df_code.columns])
df_code.to_csv("adults_discrete.csv", index=False)



   workclass  education  education-num  marital-status  occupation  \
0          7          9              4               4           1   
1          6          9              4               2           4   
2          4         11             15               0           6   
3          4          1             13               2           6   
4          4          9              4               2          10   

   relationship  race  sex  native-country  income  age_bin  hours_bin  
0             1     4    1              39       0        2          1  
1             0     4    1              39       0        2          0  
2             1     4    1              39       0        2          1  
3             0     2    1              39       0        3          1  
4             5     2    0               5       0        1          1  


In [None]:
from mbi import Domain, Dataset, FactoredInference

data_path = 'adults_discrete.csv'
domain_path = 'domain.json'
output_path = 'synthetic_adults_simplified.csv'
epsilon = 1.0
delta = 1e-5 
num_inference_iterations = 500

print("步驟一：載入資料與設定")
with open(domain_path, 'r', encoding='utf-8') as f:
    domain_dict = json.load(f)
    domain_shape_dict = {k: len(v) for k, v in domain_dict.items()}

try:
    df_original_for_cols = pd.read_csv(data_path, nrows=1) # 讀取一行以獲取欄位
    column_order = list(df_original_for_cols.columns)
    # 確保 domain_shape_dict 的鍵符合 column_order
    ordered_domain_shape = {col: domain_shape_dict[col] for col in column_order if col in domain_shape_dict}
    original_domain = Domain.fromdict(ordered_domain_shape)
except Exception as e:
    print(f"讀取 CSV 欄位或創建 Domain 時出錯: {e}")
    print("請確保 CSV 檔案存在且格式正確，且 domain.json 包含所有欄位。")
    exit()


# 3. 載入預處理過的 CSV 資料集
try:
    df = pd.read_csv(data_path)
    # 確保欄位是整數
    for col in original_domain.attrs:
         if col in df.columns:
              df[col] = df[col].astype(int)
         else:
              print(f"警告：欄位 '{col}' 在 domain 中但不在 CSV 中。")

    # 確保 DataFrame 欄位順序與 Domain 一致
    df = df[original_domain.attrs]

except FileNotFoundError:
    print(f"錯誤：找不到資料集檔案 {data_path}")
    exit()
except Exception as e:
    print(f"讀取或處理 CSV 時出錯: {e}")
    exit()

data = Dataset(df, original_domain)
print(f"資料集載入完成，共 {len(df)} 筆記錄，{len(original_domain.attrs)} 個欄位。")

# 5. 定義要測量的邊際分佈 (Marginals)
# 第一輪：所有單一欄位 (1-way marginals)
round1_marginals = list(original_domain.attrs)


步驟一：載入資料與設定


NameError: name 'json' is not defined