In [None]:
# =============================================================================
# t-Closeness Implementation 
# =============================================================================

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import time
import os

# -----------------------
# Age Generalization (Stop at level 1: ≤ "30s")
# -----------------------
def generalize_age(age_series: pd.Series, level: int) -> pd.Series:
    def bin_age(val):
        if pd.isna(val) or str(val).upper() in {"MISSING", "NULL", "?", ""}:
            return val
        s = str(val).strip("[]()")
        try:
            low, high = map(int, s.split("-", 1))
        except:
            return val
        if level == 0:
            return f"{low}-{high}"
        return "0-30" if high <= 30 else "30-60" if high <= 60 else ">60"
    return age_series.map(bin_age)

# -----------------------
# EMD
# -----------------------
def emd_1d(p: np.ndarray, q: np.ndarray) -> float:
    return float(np.abs(np.cumsum(p) - np.cumsum(q)).sum())

# -----------------------
# t-Closeness Enforcer
# -----------------------
def enforce_t_closeness(df: pd.DataFrame, qi_cols, sens_cols, t_threshold, max_level=1):
    df = df.copy()
    stats = {}
    global_dist = {}

    for s in sens_cols:
        p = df[s].value_counts(normalize=True).sort_index()
        global_dist[s] = (p.index.to_list(), p.to_numpy())

    final_level = 0
    for lvl in range(max_level + 1):
        df['age'] = generalize_age(df['age'], lvl)
        max_emd = max(
            emd_1d(global_dist[s][1],
                   group[s].value_counts(normalize=True)
                         .reindex(global_dist[s][0], fill_value=0).to_numpy())
            for _, group in df.groupby(qi_cols) for s in sens_cols
        )
        if max_emd <= t_threshold:
            final_level = lvl
            break

    dropped_rows = 0
    while True:
        to_drop = []
        for _, group in df.groupby(qi_cols):
            dists = [
                emd_1d(global_dist[s][1],
                       group[s].value_counts(normalize=True)
                             .reindex(global_dist[s][0], fill_value=0).to_numpy())
                for s in sens_cols
            ]
            if max(dists) > t_threshold:
                to_drop.extend(group.index.tolist())
        if not to_drop:
            break
        df.drop(to_drop, inplace=True)
        dropped_rows += len(to_drop)

    final_emd = max(
        emd_1d(global_dist[s][1],
               group[s].value_counts(normalize=True)
                     .reindex(global_dist[s][0], fill_value=0).to_numpy())
        for _, group in df.groupby(qi_cols) for s in sens_cols
    ) if not df.empty else 0.0

    stats = {
        'age_level': final_level,
        'final_max_emd': final_emd,
        'dropped_rows': dropped_rows,
        'final_rows': len(df)
    }

    return df.reset_index(drop=True), stats

# -----------------------
# Batch Runner
# -----------------------
def run_t_closeness_pipeline():
    datasets = {
        'full': 'diabetic_data_final.csv',
        '25k':  'diabetic_data_25k.csv',
        '50k':  'diabetic_data_50k.csv',
        '75k':  'diabetic_data_75k.csv'
    }
    t_values = [0.4, 0.7, 1.0]
    qi_cols = ['age', 'race']
    sens_cols = ['diagnoses_1']
    
    results = []

    for label, path in datasets.items():
        if not os.path.exists(path):
            print(f"Missing: {path}")
            continue
        df = pd.read_csv(path)
        for t_val in t_values:
            print(f"\nProcessing: {label} | t={t_val}")
            t0 = time.perf_counter()
            df_out, stats = enforce_t_closeness(df, qi_cols, sens_cols, t_val, max_level=1)
            t1 = time.perf_counter()
            out_path = f"diabetic_data_{label}_t{t_val}.csv"
            df_out.to_csv(out_path, index=False)
            results.append({
                'Dataset': label,
                't': t_val,
                'Original_Rows': len(df),
                'Final_Rows': stats['final_rows'],
                'Dropped_Rows': stats['dropped_rows'],
                'Age_Generalization_Level': stats['age_level'],
                'Final_Max_EMD': round(stats['final_max_emd'], 4),
                'Time_sec': round(t1 - t0, 2),
                'Achieved': stats['final_max_emd'] <= t_val
            })
            print(f"Saved: {out_path} | Final EMD: {stats['final_max_emd']:.4f}")
    
    summary_df = pd.DataFrame(results)
    summary_df.to_csv("t_closeness_performance_summary.csv", index=False)
    print("\nSummary saved to: t_closeness_performance_summary.csv")
    print(summary_df)
    return summary_df


# -----------------------
# Run
# -----------------------
if __name__ == "__main__":
    df_result = run_t_closeness_pipeline()
    if not df_result.empty:
        plot_summary(df_result)

In [None]:
# -------------------------------------------------------------
# Simple Query Utility Evaluation – t-Closeness
# -------------------------------------------------------------
import pandas as pd
import numpy as np
from IPython.display import display

def classify_utility(rel_err_pct: float) -> str:
    if rel_err_pct < 5:
        return "Good"
    elif rel_err_pct < 15:
        return "Moderate"
    else:
        return "Poor"

def generalize_age_to_3bins(age_series: pd.Series) -> pd.Series:
    def to_bucket(val):
        if pd.isna(val) or str(val).strip().upper() in {"?", "UNKNOWN", "NULL"}:
            return val
        s = str(val).strip(" []()")
        try:
            _, high = map(int, s.split("-", 1))
        except:
            return val
        if high <= 30: return "0-30"
        if high <= 60: return "30-60"
        return ">60"
    return age_series.map(to_bucket)

# -------------------------------------------------------------
# Define raw slices and t values
# -------------------------------------------------------------
slices = {
    "full": "diabetic_data_final.csv",
    "25k":  "diabetic_data_25k.csv",
    "50k":  "diabetic_data_50k.csv",
    "75k":  "diabetic_data_75k.csv"
}
t_values = [0.4, 0.7, 1.0]

# -------------------------------------------------------------
# Simple Queries
# -------------------------------------------------------------
def q_age_dist(df): 
    df2 = df.copy()
    df2["age3"] = generalize_age_to_3bins(df2["age"])
    return df2["age3"].value_counts().sort_index()

def q_race_dist(df): 
    return df["race"].value_counts().sort_index()

def q_gender_admtype(df): 
    return df.groupby(["gender", "admission_type"]).size().sort_index()

def q_avg_meds_by_age(df): 
    df2 = df.copy()
    df2["age3"] = generalize_age_to_3bins(df2["age"])
    return df2.groupby("age3")["num_medications"].mean().sort_index()

def q_readmit_rate_by_race(df): 
    return (
        df.groupby("race")["readmitted"]
          .apply(lambda s: (s == "<30").sum() / len(s) * 100)
          .sort_index()
    )

queries = {
    "Age-3bin counts":       q_age_dist,
    "Race counts":           q_race_dist,
    "Gender×AdmType counts": q_gender_admtype,
    "Avg #meds by age3bin":  q_avg_meds_by_age,
    "Readmit(<30%) by race": q_readmit_rate_by_race,
}

# -------------------------------------------------------------
# Precompute raw query results
# -------------------------------------------------------------
orig_results = {}
for slice_label, path in slices.items():
    df_raw = pd.read_csv(path, keep_default_na=False)
    orig_results[slice_label] = {
        qname: fn(df_raw) for qname, fn in queries.items()
    }

# -------------------------------------------------------------
# Compute & Compare Utility
# -------------------------------------------------------------
records = []
for slice_label in slices:
    for t in t_values:
        anon_path = f"diabetic_data_{slice_label}_t{t}.csv"
        df_anon = pd.read_csv(anon_path, keep_default_na=False)

        for qname, fn in queries.items():
            orig_ser = orig_results[slice_label][qname]
            anon_ser = fn(df_anon)

            comp = pd.DataFrame({
                "orig": orig_ser,
                "anon": anon_ser.reindex(orig_ser.index).fillna(0)
            })

            comp["rel_err_pct"] = np.where(
                comp["orig"] == 0,
                np.nan,
                (comp["anon"] - comp["orig"]).abs() / comp["orig"] * 100
            )

            mean_err = comp["rel_err_pct"].dropna().mean()
            util     = classify_utility(mean_err)

            records.append({
                "Slice":     slice_label,
                "Query":     qname,
                "t":         t,
                "RelErr(%)": round(mean_err, 2),
                "Utility":   util
            })

# -------------------------------------------------------------
# Final Summary Table & Export
# -------------------------------------------------------------
summary_df = pd.DataFrame(records)
summary_pivot = (
    summary_df
    .pivot_table(index=["Slice", "Query"], columns="t", values=["RelErr(%)", "Utility"], aggfunc="first")
    .round(2)
)

summary_pivot.columns.name = None
summary_pivot = summary_pivot.reset_index()

display(summary_pivot)
summary_pivot.to_csv("t_closeness_simple_query_utility_summary.csv", index=False)

In [None]:
# -------------------------------------------------------------
# Complex Query Utility Evaluation – t-Closeness
# -------------------------------------------------------------
import pandas as pd
import numpy as np
from IPython.display import display

def classify_utility(rel_err_pct: float) -> str:
    if rel_err_pct < 5:
        return "Good"
    elif rel_err_pct < 15:
        return "Moderate"
    else:
        return "Poor"

def generalize_age_to_3bins(age_series: pd.Series) -> pd.Series:
    def to_bucket(val):
        if pd.isna(val) or str(val).strip().upper() in {"?", "UNKNOWN", "NULL"}:
            return val
        s = str(val).strip(" []()")
        try:
            _, high = map(int, s.split("-", 1))
        except:
            return val
        if high <= 30: return "0-30"
        if high <= 60: return "30-60"
        return ">60"
    return age_series.map(to_bucket)

# -------------------------------------------------------------
# Define input datasets and t values
# -------------------------------------------------------------
slices = {
    "full": "diabetic_data_final.csv",
    "25k":  "diabetic_data_25k.csv",
    "50k":  "diabetic_data_50k.csv",
    "75k":  "diabetic_data_75k.csv"
}
t_values = [0.4, 0.7, 1.0]

# -------------------------------------------------------------
# Define Complex Queries
# -------------------------------------------------------------
def q_lab_by_age_adm(df): 
    df2 = df.copy()
    df2["age3"] = generalize_age_to_3bins(df2["age"])
    return df2.groupby(["age3", "admission_type"])["num_lab_procedures"].mean().sort_index()

def q_readmit_by_diag1_gender(df): 
    return df.groupby(["diagnoses_1", "gender"])["readmitted"]\
             .apply(lambda s: (s == "<30").sum() / len(s) * 100).sort_index()

def q_avg_meds_by_diag2_gender(df): 
    return df.groupby(["diagnoses_2", "gender"])["num_medications"].mean().sort_index()

def q_avg_meds_by_race_admtype(df): 
    return df.groupby(["race", "admission_type"])["num_medications"].mean().sort_index()

def q_avg_labs_by_diag2_admtype(df): 
    return df.groupby(["diagnoses_2", "admission_type"])["num_lab_procedures"].mean().sort_index()

queries = {
    "Avg labs by age³×AdmType":        q_lab_by_age_adm,
    "Readmit% by diag1×Gender":        q_readmit_by_diag1_gender,
    "Avg meds by diag2×Gender":        q_avg_meds_by_diag2_gender,
    "Avg meds by Race×AdmType":        q_avg_meds_by_race_admtype,
    "Avg labs by diag2×AdmType":       q_avg_labs_by_diag2_admtype,
}

# -------------------------------------------------------------
# Compute original query results
# -------------------------------------------------------------
orig_results = {}
for slice_label, path in slices.items():
    df_raw = pd.read_csv(path, keep_default_na=False)
    orig_results[slice_label] = {
        qname: fn(df_raw) for qname, fn in queries.items()
    }

# -------------------------------------------------------------
# Evaluate utility from t-closeness anonymized datasets
# -------------------------------------------------------------
records = []
for slice_label in slices:
    for t in t_values:
        anon_path = f"diabetic_data_{slice_label}_t{t}.csv"
        df_anon = pd.read_csv(anon_path, keep_default_na=False)

        for qname, fn in queries.items():
            orig_ser = orig_results[slice_label][qname]
            anon_ser = fn(df_anon)

            comp = pd.DataFrame({
                "orig": orig_ser,
                "anon": anon_ser.reindex(orig_ser.index).fillna(0)
            })

            comp["rel_err_pct"] = np.where(
                comp["orig"] == 0,
                np.nan,
                (comp["anon"] - comp["orig"]).abs() / comp["orig"] * 100
            )

            mean_err = comp["rel_err_pct"].dropna().mean()
            util     = classify_utility(mean_err)

            records.append({
                "Slice":     slice_label,
                "Query":     qname,
                "t":         t,
                "RelErr(%)": round(mean_err, 2),
                "Utility":   util
            })

# -------------------------------------------------------------
# Final Summary Table & Save Output
# -------------------------------------------------------------
summary_df = pd.DataFrame(records)
summary_pivot = (
    summary_df
    .pivot_table(index=["Slice", "Query"], columns="t", values=["RelErr(%)", "Utility"], aggfunc="first")
    .round(2)
)

summary_pivot.columns.name = None
summary_pivot = summary_pivot.reset_index()

display(summary_pivot)
summary_pivot.to_csv("t_closeness_complex_query_utility_summary.csv", index=False)