In [None]:
# ------------------------------------------------------------
# ℓ-Diversity Implementation 
# ------------------------------------------------------------

import pandas as pd
import time
import matplotlib.pyplot as plt
import numpy as np
from typing import Tuple, Dict, List
import logging

# Configure logging for detailed process tracking
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

# -----------------------------------------------------------------------------
# 1) Generalization Functions (Following Machanavajjhala et al.'s Methodology)
# -----------------------------------------------------------------------------

def generalize_age_to_3bins(age_series: pd.Series) -> pd.Series:
    """
    Implement age generalization hierarchy for ℓ-diversity as per Machanavajjhala et al.'s approach.
    
    Generalization Strategy:
    - Map 10-year age ranges to 3 broad categories to reduce quasi-identifier granularity
    - Preserves semantic consistency while enabling ℓ-diversity enforcement
    
    Args:
        age_series: Pandas Series containing age data
    
    Returns:
        Generalized age series with 3 categories: "0-30", "30-60", ">60"
    """
    logger.info("Applying age generalization for ℓ-diversity enforcement")
    
    def bucketize(val):
        # Preserve missing values as per Machanavajjhala et al.'s handling of incomplete data
        if pd.isna(val) or str(val).strip().upper() in {"MISSING", "NULL"}:
            return val
        
        # Parse age range format [low-high)
        s = str(val).strip(" []()")
        try:
            if "-" in s:
                low, high = map(int, s.split("-", 1))
            else:
                # Handle already generalized values
                return val
        except (ValueError, TypeError):
            return val
        
        # Apply 3-category generalization for ℓ-diversity
        if high <= 30:
            return "0-30"
        elif high <= 60:
            return "30-60"
        else:
            return ">60"
    
    result = age_series.map(bucketize)
    original_diversity = age_series.nunique()
    generalized_diversity = result.nunique()
    
    logger.info(f"Age generalization complete. Diversity reduced from {original_diversity} to {generalized_diversity} categories")
    return result

# -----------------------------------------------------------------------------
# 2) Core ℓ-Diversity Algorithm via Record Suppression (Machanavajjhala et al.'s Method)
# -----------------------------------------------------------------------------

def check_l_diversity_compliance(df: pd.DataFrame, quasi_identifiers: List[str], 
                               sensitive_attributes: List[str], l: int) -> Tuple[bool, Dict]:
    """
    Verify ℓ-diversity compliance according to Machanavajjhala et al.'s definition.
    
    Args:
        df: DataFrame to check
        quasi_identifiers: List of quasi-identifier column names
        sensitive_attributes: List of sensitive attribute column names
        l: Minimum diversity requirement
        
    Returns:
        Tuple of (is_l_diverse, compliance_stats)
    """
    logger.info(f"Checking ℓ-diversity compliance for ℓ={l}")
    
    compliance_stats = {}
    
    for sens_attr in sensitive_attributes:
        # Count distinct values per equivalence class for each sensitive attribute
        diversity_counts = df.groupby(quasi_identifiers)[sens_attr].nunique(dropna=True)
        min_diversity = diversity_counts.min()
        violations = (diversity_counts < l).sum()
        total_groups = len(diversity_counts)
        
        compliance_stats[sens_attr] = {
            'min_diversity': min_diversity,
            'violations': violations,
            'total_groups': total_groups,
            'compliant': min_diversity >= l
        }
        
        logger.info(f"ℓ-diversity check for {sens_attr}: min_diversity={min_diversity}, "
                   f"violations={violations}/{total_groups}")
    
    # Overall compliance requires all sensitive attributes to be ℓ-diverse
    overall_compliant = all(stats['compliant'] for stats in compliance_stats.values())
    
    logger.info(f"Overall ℓ-diversity compliance: {overall_compliant}")
    return overall_compliant, compliance_stats

def enforce_l_diversity_by_dropping(df: pd.DataFrame, l: int,
                                  quasi_identifiers: List[str] = ['age', 'race', 'gender'],
                                  sensitive_attributes: List[str] = ['diagnoses_1', 'diagnoses_2']) -> Tuple[pd.DataFrame, int]:
    """
    Enforce ℓ-diversity by dropping entire records in violating equivalence classes.
    
    Algorithm Steps (Following Machanavajjhala et al. 2007):
    1. Apply generalization to quasi-identifiers
    2. Identify equivalence classes that violate ℓ-diversity
    3. Remove all records in violating equivalence classes
    4. Verify final ℓ-diversity guarantee
    
    Args:
        df: Input DataFrame
        l: Diversity parameter (minimum distinct values required)
        quasi_identifiers: List of quasi-identifier columns
        sensitive_attributes: List of sensitive attribute columns
        
    Returns:
        Tuple of (pruned_df, dropped_count)
    """
    logger.info(f"Starting ℓ-diversity enforcement by record dropping with ℓ={l}")
    logger.info(f"Original dataset size: {len(df)} records")
    
    # Step 1: Create working copy and apply generalization
    df_work = df.copy()
    
    # Apply age generalization
    original_age_diversity = df_work['age'].nunique()
    df_work['age'] = generalize_age_to_3bins(df_work['age'])
    generalized_age_diversity = df_work['age'].nunique()
    
    logger.info(f"Age generalized from {original_age_diversity} to {generalized_age_diversity} categories")
    
    # Step 2: Identify violating equivalence classes
    logger.info("Identifying equivalence classes that violate ℓ-diversity")
    
    merged = df_work.copy()
    
    # Calculate diversity for each sensitive attribute per equivalence class
    for sens_attr in sensitive_attributes:
        diversity_stats = (
            df_work.groupby(quasi_identifiers)[sens_attr]
            .nunique(dropna=True)
            .rename(f"{sens_attr}_distinct")
            .reset_index()
        )
        merged = merged.merge(diversity_stats, on=quasi_identifiers, how='left')
    
    # Step 3: Identify records in violating equivalence classes
    violation_condition = False
    for sens_attr in sensitive_attributes:
        violation_condition |= (merged[f"{sens_attr}_distinct"] < l)
    
    violating_indices = merged.loc[violation_condition].index
    dropped_count = len(violating_indices)
    
    if dropped_count > 0:
        logger.info(f"Dropping {dropped_count} records in equivalence classes with <{l}-diversity")
        df_pruned = df_work.drop(violating_indices).reset_index(drop=True)
    else:
        logger.info(f"No records to drop; all equivalence classes already ≥{l}-diverse")
        df_pruned = df_work
    
    # Step 4: Final verification
    logger.info("Performing final ℓ-diversity verification")
    final_compliant, final_stats = check_l_diversity_compliance(
        df_pruned, quasi_identifiers, sensitive_attributes, l
    )
    
    if not final_compliant:
        logger.error("ALGORITHM ERROR: Final result does not satisfy ℓ-diversity")
        raise ValueError(f"ℓ-diversity algorithm failed to achieve ℓ={l}")
    
    # Report final diversity statistics
    final_mins = []
    for sens_attr in sensitive_attributes:
        min_diversity = final_stats[sens_attr]['min_diversity']
        final_mins.append(min_diversity)
        logger.info(f"Final min diversity for {sens_attr}: {min_diversity}")
    
    overall_min_diversity = min(final_mins)
    logger.info(f"Overall minimum diversity achieved: {overall_min_diversity} (required: {l})")
    
    return df_pruned, dropped_count

def apply_machanavajjhala_l_diversity_dropping(df: pd.DataFrame, l: int,
                                             quasi_identifiers: List[str] = ['age', 'race', 'gender'],
                                             sensitive_attributes: List[str] = ['diagnoses_1', 'diagnoses_2']) -> Tuple[pd.DataFrame, Dict]:
    """
    Apply ℓ-diversity using Machanavajjhala et al.'s methodology with record dropping.
    
    Args:
        df: Input DataFrame
        l: Diversity parameter
        quasi_identifiers: List of quasi-identifier columns
        sensitive_attributes: List of sensitive attribute columns
        
    Returns:
        Tuple of (anonymized_df, statistics_dict)
    """
    logger.info(f"Starting Machanavajjhala ℓ-diversity algorithm (record dropping) with ℓ={l}")
    
    # Initialize statistics tracking
    stats = {
        'original_records': len(df),
        'l_value': l,
        'generalization_applied': {},
        'records_dropped': 0,
        'final_records': 0,
        'algorithm_steps': []
    }
    
    stats['algorithm_steps'].append("Step 1: Started ℓ-diversity enforcement by record dropping")
    
    # Apply the core algorithm
    df_anonymized, dropped_count = enforce_l_diversity_by_dropping(
        df, l, quasi_identifiers, sensitive_attributes
    )
    
    # Step 5: Shuffle records to prevent positional inference attacks
    logger.info("Shuffling records to prevent positional attacks")
    df_final = df_anonymized.sample(frac=1, random_state=123).reset_index(drop=True)
    
    # Finalize statistics
    stats['records_dropped'] = dropped_count
    stats['final_records'] = len(df_final)
    stats['records_retained_pct'] = (stats['final_records'] / stats['original_records']) * 100
    
    # Calculate generalization statistics
    original_age_diversity = 10  # Original 10-year bins
    generalized_age_diversity = 3  # 3 broad categories
    stats['generalization_applied']['age'] = {
        'original_diversity': original_age_diversity,
        'generalized_diversity': generalized_age_diversity,
        'reduction_ratio': generalized_age_diversity / original_age_diversity
    }
    
    # Calculate final diversity statistics
    final_diversity_stats = {}
    for sens_attr in sensitive_attributes:
        diversity_per_class = df_final.groupby(quasi_identifiers)[sens_attr].nunique(dropna=True)
        final_diversity_stats[sens_attr] = {
            'min_diversity': diversity_per_class.min(),
            'max_diversity': diversity_per_class.max(),
            'avg_diversity': diversity_per_class.mean()
        }
    
    stats['final_diversity_stats'] = final_diversity_stats
    stats['algorithm_steps'].append(f"Step 2: Final dataset with {len(df_final)} records")
    
    logger.info(f"ℓ-diversity algorithm completed successfully")
    logger.info(f"Records retained: {stats['final_records']}/{stats['original_records']} "
                f"({stats['records_retained_pct']:.1f}%)")
    logger.info(f"Records dropped: {stats['records_dropped']}")
    
    return df_final, stats

# -----------------------------------------------------------------------------
# 3) Batch Processing and Performance Analysis
# -----------------------------------------------------------------------------

def process_all_datasets_l_diversity_dropping(datasets: Dict[str, str], l_values: List[int]) -> pd.DataFrame:
    """
    Apply Machanavajjhala et al.'s ℓ-diversity algorithm (record dropping) to all datasets and ℓ-values.
    
    Args:
        datasets: Dictionary mapping dataset labels to file paths
        l_values: List of ℓ values to test
        
    Returns:
        DataFrame containing timing and statistics results
    """
    logger.info("Starting batch processing of all datasets for ℓ-diversity (record dropping)")
    
    results = []
    total_combinations = len(datasets) * len(l_values)
    current_combination = 0
    
    for dataset_label, file_path in datasets.items():
        logger.info(f"\n{'='*60}")
        logger.info(f"Processing Dataset: {dataset_label} ({file_path})")
        logger.info(f"{'='*60}")
        
        # Load dataset
        try:
            df_original = pd.read_csv(file_path, keep_default_na=False)
            logger.info(f"Loaded dataset: {len(df_original)} records, {len(df_original.columns)} columns")
        except Exception as e:
            logger.error(f"Failed to load {file_path}: {e}")
            continue
        
        for l in l_values:
            current_combination += 1
            logger.info(f"\n--- Processing combination {current_combination}/{total_combinations}: {dataset_label} with ℓ={l} ---")
            
            # Measure execution time
            start_time = time.perf_counter()
            
            try:
                # Apply Machanavajjhala et al.'s ℓ-diversity algorithm
                df_anonymized, stats = apply_machanavajjhala_l_diversity_dropping(df_original, l)
                
                execution_time = time.perf_counter() - start_time
                
                # Save anonymized dataset
                output_filename = f"diabetic_data_{dataset_label}_l{l}_dropping.csv"
                df_anonymized.to_csv(output_filename, index=False)
                logger.info(f"Saved anonymized dataset: {output_filename}")
                
                # Collect results
                result_record = {
                    'dataset': dataset_label,
                    'l_value': l,
                    'execution_time_seconds': execution_time,
                    'original_records': stats['original_records'],
                    'final_records': stats['final_records'],
                    'records_retained_pct': stats['records_retained_pct'],
                    'records_dropped': stats['records_dropped'],
                    'drop_percentage': (stats['records_dropped'] / stats['original_records']) * 100
                }
                
                # Add generalization statistics
                if 'age' in stats['generalization_applied']:
                    age_stats = stats['generalization_applied']['age']
                    result_record['age_diversity_reduction'] = age_stats['reduction_ratio']
                
                # Add final diversity statistics
                for sens_attr in ['diagnoses_1', 'diagnoses_2']:
                    if sens_attr in stats['final_diversity_stats']:
                        diversity_stats = stats['final_diversity_stats'][sens_attr]
                        result_record[f'{sens_attr}_min_diversity'] = diversity_stats['min_diversity']
                        result_record[f'{sens_attr}_avg_diversity'] = diversity_stats['avg_diversity']
                
                results.append(result_record)
                
                logger.info(f"✓ SUCCESS: {dataset_label} with ℓ={l} completed successfully")
                logger.info(f"  - Execution time: {execution_time:.2f} seconds")
                logger.info(f"  - Data retention: {stats['records_retained_pct']:.1f}%")
                logger.info(f"  - Records dropped: {stats['records_dropped']}")
                
            except Exception as e:
                logger.error(f"✗ FAILED: Error processing ℓ={l} for {dataset_label}")
                logger.error(f"  - Error details: {str(e)}")
                continue
    
    results_df = pd.DataFrame(results)
    logger.info(f"\nBatch processing completed.")
    logger.info(f"Successfully processed: {len(results_df)}/{total_combinations} combinations")
    
    return results_df


    
    # Save performance summary
    summary_filename = 'l_diversity_dropping_performance_summary.csv'
    results_df.to_csv(summary_filename, index=False)
    logger.info(f"Performance summary saved: {summary_filename}")

# -----------------------------------------------------------------------------
# 4) Main Execution
# -----------------------------------------------------------------------------

if __name__ == "__main__":
    logger.info("Starting Machanavajjhala ℓ-Diversity Implementation (Record Dropping)")
    logger.info("Based on: Machanavajjhala, A., et al. (2007). ℓ-diversity: Privacy beyond k-anonymity.")
    
    # Define datasets and parameters
    datasets = {
        'full': 'diabetic_data_final.csv',
        '25k': 'diabetic_data_25k.csv', 
        '50k': 'diabetic_data_50k.csv',
        '75k': 'diabetic_data_75k.csv'
    }
    
    l_values = [2, 3, 4]
    
    print(f"Expected to process: {len(datasets)} datasets × {len(l_values)} l-values = {len(datasets) * len(l_values)} combinations")
    print(f"Datasets: {list(datasets.keys())}")
    print(f"L-values: {l_values}")
    
    # Process all combinations
    results_df = process_all_datasets_l_diversity_dropping(datasets, l_values)
    
    # Verify output files
    print(f"\nChecking generated output files:")
    import os
    expected_files = []
    for dataset_label in datasets.keys():
        for l in l_values:
            filename = f"diabetic_data_{dataset_label}_l{l}_dropping.csv"
            expected_files.append(filename)
            exists = os.path.exists(filename)
            print(f"  {filename}: {'✓ EXISTS' if exists else '✗ MISSING'}")
    
    print(f"\nExpected output files: {len(expected_files)}")
    print(f"Generated files: {sum(1 for f in expected_files if os.path.exists(f))}")
    
   
        
        # Print summary statistics
    print("\n" + "="*80)
    print("MACHANAVAJJHALA ℓ-DIVERSITY ALGORITHM (RECORD DROPPING) - EXECUTION SUMMARY")
    print("="*80)
    print(results_df.to_string(index=False))
    print("="*80)
        
    logger.info("All ℓ-diversity processing completed successfully")
else:
    logger.error("No results generated - check dataset files and parameters")

In [None]:
# -------------------------------------------------------------
# Simple Query Utility Evaluation – ℓ-Diversity
# -------------------------------------------------------------

import pandas as pd
import numpy as np
from IPython.display import display

def classify_utility(rel_err_pct: float) -> str:
    if rel_err_pct < 5:
        return "Good"
    elif rel_err_pct < 15:
        return "Moderate"
    else:
        return "Poor"

def generalize_age_to_3bins(age_series: pd.Series) -> pd.Series:
    def to_bucket(val):
        if pd.isna(val) or str(val).strip().upper() in {"?", "UNKNOWN", "NULL"}:
            return val
        s = str(val).strip(" []()")
        try:
            low, high = map(int, s.split("-", 1))
        except:
            return val
        if high <= 30: return "0-30"
        if high <= 60: return "30-60"
        return ">60"
    return age_series.map(to_bucket)

# -------------------------------------------------------------
# Dataset Slices and ℓ-values
# -------------------------------------------------------------
slices = {
    "full": "diabetic_data_final.csv",
    "25k":  "diabetic_data_25k.csv",
    "50k":  "diabetic_data_50k.csv",
    "75k":  "diabetic_data_75k.csv"
}
ls = [2, 3, 4]

# -------------------------------------------------------------
# Simple Query Definitions
# -------------------------------------------------------------
def q_age_dist(df):
    df2 = df.copy()
    df2['age3'] = generalize_age_to_3bins(df2['age'])
    return df2['age3'].value_counts().sort_index()

def q_race_dist(df):
    return df["race"].value_counts().sort_index()

def q_gender_admtype(df):
    return df.groupby(["gender","admission_type"]).size().sort_index()

def q_avg_meds_by_age(df):
    df2 = df.copy()
    df2['age3'] = generalize_age_to_3bins(df2['age'])
    return df2.groupby("age3")["num_medications"].mean().sort_index()

def q_readmit_rate_by_race(df):
    return (
        df.groupby("race")["readmitted"]
          .apply(lambda s: (s == "<30").sum() / len(s) * 100)
          .sort_index()
    )

queries = {
    "Age-3bin counts":             q_age_dist,
    "Race counts":                 q_race_dist,
    "Gender×AdmType counts":       q_gender_admtype,
    "Avg #meds by age3bin":        q_avg_meds_by_age,
    "Readmit(<30%) by race":       q_readmit_rate_by_race,
}

# -------------------------------------------------------------
# Run and Compare Queries
# -------------------------------------------------------------
orig_results = {}
for slice_label, path in slices.items():
    df_raw = pd.read_csv(path, keep_default_na=False)
    orig_results[slice_label] = {
        qname: fn(df_raw) for qname, fn in queries.items()
    }

records = []
for slice_label in slices:
    for l in ls:
        anon_path = f"diabetic_data_{slice_label}_l{l}_dropping.csv"
        df_anon = pd.read_csv(anon_path, keep_default_na=False)

        for qname, fn in queries.items():
            orig_ser = orig_results[slice_label][qname]
            anon_ser = fn(df_anon)

            comp = pd.DataFrame({
                "orig": orig_ser,
                "anon": anon_ser.reindex(orig_ser.index).fillna(0)
            })

            comp["rel_err_pct"] = np.where(
                comp["orig"] == 0,
                np.nan,
                (comp["anon"] - comp["orig"]).abs() / comp["orig"] * 100
            )

            mean_err = comp["rel_err_pct"].dropna().mean()
            util     = classify_utility(mean_err)

            records.append({
                "Slice":     slice_label,
                "Query":     qname,
                "ℓ":         l,
                "RelErr(%)": round(mean_err, 2),
                "Utility":   util
            })

# -------------------------------------------------------------
# Create Summary Table and Export
# -------------------------------------------------------------
summary_df = pd.DataFrame(records)
summary_pivot = (
    summary_df
    .pivot_table(index=["Slice", "Query"], columns="ℓ", values=["RelErr(%)", "Utility"], aggfunc="first")
    .round(2)
)

summary_pivot.columns.name = None
summary_pivot = summary_pivot.reset_index()

# Display nicely
display(summary_pivot)

# Save as CSV
summary_pivot.to_csv("l_diversity_simple_query_utility_summary.csv", index=False)

In [None]:
# -------------------------------------------------------------
# Complex Query Utility Evaluation – ℓ-Diversity
# -------------------------------------------------------------

import pandas as pd
import numpy as np
from IPython.display import display

def classify_utility(rel_err_pct: float) -> str:
    if rel_err_pct < 5:
        return "Good"
    elif rel_err_pct < 15:
        return "Moderate"
    else:
        return "Poor"

def generalize_age_to_3bins(age_series: pd.Series) -> pd.Series:
    def to_bucket(val):
        if pd.isna(val) or str(val).strip().upper() in {"?", "UNKNOWN", "NULL"}:
            return val
        s = str(val).strip(" []()")
        try:
            low, high = map(int, s.split("-", 1))
        except:
            return val
        if high <= 30: return "0-30"
        if high <= 60: return "30-60"
        return ">60"
    return age_series.map(to_bucket)

# -------------------------------------------------------------
# Dataset Slices and ℓ-values
# -------------------------------------------------------------
slices = {
    "full": "diabetic_data_final.csv",
    "25k":  "diabetic_data_25k.csv",
    "50k":  "diabetic_data_50k.csv",
    "75k":  "diabetic_data_75k.csv"
}
ls = [2, 3, 4]

# -------------------------------------------------------------
# Complex Query Definitions
# -------------------------------------------------------------
def q_lab_by_age_adm(df):
    df2 = df.copy()
    df2['age3'] = generalize_age_to_3bins(df2['age'])
    return df2.groupby(['age3','admission_type'])['num_lab_procedures'].mean().sort_index()

def q_readmit_by_diag1_gender(df):
    return df.groupby(['diagnoses_1','gender'])['readmitted'].apply(lambda s: (s == '<30').mean() * 100).sort_index()

def q_avg_meds_by_diag2_gender(df):
    return df.groupby(['diagnoses_2','gender'])['num_medications'].mean().sort_index()

def q_avg_meds_by_race_admtype(df):
    return df.groupby(['race','admission_type'])['num_medications'].mean().sort_index()

def q_avg_labs_by_diag2_admtype(df):
    return df.groupby(['diagnoses_2','admission_type'])['num_lab_procedures'].mean().sort_index()

complex_queries = {
    "Avg labs by age³×AdmType":  q_lab_by_age_adm,
    "Readmit% by diag1×Gender":  q_readmit_by_diag1_gender,
    "Avg meds by diag2×Gender":  q_avg_meds_by_diag2_gender,
    "Avg meds by Race×AdmType":  q_avg_meds_by_race_admtype,
    "Avg labs by diag2×AdmType": q_avg_labs_by_diag2_admtype,
}

# -------------------------------------------------------------
# Run and Compare Queries
# -------------------------------------------------------------
orig_results = {}
for slice_label, path in slices.items():
    df_raw = pd.read_csv(path, keep_default_na=False)
    orig_results[slice_label] = {
        qname: fn(df_raw) for qname, fn in complex_queries.items()
    }

records = []
for slice_label in slices:
    for l in ls:
        anon_path = f"diabetic_data_{slice_label}_l{l}_dropping.csv"
        df_anon = pd.read_csv(anon_path, keep_default_na=False)

        for qname, fn in complex_queries.items():
            orig_ser = orig_results[slice_label][qname]
            anon_ser = fn(df_anon)

            comp = pd.DataFrame({
                "orig": orig_ser,
                "anon": anon_ser.reindex(orig_ser.index).fillna(0)
            })

            comp["rel_err_pct"] = np.where(
                comp["orig"] == 0,
                np.nan,
                (comp["anon"] - comp["orig"]).abs() / comp["orig"] * 100
            )

            mean_err = comp["rel_err_pct"].dropna().mean()
            util     = classify_utility(mean_err)

            records.append({
                "Slice":     slice_label,
                "Query":     qname,
                "ℓ":         l,
                "RelErr(%)": round(mean_err, 2),
                "Utility":   util
            })

# -------------------------------------------------------------
# Create Summary Table and Export
# -------------------------------------------------------------
summary_df = pd.DataFrame(records)
summary_pivot = (
    summary_df
    .pivot_table(index=["Slice", "Query"], columns="ℓ", values=["RelErr(%)", "Utility"], aggfunc="first")
    .round(2)
)

summary_pivot.columns.name = None
summary_pivot = summary_pivot.reset_index()

display(summary_pivot)
summary_pivot.to_csv("l_diversity_complex_query_utility_summary.csv", index=False)