In [1]:
#Microaggregation

import pandas as pd
import numpy as np

In [2]:
# Load dataset
df = pd.read_csv("adult.csv")

In [3]:
# Normalize column names (fixes dashes, spaces, and case issues)
df.columns = df.columns.str.strip().str.lower().str.replace("-", "_")

# Select numerical columns for microaggregation
numerical_columns = ["age", "hours_per_week", "capital_gain", "capital_loss"]

# Define group size for microaggregation (k-anonymity parameter)
k = 5  # Ensures each group has at least 5 records

In [4]:
# Display original dataset before perturbation (first 5 rows)
print("Original Dataset (First 5 rows):")
print(df[df.columns.tolist()].head(100))

Original Dataset (First 5 rows):
    age         workclass  fnlwgt     education  education_num  \
0    39         State-gov   77516     Bachelors             13   
1    50  Self-emp-not-inc   83311     Bachelors             13   
2    38           Private  215646       HS-grad              9   
3    53           Private  234721          11th              7   
4    28           Private  338409     Bachelors             13   
..  ...               ...     ...           ...            ...   
95   29         Local-gov  115585  Some-college             10   
96   48  Self-emp-not-inc  191277     Doctorate             16   
97   37           Private  202683  Some-college             10   
98   48           Private  171095    Assoc-acdm             12   
99   32       Federal-gov  249409       HS-grad              9   

        marital_status         occupation   relationship   race     sex  \
0        Never-married       Adm-clerical  Not-in-family  White    Male   
1   Married-civ-spouse  

In [5]:
# Function to apply microaggregation to a numerical column
def microaggregate(series, k):
    sorted_series = series.sort_values().reset_index()
    grouped_values = np.array(sorted_series[series.name])  # Extract values

    # Apply microaggregation (group mean)
    for i in range(0, len(grouped_values), k):
        grouped_values[i:i + k] = np.mean(grouped_values[i:i + k])

    # Assign back to original DataFrame
    series_aggregated = pd.Series(grouped_values, index=sorted_series["index"])
    return series_aggregated.sort_index()


In [6]:
# Apply microaggregation to selected numerical columns
for col in numerical_columns:
    df[col + "_microaggregated"] = microaggregate(df[col], k)

#  Identify affected rows (where values changed)
df["affected"] = False
for col in numerical_columns:
    df["affected"] |= df[col] != df[col + "_microaggregated"]

# Extract affected rows
affected_rows = df[df["affected"]]

In [7]:
# Display first few affected rows
print("\n🔹 Affected Rows (First 10 rows where values changed):")
print(affected_rows[numerical_columns + [col + "_microaggregated" for col in numerical_columns]].head(10))


🔹 Affected Rows (First 10 rows where values changed):
    age  hours_per_week  capital_gain  capital_loss  age_microaggregated  \
7    52              45             0             0                   52   
8    31              50         14084             0                   30   
9    42              40          5178             0                   42   
12   23              30             0             0                   23   
23   43              40             0          2042                   43   
27   54              60             0             0                   53   
29   49              40             0             0                   49   
30   23              52             0             0                   23   
31   20              44             0             0                   19   
35   48              40             0             0                   47   

    hours_per_week_microaggregated  capital_gain_microaggregated  \
7                               44      

In [8]:
# Save the microaggregated dataset
df.to_csv("adult_microaggregated.csv", index=False)

print("\n✅ Microaggregation Applied. File saved as 'adult_microaggregated.csv'.")


✅ Microaggregation Applied. File saved as 'adult_microaggregated.csv'.
