In [1]:
# Paths
DATA_PATH = "/Users/andrewrodriguez/Desktop/compsci1050/anonymity_and_ethics/reduced_qi_filled.csv"
OUT_RECORD_SUPP = "/Users/andrewrodriguez/Desktop/compsci1050/anonymity_and_ethics/de-identified_data/record_suppression.csv"
OUT_COLUMN_SUPP = "/Users/andrewrodriguez/Desktop/compsci1050/anonymity_and_ethics/de-identified_data/column_suppression.csv"
OUT_GEN_STAGE1 = "/Users/andrewrodriguez/Desktop/compsci1050/anonymity_and_ethics/de-identified_data/generalized_stage1.csv"
OUT_GEN_STAGE2 = "/Users/andrewrodriguez/Desktop/compsci1050/anonymity_and_ethics/de-identified_data/generalized_stage2.csv"
OUT_GEN_TOPSTAR = "/Users/andrewrodriguez/Desktop/compsci1050/anonymity_and_ethics/de-identified_data/generalized_stage3_topstar.csv"
OUT_COMBINATION = "/Users/andrewrodriguez/Desktop/compsci1050/anonymity_and_ethics/de-identified_data/combination.csv"


In [2]:
import pandas as pd
import numpy as np

df = pd.read_csv(DATA_PATH)
print("Rows:", len(df))
print("Columns:", list(df.columns))
df.head(3)

Rows: 199999
Columns: ['course_id', 'user_id', 'cc_by_ip', 'city', 'postalCode', 'LoE', 'YoB', 'gender', 'nforum_posts', 'nforum_votes', 'nforum_endorsed', 'nforum_threads', 'nforum_comments', 'nforum_pinned', 'nforum_events']


Unnamed: 0,course_id,user_id,cc_by_ip,city,postalCode,LoE,YoB,gender,nforum_posts,nforum_votes,nforum_endorsed,nforum_threads,nforum_comments,nforum_pinned,nforum_events
0,HarvardX/PH525.1x/1T2018,29940,US,Austin,78713.0,,,,0,0,0,0,0,0,0
1,HarvardX/PH525.1x/1T2018,37095,BD,Dhaka,,b,1991.0,m,0,0,0,0,0,0,0
2,HarvardX/PH525.1x/1T2018,45634,CO,Medellín,,m,1982.0,m,0,0,0,0,0,0,0


In [3]:
# so user_id and course_id are idenitifiers here
identifiers = ["user_id", "course_id"]
qi_cols = [c for c in df.columns if c not in identifiers]

print("Identifiers:", identifiers)
print("QI columns ({}):".format(len(qi_cols)), qi_cols)

Identifiers: ['user_id', 'course_id']
QI columns (13): ['cc_by_ip', 'city', 'postalCode', 'LoE', 'YoB', 'gender', 'nforum_posts', 'nforum_votes', 'nforum_endorsed', 'nforum_threads', 'nforum_comments', 'nforum_pinned', 'nforum_events']


In [4]:
# Okay lets check our baseline k anon
# to do so im gonna make a helper
from typing import List, Tuple

def k_anonymity_level(data: pd.DataFrame, qis: List[str]) -> int:
    # So the level of k anonmyity is data grouped by quasi ids
    return int(data.groupby(qis, dropna=False).size().min())

k0 = k_anonymity_level(df, qi_cols)
print("Baseline k-anonymity:", k0)

Baseline k-anonymity: 1


In [5]:
def record_suppression_k(data: pd.DataFrame, qis: List[str], k: int = 5) -> Tuple[pd.DataFrame, int]:
    # Okay so for record supression we only keep rows of size k
    # where k is 5
    sizes = data.groupby(qis, dropna=False).size().rename("size")
    tmp = data.join(sizes, on=qis)
    kept = tmp[tmp["size"] >= k].drop(columns=["size"])
    deleted = len(tmp) - len(kept)
    return kept, deleted


In [6]:
# Now we want to record supress to k = 5
rs_df, rs_deleted = record_suppression_k(df, qi_cols, k=5)
rs_k = k_anonymity_level(rs_df, qi_cols)
rs_df.to_csv(OUT_RECORD_SUPP, index=False)

print({
    "rows_source": len(df),
    "deleted_rows": rs_deleted,
    "rows_kept": len(rs_df),
    "k_after": rs_k,
    "file": OUT_RECORD_SUPP
})

{'rows_source': 199999, 'deleted_rows': 150286, 'rows_kept': 49713, 'k_after': 5, 'file': '/Users/andrewrodriguez/Desktop/compsci1050/anonymity_and_ethics/de-identified_data/record_suppression.csv'}


In [7]:
def greedy_column_suppression_until_k(data, identifiers, target_k=5, count_na_as_value=True, verbose=False):
    """
    Greedy column suppression that drops the QI with the HIGHEST per-column cardinality
    (most unique values) each step. Ties are broken by the k achieved after dropping
    the column, then by resulting #groups (fewer is better), then by column name.
    """
    # Start with all QIs
    current_qis = [c for c in data.columns if c not in identifiers]
    dropped = []

    # Current k
    current_k = k_anonymity_level(data, current_qis)
    if verbose:
        print(f"Start: k={current_k}, QIs={len(current_qis)}")

    # How to count uniques (count NaN as its own value or not)
    nunique_kwargs = {"dropna": not count_na_as_value}

    # Precompute per-column cardinalities (they don’t change as we drop other columns)
    cardinality = {c: data[c].nunique(**nunique_kwargs) for c in current_qis}

    while current_k < target_k and current_qis:
        # Find the maximum cardinality among remaining QIs
        max_card = max(cardinality[c] for c in current_qis)
        candidates = [c for c in current_qis if cardinality[c] == max_card]

        # Tie-break among candidates: pick the one whose removal yields the highest k,
        # then the fewest groups, then lexicographically.
        best_col = None
        best_k = -1
        best_groups = None
        best_name = None

        for c in candidates:
            trial_qis = [x for x in current_qis if x != c]
            kval = k_anonymity_level(data, trial_qis)
            ng = (data.groupby(trial_qis, dropna=False).ngroups if trial_qis else 1)

            if (kval > best_k) or \
               (kval == best_k and (best_groups is None or ng < best_groups)) or \
               (kval == best_k and ng == best_groups and (best_name is None or c < best_name)):
                best_k = kval
                best_groups = ng
                best_col = c
                best_name = c

        # Drop the chosen column
        current_qis.remove(best_col)
        dropped.append(best_col)
        current_k = best_k
        cardinality.pop(best_col, None)

        if verbose:
            print(f"Dropped '{best_col}' (card={max_card}) -> k={current_k}, QIs left={len(current_qis)}")

    return dropped, current_k, current_qis

In [8]:
dropped_cols, final_k, remaining_qis = greedy_column_suppression_until_k(
    df, identifiers, target_k=5, count_na_as_value=True, verbose=True
)

# Save if you hit k >= 5
if final_k >= 5:
    keep_cols = identifiers + remaining_qis
    cs_df = df[keep_cols].copy()
    cs_df.to_csv(OUT_COLUMN_SUPP, index=False)

Start: k=1, QIs=13
Dropped 'postalCode' (card=18489) -> k=1, QIs left=12
Dropped 'city' (card=13276) -> k=1, QIs left=11
Dropped 'nforum_events' (card=645) -> k=1, QIs left=10
Dropped 'cc_by_ip' (card=218) -> k=1, QIs left=9
Dropped 'YoB' (card=124) -> k=1, QIs left=8
Dropped 'nforum_posts' (card=110) -> k=1, QIs left=7
Dropped 'nforum_comments' (card=104) -> k=1, QIs left=6
Dropped 'nforum_votes' (card=91) -> k=1, QIs left=5
Dropped 'nforum_threads' (card=55) -> k=1, QIs left=4
Dropped 'LoE' (card=12) -> k=1, QIs left=3
Dropped 'nforum_endorsed' (card=11) -> k=1, QIs left=2
Dropped 'nforum_pinned' (card=9) -> k=978, QIs left=1


In [9]:
# Generalization utilities
def bin_counts(x):
    xi = pd.to_numeric(x, errors="coerce")
    bins = [-np.inf, 0, 2, 5, 10, 20, np.inf]
    labels = ["0", "1-2", "3-5", "6-10", "11-20", "21+"]
    return pd.cut(xi.fillna(-np.inf), bins=bins, labels=labels, include_lowest=True).astype(str)

def coarser_bin_counts(x):
    xi = pd.to_numeric(x, errors="coerce")
    bins = [-np.inf, 0, 5, 20, np.inf]
    labels = ["0", "1-5", "6-20", "21+"]
    return pd.cut(xi.fillna(-np.inf), bins=bins, labels=labels, include_lowest=True).astype(str)

def yob_to_decade(x):
    xi = pd.to_numeric(x, errors="coerce")
    decade = (xi // 10) * 10
    return decade.fillna(-1).astype(int).astype(str).replace({"-1": "Unknown"})

def yob_to_20yr(x):
    xi = pd.to_numeric(x, errors="coerce")
    band = (xi // 20) * 20
    return band.fillna(-1).astype(int).astype(str).replace({"-1": "Unknown"})

def postal3(x):
    return x.astype(str).str[:3].replace({"nan": "Unknown", "": "Unknown"})

def postal2(x):
    return x.astype(str).str[:2].replace({"nan": "Unknown", "": "Unknown"})

def generalize_stage1(data: pd.DataFrame) -> pd.DataFrame:
    g = data.copy()
    lower = {c.lower(): c for c in g.columns}
    # YoB -> decade
    for key in ["year of birth", "ear of birth", "yob"]:
        if key in lower:
            g[lower[key]] = yob_to_decade(g[lower[key]])
            break
    # postal -> first 3
    if "postalcode" in lower:
        g[lower["postalcode"]] = postal3(g[lower["postalcode"]])
    # Level of education -> buckets
    if "level of education" in lower or "loe" in lower:
        loe_col = lower.get("level of education", lower.get("loe"))
        loe = g[loe_col].astype(str).str.lower()
        def map_loe(s):
            if any(k in s for k in ["less than", "primary", "elementary", "secondary or less", "middle", "high school", "secondary"]):
                return "≤Secondary"
            if "associate" in s:
                return "Associate"
            if "bachelor" in s or "college" in s:
                return "Bachelor"
            if any(k in s for k in ["master", "graduate", "professional"]):
                return "Master/Prof"
            if any(k in s for k in ["doctor", "phd", "doctoral"]):
                return "Doctoral"
            if any(k in s for k in ["na", "unknown", "nan", "none"]):
                return "Unknown"
            return s.title()
        g[loe_col] = loe.map(map_loe)
    # Forum counters -> binned
    count_like = [c for c in g.columns if ("number of " in c.lower()) or ("events within the forum" in c.lower()) or ("nforum" in c.lower())]
    for c in count_like:
        g[c] = bin_counts(g[c])
    return g

def generalize_stage2(data: pd.DataFrame) -> pd.DataFrame:
    g = generalize_stage1(data)
    lower = {c.lower(): c for c in g.columns}
    # YoB -> 20-year
    for key in ["year of birth", "ear of birth", "yob"]:
        if key in lower:
            g[lower[key]] = yob_to_20yr(g[lower[key]])
            break
    # postal -> first 2
    if "postalcode" in lower:
        g[lower["postalcode"]] = postal2(g[lower["postalcode"]])
    # city -> first letter
    if "city" in lower:
        g[lower["city"]] = g[lower["city"]].astype(str).str[:1].replace({"": "U", "nan": "U"})
    # Forum counters -> coarser
    count_like = [c for c in g.columns if ("number of " in c.lower()) or ("events within the forum" in c.lower()) or ("nforum" in c.lower())]
    for c in count_like:
        g[c] = coarser_bin_counts(g[c])
    return g



In [10]:

# Stage 1
g1 = generalize_stage1(df)
g1_qis = [c for c in g1.columns if c not in identifiers]
g1_k = k_anonymity_level(g1, g1_qis)
g1.to_csv(OUT_GEN_STAGE1, index=False)

# Stage 2
g2 = generalize_stage2(df)
g2_qis = [c for c in g2.columns if c not in identifiers]
g2_k = k_anonymity_level(g2, g2_qis)
g2.to_csv(OUT_GEN_STAGE2, index=False)

print({"Stage1_k": g1_k, "Stage2_k": g2_k, "files": [OUT_GEN_STAGE1, OUT_GEN_STAGE2]})

# Top-level '*' demo (not recommended for utility, but shows pure generalization can achieve k)
g3 = df.copy()
for c in g3.columns:
    if c not in identifiers:
        g3[c] = "*"
g3_qis = [c for c in g3.columns if c not in identifiers]
g3_k = k_anonymity_level(g3, g3_qis)
g3.to_csv(OUT_GEN_TOPSTAR, index=False)
print({"TopStar_k": g3_k, "file": OUT_GEN_TOPSTAR})

{'Stage1_k': 1, 'Stage2_k': 1, 'files': ['/Users/andrewrodriguez/Desktop/compsci1050/anonymity_and_ethics/de-identified_data/generalized_stage1.csv', '/Users/andrewrodriguez/Desktop/compsci1050/anonymity_and_ethics/de-identified_data/generalized_stage2.csv']}
{'TopStar_k': 199999, 'file': '/Users/andrewrodriguez/Desktop/compsci1050/anonymity_and_ethics/de-identified_data/generalized_stage3_topstar.csv'}


In [11]:

# Choose Stage 1 if it helps; otherwise Stage 2
base_gen = g1 if g1_k >= 5 else g2
base_qis = [c for c in base_gen.columns if c not in identifiers]
combo_df, combo_deleted = record_suppression_k(base_gen, base_qis, k=5)
combo_k = k_anonymity_level(combo_df, base_qis)
combo_df.to_csv(OUT_COMBINATION, index=False)

print({
    "rows_source": len(df),
    "deleted_rows": combo_deleted,
    "rows_kept": len(combo_df),
    "k_after": combo_k,
    "file": OUT_COMBINATION
})

{'rows_source': 199999, 'deleted_rows': 68207, 'rows_kept': 131792, 'k_after': 5, 'file': '/Users/andrewrodriguez/Desktop/compsci1050/anonymity_and_ethics/de-identified_data/combination.csv'}
