In [1]:
import calendar
import numpy as np
import pandas as pd
import scipy as sp
from random_address import real_random_address, real_random_address_by_state

def sample_cdf(cdf, sample_size):
    def _helper(cdf, prob):
        for i, p in enumerate(cdf):
            if prob < p:
                return i
    return [_helper(cdf, prob) for prob in np.random.random(size=sample_size)]

def sample_list(list, col):
    return list.iloc[np.random.randint(0, len(list.values))][col]

class SimData:
    def __init__(self, population_size, curr_year):
        self.curr_year = curr_year
        self.population_size = population_size

        census_df = pd.read_csv("./qid/census.csv")
        occupations_df = pd.read_csv("./qid/occupations.csv")
        marital_statuses_df = pd.read_csv("./qid/marital.csv")
        ethnicities_df = pd.read_csv("./qid/ethnicity.csv")

        tot = census_df["men"]+census_df["woman"]
        age_pdf = tot / sum(tot)
        age_cdf = [sum(age_pdf[:i+1]) for i in range(len(age_pdf.values))]
        gender_p = census_df["men"] / tot

        self.ages = sample_cdf(age_cdf, self.population_size)
        self.genders = ["Male" if np.random.random() < gender_p[age] else "Female" for age in self.ages]

        self.bday = [curr_year-age for age in self.ages]

        addresses = [real_random_address_by_state('CA') for _ in range(population_size)]
        self.street = [add.get("address1","") for add in addresses]
        self.city = [add.get("city","") for add in addresses]
        self.state = [add.get("state","") for add in addresses]
        self.postalCode = [add.get("postalCode","") for add in addresses]

        self.occupations = [sample_list(occupations_df, "Occupations") for _ in range(population_size)]
        self.marital_statuses = [sample_list(marital_statuses_df, "Status") for _ in range(population_size)]
        self.ethnicities = [sample_list(ethnicities_df, "Ethnicity") for _ in range(population_size)]

        d = {
            "rid": [i for i in range(population_size)],
            "age": self.ages,
            "gender": self.genders,
            "birthday": self.bday,
            "street": self.street,
            "city": self.city,
            "state": self.state,
            "postalcode": self.postalCode,
            "occupation": self.occupations,
            "marital_status": self.marital_statuses,
            "ethnicity": self.ethnicities 
        }

        self.data = pd.DataFrame(data=d)

    def add_quasiid(self, idname, func):
        self.data[idname] = self.data.apply(lambda x: func(x), axis=1)

    def add_datum(self, data):
        for k, v in data.items():
            self.data[k] = v
        
    def trainset(self, sample_size, columns):
        self.cols = columns
        return self.data[columns].sample(sample_size, replace=True)

    def population(self):
        return self.data[self.cols]

class privacy:
    def __init__(self):
        pass

    def idr(self, syndata, population, qids, svs):
        pass

In [2]:
class BMI:
    def __init__(self):
        bmi_df = pd.read_csv("./data/bmi.csv")
        male = bmi_df[bmi_df["Gender"]=="Male"]
        male_bmi = male["Weight"] / (male["Height"]/100)**2

        female = bmi_df[bmi_df["Gender"]=="Female"]
        female_bmi = female["Weight"] / (female["Height"]/100)**2

        self.avg = {
            "Male": np.average(list(male_bmi.values)),
            "Female": np.average(list(female_bmi.values))
        }

        self.std = {
            "Male": np.std(list(male_bmi.values)),
            "Female": np.std(list(female_bmi.values))
        }

        print(self.avg)
        print(self.std)

    def get(self, gender):
        ep = np.random.randn()
        return round(self.avg[gender] + self.std[gender]*ep,0)

In [3]:
class Diabetes:
    def __init__(self):
        self.diabetes_df = pd.read_csv("./data/diabetes.csv")
        self.age = self.diabetes_df["Age"]
        self.bmi = self.diabetes_df["BMI"]
        print(max(self.age), min(self.age))

    def sample(self, age, bmi):
        dist = np.sqrt((self.age-age)**2 + (self.bmi-bmi)**2)
        return self.diabetes_df.iloc[dist.argmin()]
    
    def get_samples(self, data, cols):
        vs = [self.sample(r["age"], r["bmi"]) for _, r in data.iterrows()]
        return {
            c: [v[c] for v in vs]
            for c in cols
        }

In [4]:
simulated = SimData(3000, 2024)
bmi = BMI()
diabetes = Diabetes()

# Remove OOD ages
simulated.data = simulated.data.drop(simulated.data[simulated.data.age > 81].index)
simulated.data = simulated.data.drop(simulated.data[simulated.data.age < 21].index)

simulated.add_quasiid("bmi", lambda x: bmi.get(x["gender"]))
diabetes_dict = diabetes.get_samples(simulated.data, ["Pregnancies", "Glucose", "BloodPressure", "SkinThickness", "Insulin", "DiabetesPedigreeFunction"])
simulated.add_datum(diabetes_dict)

display(simulated.data.head(3))

{'Male': 38.15161387716939, 'Female': 37.394104198501026}
{'Male': 14.052211556576497, 'Female': 13.844188544367285}
81 21


Unnamed: 0,rid,age,gender,birthday,street,city,state,postalcode,occupation,marital_status,ethnicity,bmi,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,DiabetesPedigreeFunction
0,0,50,Male,1974,775 47th Street,Oakland,CA,94609,Radio Operator,single,Arapaho,36.0,11.0,138.0,74.0,26.0,144.0,0.557
2,2,63,Female,1961,6716 South Mariposa Lane,Dublin,CA,94568,Insurance Staff,widowed,Honduran,35.0,2.0,197.0,70.0,99.0,0.0,0.575
3,3,50,Female,1974,1797 Pasatiempo Drive,Chico,CA,95928,Farmer,not married,Venezuelan,27.0,6.0,125.0,78.0,31.0,0.0,0.565


In [5]:
from sdv.metadata import Metadata
from sdv.single_table import CTGANSynthesizer

data = simulated.trainset(
    1000, 
    [
        "age", 
        "gender", 
        "city", 
        "marital_status", 
        "bmi", 
        "Pregnancies", 
        "Glucose", 
        "BloodPressure", 
        "SkinThickness", 
        "Insulin", 
        "DiabetesPedigreeFunction"
    ]
)

metadata = Metadata.detect_from_dataframe(
    data=data,
    table_name='modified_diabetes')

synthesizer = CTGANSynthesizer(
    metadata,
    epochs=500
)
synthesizer.fit(data)

synthetic_data = synthesizer.sample(num_rows=1000)

  return torch._C._cuda_getDeviceCount() > 0


In [6]:
display(synthetic_data)

Unnamed: 0,age,gender,city,marital_status,bmi,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,DiabetesPedigreeFunction
0,66,Male,Masonhaven,divorced,63.0,0.0,159.0,122.0,0.0,17.0,0.476
1,34,Male,Smithfurt,separated,78.0,7.0,136.0,79.0,39.0,1.0,0.741
2,54,Female,Toddville,separated,56.0,0.0,104.0,75.0,0.0,4.0,0.149
3,70,Male,Robertfurt,widowed,55.0,11.0,90.0,78.0,0.0,0.0,0.292
4,24,Female,Port Luisfort,single,47.0,0.0,161.0,71.0,43.0,16.0,0.306
...,...,...,...,...,...,...,...,...,...,...,...
995,31,Male,West Leslie,single,2.0,6.0,173.0,70.0,0.0,16.0,1.126
996,56,Male,Port James,divorced,58.0,2.0,98.0,75.0,1.0,17.0,0.702
997,40,Male,East Davidhaven,separated,51.0,2.0,156.0,10.0,0.0,389.0,0.078
998,36,Female,Lake Scott,separated,37.0,13.0,158.0,85.0,54.0,306.0,0.530


In [7]:
from sdv.evaluation.single_table import evaluate_quality

population = simulated.population()

quality_report = evaluate_quality(
    real_data=population,
    synthetic_data=synthetic_data,
    metadata=metadata)

quality_report = evaluate_quality(
    real_data=data,
    synthetic_data=synthetic_data,
    metadata=metadata)

Generating report ...

(1/2) Evaluating Column Shapes: |██████████| 11/11 [00:00<00:00, 337.49it/s]|
Column Shapes Score: 84.33%

(2/2) Evaluating Column Pair Trends: |██████████| 55/55 [00:00<00:00, 376.64it/s]|
Column Pair Trends Score: 87.42%

Overall Score (Average): 85.87%

Generating report ...

(1/2) Evaluating Column Shapes: |██████████| 11/11 [00:00<00:00, 1560.54it/s]|
Column Shapes Score: 84.18%

(2/2) Evaluating Column Pair Trends: |██████████| 55/55 [00:00<00:00, 393.39it/s]|
Column Pair Trends Score: 86.9%

Overall Score (Average): 85.54%



In [47]:
from collections import Counter
from sklearn.metrics.pairwise import cosine_similarity
from scipy.spatial.distance import cdist

# Quasi ID
QI = [
        "gender", 
        "marital_status", 
        "age",
        "bmi",
        "Pregnancies"
]

# Quasi ID + Sensitive variables
qi_sv = [
        "gender", 
        "marital_status", 
        "age",
        "bmi",
        "Pregnancies"
]


# population or data
df_baseline = population # Real Dataset
df_synthetic_baseline = synthetic_data # Synthetic Dataset

def combinations(df):
    col = QI + ['key', 'count']
    ldf = df[QI].values.tolist()
    ldf = ["_".join([str((v)) for v in r]) for r in ldf]
    cldf = Counter(ldf)
    ldf = [k.split("_") + [k, v] for k, v in list(cldf.items())]
    fdf = pd.DataFrame(ldf, columns=col).sort_values(by=['count'], ascending=False)
    return fdf

rdf = combinations(df_baseline)
sdf = combinations(df_synthetic_baseline)

sd = set(sdf['key'].tolist())
rd = set(rdf['key'].tolist())

sd_dict = {}
for v in sd:
    v = v.split("_")
    k = v[0]+"_"+v[1]
    if k not in sd_dict:
        sd_dict[k] = []
    sd_dict[k].append([int(v[2]), float(v[3]), float(v[4])])

rd_dict = {}
for v in rd:
    v = v.split("_")
    k = v[0]+"_"+v[1]
    if k not in rd_dict:
        rd_dict[k] = []
    rd_dict[k].append([int(v[2]), float(v[3]), float(v[4])])

test_u = []
for k,v in sd_dict.items():
    if k in rd_dict:
        w = rd_dict[k]
        
        v = np.array(v)
        w = np.array(w)

        d = cdist(w,v)
        d = np.where(d <= 1, 1, 0)
        w_add = d.sum(axis=1)
        v_add = d.sum(axis=0)
        
        for i, c in enumerate(w_add):
            if c > 0:
                n = k + "_" + str(int(w[i][0])) + "_" + str(w[i][1]) + "_" + str((w[i][2]))
                test_u.append(n)
        
        for i, c in enumerate(v_add):
            if c > 0:
                n = k + "_" + str(int(v[i][0])) + "_" + str(v[i][1]) + "_" + str((v[i][2]))
                test_u.append(n)

test_u = set(test_u)
u = set.intersection(sd,rd)
print(u)
print(test_u)

ss = sdf.sum()
rs = rdf.sum()

def add(df):
    c = df.copy()
    ldf = df[qi_sv].values.tolist()
    ldf = ["_".join([str(v) for v in r]) in u for r in ldf] # in u or test_u
    c['ls_rs'] = np.where(ldf, 1, 0)

    ldf = df[QI].values.tolist()
    ldf = ["_".join([str(v) for v in r]) for r in ldf]
    cldf = Counter(ldf)
    c['QI'] = ldf
    c['QI_count'] = [cldf[r] for r in c['QI']]
    return c

rdf = add(df_baseline)
sdf = add(df_synthetic_baseline)

sr = (1/sdf['QI_count']).mul(sdf['ls_rs']/rs['count']).sum()
rr = (1/rdf['QI_count']).mul(rdf['ls_rs']/ss['count']).sum()

print(sr,rr)

{'Male_widowed_23_40.0_1.0', 'Male_widowed_58_60.0_0.0', 'Male_living common-Law_26_48.0_0.0', 'Male_widowed_37_44.0_8.0'}
{'Male_separated_81_36.0_9.0', 'Female_separated_43_38.0_8.0', 'Female_widowed_67_45.0_5.0', 'Male_living common-Law_41_14.0_3.0', 'Male_living common-Law_27_60.0_0.0', 'Male_not married_21_49.0_0.0', 'Male_not married_27_35.0_3.0', 'Male_married_36_58.0_11.0', 'Male_widowed_30_47.0_0.0', 'Male_living common-Law_44_26.0_10.0', 'Female_married_65_31.0_1.0', 'Female_separated_43_37.0_8.0', 'Male_living common-Law_45_26.0_10.0', 'Female_married_56_47.0_0.0', 'Male_widowed_23_40.0_1.0', 'Male_not married_21_48.0_0.0', 'Male_not married_25_43.0_0.0', 'Male_not married_26_43.0_0.0', 'Male_married_53_60.0_0.0', 'Female_divorced_68_40.0_5.0', 'Male_married_53_59.0_0.0', 'Male_widowed_66_49.0_0.0', 'Male_living common-Law_27_59.0_0.0', 'Male_separated_81_36.0_8.0', 'Female_separated_26_43.0_0.0', 'Male_not married_27_35.0_2.0', 'Female_married_38_51.0_6.0', 'Female_single_6

In [9]:
from privacy import syntheticCombinations
cdata = syntheticCombinations(df_synthetic_baseline, combinations(df_baseline))

def risk(combinations, threshold):
    copy = combinations[['key','count','syn_count']].copy()
    copy['include'] = np.where((copy['count'] >= threshold) & (copy['syn_count'] >= threshold), 1, 0)
    ss = int(copy['syn_count'].mul(copy['syn_count'] >= threshold).sum())
    rs = int(copy['count'].mul(copy['count'] >= threshold).sum())
    sr = (1/copy['syn_count']).mul(copy['include']/ss).sum()
    rr = (1/copy['count']).mul(copy['include']/rs).sum()
    return sr, rr


print(risk(cdata, 5))

Unnamed: 0,age,gender,city,marital_status,bmi,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,DiabetesPedigreeFunction,key,syn_count
0,66,Male,Masonhaven,divorced,63.0,0.0,159.0,122.0,0.0,17.0,0.476,66_Male_Masonhaven_divorced_63.0_0.0_159.0_122...,1
671,21,Male,Alexanderside,living common-Law,47.0,7.0,149.0,0.0,0.0,0.0,0.29,21_Male_Alexanderside_living common-Law_47.0_7...,1
658,80,Female,East Douglasside,divorced,78.0,1.0,196.0,114.0,0.0,13.0,0.727,80_Female_East Douglasside_divorced_78.0_1.0_1...,1
659,74,Male,Fernandezborough,widowed,58.0,16.0,129.0,91.0,6.0,20.0,0.167,74_Male_Fernandezborough_widowed_58.0_16.0_129...,1
660,26,Male,Amandafort,divorced,48.0,12.0,155.0,4.0,41.0,12.0,1.372,26_Male_Amandafort_divorced_48.0_12.0_155.0_4....,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
338,24,Male,South Amberborough,living common-Law,78.0,5.0,147.0,95.0,2.0,21.0,1.099,24_Male_South Amberborough_living common-Law_7...,1
339,37,Male,North Thomas,widowed,51.0,0.0,129.0,82.0,1.0,177.0,0.434,37_Male_North Thomas_widowed_51.0_0.0_129.0_82...,1
340,53,Female,Hallmouth,not married,41.0,10.0,169.0,0.0,19.0,0.0,0.306,53_Female_Hallmouth_not married_41.0_10.0_169....,1
341,48,Female,Proctorburgh,not married,64.0,4.0,156.0,67.0,13.0,155.0,0.138,48_Female_Proctorburgh_not married_64.0_4.0_15...,1


Unnamed: 0,gender,marital_status,age,bmi,key,count
1079,Male,single,28,33.0,Male_single_28_33.0,2
1041,Female,single,46,31.0,Female_single_46_31.0,2
298,Male,divorced,37,31.0,Male_divorced_37_31.0,2
289,Female,widowed,22,44.0,Female_widowed_22_44.0,2
264,Female,married,49,35.0,Female_married_49_35.0,2
...,...,...,...,...,...,...
732,Male,living common-Law,46,39.0,Male_living common-Law_46_39.0,1
731,Male,married,36,38.0,Male_married_36_38.0,1
730,Male,single,67,29.0,Male_single_67_29.0,1
729,Female,married,71,40.0,Female_married_71_40.0,1


(0.0, 0.0)
