In [1]:
import calendar
import numpy as np
import pandas as pd
import scipy as sp
from random_address import real_random_address, real_random_address_by_state

def sample_cdf(cdf, sample_size):
    def _helper(cdf, prob):
        for i, p in enumerate(cdf):
            if prob < p:
                return i
    return [_helper(cdf, prob) for prob in np.random.random(size=sample_size)]

def sample_list(list, col):
    return list.iloc[np.random.randint(0, len(list.values))][col]

class SimData:
    def __init__(self, population_size, curr_year):
        self.curr_year = curr_year
        self.population_size = population_size

        census_df = pd.read_csv("./qid/census.csv")
        occupations_df = pd.read_csv("./qid/occupations.csv")
        marital_statuses_df = pd.read_csv("./qid/marital.csv")
        ethnicities_df = pd.read_csv("./qid/ethnicity.csv")

        tot = census_df["men"]+census_df["woman"]
        age_pdf = tot / sum(tot)
        age_cdf = [sum(age_pdf[:i+1]) for i in range(len(age_pdf.values))]
        gender_p = census_df["men"] / tot

        self.ages = sample_cdf(age_cdf, self.population_size)
        self.genders = ["Male" if np.random.random() < gender_p[age] else "Female" for age in self.ages]

        self.bday = [curr_year-age for age in self.ages]

        addresses = [real_random_address_by_state('CA') for _ in range(population_size)]
        self.street = [add.get("address1","") for add in addresses]
        self.city = [add.get("city","") for add in addresses]
        self.state = [add.get("state","") for add in addresses]
        self.postalCode = [add.get("postalCode","") for add in addresses]

        self.occupations = [sample_list(occupations_df, "Occupations") for _ in range(population_size)]
        self.marital_statuses = [sample_list(marital_statuses_df, "Status") for _ in range(population_size)]
        self.ethnicities = [sample_list(ethnicities_df, "Ethnicity") for _ in range(population_size)]

        d = {
            "rid": [i for i in range(population_size)],
            "age": self.ages,
            "gender": self.genders,
            "birthday": self.bday,
            "street": self.street,
            "city": self.city,
            "state": self.state,
            "postalcode": self.postalCode,
            "occupation": self.occupations,
            "marital_status": self.marital_statuses,
            "ethnicity": self.ethnicities 
        }

        self.data = pd.DataFrame(data=d)

    def add_quasiid(self, idname, func):
        self.data[idname] = self.data.apply(lambda x: func(x), axis=1)

    def add_datum(self, data):
        for k, v in data.items():
            self.data[k] = v
        
    def trainset(self, sample_size, columns):
        self.cols = columns
        return self.data[columns].sample(sample_size, replace=True)

    def population(self):
        return self.data[self.cols]

class privacy:
    def __init__(self):
        pass

    def idr(self, syndata, population, qids, svs):
        pass

In [2]:
class BMI:
    def __init__(self):
        bmi_df = pd.read_csv("./data/bmi.csv")
        male = bmi_df[bmi_df["Gender"]=="Male"]
        male_bmi = male["Weight"] / (male["Height"]/100)**2

        female = bmi_df[bmi_df["Gender"]=="Female"]
        female_bmi = female["Weight"] / (female["Height"]/100)**2

        self.avg = {
            "Male": np.average(list(male_bmi.values)),
            "Female": np.average(list(female_bmi.values))
        }

        self.std = {
            "Male": np.std(list(male_bmi.values)),
            "Female": np.std(list(female_bmi.values))
        }

        print(self.avg)
        print(self.std)

    def get(self, gender):
        ep = np.random.randn()
        return self.avg[gender] + self.std[gender]*ep

In [3]:
class Diabetes:
    def __init__(self):
        self.diabetes_df = pd.read_csv("./data/diabetes.csv")
        self.age = self.diabetes_df["Age"]
        self.bmi = self.diabetes_df["BMI"]
        print(max(self.age), min(self.age))

    def sample(self, age, bmi):
        dist = np.sqrt((self.age-age)**2 + (self.bmi-bmi)**2)
        return self.diabetes_df.iloc[dist.argmin()]
    
    def get_samples(self, data, cols):
        vs = [self.sample(r["age"], r["bmi"]) for _, r in data.iterrows()]
        return {
            c: [v[c] for v in vs]
            for c in cols
        }

In [4]:
simulated = SimData(3000, 2024)
bmi = BMI()
diabetes = Diabetes()

# Remove OOD ages
simulated.data = simulated.data.drop(simulated.data[simulated.data.age > 81].index)
simulated.data = simulated.data.drop(simulated.data[simulated.data.age < 21].index)

simulated.add_quasiid("bmi", lambda x: bmi.get(x["gender"]))
diabetes_dict = diabetes.get_samples(simulated.data, ["Pregnancies", "Glucose", "BloodPressure", "SkinThickness", "Insulin", "DiabetesPedigreeFunction"])
simulated.add_datum(diabetes_dict)

display(simulated.data.head(3))

{'Male': 38.15161387716939, 'Female': 37.394104198501026}
{'Male': 14.052211556576497, 'Female': 13.844188544367285}
81 21


Unnamed: 0,rid,age,gender,birthday,street,city,state,postalcode,occupation,marital_status,ethnicity,bmi,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,DiabetesPedigreeFunction
1,1,36,Female,1988,11150 Sun Valley Drive,Oakland,CA,94605,Accountant,divorced,Saudi Arabian,31.938954,8.0,179.0,72.0,42.0,130.0,0.719
2,2,39,Male,1985,5110 East Shoshone Avenue,Orange,CA,92867,Shipping Officer,married,Paraguayan,18.589619,3.0,150.0,76.0,0.0,0.0,0.207
3,3,47,Male,1977,2064 West Columbia Way,Hanford,CA,93230,Ornithologist,divorced,Three Affiliated Tribes of North Dakota,20.626611,9.0,120.0,72.0,22.0,56.0,0.733


In [5]:
from sdv.metadata import Metadata
from sdv.single_table import CTGANSynthesizer

data = simulated.trainset(
    1000, 
    [
        "age", 
        "gender", 
        "city", 
        "marital_status", 
        "bmi", 
        "Pregnancies", 
        "Glucose", 
        "BloodPressure", 
        "SkinThickness", 
        "Insulin", 
        "DiabetesPedigreeFunction"
    ]
)

metadata = Metadata.detect_from_dataframe(
    data=data,
    table_name='modified_diabetes')

synthesizer = CTGANSynthesizer(
    metadata,
    epochs=500
)
synthesizer.fit(data)

synthetic_data = synthesizer.sample(num_rows=1000)

  return torch._C._cuda_getDeviceCount() > 0


In [6]:
display(synthetic_data)

Unnamed: 0,age,gender,city,marital_status,bmi,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,DiabetesPedigreeFunction
0,81,Female,Warrenmouth,married,51.890060,15.0,144.0,71.0,1.0,172.0,0.084
1,50,Female,Andreaside,widowed,32.252331,1.0,144.0,79.0,26.0,14.0,0.416
2,56,Female,Port Brittany,widowed,14.094233,2.0,132.0,90.0,50.0,21.0,1.237
3,81,Female,Mcclurestad,divorced,16.819039,1.0,164.0,80.0,2.0,0.0,0.757
4,58,Male,Brookefort,single,43.440536,1.0,96.0,69.0,21.0,0.0,0.095
...,...,...,...,...,...,...,...,...,...,...,...
995,25,Female,Ronaldtown,single,29.930918,3.0,124.0,70.0,4.0,10.0,0.874
996,46,Female,North Julie,married,59.053656,11.0,159.0,98.0,0.0,264.0,0.549
997,56,Female,North Heather,widowed,28.762278,0.0,187.0,48.0,41.0,207.0,0.484
998,68,Male,Port Michaelburgh,married,46.678737,1.0,82.0,74.0,0.0,109.0,0.084


In [9]:
from sdv.evaluation.single_table import evaluate_quality

population = simulated.population()

quality_report = evaluate_quality(
    real_data=population,
    synthetic_data=synthetic_data,
    metadata=metadata)

quality_report = evaluate_quality(
    real_data=data,
    synthetic_data=synthetic_data,
    metadata=metadata)

Generating report ...

(1/2) Evaluating Column Shapes: |██████████| 11/11 [00:00<00:00, 284.52it/s]|
Column Shapes Score: 81.25%

(2/2) Evaluating Column Pair Trends: |██████████| 55/55 [00:00<00:00, 386.65it/s]|
Column Pair Trends Score: 82.25%

Overall Score (Average): 81.75%

Generating report ...

(1/2) Evaluating Column Shapes: |██████████| 11/11 [00:00<00:00, 1530.31it/s]|
Column Shapes Score: 80.78%

(2/2) Evaluating Column Pair Trends: |██████████| 55/55 [00:00<00:00, 399.09it/s]|
Column Pair Trends Score: 83.8%

Overall Score (Average): 82.29%



In [26]:
from collections import Counter

# Quasi ID
QI = [
        "gender", 
        "marital_status", 
]

# Quasi ID + Sensitive variables
qi_sv = [
        "gender", 
        "marital_status", 
]

df_baseline = population # Real Dataset
df_synthetic_baseline = synthetic_data # Synthetic Dataset

def combinations(df):
    col = QI + ['key', 'count']
    ldf = df[QI].values.tolist()
    ldf = ["_".join([str((v)) for v in r]) for r in ldf]
    cldf = Counter(ldf)
    ldf = [k.split("_") + [k, v] for k, v in list(cldf.items())]
    fdf = pd.DataFrame(ldf, columns=col).sort_values(by=['count'], ascending=False)
    return fdf

rdf = combinations(df_baseline)
sdf = combinations(df_synthetic_baseline)

sd = set(sdf['key'].tolist())
rd = set(rdf['key'].tolist())
u = set.intersection(sd,rd)

ss = sdf.sum()
rs = rdf.sum()

def add(df):
    c = df.copy()
    ldf = df[qi_sv].values.tolist()
    ldf = ["_".join([str(v) for v in r]) in u for r in ldf]
    c['ls_rs'] = np.where(ldf, 1, 0)

    ldf = df[QI].values.tolist()
    ldf = ["_".join([str(v) for v in r]) for r in ldf]
    cldf = Counter(ldf)
    c['QI'] = ldf
    c['QI_count'] = [cldf[r] for r in c['QI']]
    return c

rdf = add(df_baseline)
sdf = add(df_synthetic_baseline)

sr = (1/sdf['QI_count']).mul(sdf['ls_rs']/rs['count']).sum()
rr = (1/rdf['QI_count']).mul(rdf['ls_rs']/ss['count']).sum()

print(sr,rr)

0.00641319285387082 0.014
