In [6]:
import json
from time import time
import pandas as pd
from pathlib import Path
from tempfile import TemporaryDirectory
from dpmm.pipelines import MSTPipeline

# Utility function

In [10]:
def get_size(folder_path):
    to_visit = [Path(folder_path)]
    total_size = 0
    while len(to_visit) > 0:
        current_folder = to_visit.pop(0)
        files = list(current_folder.glob("*"))
        
        for f in files:
            if f.is_dir():
                to_visit.append(f)
            else:
                total_size += f.lstat().st_size / (1028 ** 2) # Add size in MegaBytes

    return total_size

# Load data

In [11]:
wine_dir = Path().parent / "wine"

df = pd.read_pickle(wine_dir / "wine.pkl.gz")
with (wine_dir / "wine_bounds.json").open("r") as f:
    domain = json.load(f)


df.head()

Unnamed: 0,type,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
4192,white,6.3,0.21,0.29,11.7,0.048,49.0,147.0,0.99482,3.22,0.38,10.8,0
2157,white,5.9,0.21,0.24,12.1,0.044,53.0,165.0,0.9969,3.25,0.39,9.5,0
631,red,10.4,0.28,0.54,2.7,0.105,5.0,19.0,0.9988,3.25,0.63,9.5,0
3410,white,7.6,0.38,0.2,3.4,0.046,9.0,116.0,0.9944,3.15,0.41,9.4,0
3117,white,8.4,0.23,0.49,7.8,0.035,22.0,95.0,0.9935,3.04,0.34,12.0,1


# Build pipeline

In [None]:
pipeline = MSTPipeline(
    epsilon=1,  # Privacy budget for generate model
    proc_epsilon=0.1,  # Privacy budget for data processing 
    delta=1e-5,  # Delta Setting in (eps,  delta) differential privacy
)

# Fit step

In [13]:
print("Fitting MST pipeline...", end='\r')
start_time = time()
# The number of rows to generate
pipeline.fit(df, domain)
fit_time = time()

print(f"Fitting MST pipeline - Took {fit_time - start_time:.2f} seconds")

Fitting MST pipeline - Took 15.50 seconds


# Serialisation & Deserialisation

In [14]:

with TemporaryDirectory() as temp_dir:
    temp_dir = Path(temp_dir)
    print("Storing to {temp_dir} ...", end="\r")
    pipeline.store(temp_dir)
    store_time = time()
    model_size = get_size(temp_dir)
    print(f"Stored to {temp_dir} - {model_size:.02f} Mb - took {store_time - fit_time:.02f} seconds")

    print("Reloading Pipeline", end="\r")
    pipeline = MSTPipeline.load(temp_dir)
    reload_time = time()
    print(f"Pipeline Reloaded - took {reload_time - store_time:.02f} seconds")

Stored to /tmp/tmpuyngcmp0 - 2.52 Mb - took 0.04 seconds
Pipeline Reloaded - took 0.39 seconds


# Generation step

In [16]:
print("Generating synthetic data...", end="\r")
# Generate synthetic data
synth_df = pipeline.generate(df.shape[0])
gen_time = time()

print(f"Generating synthetic data - Took {gen_time - reload_time:.2f} seconds")

display(synth_df.head())

Generating synthetic data - Took 5.86 seconds


Unnamed: 0,type,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,white,6.431133,0.194061,0.363098,0.913746,0.035178,112.024583,99.812872,0.990381,3.305188,0.372627,11.468028,1
1,white,6.413428,0.216144,0.312437,1.379739,0.03657,34.424491,105.555298,0.99312,3.18418,0.552937,12.150351,1
2,white,7.777567,0.465012,0.256546,2.315051,0.112173,9.408577,49.778692,0.994643,3.262033,0.590493,12.1117,1
3,white,6.334822,0.325774,0.239294,0.930108,0.072153,34.84534,131.632663,0.996706,3.063701,0.622852,10.556665,1
4,red,11.717141,0.330618,0.209156,2.221701,0.153539,12.881318,7.360696,0.996591,3.155991,0.451228,11.431277,1
