In [10]:
import json
from time import time
import numpy as np
import pandas as pd
from pathlib import Path
from dpmm.pipelines import MSTPipeline

# Load data

In [11]:
domain = {
    "datetime_col": {"lower": pd.to_datetime("01/01/2024", format="%d/%m/%Y"), "upper": pd.to_datetime("31/12/2025", format="%d/%m/%Y")},
    "timedelta_col": {"lower": pd.to_timedelta("0 days"), "upper": pd.to_timedelta("365 days")},
    "int_col": {"lower": -100, "upper": 100},
    "float_col": {"lower": 0.0, "upper": 1.0},
    "category_col": {"categories": list(range(10))}
}

NROWS = 10_000
df = pd.DataFrame(index=np.arange(NROWS))

for col, col_domain in domain.items():
    if "lower" in col_domain:
        df[col] = np.random.uniform(0, 1, size=NROWS) * (col_domain["upper"] - col_domain["lower"]) + col_domain["lower"]
        if col == "int_col":
            df[col] = df[col].round().astype(int)
    else:
        df[col] = np.random.choice(col_domain["categories"], replace=True, size=NROWS)
        df[col] = df[col].astype("category")


df.head()

Unnamed: 0,datetime_col,timedelta_col,int_col,float_col,category_col
0,2025-02-21 14:07:52.063903856,341 days 23:01:00.217958832,90,0.276895,7
1,2024-04-25 15:20:19.132008694,84 days 14:13:29.889610772,0,0.500879,8
2,2024-01-06 23:57:05.974915018,36 days 02:26:37.856744218,-75,0.89338,0
3,2024-10-23 11:49:33.916036136,350 days 23:38:25.245704796,69,0.670218,1
4,2024-11-05 10:11:17.239320760,161 days 02:05:59.745130012,-25,0.716258,6


# Build pipeline

In [None]:
pipeline = MSTPipeline(
    epsilon=1,  # Privacy budget for generate model
    proc_epsilon=0.1,  # Privacy budget for data processing 
    delta=1e-5,  # Delta Setting in (eps,  delta) differential privacy
)

# Fit step

In [13]:
print("Fitting MST pipeline...", end='\r')
start_time = time()
# The number of rows to generate
pipeline.fit(df, domain)
fit_time = time()

print(f"Fitting MST pipeline - Took {fit_time - start_time:.2f} seconds")

Fitting MST pipeline - Took 6.12 seconds


# Generate step

In [14]:
print("Generating synthetic data...", end="\r")
# Generate synthetic data
synth_df = pipeline.generate(df.shape[0])
gen_time = time()

print(f"Generating synthetic data - Took {gen_time - fit_time:.2f} seconds")

synth_df.head()

Generating synthetic data - Took 0.04 seconds


Unnamed: 0,datetime_col,timedelta_col,int_col,float_col,category_col
0,2025-03-12 21:21:39.606684,100 days 19:34:00.043399812,-71,0.606442,2
1,2025-10-28 20:40:13.734340,288 days 08:14:00.947903175,85,0.063074,7
2,2024-09-22 19:48:13.521290,184 days 01:28:56.579526384,-13,0.153094,4
3,2024-08-06 23:22:44.034462,342 days 03:36:14.654889502,75,0.776069,6
4,2024-03-21 03:23:23.283484,44 days 15:49:22.174719843,-33,0.988548,5
