In [18]:
from complex_pipeline import *
from gen_single_report import gen_single_report
from report_utils import get_multi_metadata

### Load Configs
A config file is used to control the hyper parameters for ClavaDDPM.

In [19]:
config_path = 'configs/movie_lens.json'
configs, save_dir = load_configs(config_path)
configs

{'general': {'data_dir': 'complex_data/movie_lens/preprocessed/train',
  'exp_name': 'movie_lens_train',
  'workspace_dir': 'clavaDDPM_workspace/movie_lens',
  'sample_prefix': '',
  'test_data_dir': 'complex_data/movie_lens/preprocessed/test'},
 'clustering': {'parent_scale': 1.0,
  'num_clusters': 50,
  'clustering_method': 'both'},
 'diffusion': {'d_layers': [512, 1024, 1024, 1024, 1024, 512],
  'dropout': 0.0,
  'num_timesteps': 2000,
  'model_type': 'mlp',
  'iterations': 200000,
  'batch_size': 4096,
  'lr': 0.0006,
  'gaussian_loss_type': 'mse',
  'weight_decay': 1e-05,
  'scheduler': 'cosine'},
 'classifier': {'d_layers': [128, 256, 512, 1024, 512, 256, 128],
  'lr': 0.0001,
  'dim_t': 128,
  'batch_size': 4096,
  'iterations': 20000},
 'sampling': {'batch_size': 20000, 'classifier_scale': 1.0},
 'matching': {'num_matching_clusters': 1,
  'matching_batch_size': 1000,
  'unique_matching': True,
  'no_matching': False}}

### Load Data
Datasets need to be stored, and the path needs to be specified in config files.

In [20]:
tables, relation_order, dataset_meta = load_multi_table(configs['general']['data_dir'])

actor (81895, 2) (81895, 2)
Numerical (81895, 1)
Categorical (81895, 1)
Processing actor Successfully!
actor
Total 81895
Train 81895
Num 1
Cat 1
director (2041, 2) (2041, 2)
Numerical (2041, 2)
Categorical (2041, 0)
Processing director Successfully!
director
Total 2041
Train 2041
Num 2
Cat 0
user (6039, 3) (6039, 3)
Numerical (6039, 1)
Categorical (6039, 2)
Processing user Successfully!
user
Total 6039
Train 6039
Num 1
Cat 2
movie (3650, 4) (3650, 4)
Numerical (3650, 2)
Categorical (3650, 2)
Processing movie Successfully!
movie
Total 3650
Train 3650
Num 2
Cat 2
movie2actor (134122, 1) (134122, 1)
Numerical (134122, 1)
Categorical (134122, 0)
Processing movie2actor Successfully!
movie2actor
Total 134122
Train 134122
Num 1
Cat 0
movie2director (3943, 1) (3943, 1)
Numerical (3943, 0)
Categorical (3943, 1)
Processing movie2director Successfully!
movie2director
Total 3943
Train 3943
Num 0
Cat 1
rating (896543, 1) (896543, 1)
Numerical (896543, 1)
Categorical (896543, 0)
Processing rating Su

In [21]:
# tables is a dictionary of the multi-table dataset
tables.keys()

dict_keys(['actor', 'director', 'user', 'movie', 'movie2actor', 'movie2director', 'rating'])

In [22]:
# each entry in tables contains the data and metadata for a table
tables['actor'].keys()

dict_keys(['df', 'domain', 'children', 'parents', 'original_cols', 'original_df', 'info'])

In [23]:
# relation order is the topological order of the multi-table dataset
relation_order

[[None, 'actor'],
 [None, 'director'],
 [None, 'movie'],
 [None, 'user'],
 ['actor', 'movie2actor'],
 ['movie', 'movie2actor'],
 ['movie', 'movie2director'],
 ['director', 'movie2director'],
 ['movie', 'rating'],
 ['user', 'rating']]

In [None]:
multi_meta = get_multi_metadata(tables, relation_order)
multi_meta.visualize()

### Clustering

In [24]:
# performs clustering on the multi-table dataset
# updates the tables dictionary with augmented tables
# computes group size distributions
tables, all_group_lengths_prob_dicts = clava_clustering(tables, relation_order, save_dir, configs)

Clustering checkpoint found, loading...


### Training
Trains all the diffusion models and classifiers

In [27]:
tables, models = clava_training(tables, relation_order, save_dir, configs)

None -> actor checkpoint found, loading...
None -> director checkpoint found, loading...
None -> movie checkpoint found, loading...
None -> user checkpoint found, loading...
actor -> movie2actor checkpoint found, loading...
movie -> movie2actor checkpoint found, loading...
movie -> movie2director checkpoint found, loading...
director -> movie2director checkpoint found, loading...
movie -> rating checkpoint found, loading...
user -> rating checkpoint found, loading...


### Synthesizing

In [30]:
cleaned_tables, synthesizing_time_spent, matching_time_spent = clava_synthesizing(
    tables, 
    relation_order, 
    save_dir, 
    all_group_lengths_prob_dicts, 
    models,
    configs,
    sample_scale=1 if not 'debug' in configs else configs['debug']['sample_scale']
)

Synthetic tables found, loading...


### Multi-table Evaluation

In [31]:
report = clava_eval(tables, save_dir, configs, relation_order, cleaned_tables)

generating multi-table report for clavaDDPM_workspace/movie_lens
actor (81895, 2) (81895, 2)
Numerical (81895, 1)
Categorical (81895, 1)
Processing actor Successfully!
actor
Total 81895
Train 81895
Num 1
Cat 1
director (2041, 2) (2041, 2)
Numerical (2041, 2)
Categorical (2041, 0)
Processing director Successfully!
director
Total 2041
Train 2041
Num 2
Cat 0
user (6039, 3) (6039, 3)
Numerical (6039, 1)
Categorical (6039, 2)
Processing user Successfully!
user
Total 6039
Train 6039
Num 1
Cat 2
movie (3650, 4) (3650, 4)
Numerical (3650, 2)
Categorical (3650, 2)
Processing movie Successfully!
movie
Total 3650
Train 3650
Num 2
Cat 2
movie2actor (134122, 1) (134122, 1)
Numerical (134122, 1)
Categorical (134122, 0)
Processing movie2actor Successfully!
movie2actor
Total 134122
Train 134122
Num 1
Cat 0
movie2director (3943, 1) (3943, 1)
Numerical (3943, 0)
Categorical (3943, 1)
Processing movie2director Successfully!
movie2director
Total 3943
Train 3943
Num 0
Cat 1
rating (896543, 1) (896543, 1)
N

### Single table metrics

In [32]:
test_tables, _, _ = load_multi_table(configs['general']['test_data_dir'])
real_tables, _, _ = load_multi_table(configs['general']['data_dir'])

for table_name in tables.keys():
    print(f'Generating report for {table_name}')
    real_data = real_tables[table_name]['df']
    syn_data = cleaned_tables[table_name]
    domain_dict = real_tables[table_name]['domain']

    if configs['general']['test_data_dir'] is not None:
        test_data = test_tables[table_name]['df']
    else:
        test_data = None

    gen_single_report(
        real_data, 
        syn_data,
        domain_dict,
        table_name,
        save_dir,
        alpha_beta_sample_size=200_000,
        test_data=test_data
    )

actor (16795, 2) (16795, 2)
Numerical (16795, 1)
Categorical (16795, 1)
Processing actor Successfully!
actor
Total 16795
Train 16795
Num 1
Cat 1
director (160, 2) (160, 2)
Numerical (160, 2)
Categorical (160, 0)
Processing director Successfully!
director
Total 160
Train 160
Num 2
Cat 0
user (0, 3) (0, 3)
Numerical (0, 1)
Categorical (0, 2)
Processing user Successfully!
user
Total 0
Train 0
Num 1
Cat 2
movie (182, 4) (182, 4)
Numerical (182, 2)
Categorical (182, 2)
Processing movie Successfully!
movie
Total 182
Train 182
Num 2
Cat 2
movie2actor (4227, 1) (4227, 1)
Numerical (4227, 1)
Categorical (4227, 0)
Processing movie2actor Successfully!
movie2actor
Total 4227
Train 4227
Num 1
Cat 0
movie2director (198, 1) (198, 1)
Numerical (198, 0)
Categorical (198, 1)
Processing movie2director Successfully!
movie2director
Total 198
Train 198
Num 0
Cat 1
rating (99616, 1) (99616, 1)
Numerical (99616, 1)
Categorical (99616, 0)
Processing rating Successfully!
rating
Total 99616
Train 99616
Num 1
Cat