In [1]:
import numpy as np
import pandas as pd
import torch
from scipy.linalg import toeplitz
from scipy.stats import norm

import matplotlib.pyplot as plt
from tableone import TableOne
from sksurv.nonparametric import kaplan_meier_estimator

from utils import data_processing, visualization
from utils.simulations import *
from execute import surv_hivae, surv_gan, surv_vae
from sksurv.nonparametric import kaplan_meier_estimator

import os
import uuid
import datetime
import json
import sys
from utils.metrics import fit_cox_model, general_metrics

from synthcity.utils.constants import DEVICE
print('Device :', DEVICE)

  from .autonotebook import tqdm as notebook_tqdm


                  variable OMP_PATH to the location of the header before importing keopscore or pykeops,
                  e.g. using os.environ: import os; os.environ['OMP_PATH'] = '/path/to/omp/header'
Device : cpu


In [2]:
def prepare_dataset_dirs(dataset_name):
    base_path = os.path.join("./dataset", dataset_name)
    os.makedirs(base_path, exist_ok=True)
    os.makedirs(os.path.join(base_path, "optuna_results"), exist_ok=True)
    return base_path

def adjust_feat_types_for_generator(generator_name, feat_types_dict):
    feat_types_dict_ext = [dict(ft) for ft in feat_types_dict]  # deep copy
    for d in feat_types_dict_ext:
        if d['name'] == "survcens":
            if generator_name == "HI-VAE_weibull" or generator_name == "HI-VAE_weibull_prior":
                d["type"] = 'surv_weibull'
            elif generator_name == "HI-VAE_lognormal":
                d["type"] = 'surv'
            else:
                d["type"] = 'surv_piecewise'
    return feat_types_dict_ext

In [3]:
 # Simulation parameters
n_samples = 600
n_features_bytype = 6
n_active_features = 3 
treatment_effect = 0.
p_treated = 0.5
shape_T = 2.
shape_C = 2.
scale_C = 2.5
scale_C_indep = 3.9
feature_types_list = ["real", "cat"]
independent = True
data_types_create = True


metric_optuna = "survival_km_distance"
dataset_name = "Simulations_6_indep"
base_path = prepare_dataset_dirs(dataset_name)

In [4]:
# generators_sel = ["HI-VAE_weibull", "HI-VAE_piecewise", "Surv-GAN", "Surv-VAE", "HI-VAE_weibull_prior", "HI-VAE_piecewise_prior"]
generators_sel = ["HI-VAE_weibull", "HI-VAE_piecewise"]

In [5]:
generators_dict = {"HI-VAE_weibull" : surv_hivae,
                    "HI-VAE_piecewise" : surv_hivae,
                    "HI-VAE_lognormal" : surv_hivae,
                    "Surv-GAN" : surv_gan,
                    "Surv-VAE" : surv_vae, 
                    "HI-VAE_weibull_prior" : surv_hivae, 
                    "HI-VAE_piecewise_prior" : surv_hivae}

# BEST PARAMETERS
best_params_dict = {}
name_config = "simu_N{}_nfeat{}_t{}".format(n_samples, n_features_bytype, int(treatment_effect))
n_trials = 150
for generator_name in generators_sel:
    best_params_file = os.path.join(base_path, "optuna_results", "best_params_{}_ntrials{}_{}_{}.json".format(name_config, n_trials, metric_optuna, generator_name))
    with open(best_params_file, "r") as f:
        best_params_dict[generator_name] = json.load(f)


In [6]:
n_generated_dataset = 1
n_simulated_dataset = 20

In [9]:
df_gen_control_dict = {generator_name: [] for generator_name in generators_sel}
latent_s = {generator_name: [] for generator_name in generators_sel}
latent_z = {generator_name: [] for generator_name in generators_sel}
latent_y = {generator_name: [] for generator_name in generators_sel}

In [None]:
for i in range(n_simulated_dataset):

    seed = i

    control, treated, types = simulation(treatment_effect, n_samples, independent, feature_types_list,
                                        n_features_bytype, n_active_features, p_treated, shape_T,
                                        shape_C, scale_C, scale_C_indep, data_types_create, seed=seed)
    control = control.drop(columns='treatment')

    data_file_control = os.path.join(f"./dataset/{dataset_name}", "data_control.csv")
    feat_types_file_control = os.path.join(f"./dataset/{dataset_name}", "data_types_control.csv")
    control.to_csv(data_file_control, index=False, header=False)
    types.to_csv(feat_types_file_control, index=False)

    # Load and process control data
    df_init_control_encoded, feat_types_dict, miss_mask_control, true_miss_mask_control, _ = data_processing.read_data(
            data_file_control, feat_types_file_control, miss_file="Missing.csv", true_miss_file=None)
    data_init_control_encoded = torch.from_numpy(df_init_control_encoded.values)
    data_init_control = data_processing.discrete_variables_transformation(data_init_control_encoded, feat_types_dict)

    # Format control data into DataFrame
    fnames = types['name'][:-1].tolist() + ["time", "censor"]
    df_init_control = pd.DataFrame(data_init_control.numpy(), columns=fnames)
    df_init_control["treatment"] = 0

    # df_gen_control_dict ={}
    # For each generator, perform the data generation with the best params
    for generator_name in generators_sel:
        best_params = best_params_dict[generator_name]
        if generator_name in ["HI-VAE_lognormal", "HI-VAE_weibull", "HI-VAE_piecewise", "HI-VAE_weibull_prior", "HI-VAE_piecewise_prior"]:
            if generator_name in ["HI-VAE_weibull_prior", "HI-VAE_piecewise_prior"]:
                gen_from_prior = True
            else:
                gen_from_prior = False
            feat_types_dict_ext = adjust_feat_types_for_generator(generator_name, feat_types_dict)
            data_gen_control, s_total, z_total, y_total = generators_dict[generator_name].run(df_init_control_encoded, miss_mask_control, 
                                                                    true_miss_mask_control, feat_types_dict_ext, 
                                                                    n_generated_dataset, params=best_params, epochs=10000, gen_from_prior=gen_from_prior, return_latent_vectors=True)
        # else:
        #     data_gen_control = generators_dict[generator_name].run(data_init_control, columns=fnames, 
        #                                                             target_column="censor", time_to_event_column="time", 
        #                                                             n_generated_dataset=n_generated_dataset, 
        #                                                             params=best_params)

        latent_s[generator_name].append(s_total)
        latent_z[generator_name].append(z_total)
        latent_y[generator_name].append(y_total)

        list_df_gen_control = []
        for i in range(n_generated_dataset):
            df_gen_control = pd.DataFrame(data_gen_control[i].numpy(), columns=fnames)
            df_gen_control["treatment"] = 0
            list_df_gen_control.append(df_gen_control)
        # df_gen_control_dict[generator_name] = list_df_gen_control

        df_gen_control_dict[generator_name].append(list_df_gen_control)


Epoch: [ 0]  time: 0.0825, ELBO_train: -15.98717213, KL_z: 2.46765232, KL_s: 0.10257244, reconstruction loss: -13.41694736
Epoch: [100]  time: 0.7724, ELBO_train: -12.11521721, KL_z: 1.24780822, KL_s: 0.03155565, reconstruction loss: -10.83585334
Epoch: [200]  time: 1.4572, ELBO_train: -11.68682480, KL_z: 1.32029927, KL_s: 0.01734877, reconstruction loss: -10.34917676
Epoch: [300]  time: 2.1342, ELBO_train: -11.76412106, KL_z: 1.42126477, KL_s: 0.01514816, reconstruction loss: -10.32770813
Epoch: [400]  time: 2.8198, ELBO_train: -11.64529896, KL_z: 1.40081096, KL_s: 0.01474285, reconstruction loss: -10.22974515
Epoch: [500]  time: 3.4625, ELBO_train: -11.63064003, KL_z: 1.52212512, KL_s: 0.01466656, reconstruction loss: -10.09384835
Epoch: [600]  time: 4.1046, ELBO_train: -11.55868340, KL_z: 1.56701541, KL_s: 0.01482153, reconstruction loss: -9.97684646
Epoch: [700]  time: 4.7618, ELBO_train: -11.33791542, KL_z: 1.62374008, KL_s: 0.01600027, reconstruction loss: -9.69817507
Epoch: [800

KeyError: 'HI-VAE_weibull'

In [8]:
print(s_total.shape, z_total.shape, y_total.shape)

torch.Size([300, 150]) torch.Size([300, 60]) torch.Size([300, 2275])
