In [1]:
import torch
from torch import nn

batch_size = 2
inp_size = 5
lat_size = 2


In [None]:
x = torch.randn((batch_size, inp_size))
recon_x = torch.randn((batch_size, inp_size))
log_scale = torch.zeros(inp_size)

eq = nn.functional.mse_loss(
    torch.tanh(recon_x),
    x,
    reduction="none",
).sum(dim=-1)
eq

In [2]:
from uqvae.quality_dataset import JanusDataset, preprocess_janus_dataset
filename = "../data/fulldataset.csv"
dataset, discrete_columns = preprocess_janus_dataset(filename, n_classes_allowed=2)

# dataset = JanusDataset(dataset, discrete_columns, preprocess_dataset=False)


In [4]:
dataset[discrete_columns]

Unnamed: 0,data_021,data_025,data_033,data_059,data_060,data_061,data_062,data_064,data_065,data_066,...,data_128,data_129,data_130,data_134,data_135,data_191,data_192,data_193,data_194,data_195
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15159,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
15160,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
15161,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
15162,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


In [3]:
from ctgan.data_transformer import DataTransformer
filename = "../data/fulldataset.csv"
dataset, discrete_columns = preprocess_janus_dataset(filename, n_classes_allowed=2)
df_x = dataset.iloc[:, :-2]
print("Transforming Janus data...", end=" ")
transformer = DataTransformer()
print("Fit...", end=" ")
tmp = df_x.sample(50, axis=0)
transformer.fit(tmp, discrete_columns)
print("Transform...", end=" ")
x = transformer.transform(df_x).astype("float32")
print("Done.")
x = torch.from_numpy(x)
n_features = transformer.output_dimensions


Transforming Janus data... Fit... Transform... Done.


In [4]:
inv_x = transformer.inverse_transform(x)
inv_x.index = dataset.index

In [31]:
# import pandas as pd
# x = pd.DataFrame(torch.randn(10, 1)).astype("category")
# x.info()

x[0].dtype == "category"


True

In [16]:
inv_x["data_001"].describe()

count    14972.000000
mean         0.000061
std          0.999773
min         -5.709252
25%         -0.628358
50%          0.049094
75%          0.726546
max          4.791261
Name: data_001, dtype: float64

In [17]:
dataset["data_001"].describe()


count    1.497200e+04
mean    -2.038310e-09
std      1.000033e+00
min     -5.709252e+00
25%     -6.283584e-01
50%      4.909404e-02
75%      7.265465e-01
max      4.791261e+00
Name: data_001, dtype: float64

In [None]:
# for col in discrete_columns:
#     print(col, len(dataset[col].unique()))

In [None]:
from uqvae.models import VAE, Encoder, Decoder

device = torch.device("cuda")
ckpt = torch.load("../results/logs/VAE/best_vae.pkl", map_location=device)

vae = VAE(
    encoder=Encoder,
    decoder=Decoder,
    configs=ckpt["configs"],
)

vae.load_state_dict(ckpt["model"])

ckpt = torch.load("../results/logs/TVAE/best_vae.pkl", map_location=device)
tvae = VAE(
    encoder=Encoder,
    decoder=Decoder,
    configs=ckpt["configs"],
)

tvae.load_state_dict(ckpt["model"])


In [None]:
configs = ckpt["configs"]
configs


In [None]:
with torch.no_grad():
    real = dataset.x[:1000].to("cpu")

    vae.decoder.eval()
    tvae.decoder.eval()
    mean = torch.ones(size=(1000, configs["embedding_dim"]))
    z = torch.normal(mean=mean, std=mean+1).to("cpu")
    fake_vae, log_scale = vae.decoder(z)
    fake_tvae, log_scale = tvae.decoder(z)


In [None]:
from sdv.metadata.single_table import SingleTableMetadata
from sdmetrics.reports.single_table import QualityReport
import pandas as pd
import warnings
warnings.filterwarnings("ignore")

real = pd.DataFrame(real)
fake_vae = pd.DataFrame(fake_vae, columns=real.columns, index=real.index)
fake_tvae = pd.DataFrame(fake_tvae, columns=real.columns, index=real.index)

metadata = SingleTableMetadata()
metadata.detect_from_dataframe(real)
report = QualityReport()
report.generate(real, fake_vae, metadata.to_dict())

In [None]:
from sdmetrics.reports import utils

fig = utils.get_column_plot(
    real_data=real,
    synthetic_data=fake_vae,
    column_name=0,
    metadata=metadata.to_dict()
)

fig.show()

In [None]:
new_metadata = metadata.to_dict()

for x in new_metadata["columns"]:
    if x in discrete_columns:
        new_metadata["columns"][x]["sdtype"] = "categorical"
metadata.load_from_dict(new_metadata)

report = QualityReport()
report.generate(real, fake_tvae, metadata.to_dict())

In [None]:
from sdmetrics.reports import utils

fig = utils.get_column_plot(
    real_data=real,
    synthetic_data=fake_tvae,
    column_name=0,
    metadata=metadata.to_dict()
)

fig.show()