In [532]:
"""TVAE module."""

import numpy as np
import pandas as pd
import torch
from torch.nn import Linear, Module, Parameter, ReLU, Sequential, GELU
from torch.nn.functional import cross_entropy
from torch.optim import Adam
from torch.utils.data import DataLoader, TensorDataset
from tqdm import tqdm

from ctgan.data_transformer import DataTransformer
from ctgan.synthesizers.base import BaseSynthesizer, random_state


class Encoder(Module):
    """Encoder for the TVAE.

    Args:
        data_dim (int):
            Dimensions of the data.
        compress_dims (tuple or list of ints):
            Size of each hidden layer.
        embedding_dim (int):
            Size of the output vector.
    """

    def __init__(self, data_dim, compress_dims, embedding_dim):
        super(Encoder, self).__init__()
        dim = data_dim
        seq = []
        for item in list(compress_dims):
            seq += [Linear(dim, item), GELU()]
            dim = item

        self.seq = Sequential(*seq)
        self.fc1 = Linear(dim, embedding_dim)
        self.fc2 = Linear(dim, embedding_dim)

    def forward(self, input_):
        """Encode the passed `input_`."""
        feature = self.seq(input_)
        mu = self.fc1(feature)
        logvar = self.fc2(feature)
        std = torch.exp(0.5 * logvar)
        return mu, std, logvar


class Decoder(Module):
    """Decoder for the TVAE.

    Args:
        embedding_dim (int):
            Size of the input vector.
        decompress_dims (tuple or list of ints):
            Size of each hidden layer.
        data_dim (int):
            Dimensions of the data.
    """

    def __init__(self, embedding_dim, decompress_dims, data_dim):
        super(Decoder, self).__init__()
        dim = embedding_dim
        seq = []
        for item in list(decompress_dims):
            seq += [Linear(dim, item), GELU()]
            dim = item

        seq.append(Linear(dim, data_dim))
        self.seq = Sequential(*seq)
        self.sigma = Parameter(torch.ones(data_dim) * 0.1)

    def forward(self, input_):
        """Decode the passed `input_`."""
        return self.seq(input_), self.sigma


def _loss_function(recon_x, x, sigmas, mu, logvar, output_info, factor):
    st = 0
    loss = []
    for column_info in output_info:
        for span_info in column_info:
            if span_info.activation_fn != 'softmax':
                ed = st + span_info.dim
                std = sigmas[st]
                eq = x[:, st] - torch.tanh(recon_x[:, st])
                loss.append((eq**2 / 2 / (std**2)).sum())
                loss.append(torch.log(std) * x.size()[0])
                st = ed

            else:
                ed = st + span_info.dim
                loss.append(
                    cross_entropy(
                        recon_x[:, st:ed], torch.argmax(x[:, st:ed], dim=-1), reduction='sum'
                    )
                )
                st = ed

    assert st == recon_x.size()[1]
    KLD = -0.5 * torch.sum(1 + logvar - mu**2 - logvar.exp())
    return sum(loss) * factor / x.size()[0], KLD / x.size()[0]


class TVAE(BaseSynthesizer):
    """TVAE."""

    def __init__(
        self,
        embedding_dim=32,
        compress_dims=(32, 1024),
        decompress_dims=(1024, 32),
        l2scale=1e-5,
        batch_size=500,
        epochs=300,
        loss_factor=2,
        cuda=False,
        verbose=True,
    ):
        self.embedding_dim = embedding_dim
        self.compress_dims = compress_dims
        self.decompress_dims = decompress_dims

        self.l2scale = l2scale
        self.batch_size = batch_size
        self.loss_factor = loss_factor
        self.epochs = epochs
        self.loss_values = pd.DataFrame(columns=['Epoch', 'Batch', 'Loss'])
        self.verbose = verbose

        if not cuda or not torch.cuda.is_available():
            device = 'cpu'
        elif isinstance(cuda, str):
            device = cuda
        else:
            device = 'cuda'

        self._device = torch.device(device)

    @random_state
    def fit(self, train_data, discrete_columns=()):
        """Fit the TVAE Synthesizer models to the training data.

        Args:
            train_data (numpy.ndarray or pandas.DataFrame):
                Training Data. It must be a 2-dimensional numpy array or a pandas.DataFrame.
            discrete_columns (list-like):
                List of discrete columns to be used to generate the Conditional
                Vector. If ``train_data`` is a Numpy array, this list should
                contain the integer indices of the columns. Otherwise, if it is
                a ``pandas.DataFrame``, this list should contain the column names.
        """
        self.transformer = DataTransformer()
        self.transformer.fit(train_data, discrete_columns)
        train_data = self.transformer.transform(train_data)
        dataset = TensorDataset(torch.from_numpy(train_data.astype('float32')).to(self._device))
        loader = DataLoader(dataset, batch_size=self.batch_size, shuffle=True, drop_last=False)

        data_dim = self.transformer.output_dimensions
        self.encoder = Encoder(data_dim, self.compress_dims, self.embedding_dim).to(self._device)
        self.decoder = Decoder(self.embedding_dim, self.decompress_dims, data_dim).to(self._device)
        optimizerAE = Adam(
            list(self.encoder.parameters()) + list(self.decoder.parameters()), weight_decay=self.l2scale
        )

        self.loss_values = pd.DataFrame(columns=['Epoch', 'Batch', 'Loss'])
        iterator = tqdm(range(self.epochs), disable=(not self.verbose))
        if self.verbose:
            iterator_description = 'Loss: {loss:.3f}'
            iterator.set_description(iterator_description.format(loss=0))

        for i in iterator:
            loss_values = []
            batch = []
            for id_, data in enumerate(loader):
                optimizerAE.zero_grad()
                real = data[0].to(self._device)
                mu, std, logvar = self.encoder(real)
                eps = torch.randn_like(std)
                emb = eps * std + mu
                rec, sigmas = self.decoder(emb)
                loss_1, loss_2 = _loss_function(
                    rec,
                    real,
                    sigmas,
                    mu,
                    logvar,
                    self.transformer.output_info_list,
                    self.loss_factor,
                )
                loss = loss_1 + loss_2
                loss.backward()
                optimizerAE.step()
                self.decoder.sigma.data.clamp_(0.01, 1.0)

                batch.append(id_)
                loss_values.append(loss.detach().cpu().item())

            epoch_loss_df = pd.DataFrame({
                'Epoch': [i] * len(batch),
                'Batch': batch,
                'Loss': loss_values,
            })
            if not self.loss_values.empty:
                self.loss_values = pd.concat([self.loss_values, epoch_loss_df]).reset_index(
                    drop=True
                )
            else:
                self.loss_values = epoch_loss_df

            if self.verbose:
                iterator.set_description(
                    iterator_description.format(loss=loss.detach().cpu().item())
                )

    @random_state
    def sample(self, samples, path="../data/external/bank.csv"):
        """Sample data similar to the training data.

        Args:
            samples (int):
                Number of rows to sample.

        Returns:
            numpy.ndarray or pandas.DataFrame
        """
        self.decoder.eval()

        steps = samples // self.batch_size + 1
        data = []
        for _ in range(steps):
            embedding_data = pd.read_csv(path, sep=",")
            fake, sigmas = self.decoder(torch.tensor(embedding_data.to_numpy(), dtype=torch.float32))
            fake = torch.tanh(fake)
            data.append(fake.detach().cpu().numpy())

        data = np.concatenate(data, axis=0)
        data = data[:samples]
        return self.transformer.inverse_transform(data, sigmas.detach().cpu().numpy())

    def generate_latents(self, data):
        """Generate latent vectors from the input data using the trained encoder.

        Args:
            data (numpy.ndarray or pandas.DataFrame):
                Input data to encode. Must match the format used for training.

        Returns:
            numpy.ndarray: Latent vectors generated by the encoder.
        """
        self.encoder.eval()
        data = self.transformer.transform(data)
        data_tensor = torch.from_numpy(data.astype('float32')).to(self._device)
        
        with torch.no_grad():  # Disable gradient calculations
            mu, std, _ = self.encoder(data_tensor)  # Get mean and standard deviation from encoder
            eps = torch.randn_like(std)  # Sample from a standard normal distribution
            emb = eps * std + mu  # Reparameterization trick to sample from the latent space

        return emb.cpu().numpy() 
    
    def set_device(self, device):
        """Set the `device` to be used ('GPU' or 'CPU)."""
        self._device = device
        self.decoder.to(self._device)

In [533]:
def categorical_column_indices(metadata_dict):
    categorical_indices = []
    columns = metadata_dict.get('columns', {})
    column_names = list(columns.keys())[:-1]  # Exclude the last key
    for index, column_name in enumerate(column_names):
        column_data = columns[column_name]
        if column_data.get('sdtype') == 'categorical':
            categorical_indices.append(index)
    return categorical_indices

In [534]:
def generate_and_save_latent(model, source="../data/interim/bank_no_label.csv", path="../data/processed/bank,.csv"):
    DATA_PATH = source
    df = pd.read_csv(DATA_PATH, sep=",")
    actual_data = df #.iloc[:, :-1]
    # outcomes = df.iloc[:, -1]

    latents = []
    metadata = SingleTableMetadata()
    meta = metadata.detect_from_csv(source)

    discrete_columns = categorical_column_indices(metadata.to_dict())
    print(discrete_columns)
    model.fit(actual_data, discrete_columns)
    latents = model.generate_latents(actual_data)
    # unbatched_latent = torch.cat(latents, dim=0)

    latents_df = pd.DataFrame(latents) #(unbatched_latent)
    # outcomes_df = pd.DataFrame(outcomes)
    # Save DataFrame to a CSV file
    # data_with_outcomes = pd.concat([latents_df, outcomes_df], axis=1)

    # data_with_outcomes.to_csv(path, index=False)
    latents_df.to_csv(path, index=False)

In [535]:
def split_columns(df: pd.DataFrame):
    """
    Split the input DataFrame into two parts:
    - all columns except the last one
    - the last column

    Args:
    df (pd.DataFrame): The input DataFrame.

    Returns:
    pd.DataFrame, pd.Series: A DataFrame with all columns except the last one, and the last column as a Series.
    """
    # All columns except the last one
    all_except_last = df.iloc[:, :-1]

    # The last column
    last_column = df.iloc[:, -1]
    all_except_last.to_csv("../data/interim/bank_no_label.csv", index=False)
    return all_except_last, last_column

In [536]:
data = pd.read_csv("../data/raw/bank.csv", sep=",")
all_n_last, label = split_columns(data)
all_n_last

Unnamed: 0,ID,Age,Experience,Income,ZIP Code,Family,CCAvg,Education,Mortgage,Personal Loan,Securities Account,CD Account,Online
0,1,25,1,49,91107,4,1.6,1,0,0,1,0,0
1,2,45,19,34,90089,3,1.5,1,0,0,1,0,0
2,3,39,15,11,94720,1,1.0,1,0,0,0,0,0
3,4,35,9,100,94112,1,2.7,2,0,0,0,0,0
4,5,35,8,45,91330,4,1.0,2,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
4995,4996,29,3,40,92697,1,1.9,3,0,0,0,0,1
4996,4997,30,4,15,92037,4,0.4,1,85,0,0,0,1
4997,4998,63,39,24,93023,2,0.3,3,0,0,0,0,0
4998,4999,65,40,49,90034,3,0.5,2,0,0,0,0,1


In [537]:
data = pd.read_csv("../data/raw/bank.csv", sep=",")
all_n_last, label = split_columns(data)
no_label_data = pd.read_csv("../data/interim/bank_no_label.csv", sep=",")
model = TVAE(embedding_dim=no_label_data.shape[1], compress_dims=(32,1024), decompress_dims=(1024,32))
generate_and_save_latent(model)

[5, 7, 9, 10, 11]


Loss: -31.020:   8%|▊         | 25/300 [01:02<11:32,  2.52s/it]


KeyboardInterrupt: 

In [499]:
latent_check = pd.read_csv("../data/processed/bank,.csv")
combined_df = pd.concat([latent_check, label], axis=1)
combined_df.to_csv("../data/processed/bank.csv")

In [477]:
gen_data = np.load("../tabddpm/exp/bank_latent/X_num_unnorm.npy")
df = pd.DataFrame(gen_data)
df.to_csv("../data/external/bank.csv")

In [507]:
gen_df = model.sample(5000, path="../data/external/bank.csv")
gen_df

Unnamed: 0,ID,Age,Experience,Income,ZIP Code,Family,CCAvg,Education,Mortgage,Personal Loan,Securities Account,CD Account,Online
0,948,25,4,108,94898,2,9.285551,1,1,0,0,0,1
1,227,53,21,66,94732,1,0.500388,1,-1,0,0,0,1
2,4594,39,14,41,94634,4,0.546272,2,-2,0,0,0,1
3,-64,32,14,104,95201,2,2.772759,1,1,0,0,0,1
4,818,34,10,33,94617,4,2.412155,2,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
4995,-71,29,1,70,94815,1,1.645613,1,0,0,0,0,1
4996,447,30,4,40,94814,4,1.389405,1,-1,0,0,0,1
4997,1021,63,38,44,94428,1,1.226829,1,0,0,0,0,1
4998,3254,65,36,30,94825,1,2.168205,3,0,0,0,0,1


In [544]:
real_df = pd.read_csv("../data/raw/bank.csv", sep=",")
real_df = real_df.iloc[:, :-1]
real_df

Unnamed: 0,ID,Age,Experience,Income,ZIP Code,Family,CCAvg,Education,Mortgage,Personal Loan,Securities Account,CD Account,Online
0,1,25,1,49,91107,4,1.6,1,0,0,1,0,0
1,2,45,19,34,90089,3,1.5,1,0,0,1,0,0
2,3,39,15,11,94720,1,1.0,1,0,0,0,0,0
3,4,35,9,100,94112,1,2.7,2,0,0,0,0,0
4,5,35,8,45,91330,4,1.0,2,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
4995,4996,29,3,40,92697,1,1.9,3,0,0,0,0,1
4996,4997,30,4,15,92037,4,0.4,1,85,0,0,0,1
4997,4998,63,39,24,93023,2,0.3,3,0,0,0,0,0
4998,4999,65,40,49,90034,3,0.5,2,0,0,0,0,1


## Resemblance

In [458]:
from scipy.stats import pearsonr, ks_2samp
from scipy.spatial.distance import jensenshannon
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from xgboost import XGBClassifier

In [545]:
def compute_categorical_similarity(col_real, col_synthetic):
    # Compute Theil's U for categorical features
    p_real = pd.Series(col_real).value_counts(normalize=True)
    p_synthetic = pd.Series(col_synthetic).value_counts(normalize=True)
    u = (p_real * np.log(p_real / p_synthetic)).sum()
    return 1 - u
def column_similarity(real_data, synthetic_data):
    similarities = []
    for col_real, col_synthetic in zip(real_data, synthetic_data):
        correlation, _ = pearsonr(col_real, col_synthetic)
        similarity = correlation
        similarities.append(similarity)
    return np.mean(similarities)
def correlation_similarity(real_data, synthetic_data):
    real_corr = np.corrcoef(real_data, rowvar=False)
    synthetic_corr = np.corrcoef(synthetic_data, rowvar=False)
    print(synthetic_corr)
    correlation, _ = pearsonr(real_corr.flatten(), synthetic_corr.flatten())
    return correlation
def jensen_shannon_similarity(real_data, synthetic_data):
    similarities = []
    for col_real, col_synthetic in zip(real_data.T, synthetic_data.T):
        # Compute probability distributions and Jensen-Shannon divergence
        p_real = np.histogram(col_real, bins=10, density=True)[0]
        p_synthetic = np.histogram(col_synthetic, bins=10, density=True)[0]
        similarity = 1 - jensenshannon(p_real, p_synthetic)
        similarities.append(similarity)
    return np.mean(similarities)
def kolmogorov_smirnov_similarity(real_data, synthetic_data):
    similarities = []
    for col_real, col_synthetic in zip(real_data.T, synthetic_data.T):
        # Compute cumulative distributions and Kolmogorov-Smirnov distance
        similarity, _ = ks_2samp(col_real, col_synthetic)
        similarity = 1 - similarity
        similarities.append(similarity)
    return np.mean(similarities)
def propensity_mean_absolute_similarity(real_data, synthetic_data):
    # Train XGBoost classifier to discriminate between real and synthetic samples
    X = np.vstack([real_data, synthetic_data])
    y = np.concatenate([np.ones(len(real_data)), np.zeros(len(synthetic_data))])
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    classifier = XGBClassifier()
    classifier.fit(X_train, y_train)
    # Compute mean absolute error of classifier probabilities
    y_pred_proba = classifier.predict_proba(X_test)[:, 1]
    error = mean_absolute_error(y_test, y_pred_proba)
    return 1 - error

In [612]:
def resemblance_measure(real_data, synthetic_data):
    resemblance_score = (
        column_similarity(real_data, synthetic_data) +
        # correlation_similarity(real_data, synthetic_data) +
        jensen_shannon_similarity(real_data, synthetic_data) +
        kolmogorov_smirnov_similarity(real_data, synthetic_data) +
        propensity_mean_absolute_similarity(real_data, synthetic_data)
    ) / 4
    print("Resemblance Score:", resemblance_score)

In [461]:
resemblance_measure(real_df.to_numpy(), gen_df.to_numpy())

Resemblance Score: 0.8636330978384504


In [417]:
correlation_similarity(real_df.to_numpy(), gen_df.to_numpy())

[[ 1.00000000e+00 -8.47259459e-03 -8.32575989e-03 -1.76947432e-02
   1.34315402e-02 -1.67972432e-02 -2.46751718e-02  2.14632093e-02
  -1.39199177e-02 -2.48011655e-02 -1.69723327e-02 -6.90940312e-03
  -2.52841007e-03  1.70282321e-02]
 [-8.47259459e-03  1.00000000e+00  9.94214857e-01 -5.52686182e-02
  -2.92162874e-02 -4.64176636e-02 -5.20121791e-02  4.13343834e-02
  -1.25385869e-02 -7.72561717e-03 -4.36242223e-04  8.04255215e-03
   1.37024021e-02  7.68103676e-03]
 [-8.32575989e-03  9.94214857e-01  1.00000000e+00 -4.65741777e-02
  -2.86255480e-02 -5.25631471e-02 -5.00765106e-02  1.31518129e-02
  -1.05815524e-02 -7.41309808e-03 -1.23213441e-03  1.03533312e-02
   1.38978996e-02  8.96744734e-03]
 [-1.76947432e-02 -5.52686182e-02 -4.65741777e-02  1.00000000e+00
  -1.64098117e-02 -1.57500785e-01  6.45983670e-01 -1.87524257e-01
   2.06806228e-01  5.02462292e-01 -2.61649670e-03  1.69738080e-01
   1.42059193e-02 -2.38500775e-03]
 [ 1.34315402e-02 -2.92162874e-02 -2.86255480e-02 -1.64098117e-02
  

  c /= stddev[:, None]
  c /= stddev[None, :]


ValueError: array must not contain infs or NaNs

In [525]:
latent_check = pd.read_csv("../data/processed/bank.csv")
latent_check

FileNotFoundError: [Errno 2] No such file or directory: '../data/processed/bank.csv'

In [555]:
df = pd.read_csv("../data/external/bank_synth_latent.csv")
df.shape

(5000, 14)

In [526]:
test_data = np.load("../data/external/bank_latent/X_num_unnorm.npy")
test_data.shape

(5000, 14)

In [531]:
old_data = np.load("../tabddpm/exp/bank_latent/X_num_unnorm.npy")
old_data.shape

(5000, 14)

In [549]:
df = pd.DataFrame(test_data)
df.to_csv("../data/external/bank_synth_latent.csv", index=False)

In [556]:
df = pd.read_csv("../data/external/bank_synth_data.csv")
df

Unnamed: 0,ID,Age,Experience,Income,ZIP Code,Family,CCAvg,Education,Mortgage,Personal Loan,Securities Account,CD Account,Online
0,4006,61,8,59,94581,4,0.447093,2,2,0,0,0,1
1,4515,30,4,55,94887,4,1.961978,1,-1,0,0,0,1
2,375,26,4,37,94973,1,1.497103,1,0,0,0,0,1
3,4369,27,5,22,94688,4,1.623934,1,-1,0,0,0,1
4,3364,64,43,70,94521,1,1.069062,2,2,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
4995,599,67,33,32,94907,1,0.237404,1,0,0,0,0,1
4996,218,27,2,88,94300,1,1.295534,1,1,0,0,0,1
4997,389,28,5,159,95089,1,3.944582,1,1,0,0,0,1
4998,305,32,19,45,94981,1,2.461338,1,-1,0,0,0,1


In [557]:
gen_data_ddpm = df.iloc[:,:]
gen_data_ddpm

Unnamed: 0,ID,Age,Experience,Income,ZIP Code,Family,CCAvg,Education,Mortgage,Personal Loan,Securities Account,CD Account,Online
0,4006,61,8,59,94581,4,0.447093,2,2,0,0,0,1
1,4515,30,4,55,94887,4,1.961978,1,-1,0,0,0,1
2,375,26,4,37,94973,1,1.497103,1,0,0,0,0,1
3,4369,27,5,22,94688,4,1.623934,1,-1,0,0,0,1
4,3364,64,43,70,94521,1,1.069062,2,2,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
4995,599,67,33,32,94907,1,0.237404,1,0,0,0,0,1
4996,218,27,2,88,94300,1,1.295534,1,1,0,0,0,1
4997,389,28,5,159,95089,1,3.944582,1,1,0,0,0,1
4998,305,32,19,45,94981,1,2.461338,1,-1,0,0,0,1


In [558]:
resemblance_measure(real_df.to_numpy(), gen_data_ddpm.to_numpy())

Resemblance Score: 0.8454741915814525


In [563]:
diabetes = pd.read_csv("../data/raw/diabetes.csv")
diabetes = diabetes.iloc[:, :-1]

In [564]:
diabetes_syn_data = pd.read_csv("../data/external/diabetes_synth_data.csv")
diabetes_syn_data

Unnamed: 0,preg,plas,pres,skin,insu,mass,pedi,age
0,5,186,84,29,-1,42.828688,0.192437,36
1,3,116,77,27,-1,29.701258,0.333740,23
2,-1,94,43,43,48,39.462009,0.339307,21
3,8,124,80,33,-5,33.126051,0.248642,32
4,7,114,99,39,-2,42.775687,0.102719,29
...,...,...,...,...,...,...,...,...
4995,-1,135,72,35,135,34.182429,0.540444,22
4996,1,105,93,38,67,35.791426,0.165645,26
4997,4,175,62,34,489,45.298299,1.490308,67
4998,2,153,71,30,109,34.485610,0.245672,23


In [566]:
resemblance_measure(diabetes.to_numpy(), diabetes_syn_data.to_numpy())

Resemblance Score: 0.7609724990916054


In [599]:
abalone = pd.read_csv("../data/raw/abalone.csv")
abalone = abalone #.iloc[:, :-1]
# one_hot_encoded = pd.get_dummies(abalone.iloc[:, 0], prefix='Gender')
# abalone_encoded = pd.concat([abalone, one_hot_encoded], axis=1)
# abalone_encoded.drop(abalone.columns[0], axis=1, inplace=True)
# abalone_encoded
abalone['rings'].unique()


array([15,  7,  9, 10,  8, 20, 16, 19, 14, 11, 12, 18, 13,  5,  4,  6, 21,
       17, 22,  1,  3, 26, 23, 29,  2, 27, 25, 24])

In [610]:
cardio = pd.read_csv("../data/raw/cardio.csv", sep=";")
cardio = cardio.iloc[:, :-1]
cardio

Unnamed: 0,id,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active
0,0,18393,2,168,62.0,110,80,1,1,0,0,1
1,1,20228,1,156,85.0,140,90,3,1,0,0,1
2,2,18857,1,165,64.0,130,70,3,1,0,0,0
3,3,17623,2,169,82.0,150,100,1,1,0,0,1
4,4,17474,1,156,56.0,100,60,1,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...
69995,99993,19240,2,168,76.0,120,80,1,1,1,0,1
69996,99995,22601,1,158,126.0,140,90,2,2,0,0,1
69997,99996,19066,2,183,105.0,180,90,3,1,0,1,0
69998,99998,22431,1,163,72.0,135,80,1,2,0,0,0


In [606]:
adult = pd.read_csv("../data/raw/adult.csv", sep=",")
adult

Unnamed: 0,age,workclass,fnlwgt,education,educational-num,marital-status,occupation,relationship,race,gender,capital-gain,capital-loss,hours-per-week,native-country,income
0,25,Private,226802,11th,7,Never-married,Machine-op-inspct,Own-child,Black,Male,0,0,40,United-States,<=50K
1,38,Private,89814,HS-grad,9,Married-civ-spouse,Farming-fishing,Husband,White,Male,0,0,50,United-States,<=50K
2,28,Local-gov,336951,Assoc-acdm,12,Married-civ-spouse,Protective-serv,Husband,White,Male,0,0,40,United-States,>50K
3,44,Private,160323,Some-college,10,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,7688,0,40,United-States,>50K
4,18,?,103497,Some-college,10,Never-married,?,Own-child,White,Female,0,0,30,United-States,<=50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48837,27,Private,257302,Assoc-acdm,12,Married-civ-spouse,Tech-support,Wife,White,Female,0,0,38,United-States,<=50K
48838,40,Private,154374,HS-grad,9,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0,0,40,United-States,>50K
48839,58,Private,151910,HS-grad,9,Widowed,Adm-clerical,Unmarried,White,Female,0,0,40,United-States,<=50K
48840,22,Private,201490,HS-grad,9,Never-married,Adm-clerical,Own-child,White,Male,0,0,20,United-States,<=50K


In [607]:
cardio_syn_data = pd.read_csv("../data/external/cardio_synth_data.csv")
cardio_syn_data

Unnamed: 0,id,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active
0,90696,20949,1,168,71.082749,140,87,1,1,0,0,1
1,84308,14992,2,168,57.407730,107,46,1,1,1,0,1
2,88020,21718,1,155,87.968905,150,86,3,3,0,0,1
3,67963,22163,1,166,70.150322,90,66,2,1,0,0,1
4,75375,14988,1,172,64.707230,110,69,1,1,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...
4995,79850,21552,2,166,73.555163,140,82,1,1,1,1,1
4996,90389,21467,2,169,63.210992,120,82,1,1,0,0,1
4997,91294,21224,1,168,71.300427,120,80,3,3,0,0,1
4998,95000,20537,2,165,88.403584,139,98,1,1,0,0,1


In [613]:
resemblance_measure(cardio.to_numpy(), cardio_syn_data.to_numpy())

Resemblance Score: 0.7024534547666803


In [639]:
churn = pd.read_csv("../data/raw/churn.csv")
churn = churn.iloc[:, :-1].drop('Surname', axis=1)
churn

Unnamed: 0,RowNumber,CustomerId,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary
0,1,15634602,619,France,Female,42,2,0.00,1,1,1,101348.88
1,2,15647311,608,Spain,Female,41,1,83807.86,1,0,1,112542.58
2,3,15619304,502,France,Female,42,8,159660.80,3,1,0,113931.57
3,4,15701354,699,France,Female,39,1,0.00,2,0,0,93826.63
4,5,15737888,850,Spain,Female,43,2,125510.82,1,1,1,79084.10
...,...,...,...,...,...,...,...,...,...,...,...,...
9995,9996,15606229,771,France,Male,39,5,0.00,2,1,0,96270.64
9996,9997,15569892,516,France,Male,35,10,57369.61,1,1,1,101699.77
9997,9998,15584532,709,France,Female,36,7,0.00,1,0,1,42085.58
9998,9999,15682355,772,Germany,Male,42,3,75075.31,2,1,0,92888.52


In [640]:
churn_syn_data = pd.read_csv("../data/external/churn_synth_data.csv")
churn_syn_data = churn_syn_data.drop('Surname', axis =1)
churn_syn_data

Unnamed: 0,RowNumber,CustomerId,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary
0,3968,15638646,649,Germany,Female,57,1,114497.866492,1,1,0,22330.427556
1,9187,15684548,666,France,Male,35,2,-265.657410,2,1,1,5950.815255
2,410,15762745,627,France,Male,39,9,-229.755239,2,1,0,193584.845336
3,7849,15623489,643,France,Female,41,0,-433.180487,1,1,1,190581.986865
4,5442,15583392,647,Germany,Male,34,1,141393.370762,1,1,1,185953.142922
...,...,...,...,...,...,...,...,...,...,...,...,...
4995,10089,15700046,713,France,Male,33,8,105554.114390,2,1,1,198484.580331
4996,9710,15569976,672,Germany,Male,39,10,131597.139904,1,1,1,5636.497466
4997,7045,15606641,636,Germany,Male,37,9,127552.991434,1,1,0,18879.491197
4998,9629,15810218,699,France,Male,29,8,-315.250353,2,1,1,2957.614788


In [655]:
churn_encoded = pd.get_dummies(churn, columns=['Gender', 'Geography'], dtype=int)
churn_encoded =  churn_encoded.iloc[:, :-1]

churn_syn_encoded = pd.get_dummies(churn_syn_data, columns=['Gender', 'Geography'], dtype=int)

In [656]:
resemblance_measure(churn_encoded.to_numpy(), churn_syn_encoded.to_numpy())

Resemblance Score: 0.8627583260012426


In [661]:
from sdv.metadata import SingleTableMetadata

metadata = SingleTableMetadata()
metadata.detect_from_csv("../data/raw/covtype.csv")




In [662]:
metadata

{
    "columns": {
        "Elevation": {
            "sdtype": "numerical"
        },
        "Aspect": {
            "sdtype": "numerical"
        },
        "Slope": {
            "sdtype": "numerical"
        },
        "Horizontal_Distance_To_Hydrology": {
            "sdtype": "numerical"
        },
        "Vertical_Distance_To_Hydrology": {
            "sdtype": "numerical"
        },
        "Horizontal_Distance_To_Roadways": {
            "sdtype": "numerical"
        },
        "Hillshade_9am": {
            "sdtype": "numerical"
        },
        "Hillshade_Noon": {
            "sdtype": "numerical"
        },
        "Hillshade_3pm": {
            "sdtype": "numerical"
        },
        "Horizontal_Distance_To_Fire_Points": {
            "sdtype": "numerical"
        },
        "Wilderness_Area1": {
            "sdtype": "categorical"
        },
        "Wilderness_Area2": {
            "sdtype": "categorical"
        },
        "Wilderness_Area3": {
            "sdtype"

In [660]:
covertype = pd.read_csv("../data/raw/covtype.csv")
# covertype
covertype.iloc[:, -1].unique()

array([5, 2, 1, 7, 3, 6, 4])

In [666]:
heloc = pd.read_csv("../data/raw/heloc.csv")
heloc

Unnamed: 0,ExternalRiskEstimate,MSinceOldestTradeOpen,MSinceMostRecentTradeOpen,AverageMInFile,NumSatisfactoryTrades,NumTrades60Ever2DerogPubRec,NumTrades90Ever2DerogPubRec,PercentTradesNeverDelq,MSinceMostRecentDelq,MaxDelq2PublicRecLast12M,...,MSinceMostRecentInqexcl7days,NumInqLast6M,NumInqLast6Mexcl7days,NetFractionRevolvingBurden,NetFractionInstallBurden,NumRevolvingTradesWBalance,NumInstallTradesWBalance,NumBank2NatlTradesWHighUtilization,PercentTradesWBalance,RiskPerformance
0,55,144,4,84,20,3,0,83,2,3,...,0,0,0,33,-8,8,1,1,69,Bad
1,61,58,15,41,2,4,4,100,-7,0,...,0,0,0,0,-8,0,-8,-8,0,Bad
2,67,66,5,24,9,0,0,100,-7,7,...,0,4,4,53,66,4,2,1,86,Bad
3,66,169,1,73,28,1,1,93,76,6,...,0,5,4,72,83,6,4,3,91,Bad
4,81,333,27,132,12,0,0,100,-7,7,...,0,1,1,51,89,3,1,0,80,Bad
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10454,73,131,5,57,21,0,0,95,80,6,...,7,0,0,26,-8,5,2,0,100,Good
10455,65,147,39,68,11,0,0,92,28,6,...,1,1,1,86,53,2,2,1,80,Bad
10456,74,129,6,64,18,1,1,100,-7,6,...,3,4,4,6,-8,5,-8,0,56,Bad
10457,72,234,12,113,42,2,2,96,35,6,...,6,0,0,19,-8,4,1,0,38,Bad


In [665]:
col_to_move = 'RiskPerformance'
df = pd.read_csv("../data/raw/heloc.csv")
df = df[[col for col in df.columns if col != col_to_move] + [col_to_move]]

df.to_csv('../data/raw/heloc.csv', index=False)

In [None]:
train_intrusion_data = pd.read_csv("../data/raw/intrusion.csv")

In [688]:
train_intrusion_data = pd.read_csv("../data/raw/intrusion.csv")
train_intrusion_data
# train_intrusion_data_en = pd.get_dummies(train_intrusion_data, columns = ['protocol_type', 'service','flag'])
# train_intrusion_data_en

Unnamed: 0,duration,protocol_type,service,flag,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,...,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate,class
0,0,tcp,ftp_data,SF,491,0,0,0,0,0,...,25,0.17,0.03,0.17,0.00,0.00,0.00,0.05,0.00,normal
1,0,udp,other,SF,146,0,0,0,0,0,...,1,0.00,0.60,0.88,0.00,0.00,0.00,0.00,0.00,normal
2,0,tcp,private,S0,0,0,0,0,0,0,...,26,0.10,0.05,0.00,0.00,1.00,1.00,0.00,0.00,anomaly
3,0,tcp,http,SF,232,8153,0,0,0,0,...,255,1.00,0.00,0.03,0.04,0.03,0.01,0.00,0.01,normal
4,0,tcp,http,SF,199,420,0,0,0,0,...,255,1.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,normal
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25187,0,tcp,exec,RSTO,0,0,0,0,0,0,...,7,0.03,0.06,0.00,0.00,0.00,0.00,1.00,1.00,anomaly
25188,0,tcp,ftp_data,SF,334,0,0,0,0,0,...,39,1.00,0.00,1.00,0.18,0.00,0.00,0.00,0.00,anomaly
25189,0,tcp,private,REJ,0,0,0,0,0,0,...,13,0.05,0.07,0.00,0.00,0.00,0.00,1.00,1.00,anomaly
25190,0,tcp,nnsp,S0,0,0,0,0,0,0,...,20,0.08,0.06,0.00,0.00,1.00,1.00,0.00,0.00,anomaly


In [687]:
test_intrusion_data = pd.read_csv("../data/raw/Test_data.csv")
test_intrusion_data
# test_intrusion_data_en = pd.get_dummies(test_intrusion_data, columns = ['protocol_type', 'service','flag'])
# test_intrusion_data_en

Unnamed: 0,duration,protocol_type,service,flag,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,...,dst_host_count,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate
0,0,tcp,private,REJ,0,0,0,0,0,0,...,255,10,0.04,0.06,0.00,0.00,0.00,0.0,1.00,1.00
1,0,tcp,private,REJ,0,0,0,0,0,0,...,255,1,0.00,0.06,0.00,0.00,0.00,0.0,1.00,1.00
2,2,tcp,ftp_data,SF,12983,0,0,0,0,0,...,134,86,0.61,0.04,0.61,0.02,0.00,0.0,0.00,0.00
3,0,icmp,eco_i,SF,20,0,0,0,0,0,...,3,57,1.00,0.00,1.00,0.28,0.00,0.0,0.00,0.00
4,1,tcp,telnet,RSTO,0,15,0,0,0,0,...,29,86,0.31,0.17,0.03,0.02,0.00,0.0,0.83,0.71
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22539,0,tcp,smtp,SF,794,333,0,0,0,0,...,100,141,0.72,0.06,0.01,0.01,0.01,0.0,0.00,0.00
22540,0,tcp,http,SF,317,938,0,0,0,0,...,197,255,1.00,0.00,0.01,0.01,0.01,0.0,0.00,0.00
22541,0,tcp,http,SF,54540,8314,0,0,0,2,...,255,255,1.00,0.00,0.00,0.00,0.00,0.0,0.07,0.07
22542,0,udp,domain_u,SF,42,42,0,0,0,0,...,255,252,0.99,0.01,0.00,0.00,0.00,0.0,0.00,0.00


In [682]:
gen_intrusion = pd.read_csv("../data/external/intrusion_synth_data.csv")
gen_intrusion
gen_intrusion_encoded = pd.get_dummies(gen_intrusion, columns = ['protocol_type', 'service','flag'])
gen_intrusion_encoded

Unnamed: 0,duration,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,num_failed_logins,logged_in,num_compromised,...,service_smtp,service_telnet,service_time,service_urp_i,flag_REJ,flag_RSTO,flag_RSTR,flag_S0,flag_SF,flag_SH
0,0,-174,2715,0,0,0,0,0,1,0,...,False,False,False,False,False,False,False,False,True,False
1,-1,-132,-102,0,0,0,0,0,0,0,...,False,False,False,False,False,False,False,False,True,False
2,1,845,-144,0,0,0,0,0,0,0,...,False,False,False,False,False,False,False,False,True,False
3,-1,234,43,0,0,0,0,0,0,0,...,False,False,False,False,False,False,False,False,True,False
4,-2,741,-270,0,0,0,0,0,0,0,...,False,False,False,False,False,False,False,True,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4995,1,921,331,0,0,0,0,0,1,0,...,False,False,False,False,False,False,False,False,True,False
4996,-1,1302,-229,0,0,0,0,0,0,0,...,False,False,False,False,False,False,False,True,False,False
4997,8,2211,1570,0,0,0,0,0,1,0,...,False,True,False,False,False,False,False,False,True,False
4998,1,1135,168,0,0,0,0,0,0,0,...,False,False,False,False,False,False,False,False,True,False


In [684]:
resemblance_measure(test_intrusion_data_en.to_numpy(), gen_intrusion_encoded.to_numpy())

ValueError: x and y must have the same length.