In [1]:
# pip install sdv

In [2]:
import numpy as np
import pandas as pd
from datetime import datetime as dt
import re
import time
from sdv.tabular import GaussianCopula, CTGAN, CopulaGAN, TVAE
from sdv.constraints import Range, ScalarRange, Inequality, ScalarInequality, create_custom_constraint
from copy import deepcopy
import warnings
from itertools import combinations

In [3]:
warnings.filterwarnings("ignore")

In [4]:
data = pd.read_csv('data.csv')
data.drop(columns=['Name'], axis=1, inplace=True)
data.head()

Unnamed: 0,Date_of_birth,Opening_date,Expiry_date,Credit_limit,Statement_balance,Available_credit
0,1949-08-17,2008-09-02,2013-09-02,38000.0,29297.69,8702.31
1,1938-07-04,2019-09-08,2024-09-08,19500.0,15628.5,3871.5
2,1927-03-28,2019-04-23,2024-04-23,11500.0,7388.34,4111.66
3,1936-01-17,2021-10-27,2026-10-27,68500.0,32060.39,36439.61
4,1960-09-05,2010-10-02,2015-10-02,72000.0,56148.13,15851.87


In [5]:
data['Opening'] = data['Opening_date'].apply(lambda x: x.replace('-', ''))
data['Expiry'] = data['Expiry_date'].apply(lambda x: x.replace('-', ''))
data['Client_id'] = data['Opening'] + data['Expiry']
data['Client_id'] = data['Client_id'].apply(lambda x: int(x))
data['Opening'] = data['Opening'].apply(lambda x: int(x))
data['Expiry'] = data['Expiry'].apply(lambda x: int(x))

data = data[['Client_id', 'Date_of_birth', 'Opening_date', 'Expiry_date', 'Credit_limit', 'Statement_balance', 'Available_credit', 'Opening', 'Expiry']]

data.head()

Unnamed: 0,Client_id,Date_of_birth,Opening_date,Expiry_date,Credit_limit,Statement_balance,Available_credit,Opening,Expiry
0,2008090220130902,1949-08-17,2008-09-02,2013-09-02,38000.0,29297.69,8702.31,20080902,20130902
1,2019090820240908,1938-07-04,2019-09-08,2024-09-08,19500.0,15628.5,3871.5,20190908,20240908
2,2019042320240423,1927-03-28,2019-04-23,2024-04-23,11500.0,7388.34,4111.66,20190423,20240423
3,2021102720261027,1936-01-17,2021-10-27,2026-10-27,68500.0,32060.39,36439.61,20211027,20261027
4,2010100220151002,1960-09-05,2010-10-02,2015-10-02,72000.0,56148.13,15851.87,20101002,20151002


In [6]:
data.dtypes

Client_id              int64
Date_of_birth         object
Opening_date          object
Expiry_date           object
Credit_limit         float64
Statement_balance    float64
Available_credit     float64
Opening                int64
Expiry                 int64
dtype: object

In [7]:
def is_valid_arithmetic_equality(column_names, data):
    is_equal = [data[column_names[0]] - data[column_names[1]] == data[column_names[2]]][0]
    return is_equal
    
arithmetic_equality_constraint = create_custom_constraint(is_valid_fn = is_valid_arithmetic_equality)

def is_valid_arithmetic_inequality(column_names, data):
    is_equal = [data[column_names[0]] >= data[column_names[1]] + column_names[2]][0]
    return is_equal

arithmetic_inequality_constraint = create_custom_constraint(is_valid_fn = is_valid_arithmetic_inequality)

def is_valid_inclusive(column_names, data):
    is_equal = []
    for index, row in data.iterrows():
        is_equal.append(str(int(row[column_names[0]]))[column_names[2]:column_names[2]+len(str(int(row[column_names[1]])))] == str(int(row[column_names[1]])))
    is_equal = pd.Series(i for i in is_equal)
    return is_equal

inclusive_constraint = create_custom_constraint(is_valid_fn = is_valid_inclusive)

In [8]:
class SD_generator():
    """
    A class to detect deterministic relationships between two/three columns from a given dataset.
    Create constraints with detected relationships.
    Apply models in SDV and generate synthetic data.
    
    ...

    Attributes
    ----------
    data : Pandas DataFrame
        an input dataset in Pandas DataFrame format
        
    inequality_threshold : float
        a cut-off percentage for detection functions to confirm the inequality deterministic relationships
        
    arithmetic_equality_threshold : float
        a cut-off percentage for detection functions to confirm the arithmetic equality deterministic relationships
    
    inclusive_threshold : float
        a cut-off percentage for detection functions to confirm the inclusive deterministic relationships
        
    inequality_dict : dictionary
        stores inequality deterministic relationships;
        in which the key is greater than its values
    
    inequality_runtime : float
        a variable to store the runtime of inequality detection function
    
    arithmetic_equality_dict : dictionary
        stores deterministic relationships like "A = B + C" among three colomns
        
    arithmetic_equality_runtime : float
        a variable to store the runtime of arithmetic equality detection function
    
    arithmetic_equality_flag : bool
        a bolean variable; if True, apply arithmetic equality constraints to generate synthetic data
    
    arithmetic_inequality_dict : dictionary
        stores deterministic relationships like "A >= B + X" between two colomns
    
    arithmetic_inequality_runtime : float
        a variable to store the runtime of arithmetic inequality detection function
    
    inclusive_dict : dictionary
        stores relationships that a column contains another column and the starting index 
    
    inclusive_runtime : float
        a variable to store the runtime of inclusive detection function
    
    inclusive_flag : bool
        a bolean variable; if True, apply inclusive constraints to generate synthetic data
    
    constraints : list
        a list containing constraints for synthetic data generation model training
        
    models : dictionary
        an empty dictionary to store SDV models that are ready to generate synthetic data
        
    temp_dict : dictionary
        an empty dictionary to store temporary relationships
        
    Methods
    -------
    preprocess():
        Change the dtpyes of date columns to float and 
        drop the rows of the input dataframe which have missing values.
    
    detect_inequality():
        Detect the inequality deterministic relationship between two colomns.
    
    detect_arithmetic_equality():
        Detect the deterministic relationships like "A = B + C" among three colomns.
        
    detect_arithmetic_inequality():
        Detect the deterministic relationships like "A >= B + X" between two colomns.
    
    detect_inclusive():
        Detect the inclusive relationships that a column contains another column and the starting index.
        
    create_constraints():
        Create constraints for synthetic data generation model training.
        
    apply_model():
        Train a specific model in SDV with constraints.
        Store the trained model in dictionary "models".
       
    generate():
        Generate synthetic data of a specific number of rows with a specific pre-trainned model.
    """
    def __init__(self, data, inequality_threshold, arithmetic_equality_threshold, inclusive_threshold, 
                 inequality_dict={}, inequality_runtime=0,
                 arithmetic_equality_dict={}, arithmetic_equality_runtime=0, arithmetic_equality_flag=False,
                 arithmetic_inequality_dict={}, arithmetic_inequality_runtime=0,
                 inclusive_dict={}, inclusive_runtime=0, inclusive_flag=False,
                 constraints=[], models={}, temp_dict={}):
        """
        Constructs all the necessary attributes for the person object.

        Parameters
        ----------
            dataframe : Pandas DataFrame
                an input dataset in Pandas DataFrame format
                
            threshold : float
                a cut-off percentage for detection functions to confirm the deterministic relationships
                
            inequality_dict : dictionary
                an empty dictionary to store inequality deterministic relationships
            
            inequality_runtime : float
                a variable to store the runtime of inequality detection function
    
            arithmetic_equality_dict : dictionary
                an empty dictionary to store deterministic relationships among three columns like "A = B + C"
                
            arithmetic_equality_runtime : float
                a variable to store the runtime of arithmetic equality detection function
            
            arithmetic_equality_flag : bool
                a bolean variable; if True, apply arithmetic equality constraints to generate synthetic data
            
            arithmetic_inequality : dictionary
                an empty dictionary to store deterministic relationships between two columnslike "A >= B + X"
                
            arithmetic_inequality_runtime : float
                a variable to store the runtime of arithmetic inequality detection function
            
            inclusive_flag : bool
                a bolean variable; if True, apply inclusive constraints to generate synthetic data
            
            constarints : list
                an empty list to store contraints for synthetic data generation model training
            
            models : dictionary
                an empty dictionary to store SDV models that are ready to generate synthetic data
            
            temp_dict : dictionary
                an empty dictionary to store temporary relationships
        """    
        self.data = data
        self.inequality_threshold = inequality_threshold
        self.arithmetic_equality_threshold = arithmetic_equality_threshold
        self.inclusive_threshold = inclusive_threshold
        self.inequality_dict = inequality_dict
        self.inequality_runtime = inequality_runtime
        self.arithmetic_equality_dict = arithmetic_equality_dict
        self.arithmetic_equality_runtime = arithmetic_equality_runtime
        self.arithmetic_equality_flag = arithmetic_equality_flag
        self.arithmetic_inequality_dict = arithmetic_inequality_dict 
        self.arithmetic_inequality_runtime = arithmetic_inequality_runtime
        self.inclusive_dict = inclusive_dict
        self.inclusive_runtime = inclusive_runtime
        self.inclusive_flag = inclusive_flag
        self.constraints = constraints
        self.models = models
        self.temp_dict = temp_dict
        
    def preprocess(self):
        """
        Change date columns to float format;
        Handle missing values of the input dataframe;
        Drop the rows with missing values.
        
        Returns:
            None.
        
        Output:
            Running finished message with execution time.
        """
        st = time.time()
        
        ref_dt = pd.Timestamp('1900-01-01')

        str2date = lambda x: dt.strptime(x, "%Y-%m-%d") - ref_dt if x.replace(" ", "") else np.nan
        
        for col in self.data.columns:
            
            try:
                re.match('^[0-9]{4}\-[0-9]{2}\-[0-9]{2}$', self.data[col][0])
                
                self.data[col] = self.data[col].apply(str2date)
                self.data[col] = (self.data[col] / np.timedelta64(1, 'D')).astype(float)
                
            except:
                pass
            
        data.dropna(axis=0, inplace=True)

        et = time.time()
        elapsed_time = et - st
        print("Date types reformatted and missing values handled successfully!\nExecution Time:"
              , round(elapsed_time, 4), "seconds")

    def detect_inequality(self):
        """
        Detect the inequality deterministic relationship between colomns;
        Update the inequality_dictionary of the class object.
        
        Returns:
            None.
        
        Output:
            Number of relationships detected with execution time.
        """
        if bool(self.inequality_dict):
            return
        
        st = time.time()
        
    # Looping through all combinitions of columns
        column_pairs = list(combinations(data.columns, 2))
        for column_pair in column_pairs:
            if self.data[column_pair[0]].dtypes in ['int', 'float'] and self.data[column_pair[1]].dtypes in ['int', 'float']:
                temp = self.data[[column_pair[0], column_pair[1]]].apply(lambda x: x[column_pair[0]] > x[column_pair[1]], axis=1)
                count_true = temp[temp == True].count()
                ratio = float(count_true) / len(temp)
                
                if ratio >= self.inequality_threshold:
                    if column_pair[0] in self.inequality_dict.keys():
                        self.inequality_dict[column_pair[0]].append(column_pair[1])
                    else:
                        self.inequality_dict[column_pair[0]] = []
                        self.inequality_dict[column_pair[0]].append(column_pair[1])
                
                elif (1 - ratio) >= self.inequality_threshold:
                    if column_pair[1] in self.inequality_dict.keys():
                        self.inequality_dict[column_pair[1]].append(column_pair[0])
                    else:
                        self.inequality_dict[column_pair[1]] = []
                        self.inequality_dict[column_pair[1]].append(column_pair[0])

        self.temp_dict = deepcopy(self.inequality_dict)
        # Merge duplicates records and remove redundant relationships
        for key in self.inequality_dict:
            for i in range(len(self.inequality_dict[key])):
                str_1 = self.inequality_dict[key][i]
                for j in range(i, len(self.inequality_dict[key])):
                    str_2 = self.inequality_dict[key][j]
                    if ((str_1 in self.inequality_dict) and (str_2 in self.inequality_dict[str_1])):
                        self.inequality_dict[key][j] = 'N/A' 
                    elif ((str_2 in self.inequality_dict) and (str_1 in self.inequality_dict[str_2])):
                        self.inequality_dict[key][i] = 'N/A'
        for key in self.inequality_dict:
            self.inequality_dict[key] = [i for i in self.inequality_dict[key] if i != 'N/A']
        
        et = time.time()
        self.inequality_runtime = et - st
        
        num = 0
        for key in self.inequality_dict:
            num += len(self.inequality_dict[key])
        print(num, "relationships detected")
        print("Execution Time:", round(self.inequality_runtime, 4), "seconds")

    def detect_arithmetic_equality(self):
        """
        Detect the deterministic relationships "A = B + C" among three columns;
        Based on the dictionary of inequality deterministic relationships.
        
        Returns:
            None.
        
        Output:
            Number of relationships detected with execution time.    
        """
        if bool(self.arithmetic_equality_dict):
            return
        
        st = time.time()
        
        for key in self.temp_dict:
            column_pairs = list(combinations(self.temp_dict[key], 2))
            for column_pair in column_pairs:
                temp = self.data[[key, column_pair[0], column_pair[1]]].apply(lambda x: x[key] == x[column_pair[0]] + x[column_pair[1]], axis=1)
                count_true = temp[temp == True].count()
                if float(count_true) / len(temp) >= self.arithmetic_equality_threshold:
                    if key in self.arithmetic_equality_dict.keys():
                        self.arithmetic_equality_dict[key].append([column_pair[0], column_pair[1]])
                    else:
                        self.arithmetic_equality_dict[key] = []
                        self.arithmetic_equality_dict[key].append([column_pair[0], column_pair[1]])
        
        et = time.time()
        self.arithmetic_equality_runtime = et - st
        
        num = 0
        for key in self.arithmetic_equality_dict:
            num += len(self.arithmetic_equality_dict[key])
        print(num, "relationships detected")
        print("Execution Time:", round(self.arithmetic_equality_runtime, 4), "seconds")
    
    def detect_arithmetic_inequality(self):
        """
        Detect the deterministic relationships "A >= B + X" between two columns;
        Based on the dictionary of inequality deterministic relationships.
        
        Returns:
            None.
        
        Output:
            Number of relationships detected with execution time. 
        """
        if bool(self.arithmetic_inequality_dict):
            return
        
        st = time.time()
        
        data = self.data
        
        for key in self.inequality_dict:
            for value in self.inequality_dict[key]:
                diff = []
                for index, row in data.iterrows():
                    diff.append(row[key] - row[value])
                    
                if key in self.arithmetic_inequality_dict.keys():
                    self.arithmetic_inequality_dict[key].append([value, min(diff)])
                else:
                    self.arithmetic_inequality_dict[key] = []
                    self.arithmetic_inequality_dict[key].append([value, min(diff)])
            
        et = time.time()
        self.arithmetic_inequality_runtime = et - st
        
        num = 0
        for key in self.arithmetic_inequality_dict:
            num += len(self.arithmetic_inequality_dict[key])
        print(num, "relationships detected")
        print("Execution Time:", round(self.arithmetic_inequality_runtime, 4), "seconds")
    
    def detect_inclusive(self):
        """
        Detect the inclusive relationships between two columns
        
        Returns:
            None.
        
        Output:
            Number of relationships detected with execution time. 
        """
        if bool(self.inclusive_dict):
            return
        
        st = time.time()
        
        data_length = len(self.data.index)
        
        column_pairs = list(combinations(data.columns, 2))
        
        for column_pair in column_pairs:
            ratio = 0
            
            if len(str(int(self.data.iloc[0][column_pair[1]]))) > len(str(int(self.data.iloc[0][column_pair[0]]))):
                temp = self.data[[column_pair[0], column_pair[1]]].apply(lambda x: str(x[column_pair[0]]) in str(x[column_pair[1]]), axis=1)
                count = temp[temp == True].count()
                ratio = float(count) / len(temp)
                key = column_pair[1]
                value = column_pair[0]
                index = str(self.data.loc[0][column_pair[1]]).find(str(self.data.loc[0][column_pair[0]]))
            
            elif len(str(int(self.data.iloc[0][column_pair[0]]))) > len(str(int(self.data.iloc[0][column_pair[1]]))):
                temp = self.data[[column_pair[0], column_pair[1]]].apply(lambda x: str(x[column_pair[1]]) in str(x[column_pair[0]]), axis=1)
                count = temp[temp == True].count()
                ratio = float(count) / len(temp)
                key = column_pair[0]
                value = column_pair[1]
                index = str(int(self.data.loc[0][column_pair[0]])).find(str(int(self.data.loc[0][column_pair[1]])))
                
            if ratio >= self.inclusive_threshold:
                
                if key in self.inclusive_dict.keys():
                        self.inclusive_dict[key].append([value, index])
                else:
                    self.inclusive_dict[key] = []
                    self.inclusive_dict[key].append([value, index])
            
        et = time.time()
        self.inclusive_runtime = et - st
        
        num = 0
        for key in self.inclusive_dict:
            num += len(self.inclusive_dict[key])
        print(num, "relationships detected")
        print("Execution Time:", round(self.inclusive_runtime, 4), "seconds")
    
    def create_constraints(self, inequality=False, 
                           arithmetic_equality=False, 
                           arithmetic_inequality=False, 
                           inclusive=False):
        """
        Create constraints for synthetic data generation model training.
        
        Parameters:
            inequality: bool, default=False
                If True, create constraints for inequality deterministic relationships.
            
            arithmetic_equality: bool, default=False
                If True, create constraints for deterministic relationships like "A = B + C".
            
            arithmetic_inequality: bool, default=False
                If True, create constraints for deterministic relationships like "A >= B + X".
            
            inclusive: bool, default=False
                If True, create constraints for inclusive relationships.
        
        Returns:
            None.
        
        Output:
            Running finished message with execution time.
        """
        st = time.time()
        
        if inequality:
            for key in self.inequality_dict:
                for value in self.inequality_dict[key]:
                    self.constraints.append(Inequality(low_column_name=value, high_column_name=key))
        
        if arithmetic_equality:
            self.arithmetic_equality_flag=True
                    
        if arithmetic_inequality:
            for key in self.arithmetic_inequality_dict:
                for value_list in self.arithmetic_inequality_dict[key]:
                    columns = [key, value_list[0], value_list[1]]
                    cons = arithmetic_inequality_constraint(column_names=columns)
                    self.constraints.append(cons)
        
        if inclusive:
            self.inclusive_flag=True
        
        et = time.time()
        elapsed_time = et - st
        
        print("Constrainsts created successfully!\nExecution Time:"
              , round(elapsed_time, 4), "seconds")
    
    def apply_model(self, model_name=None):
        """
        Train a specific model in SDV with constraints.
        Store the trained model in dictionary "models".
        
        Parameters:
            model_name: string
                The name of the models in SDV.
                Eg. "GaussianCopula", "CTGAN", "CopulaGAN" and "TVAE".
        
        Returns:
            None.
        """
        st = time.time()
        
        if model_name == None:
            print("No input for model name!")
            return None
        
        elif model_name == 'GaussianCopula':
            model = GaussianCopula(constraints=self.constraints)
            model.fit(self.data)
            self.models['GaussianCopula'] = model
            
            et = time.time()
            elapsed_time = et - st
        
            print("Execution Time:", round(elapsed_time, 4), "seconds")
            
            
        elif model_name == 'CTGAN':
            model = CTGAN(constraints=self.constraints)
            model.fit(self.data)
            self.models['CTGAN'] = model
            
            et = time.time()
            elapsed_time = et - st
        
            print(f"Execution Time for training {model_name}:", round(elapsed_time, 4), "seconds")
            
        elif model_name == 'CopulaGAN':
            model = CopulaGAN(constraints=self.constraints)
            model.fit(self.data)
            self.models['CopulaGAN'] = model
            
            et = time.time()
            elapsed_time = et - st
        
            print(f"Execution Time for training {model_name}:", round(elapsed_time, 4), "seconds")
            
        elif model_name == 'TVAE':
            model = TVAE(constraints=self.constraints)
            model.fit(self.data)
            self.models['TVAE'] = model
            
            et = time.time()
            elapsed_time = et - st
        
            print(f"Execution Time for training {model_name}:", round(elapsed_time, 4), "seconds")
            
        else:
            print("Wrong model name!")
            
    def generate(self, model_name=None, num_rows=0):
        """
        Generate synthetic data of a specific number of rows with a specific pre-trainned model.
        
        Parameters:
            model_name: string
                The name of the models in SDV.
                Eg. "GaussianCopula", "CTGAN", "CopulaGAN" and "TVAE".
        
            num_rows: integer
                Number of rows needed to generate.
                
        Returns:
            Synthetic data in DataFrame type.
        """
        st = time.time()
        
        if model_name == None:
            print("No input for model name!")
            return None
        elif (num_rows == 0) or (type(num_rows) != int):
            print("Number of rows has to be integer and greater than 0!")
            return None
        
        elif model_name not in ["GaussianCopula", "CTGAN", "CopulaGAN", "TVAE"]:
            print("Wrong model name!\nAccepted model name: GaussianCopula, CTAGAN, CopulaGAN and TVAE.")
            return None
        
        else:
            syn_data = self.models[model_name].sample(num_rows=num_rows)
            
            if self.arithmetic_equality_flag:
                for key in self.arithmetic_equality_dict:
                    syn_data[key] = syn_data[self.arithmetic_equality_dict[key][0][0]] + syn_data[self.arithmetic_equality_dict[key][0][1]]
            
            if self.inclusive_flag:
                for key in self.inclusive_dict:
                    for value_list in self.inclusive_dict[key]:
                        for index, row in syn_data.iterrows():
                            list_temp = list(str(int(row[key])))
                            list_temp[value_list[1] : value_list[1] + len(str(row[value_list[1]]))] = str(int(row[value_list[0]]))
                            syn_data.at[index, key] = int(''.join(list_temp))
                            
            et = time.time()
            elapsed_time = et - st       
            print(f"Synthetic data generated successfully with {model_name} model!\nExecution Time:", round(elapsed_time, 4), "seconds")
            
            return syn_data
    
    
            
            

In [9]:
d = SD_generator(data, inequality_threshold=0.95, arithmetic_equality_threshold=0.95, inclusive_threshold=1.0)

In [10]:
d.preprocess()

Date types reformatted and missing values handled successfully!
Execution Time: 0.0997 seconds


In [11]:
d.data.dtypes

Client_id              int64
Date_of_birth        float64
Opening_date         float64
Expiry_date          float64
Credit_limit         float64
Statement_balance    float64
Available_credit     float64
Opening                int64
Expiry                 int64
dtype: object

In [12]:
d.detect_inequality()
d.inequality_dict

8 relationships detected
Execution Time: 0.6006 seconds


{'Client_id': ['Expiry'],
 'Opening_date': ['Date_of_birth'],
 'Expiry_date': ['Opening_date'],
 'Opening': ['Expiry_date', 'Credit_limit'],
 'Expiry': ['Opening'],
 'Credit_limit': ['Statement_balance', 'Available_credit']}

In [13]:
d.detect_arithmetic_equality()
d.arithmetic_equality_dict

1 relationships detected
Execution Time: 1.4553 seconds


{'Credit_limit': [['Statement_balance', 'Available_credit']]}

In [14]:
d.detect_arithmetic_inequality()
d.arithmetic_inequality_dict

8 relationships detected
Execution Time: 0.6194 seconds


{'Client_id': [['Expiry', 2000010600000000.0]],
 'Opening_date': [['Date_of_birth', 6599.0]],
 'Expiry_date': [['Opening_date', 1826.0]],
 'Opening': [['Expiry_date', 19961750.0], ['Credit_limit', 19900301.0]],
 'Expiry': [['Opening', 49999.0]],
 'Credit_limit': [['Statement_balance', 3.9400000000023283],
  ['Available_credit', 7.260000000000218]]}

In [15]:
d.detect_inclusive()
d.inclusive_dict

2 relationships detected
Execution Time: 0.5797 seconds


{'Client_id': [['Opening', 0], ['Expiry', 8]]}

In [16]:
d.create_constraints(inequality=True, arithmetic_equality=True, arithmetic_inequality=True, inclusive=True)

Constrainsts created successfully!
Execution Time: 0.0002 seconds


In [17]:
d.constraints

[<sdv.constraints.tabular.Inequality at 0x7fc7629abaf0>,
 <sdv.constraints.tabular.Inequality at 0x7fc7629abd00>,
 <sdv.constraints.tabular.Inequality at 0x7fc7629abbe0>,
 <sdv.constraints.tabular.Inequality at 0x7fc7629ab9d0>,
 <sdv.constraints.tabular.Inequality at 0x7fc7629abac0>,
 <sdv.constraints.tabular.Inequality at 0x7fc7629ab9a0>,
 <sdv.constraints.tabular.Inequality at 0x7fc7629ab4f0>,
 <sdv.constraints.tabular.Inequality at 0x7fc7629ab2b0>,
 <sdv.constraints.tabular.create_custom_constraint.<locals>.CustomConstraint at 0x7fc7629c0b80>,
 <sdv.constraints.tabular.create_custom_constraint.<locals>.CustomConstraint at 0x7fc7629c0df0>,
 <sdv.constraints.tabular.create_custom_constraint.<locals>.CustomConstraint at 0x7fc7629c06a0>,
 <sdv.constraints.tabular.create_custom_constraint.<locals>.CustomConstraint at 0x7fc7629c0220>,
 <sdv.constraints.tabular.create_custom_constraint.<locals>.CustomConstraint at 0x7fc7629c0c40>,
 <sdv.constraints.tabular.create_custom_constraint.<locals>

In [18]:
d.apply_model("GaussianCopula")
d.apply_model("CTGAN")
d.apply_model("CopulaGAN")
d.apply_model("TVAE")

Execution Time: 0.2118 seconds
Execution Time for training CTGAN: 50.8866 seconds
Execution Time for training CopulaGAN: 52.6716 seconds
Execution Time for training TVAE: 19.1781 seconds


In [19]:
d.models

{'GaussianCopula': <sdv.tabular.copulas.GaussianCopula at 0x7fc762a1cf10>,
 'CTGAN': <sdv.tabular.ctgan.CTGAN at 0x7fc7629c07c0>,
 'CopulaGAN': <sdv.tabular.copulagan.CopulaGAN at 0x7fc762abda30>,
 'TVAE': <sdv.tabular.ctgan.TVAE at 0x7fc762ab2400>}

In [20]:
syn_data1 = d.generate("GaussianCopula", num_rows=200)
syn_data1.head()

Sampling rows: 100%|█████████████████████████| 200/200 [00:00<00:00, 813.58it/s]

Synthetic data generated successfully with GaussianCopula model!
Execution Time: 0.2939 seconds





Unnamed: 0,Client_id,Date_of_birth,Opening_date,Expiry_date,Credit_limit,Statement_balance,Available_credit,Opening,Expiry
0,2011879120256891,9832.0,43641.271044,46053.0,65921.974475,16818.11,49103.864475,20118791,20256891
1,2011782220262255,21617.0,41627.237784,46383.0,57452.23051,56722.89,729.34051,20117822,20262255
2,2011812220248568,4225.0,39424.397528,45884.0,40422.358478,31145.71,9276.648478,20118122,20248568
3,2011030120183377,1230.0,37596.610848,43255.0,52815.511842,4184.5,48631.011842,20110301,20183377
4,2011748420227645,11147.0,42654.979703,45408.0,58361.66559,24232.61,34129.05559,20117484,20227645


In [21]:
syn_data2 = d.generate("CTGAN", num_rows=200)
syn_data2.head()

Sampling rows: 100%|█████████████████████████| 200/200 [00:00<00:00, 672.59it/s]

Synthetic data generated successfully with CTGAN model!
Execution Time: 0.3438 seconds





Unnamed: 0,Client_id,Date_of_birth,Opening_date,Expiry_date,Credit_limit,Statement_balance,Available_credit,Opening,Expiry
0,2001652220238918,27558.0,37261.986385,42896.0,37567.825466,26741.4,10826.425466,20016522,20238918
1,2004094420103757,15393.0,30797.57972,44639.0,3325.38,3321.44,3.94,20040944,20103757
2,2008598520196065,20004.0,29233.27782,44089.0,25775.459403,6849.17,18926.289403,20085985,20196065
3,2014388520236654,9749.0,24516.243006,45599.0,15066.17,15062.23,3.94,20143885,20236654
4,2013115520234938,21692.0,30006.896415,46532.0,39612.428196,2233.79,37378.638196,20131155,20234938


In [22]:
syn_data3 = d.generate("CopulaGAN", num_rows=200)
syn_data3.head()

Sampling rows: 100%|█████████████████████████| 200/200 [00:00<00:00, 390.27it/s]


Synthetic data generated successfully with CopulaGAN model!
Execution Time: 0.5645 seconds


Unnamed: 0,Client_id,Date_of_birth,Opening_date,Expiry_date,Credit_limit,Statement_balance,Available_credit,Opening,Expiry
0,2005904920238238,22772.0,41737.853676,46653.0,59289.59927,2385.51,56904.08927,20059049,20238238
1,2008405020217398,25757.0,39419.628496,46323.0,120648.23583,62495.33,58152.90583,20084050,20217398
2,2018813020270244,18534.0,41427.449409,45600.0,139993.835381,46069.44,93924.395381,20188130,20270244
3,2007345920177711,22613.0,40701.523879,46746.0,125195.170214,54823.0,70372.170214,20073459,20177711
4,2010424020215324,10542.0,20555.341066,39116.0,50155.637824,26405.15,23750.487824,20104240,20215324


In [23]:
syn_data3 = d.generate("TVAE", num_rows=200)
syn_data3.head()

Sampling rows: 100%|████████████████████████| 200/200 [00:00<00:00, 1007.93it/s]


Synthetic data generated successfully with TVAE model!
Execution Time: 0.2491 seconds


Unnamed: 0,Client_id,Date_of_birth,Opening_date,Expiry_date,Credit_limit,Statement_balance,Available_credit,Opening,Expiry
0,2008930620143421,26564.0,39109.831808,42319.0,95051.289558,2621.52,92429.769558,20089306,20143421
1,2003333420090478,26520.0,35656.355004,39910.0,6952.164604,3701.64,3250.524604,20033334,20090478
2,2016098420227387,21570.0,41085.431649,44898.0,77071.989322,1870.25,75201.739322,20160984,20227387
3,2013861020212826,19731.0,39353.967976,43965.0,4714.39,4710.45,3.94,20138610,20212826
4,2019900820264600,33158.0,44196.414855,46281.0,80780.012763,49820.97,30959.042763,20199008,20264600
