In [1]:
# pip install sdv

In [2]:
import numpy as np
import pandas as pd
from datetime import datetime as dt
import re
import time
from sdv.tabular import GaussianCopula, CTGAN, CopulaGAN, TVAE
from sdv.constraints import Range, ScalarRange, Inequality, ScalarInequality, create_custom_constraint
from copy import deepcopy
import warnings

In [3]:
warnings.filterwarnings("ignore")

In [4]:
data = pd.read_csv('data.csv')
data.drop(columns=['Name'], axis=1, inplace=True)
data.head()

Unnamed: 0,Date_of_birth,Opening_date,Expiry_date,Credit_limit,Statement_balance,Available_credit
0,1949-08-17,2008-09-02,2013-09-02,38000.0,29297.69,8702.31
1,1938-07-04,2019-09-08,2024-09-08,19500.0,15628.5,3871.5
2,1927-03-28,2019-04-23,2024-04-23,11500.0,7388.34,4111.66
3,1936-01-17,2021-10-27,2026-10-27,68500.0,32060.39,36439.61
4,1960-09-05,2010-10-02,2015-10-02,72000.0,56148.13,15851.87


In [5]:
data['Opening'] = data['Opening_date'].apply(lambda x: x.replace('-', ''))
data['Expiry'] = data['Expiry_date'].apply(lambda x: x.replace('-', ''))
data['Client_id'] = data['Opening'] + data['Expiry']
data['Client_id'] = data['Client_id'].apply(lambda x: int(x))
data['Opening'] = data['Opening'].apply(lambda x: int(x))
data['Expiry'] = data['Expiry'].apply(lambda x: int(x))

data = data[['Client_id', 'Date_of_birth', 'Opening_date', 'Expiry_date', 'Credit_limit', 'Statement_balance', 'Available_credit', 'Opening', 'Expiry']]

data.head()

Unnamed: 0,Client_id,Date_of_birth,Opening_date,Expiry_date,Credit_limit,Statement_balance,Available_credit,Opening,Expiry
0,2008090220130902,1949-08-17,2008-09-02,2013-09-02,38000.0,29297.69,8702.31,20080902,20130902
1,2019090820240908,1938-07-04,2019-09-08,2024-09-08,19500.0,15628.5,3871.5,20190908,20240908
2,2019042320240423,1927-03-28,2019-04-23,2024-04-23,11500.0,7388.34,4111.66,20190423,20240423
3,2021102720261027,1936-01-17,2021-10-27,2026-10-27,68500.0,32060.39,36439.61,20211027,20261027
4,2010100220151002,1960-09-05,2010-10-02,2015-10-02,72000.0,56148.13,15851.87,20101002,20151002


In [6]:
data.dtypes

Client_id              int64
Date_of_birth         object
Opening_date          object
Expiry_date           object
Credit_limit         float64
Statement_balance    float64
Available_credit     float64
Opening                int64
Expiry                 int64
dtype: object

In [7]:
def is_valid_arithmetic_equality(column_names, data):
    is_equal = [data[column_names[0]] - data[column_names[1]] == data[column_names[2]]][0]
    return is_equal
    
arithmetic_equality_constraint = create_custom_constraint(is_valid_fn = is_valid_arithmetic_equality)

def is_valid_arithmetic_inequality(column_names, data):
    is_equal = [data[column_names[0]] >= data[column_names[1]] + column_names[2]][0]
    return is_equal

arithmetic_inequality_constraint = create_custom_constraint(is_valid_fn = is_valid_arithmetic_inequality)

def is_valid_inclusive(column_names, data):
    is_equal = []
    for index, row in data.iterrows():
        is_equal.append(str(int(row[column_names[0]]))[column_names[2]:column_names[2]+len(str(int(row[column_names[1]])))] == str(int(row[column_names[1]])))
    is_equal = pd.Series(i for i in is_equal)
    return is_equal

inclusive_constraint = create_custom_constraint(is_valid_fn = is_valid_inclusive)

In [8]:
class SD_generator():
    """
    A class to detect deterministic relationships between two/three columns from a given dataset.
    Create constraints with detected relationships.
    Apply models in SDV and generate synthetic data.
    
    ...

    Attributes
    ----------
    data : Pandas DataFrame
        an input dataset in Pandas DataFrame format
        
    threshold : float
        a cut-off percentage for detection functions to confirm the deterministic relationships
        
    inequality_dict : dictionary
        stores inequality deterministic relationships;
        in which the key is greater than its values
    
    inequality_runtime : float
        a variable to store the runtime of inequality detection function
    
    arithmetic_equality_dict : dictionary
        stores deterministic relationships like "A = B + C" among three colomns
        
    arithmetic_equality_runtime : float
        a variable to store the runtime of arithmetic equality detection function
    
    arithmetic_equality_flag : bool
        a bolean variable; if True, apply arithmetic equality constraints to generate synthetic data
    
    arithmetic_inequality_dict : dictionary
        stores deterministic relationships like "A >= B + X" between two colomns
    
    arithmetic_inequality_runtime : float
        a variable to store the runtime of arithmetic inequality detection function
    
    inclusive_dict : dictionary
        stores relationships that a column contains another column and the starting index 
    
    inclusive_runtime : float
        a variable to store the runtime of inclusive detection function
    
    inclusive_flag : bool
        a bolean variable; if True, apply inclusive constraints to generate synthetic data
    
    constraints : list
        a list containing constraints for synthetic data generation model training
        
    models : dictionary
        an empty dictionary to store SDV models that are ready to generate synthetic data
        
    Methods
    -------
    preprocess():
        Change the dtpyes of date columns to float and 
        drop the rows of the input dataframe which have missing values.
    
    detect_inequality():
        Detect the inequality deterministic relationship between two colomns.
    
    detect_arithmetic_equality():
        Detect the deterministic relationships like "A = B + C" among three colomns.
        
    detect_arithmetic_inequality():
        Detect the deterministic relationships like "A >= B + X" between two colomns.
    
    detect_inclusive():
        Detect the inclusive relationships that a column contains another column and the starting index.
        
    create_constraints():
        Create constraints for synthetic data generation model training.
        
    apply_model():
        Train a specific model in SDV with constraints.
        Store the trained model in dictionary "models".
       
    generate():
        Generate synthetic data of a specific number of rows with a specific pre-trainned model.
    """
    def __init__(self, data, threshold, 
                 inequality_dict={}, inequality_runtime=0,
                 arithmetic_equality_dict={}, arithmetic_equality_runtime=0, arithmetic_equality_flag=False,
                 arithmetic_inequality_dict={}, arithmetic_inequality_runtime=0,
                 inclusive_dict={}, inclusive_runtime=0, inclusive_flag=False,
                 constraints=[], models={}):
        """
        Constructs all the necessary attributes for the person object.

        Parameters
        ----------
            dataframe : Pandas DataFrame
                an input dataset in Pandas DataFrame format
                
            threshold : float
                a cut-off percentage for detection functions to confirm the deterministic relationships
                
            inequality_dict : dictionary
                an empty dictionary to store inequality deterministic relationships
            
            inequality_runtime : float
                a variable to store the runtime of inequality detection function
    
            arithmetic_equality_dict : dictionary
                an empty dictionary to store deterministic relationships among three columns like "A = B + C"
                
            arithmetic_equality_runtime : float
                a variable to store the runtime of arithmetic equality detection function
            
            arithmetic_equality_flag : bool
                a bolean variable; if True, apply arithmetic equality constraints to generate synthetic data
            
            arithmetic_inequality : dictionary
                an empty dictionary to store deterministic relationships between two columnslike "A >= B + X"
                
            arithmetic_inequality_runtime : float
                a variable to store the runtime of arithmetic inequality detection function
            
            inclusive_flag : bool
                a bolean variable; if True, apply inclusive constraints to generate synthetic data
            
            constarints : list
                an empty list to store contraints for synthetic data generation model training
            
            models : dictionary
                an empty dictionary to store SDV models that are ready to generate synthetic data
        """    
        self.data = data
        self.threshold = threshold
        self.inequality_dict = inequality_dict
        self.inequality_runtime = inequality_runtime
        self.arithmetic_equality_dict = arithmetic_equality_dict
        self.arithmetic_equality_runtime = arithmetic_equality_runtime
        self.arithmetic_equality_flag = arithmetic_equality_flag
        self.arithmetic_inequality_dict = arithmetic_inequality_dict 
        self.arithmetic_inequality_runtime = arithmetic_inequality_runtime
        self.inclusive_dict = inclusive_dict
        self.inclusive_runtime = inclusive_runtime
        self.inclusive_flag = inclusive_flag
        self.constraints = constraints
        self.models = models
        
    def preprocess(self):
        """
        Change date columns to float format;
        Handle missing values of the input dataframe;
        Drop the rows with missing values.
        
        Returns:
            None.
        
        Output:
            Running finished message with execution time.
        """
        st = time.time()
        
        data = self.data
        
        ref_dt = pd.Timestamp('1900-01-01')

        str2date = lambda x: dt.strptime(x, "%Y-%m-%d") - ref_dt if x.replace(" ", "") else np.nan
        
        for col in data.columns:
            
            try:
                re.match('^[0-9]{4}\-[0-9]{2}\-[0-9]{2}$', data[col][0])
                
                self.data[col] = self.data[col].apply(str2date)
                self.data[col] = (self.data[col] / np.timedelta64(1, 'D')).astype(float)
                
            except:
                pass
            
        data.dropna(axis=0, inplace=True)
        self.data = data

        et = time.time()
        elapsed_time = et - st
        print("Date types reformatted and missing values handled successfully!\nExecution Time:"
              , round(elapsed_time, 4), "seconds")

    def detect_inequality(self):
        """
        Detect the inequality deterministic relationship between colomns;
        Update the inequality_dictionary of the class object.
        
        Returns:
            None.
        
        Output:
            Number of relationships detected with execution time.
        """
        st = time.time()
        
        data = self.data
        
    # Looping through all combinitions of columns
        for i in range(len(data.columns)):
            for j in range(i+1, len(data.columns)):
                
                # Extract the column pairs in float or int data types
                if (data[data.columns[i]].dtypes in ['int', 'float'] and data[data.columns[j]].dtypes in ['int', 'float']):
                    diff = []
                    count = 0
                    for rownum, row in data.iterrows():
                        diff.append(row[i] - row[j])
                    for num in diff:
                        if num < 0:
                            count+=1 
                    if float(count)/len(diff) >= self.threshold :
                        if data.columns[j] in self.inequality_dict.keys():
                            self.inequality_dict[data.columns[j]].append(data.columns[i])
                        else:
                            self.inequality_dict[data.columns[j]] = []
                            self.inequality_dict[data.columns[j]].append(data.columns[i])
                    elif float(len(diff) - count)/len(diff) >= self.threshold:
                        if data.columns[i] in self.inequality_dict.keys():
                            self.inequality_dict[data.columns[i]].append(data.columns[j])
                        else:
                            self.inequality_dict[data.columns[i]] = []
                            self.inequality_dict[data.columns[i]].append(data.columns[j])
        
        # Merge duplicates records and remove redundant relationships
        for key in self.inequality_dict:
            for i in range(len(self.inequality_dict[key])):
                str_1 = self.inequality_dict[key][i]
                for j in range(i, len(self.inequality_dict[key])):
                    str_2 = self.inequality_dict[key][j]
                    if ((str_1 in self.inequality_dict) and (str_2 in self.inequality_dict[str_1])):
                        self.inequality_dict[key][j] = 'N/A' 
                    elif ((str_2 in self.inequality_dict) and (str_1 in self.inequality_dict[str_2])):
                        self.inequality_dict[key][i] = 'N/A'
        for key in self.inequality_dict:
            self.inequality_dict[key] = [i for i in self.inequality_dict[key] if i != 'N/A']
        
        et = time.time()
        self.inequality_runtime = et - st
        
        num = 0
        for key in self.inequality_dict:
            num += len(self.inequality_dict[key])
        print(num, "relationships detected")
        print("Execution Time:", round(self.inequality_runtime, 4), "seconds")

    def detect_arithmetic_equality(self):
        """
        Detect the deterministic relationships "A = B + C" among three columns;
        Based on the dictionary of inequality deterministic relationships.
        
        Returns:
            None.
        
        Output:
            Number of relationships detected with execution time.    
        """
        st = time.time()
        
        data = self.data
        
        for key in self.inequality_dict:
            # Check if the len(the list of values of the key) is >= 2 
            if len(self.inequality_dict[key]) >= 2:
                for i in range(len(self.inequality_dict[key])):
                    for j in range(i+1, len(self.inequality_dict[key])):
                        count = 0
                        for index, row in data.iterrows():
                            if (row[key] == row[self.inequality_dict[key][i]] + row[self.inequality_dict[key][j]]):
                                count += 1
                        if float(count) / len(data.index) >= self.threshold:
                            if key in self.arithmetic_equality_dict.keys():
                                self.arithmetic_equality_dict[key].append([self.inequality_dict[key][i], self.inequality_dict[key][j]])
                            else:
                                self.arithmetic_equality_dict[key] = []
                                self.arithmetic_equality_dict[key].append([self.inequality_dict[key][i], self.inequality_dict[key][j]])

        
        et = time.time()
        self.arithmetic_equality_runtime = et - st
        
        num = 0
        for key in self.arithmetic_equality_dict:
            num += len(self.arithmetic_equality_dict[key])
        print(num, "relationships detected")
        print("Execution Time:", round(self.arithmetic_equality_runtime, 4), "seconds")
    
    def detect_arithmetic_inequality(self):
        """
        Detect the deterministic relationships "A >= B + X" between two columns;
        Based on the dictionary of inequality deterministic relationships.
        
        Returns:
            None.
        
        Output:
            Number of relationships detected with execution time. 
        """
        st = time.time()
        
        data = self.data
        
        for key in self.inequality_dict:
            for value in self.inequality_dict[key]:
                diff = []
                for index, row in data.iterrows():
                    diff.append(row[key] - row[value])
                    
                if key in self.arithmetic_inequality_dict.keys():
                    self.arithmetic_inequality_dict[key].append([value, min(diff)])
                else:
                    self.arithmetic_inequality_dict[key] = []
                    self.arithmetic_inequality_dict[key].append([value, min(diff)])
            
        et = time.time()
        self.arithmetic_inequality_runtime = et - st
        
        num = 0
        for key in self.arithmetic_inequality_dict:
            num += len(self.arithmetic_inequality_dict[key])
        print(num, "relationships detected")
        print("Execution Time:", round(self.arithmetic_inequality_runtime, 4), "seconds")
    
    def detect_inclusive(self):
        """
        Detect the inclusive relationships between two columns
        
        Returns:
            None.
        
        Output:
            Number of relationships detected with execution time. 
        """
        st = time.time()
        
        data_length = len(self.data.index)
        for i in range(len(self.data.columns)):
            col_1 = self.data.columns[i]
            for j in range(i+1, len(self.data.columns)):
                col_2 = self.data.columns[j]
                count = 0
                key = ''
                value = ''
                index = 0
                    
                for ind, row in self.data.iterrows():
                    if len(str(int(row[col_1]))) > len(str(int(row[col_2]))):
                        key = col_1
                        value = col_2
                        for k in range(len(str(int(row[col_1]))) - len(str(int(row[col_2]))) + 1):
                            if str(int(row[col_1]))[k:k+len(str(int(row[col_2])))] == str(int(row[col_2])):
                                index = k
                                count += 1
                            
                    else:
                        key = col_2
                        value = col_1
                        for k in range(len(str(int(row[col_2]))) - len(str(int(row[col_1]))) + 1):
                            if str(int(row[col_2]))[k:k+len(str(int(row[col_1])))] == str(int(row[col_1])):
                                index = k
                                count += 1

                if count/float(data_length) >= 0.95:
                    if key in self.inclusive_dict.keys():
                        self.inclusive_dict[key].append([value, index])
                    else:
                        self.inclusive_dict[key] = []
                        self.inclusive_dict[key].append([value, index])
    
        et = time.time()
        self.inclusive_runtime = et - st
        
        num = 0
        for key in self.inclusive_dict:
            num += len(self.inclusive_dict[key])
        print(num, "relationships detected")
        print("Execution Time:", round(self.inclusive_runtime, 4), "seconds")
    
    def create_constraints(self, inequality=False, 
                           arithmetic_equality=False, 
                           arithmetic_inequality=False, 
                           inclusive=False):
        """
        Create constraints for synthetic data generation model training.
        
        Parameters:
            inequality: bool, default=False
                If True, create constraints for inequality deterministic relationships.
            
            arithmetic_equality: bool, default=False
                If True, create constraints for deterministic relationships like "A = B + C".
            
            arithmetic_inequality: bool, default=False
                If True, create constraints for deterministic relationships like "A >= B + X".
            
            inclusive: bool, default=False
                If True, create constraints for inclusive relationships.
        
        Returns:
            None.
        
        Output:
            Running finished message with execution time.
        """
        st = time.time()
        
        if inequality:
            for key in self.inequality_dict:
                for value in self.inequality_dict[key]:
                    self.constraints.append(Inequality(low_column_name=value, high_column_name=key))
        
        if arithmetic_equality:
            self.arithmetic_equality_flag=True
#             for key in self.arithmetic_equality_dict:
#                 for value_list in self.arithmetic_equality_dict[key]:
#                     columns = [key, value_list[0], value_list[1]]
#                     cons = arithmetic_equality_constraint(column_names=columns)
#                     self.constraints.append(cons)
                    
        if arithmetic_inequality:
            for key in self.arithmetic_inequality_dict:
                for value_list in self.arithmetic_inequality_dict[key]:
                    columns = [key, value_list[0], value_list[1]]
                    cons = arithmetic_inequality_constraint(column_names=columns)
                    self.constraints.append(cons)
        
        if inclusive:
            self.inclusive_flag=True
#             for key in self.inclusive_dict:
#                 for value_list in self.inclusive_dict[key]:
#                     columns = [key, value_list[0], value_list[1]]
#                     cons = inclusive_constraint(column_names=columns)
#                     self.constraints.append(cons)
        
        et = time.time()
        elapsed_time = et - st
        
        print("Constrainsts created successfully!\nExecution Time:"
              , round(elapsed_time, 4), "seconds")
    
    def apply_model(self, model_name=None):
        """
        Train a specific model in SDV with constraints.
        Store the trained model in dictionary "models".
        
        Parameters:
            model_name: string
                The name of the models in SDV.
                Eg. "GaussianCopula", "CTGAN", "CopulaGAN" and "TVAE".
        
        Returns:
            None.
        """
        st = time.time()
        
        if model_name == None:
            print("No input for model name!")
            return None
        
        elif model_name == 'GaussianCopula':
            model = GaussianCopula(constraints=self.constraints)
            model.fit(self.data)
            self.models['GaussianCopula'] = model
            
            et = time.time()
            elapsed_time = et - st
        
            print("Execution Time:", round(elapsed_time, 4), "seconds")
            
            
        elif model_name == 'CTGAN':
            model = CTGAN(constraints=self.constraints)
            model.fit(self.data)
            self.models['CTGAN'] = model
            
            et = time.time()
            elapsed_time = et - st
        
            print(f"Execution Time for training {model_name}:", round(elapsed_time, 4), "seconds")
            
        elif model_name == 'CopulaGAN':
            model = CopulaGAN(constraints=self.constraints)
            model.fit(self.data)
            self.models['CopulaGAN'] = model
            
            et = time.time()
            elapsed_time = et - st
        
            print(f"Execution Time for training {model_name}:", round(elapsed_time, 4), "seconds")
            
        elif model_name == 'TVAE':
            model = TVAE(constraints=self.constraints)
            model.fit(self.data)
            self.models['TVAE'] = model
            
            et = time.time()
            elapsed_time = et - st
        
            print(f"Execution Time for training {model_name}:", round(elapsed_time, 4), "seconds")
            
        else:
            print("Wrong model name!")
            
    def generate(self, model_name=None, num_rows=0):
        """
        Generate synthetic data of a specific number of rows with a specific pre-trainned model.
        
        Parameters:
            model_name: string
                The name of the models in SDV.
                Eg. "GaussianCopula", "CTGAN", "CopulaGAN" and "TVAE".
        
            num_rows: integer
                Number of rows needed to generate.
                
        Returns:
            Synthetic data in DataFrame type.
        """
        st = time.time()
        
        if model_name == None:
            print("No input for model name!")
            return None
        elif (num_rows == 0) or (type(num_rows) != int):
            print("Number of rows has to be integer and greater than 0!")
            return None
        
        elif model_name not in ["GaussianCopula", "CTGAN", "CopulaGAN", "TVAE"]:
            print("Wrong model name!\nAccepted model name: GaussianCopula, CTAGAN, CopulaGAN and TVAE.")
            return None
        
        else:
            syn_data = self.models[model_name].sample(num_rows=num_rows)
            
            if self.arithmetic_equality_flag:
                for key in self.arithmetic_equality_dict:
                    syn_data[key] = syn_data[self.arithmetic_equality_dict[key][0][0]] + syn_data[self.arithmetic_equality_dict[key][0][1]]
            
            if self.inclusive_flag:
                for key in self.inclusive_dict:
                    for value_list in self.inclusive_dict[key]:
                        for index, row in syn_data.iterrows():
                            list_temp = list(str(int(row[key])))
                            list_temp[value_list[1] : value_list[1] + len(str(row[value_list[1]]))] = str(int(row[value_list[0]]))
                            syn_data.at[index, key] = int(''.join(list_temp))
                            
            et = time.time()
            elapsed_time = et - st       
            print(f"Synthetic data generated successfully with {model_name} model!\nExecution Time:", round(elapsed_time, 4), "seconds")
            
            return syn_data
    
    
            
            

In [9]:
d = SD_generator(data, 0.95)

In [10]:
d.preprocess()

Date types reformatted and missing values handled successfully!
Execution Time: 0.0878 seconds


In [11]:
d.data.dtypes

Client_id              int64
Date_of_birth        float64
Opening_date         float64
Expiry_date          float64
Credit_limit         float64
Statement_balance    float64
Available_credit     float64
Opening                int64
Expiry                 int64
dtype: object

In [12]:
d.detect_inequality()
d.inequality_dict

8 relationships detected
Execution Time: 2.4058 seconds


{'Client_id': ['Expiry'],
 'Opening_date': ['Date_of_birth'],
 'Expiry_date': ['Opening_date'],
 'Opening': ['Expiry_date', 'Credit_limit'],
 'Expiry': ['Opening'],
 'Credit_limit': ['Statement_balance', 'Available_credit']}

In [13]:
d.detect_arithmetic_equality()
d.arithmetic_equality_dict

1 relationships detected
Execution Time: 0.1663 seconds


{'Credit_limit': [['Statement_balance', 'Available_credit']]}

In [14]:
d.detect_arithmetic_inequality()
d.arithmetic_inequality_dict

8 relationships detected
Execution Time: 0.6071 seconds


{'Client_id': [['Expiry', 2000010600000000.0]],
 'Opening_date': [['Date_of_birth', 6599.0]],
 'Expiry_date': [['Opening_date', 1826.0]],
 'Opening': [['Expiry_date', 19961750.0], ['Credit_limit', 19900301.0]],
 'Expiry': [['Opening', 49999.0]],
 'Credit_limit': [['Statement_balance', 3.9400000000023283],
  ['Available_credit', 7.260000000000218]]}

In [15]:
d.detect_inclusive()
d.inclusive_dict

2 relationships detected
Execution Time: 5.8242 seconds


{'Client_id': [['Opening', 0], ['Expiry', 8]]}

In [16]:
d.create_constraints(inequality=True, arithmetic_equality=True, arithmetic_inequality=True, inclusive=True)

Constrainsts created successfully!
Execution Time: 0.0002 seconds


In [17]:
d.constraints

[<sdv.constraints.tabular.Inequality at 0x7f8be17a8700>,
 <sdv.constraints.tabular.Inequality at 0x7f8be17a84c0>,
 <sdv.constraints.tabular.Inequality at 0x7f8be17a8490>,
 <sdv.constraints.tabular.Inequality at 0x7f8be17a8400>,
 <sdv.constraints.tabular.Inequality at 0x7f8be17dbd30>,
 <sdv.constraints.tabular.Inequality at 0x7f8be17db8b0>,
 <sdv.constraints.tabular.Inequality at 0x7f8be17db610>,
 <sdv.constraints.tabular.Inequality at 0x7f8be17db040>,
 <sdv.constraints.tabular.create_custom_constraint.<locals>.CustomConstraint at 0x7f8be17db1f0>,
 <sdv.constraints.tabular.create_custom_constraint.<locals>.CustomConstraint at 0x7f8be17db130>,
 <sdv.constraints.tabular.create_custom_constraint.<locals>.CustomConstraint at 0x7f8be17dbeb0>,
 <sdv.constraints.tabular.create_custom_constraint.<locals>.CustomConstraint at 0x7f8be17dbd00>,
 <sdv.constraints.tabular.create_custom_constraint.<locals>.CustomConstraint at 0x7f8be17db190>,
 <sdv.constraints.tabular.create_custom_constraint.<locals>

In [18]:
d.apply_model("GaussianCopula")
d.apply_model("CTGAN")
d.apply_model("CopulaGAN")
d.apply_model("TVAE")

Execution Time: 0.1481 seconds
Execution Time for training CTGAN: 47.365 seconds
Execution Time for training CopulaGAN: 45.0356 seconds
Execution Time for training TVAE: 17.1624 seconds


In [19]:
d.models

{'GaussianCopula': <sdv.tabular.copulas.GaussianCopula at 0x7f8be17db070>,
 'CTGAN': <sdv.tabular.ctgan.CTGAN at 0x7f8be11ed850>,
 'CopulaGAN': <sdv.tabular.copulagan.CopulaGAN at 0x7f8be11ed400>,
 'TVAE': <sdv.tabular.ctgan.TVAE at 0x7f8be199ebb0>}

In [20]:
syn_data1 = d.generate("GaussianCopula", num_rows=200)
syn_data1.head()

Sampling rows: 100%|████████████████████████| 200/200 [00:00<00:00, 1236.65it/s]


Synthetic data generated successfully with GaussianCopula model!
Execution Time: 0.2042 seconds


Unnamed: 0,Client_id,Date_of_birth,Opening_date,Expiry_date,Credit_limit,Statement_balance,Available_credit,Opening,Expiry
0,2011297720164994,17573.0,40671.396209,42797.0,61807.5477,32022.36,29785.1877,20112977,20164994
1,2011463020207965,22108.0,41587.922394,44565.0,80233.527041,33463.32,46770.207041,20114630,20207965
2,2011816920256387,4197.0,40399.523051,46295.0,105860.610737,51747.61,54113.000737,20118169,20256387
3,2011090020162775,31214.0,39890.661265,42620.0,86460.445832,54518.07,31942.375832,20110900,20162775
4,2011234620175642,31432.0,41521.701063,43481.0,83096.50423,47799.22,35297.28423,20112346,20175642


In [21]:
syn_data1.head()

Unnamed: 0,Client_id,Date_of_birth,Opening_date,Expiry_date,Credit_limit,Statement_balance,Available_credit,Opening,Expiry
0,2011297720164994,17573.0,40671.396209,42797.0,61807.5477,32022.36,29785.1877,20112977,20164994
1,2011463020207965,22108.0,41587.922394,44565.0,80233.527041,33463.32,46770.207041,20114630,20207965
2,2011816920256387,4197.0,40399.523051,46295.0,105860.610737,51747.61,54113.000737,20118169,20256387
3,2011090020162775,31214.0,39890.661265,42620.0,86460.445832,54518.07,31942.375832,20110900,20162775
4,2011234620175642,31432.0,41521.701063,43481.0,83096.50423,47799.22,35297.28423,20112346,20175642


In [22]:
syn_data2 = d.generate("CTGAN", num_rows=200)
syn_data2.head()

Sampling rows: 100%|█████████████████████████| 200/200 [00:00<00:00, 585.13it/s]


Synthetic data generated successfully with CTGAN model!
Execution Time: 0.3827 seconds


Unnamed: 0,Client_id,Date_of_birth,Opening_date,Expiry_date,Credit_limit,Statement_balance,Available_credit,Opening,Expiry
0,2010753720183133,11836.0,33772.066007,46213.0,14949.876889,5584.66,9365.216889,20107537,20183133
1,2004482520163685,12216.0,30996.526292,41751.0,41666.029943,39353.06,2312.969943,20044825,20163685
2,2005351920180330,9976.0,28692.618112,41073.0,56315.260398,48534.37,7780.890398,20053519,20180330
3,2012792220225103,23830.0,32897.177037,42865.0,54124.369947,50151.92,3972.449947,20127922,20225103
4,2012646620210435,11435.0,42802.909662,44944.0,15362.388955,9507.3,5855.088955,20126466,20210435


In [23]:
syn_data3 = d.generate("CopulaGAN", num_rows=200)
syn_data3.head()

Sampling rows: 100%|█████████████████████████| 200/200 [00:00<00:00, 460.89it/s]

Synthetic data generated successfully with CopulaGAN model!
Execution Time: 0.4755 seconds





Unnamed: 0,Client_id,Date_of_birth,Opening_date,Expiry_date,Credit_limit,Statement_balance,Available_credit,Opening,Expiry
0,2001638220235246,7710.0,39807.640191,46330.0,59056.661301,8023.65,51033.011301,20016382,20235246
1,2010612920215917,23400.0,36023.755422,41711.0,63641.188394,76.32,63564.868394,20106129,20215917
2,2012245820251518,4646.0,32416.805429,43510.0,12778.491314,4000.16,8778.331314,20122458,20251518
3,2001616820110633,1383.0,28684.676961,40093.0,63159.667814,35723.47,27436.197814,20016168,20110633
4,2003420320096318,6632.0,35398.564183,44271.0,31430.637958,4967.85,26462.787958,20034203,20096318


In [24]:
syn_data3 = d.generate("TVAE", num_rows=200)
syn_data3.head()

Sampling rows: 100%|████████████████████████| 200/200 [00:00<00:00, 1220.84it/s]


Synthetic data generated successfully with TVAE model!
Execution Time: 0.2087 seconds


Unnamed: 0,Client_id,Date_of_birth,Opening_date,Expiry_date,Credit_limit,Statement_balance,Available_credit,Opening,Expiry
0,2006753720130319,9452.0,35203.409211,41485.0,49675.764652,30599.21,19076.554652,20067537,20130319
1,2008203520159545,21077.0,38125.999172,41168.0,57501.951465,54423.86,3078.091465,20082035,20159545
2,2017191520247634,27170.0,39847.047974,46040.0,114624.664464,80600.92,34023.744464,20171915,20247634
3,2011528020192931,21714.0,39177.188173,42279.0,103914.353791,22351.87,81562.483791,20115280,20192931
4,2009964220161890,12152.0,35213.957685,42826.0,23427.065637,7572.0,15855.065637,20099642,20161890
