# Import Packages

In [1]:
# pip install sdv

In [2]:
import numpy as np
import pandas as pd
from datetime import datetime as dt
import re
import time
from sdv.tabular import GaussianCopula, CTGAN, CopulaGAN, TVAE
from sdv.constraints import Inequality, create_custom_constraint
from copy import deepcopy
import warnings
from itertools import combinations

In [3]:
warnings.filterwarnings("ignore")

# Prepare Data

In [4]:
data = pd.read_csv('data.csv')
data.drop(columns=['Name'], axis=1, inplace=True)
data.head()

Unnamed: 0,Date_of_birth,Opening_date,Expiry_date,Credit_limit,Statement_balance,Available_credit
0,1949-08-17,2008-09-02,2013-09-02,38000.0,29297.69,8702.31
1,1938-07-04,2019-09-08,2024-09-08,19500.0,15628.5,3871.5
2,1927-03-28,2019-04-23,2024-04-23,11500.0,7388.34,4111.66
3,1936-01-17,2021-10-27,2026-10-27,68500.0,32060.39,36439.61
4,1960-09-05,2010-10-02,2015-10-02,72000.0,56148.13,15851.87


In [5]:
data['Opening'] = data['Opening_date'].apply(lambda x: x.replace('-', ''))
data['Expiry'] = data['Expiry_date'].apply(lambda x: x.replace('-', ''))
data['Client_id'] = data['Opening'] + data['Expiry']
data['Client_id'] = data['Client_id'].apply(lambda x: int(x))
data['Opening'] = data['Opening'].apply(lambda x: int(x))
data['Expiry'] = data['Expiry'].apply(lambda x: int(x))

data = data[['Client_id', 'Date_of_birth', 'Opening_date', 'Expiry_date', 'Credit_limit', 'Statement_balance', 'Available_credit', 'Opening', 'Expiry']]

data.head()

Unnamed: 0,Client_id,Date_of_birth,Opening_date,Expiry_date,Credit_limit,Statement_balance,Available_credit,Opening,Expiry
0,2008090220130902,1949-08-17,2008-09-02,2013-09-02,38000.0,29297.69,8702.31,20080902,20130902
1,2019090820240908,1938-07-04,2019-09-08,2024-09-08,19500.0,15628.5,3871.5,20190908,20240908
2,2019042320240423,1927-03-28,2019-04-23,2024-04-23,11500.0,7388.34,4111.66,20190423,20240423
3,2021102720261027,1936-01-17,2021-10-27,2026-10-27,68500.0,32060.39,36439.61,20211027,20261027
4,2010100220151002,1960-09-05,2010-10-02,2015-10-02,72000.0,56148.13,15851.87,20101002,20151002


In [6]:
data.dtypes

Client_id              int64
Date_of_birth         object
Opening_date          object
Expiry_date           object
Credit_limit         float64
Statement_balance    float64
Available_credit     float64
Opening                int64
Expiry                 int64
dtype: object

# Create Custom Constraints

In [7]:
def is_valid_arithmetic_equality(column_names, data):
    is_equal = [data[column_names[0]] - data[column_names[1]] == data[column_names[2]]][0]
    return is_equal
    
arithmetic_equality_constraint = create_custom_constraint(is_valid_fn = is_valid_arithmetic_equality)

def is_valid_arithmetic_inequality(column_names, data):
    is_equal = [data[column_names[0]] >= data[column_names[1]] + column_names[2]][0]
    return is_equal

arithmetic_inequality_constraint = create_custom_constraint(is_valid_fn = is_valid_arithmetic_inequality)

def is_valid_inclusive(column_names, data):
    is_equal = []
    for index, row in data.iterrows():
        is_equal.append(str(int(row[column_names[0]]))[column_names[2]:column_names[2]+len(str(int(row[column_names[1]])))] == str(int(row[column_names[1]])))
    is_equal = pd.Series(i for i in is_equal)
    return is_equal

inclusive_constraint = create_custom_constraint(is_valid_fn = is_valid_inclusive)

# Synthetic Data Generator

In [8]:
class SD_generator():
    """
    A class to detect deterministic relationships between two/three columns from a given dataset.
    Create constraints with detected relationships.
    Apply models in SDV and generate synthetic data.
    
    ...

    Attributes
    ----------
    data : Pandas DataFrame
        an input dataset in Pandas DataFrame format
        
    inequality_threshold : float
        a cut-off percentage for detection functions to confirm the inequality deterministic relationships
        
    arithmetic_equality_threshold : float
        a cut-off percentage for detection functions to confirm the arithmetic equality deterministic relationships
    
    inclusive_threshold : float
        a cut-off percentage for detection functions to confirm the inclusive deterministic relationships
        
    inequality_dict : dictionary
        stores inequality deterministic relationships;
        in which the key is greater than its values
    
    inequality_runtime : float
        a variable to store the runtime of inequality detection function
    
    arithmetic_equality_dict : dictionary
        stores deterministic relationships like "A = B + C" among three colomns
        
    arithmetic_equality_runtime : float
        a variable to store the runtime of arithmetic equality detection function
    
    arithmetic_equality_flag : bool
        a bolean variable; if True, apply arithmetic equality constraints to generate synthetic data
    
    arithmetic_inequality_dict : dictionary
        stores deterministic relationships like "A >= B + X" between two colomns
    
    arithmetic_inequality_runtime : float
        a variable to store the runtime of arithmetic inequality detection function
    
    inclusive_dict : dictionary
        stores relationships that a column contains another column and the starting index 
    
    inclusive_runtime : float
        a variable to store the runtime of inclusive detection function
    
    inclusive_flag : bool
        a bolean variable; if True, apply inclusive constraints to generate synthetic data
    
    constraints : list
        a list containing constraints for synthetic data generation model training
        
    models : dictionary
        an empty dictionary to store SDV models that are ready to generate synthetic data
        
    temp_dict : dictionary
        an empty dictionary to store temporary relationships
        
    Methods
    -------
    preprocess():
        Change the dtpyes of date columns to float and 
        drop the rows of the input dataframe which have missing values.
    
    detect_inequality():
        Detect the inequality deterministic relationship between two colomns.
    
    detect_arithmetic_equality():
        Detect the deterministic relationships like "A = B + C" among three colomns.
        
    detect_arithmetic_inequality():
        Detect the deterministic relationships like "A >= B + X" between two colomns.
    
    detect_inclusive():
        Detect the inclusive relationships that a column contains another column and the starting index.
        
    create_constraints():
        Create constraints for synthetic data generation model training.
        
    apply_model():
        Train a specific model in SDV with constraints.
        Store the trained model in dictionary "models".
       
    generate():
        Generate synthetic data of a specific number of rows with a specific pre-trainned model.
    """
    def __init__(self, data, inequality_threshold, arithmetic_equality_threshold, inclusive_threshold, 
                 inequality_dict=None, inequality_runtime=0,
                 arithmetic_equality_dict=None, arithmetic_equality_runtime=0, arithmetic_equality_flag=False,
                 arithmetic_inequality_dict=None, arithmetic_inequality_runtime=0,
                 inclusive_dict=None, inclusive_runtime=0, inclusive_flag=False,
                 constraints=None, models=None, temp_dict=None):
        """
        Constructs all the necessary attributes for the person object.

        Parameters
        ----------
            dataframe : Pandas DataFrame
                an input dataset in Pandas DataFrame format
                
            threshold : float
                a cut-off percentage for detection functions to confirm the deterministic relationships
                
            inequality_dict : dictionary
                an empty dictionary to store inequality deterministic relationships
            
            inequality_runtime : float
                a variable to store the runtime of inequality detection function
    
            arithmetic_equality_dict : dictionary
                an empty dictionary to store deterministic relationships among three columns like "A = B + C"
                
            arithmetic_equality_runtime : float
                a variable to store the runtime of arithmetic equality detection function
            
            arithmetic_equality_flag : bool
                a bolean variable; if True, apply arithmetic equality constraints to generate synthetic data
            
            arithmetic_inequality : dictionary
                an empty dictionary to store deterministic relationships between two columnslike "A >= B + X"
                
            arithmetic_inequality_runtime : float
                a variable to store the runtime of arithmetic inequality detection function
            
            inclusive_flag : bool
                a bolean variable; if True, apply inclusive constraints to generate synthetic data
            
            constarints : list
                an empty list to store contraints for synthetic data generation model training
            
            models : dictionary
                an empty dictionary to store SDV models that are ready to generate synthetic data
            
            temp_dict : dictionary
                an empty dictionary to store temporary relationships
        """    
        self.data = data
        self.inequality_threshold = inequality_threshold
        self.arithmetic_equality_threshold = arithmetic_equality_threshold
        self.inclusive_threshold = inclusive_threshold
        self.inequality_dict = inequality_dict if inequality_dict is not None else {}
        self.inequality_runtime = inequality_runtime
        self.arithmetic_equality_dict = arithmetic_equality_dict if arithmetic_equality_dict is not None else {}
        self.arithmetic_equality_runtime = arithmetic_equality_runtime
        self.arithmetic_equality_flag = arithmetic_equality_flag
        self.arithmetic_inequality_dict = arithmetic_inequality_dict if arithmetic_inequality_dict is not None else {}
        self.arithmetic_inequality_runtime = arithmetic_inequality_runtime
        self.inclusive_dict = inclusive_dict if inclusive_dict is not None else {}
        self.inclusive_runtime = inclusive_runtime
        self.inclusive_flag = inclusive_flag
        self.constraints = constraints if constraints is not None else []
        self.models = models if models is not None else {}
        self.temp_dict = temp_dict if temp_dict is not None else {}
        
    def preprocess(self):
        """
        Change date columns to float format;
        Handle missing values of the input dataframe;
        Drop the rows with missing values.
        
        Returns:
            None.
        
        Output:
            Running finished message with execution time.
        """
        st = time.time()
        
        ref_dt = pd.Timestamp('1900-01-01')

        str2date = lambda x: dt.strptime(x, "%Y-%m-%d") - ref_dt if x.replace(" ", "") else np.nan
        
        for col in self.data.columns:
            
            try:
                re.match('^[0-9]{4}\-[0-9]{2}\-[0-9]{2}$', self.data[col][0])
                
                self.data[col] = self.data[col].apply(str2date)
                self.data[col] = (self.data[col] / np.timedelta64(1, 'D')).astype(float)
                
            except:
                pass
            
        self.data.dropna(axis=0, inplace=True)

        et = time.time()
        elapsed_time = et - st
        print("Date types reformatted and missing values handled successfully!\nExecution Time:"
              , round(elapsed_time, 4), "seconds")

    def detect_inequality(self):
        """
        Detect the inequality deterministic relationship between colomns;
        Update the inequality_dictionary of the class object.
        
        Returns:
            None.
        
        Output:
            Number of relationships detected with execution time.
        """
        # Check if the inequality_dict is empty
        # If not, skip the detetcion to avoid adding duplicate records
        if bool(self.inequality_dict):
            return
        
        st = time.time()
        
        # Looping through all pair combinitions of columns
        # For each pair, check the inequality row by row
        # Compute the inequality ratio and compared it to the predefined threshold
        # If the ratio is greater than the threshold, the inequality is confirmed
        column_pairs = list(combinations(data.columns, 2))
        for column_pair in column_pairs:
            if self.data[column_pair[0]].dtypes in ['int', 'float'] and self.data[column_pair[1]].dtypes in ['int', 'float']:
                temp = self.data[[column_pair[0], column_pair[1]]].apply(lambda x: x[column_pair[0]] > x[column_pair[1]], axis=1)
                count_true = temp[temp == True].count()
                ratio = float(count_true) / len(temp)
                
                if ratio >= self.inequality_threshold:
                    if column_pair[0] in self.inequality_dict.keys():
                        self.inequality_dict[column_pair[0]].append(column_pair[1])
                    else:
                        self.inequality_dict[column_pair[0]] = []
                        self.inequality_dict[column_pair[0]].append(column_pair[1])
                
                elif (1 - ratio) >= self.inequality_threshold:
                    if column_pair[1] in self.inequality_dict.keys():
                        self.inequality_dict[column_pair[1]].append(column_pair[0])
                    else:
                        self.inequality_dict[column_pair[1]] = []
                        self.inequality_dict[column_pair[1]].append(column_pair[0])
        
        # Store the full relationships in a temp dictionary for later use
        self.temp_dict = deepcopy(self.inequality_dict)
        
        # Merge duplicates records and remove redundant relationships
        # Loop through the keys and their values in the inequality dictionary
        # Check if a value A is a value of value B, and value B is a key of the inequality dictionary
        # If True, change the value A to be 'N/A', and removed all 'N/A' after the looping is finished
        for key in self.inequality_dict:
            for i in range(len(self.inequality_dict[key])):
                str_1 = self.inequality_dict[key][i]
                for j in range(i, len(self.inequality_dict[key])):
                    str_2 = self.inequality_dict[key][j]
                    if ((str_1 in self.inequality_dict) and (str_2 in self.inequality_dict[str_1])):
                        self.inequality_dict[key][j] = 'N/A' 
                    elif ((str_2 in self.inequality_dict) and (str_1 in self.inequality_dict[str_2])):
                        self.inequality_dict[key][i] = 'N/A'
        for key in self.inequality_dict:
            self.inequality_dict[key] = [i for i in self.inequality_dict[key] if i != 'N/A']
        
        et = time.time()
        self.inequality_runtime = et - st
        
        num = 0
        for key in self.inequality_dict:
            num += len(self.inequality_dict[key])
        print(num, "relationships detected")
        print("Execution Time:", round(self.inequality_runtime, 4), "seconds")

    def detect_arithmetic_equality(self):
        """
        Detect the deterministic relationships "A = B + C" among three columns;
        Based on the dictionary of inequality deterministic relationships.
        
        Returns:
            None.
        
        Output:
            Number of relationships detected with execution time.    
        """
        # Check if the arithmetic_equality_dict is empty
        # If not, skip the detetcion to avoid adding duplicate records
        if bool(self.arithmetic_equality_dict):
            return
        
        st = time.time()
        
        # Loop through the keys of the full inequality relationships dictionary
        # Loop through the value pairs within the same key
        # Check if the percentage of row that key = sum(value_pairs) is >= the predefined threshold
        # If True, append the arithmetic equality relationship
        for key in self.temp_dict:
            column_pairs = list(combinations(self.temp_dict[key], 2))
            for column_pair in column_pairs:
                temp = self.data[[key, column_pair[0], column_pair[1]]].apply(lambda x: x[key] == x[column_pair[0]] + x[column_pair[1]], axis=1)
                count_true = temp[temp == True].count()
                if float(count_true) / len(temp) >= self.arithmetic_equality_threshold:
                    if key in self.arithmetic_equality_dict.keys():
                        self.arithmetic_equality_dict[key].append([column_pair[0], column_pair[1]])
                    else:
                        self.arithmetic_equality_dict[key] = []
                        self.arithmetic_equality_dict[key].append([column_pair[0], column_pair[1]])
        
        et = time.time()
        self.arithmetic_equality_runtime = et - st
        
        num = 0
        for key in self.arithmetic_equality_dict:
            num += len(self.arithmetic_equality_dict[key])
        print(num, "relationships detected")
        print("Execution Time:", round(self.arithmetic_equality_runtime, 4), "seconds")
    
    def detect_arithmetic_inequality(self):
        """
        Detect the deterministic relationships "A >= B + X" between two columns;
        Based on the dictionary of inequality deterministic relationships.
        
        Returns:
            None.
        
        Output:
            Number of relationships detected with execution time. 
        """
        # Check if the arithmetic_inequality_dict is empty
        # If not, skip the detetcion to avoid adding duplicate records
        if bool(self.arithmetic_inequality_dict):
            return
        
        st = time.time()
        
        # Loop through all keys in the inequality dictionary
        # Loop through all values within the same key
        # Find the minimum difference between the key and its values
        # Append the arithmetic inequality relationship
        for key in self.inequality_dict:
            for value in self.inequality_dict[key]:
                diff = []
                for index, row in self.data.iterrows():
                    diff.append(row[key] - row[value])
                
                positive_diff = [d for d in diff if d >= 0]
                
                if key in self.arithmetic_inequality_dict.keys():
                    self.arithmetic_inequality_dict[key].append([value, min(positive_diff)])
                else:
                    self.arithmetic_inequality_dict[key] = []
                    self.arithmetic_inequality_dict[key].append([value, min(positive_diff)])
            
        et = time.time()
        self.arithmetic_inequality_runtime = et - st
        
        num = 0
        for key in self.arithmetic_inequality_dict:
            num += len(self.arithmetic_inequality_dict[key])
        print(num, "relationships detected")
        print("Execution Time:", round(self.arithmetic_inequality_runtime, 4), "seconds")
    
    def detect_inclusive(self):
        """
        Detect the inclusive relationships between two columns
        
        Returns:
            None.
        
        Output:
            Number of relationships detected with execution time. 
        """
        # Check if the inclusive_dict is empty
        # If not, skip the detetcion to avoid adding duplicate records
        if bool(self.inclusive_dict):
            return
        
        st = time.time()
        
        data_length = len(self.data.index)
        
        column_pairs = list(combinations(self.data.columns, 2))
        
        # Loop through all column pairs in the dataset
        # Check if the percentage of the data within column A that is part of column B is >= the predefined threshold
        # If True, append the inclusive relationship
        for column_pair in column_pairs:
            ratio = 0
            
            if len(str(int(self.data.iloc[0][column_pair[1]]))) > len(str(int(self.data.iloc[0][column_pair[0]]))):
                temp = self.data[[column_pair[0], column_pair[1]]].apply(lambda x: str(x[column_pair[0]]) in str(x[column_pair[1]]), axis=1)
                count = temp[temp == True].count()
                ratio = float(count) / len(temp)
                key = column_pair[1]
                value = column_pair[0]
                index = str(self.data.loc[0][column_pair[1]]).find(str(self.data.loc[0][column_pair[0]]))
            
            elif len(str(int(self.data.iloc[0][column_pair[0]]))) > len(str(int(self.data.iloc[0][column_pair[1]]))):
                temp = self.data[[column_pair[0], column_pair[1]]].apply(lambda x: str(x[column_pair[1]]) in str(x[column_pair[0]]), axis=1)
                count = temp[temp == True].count()
                ratio = float(count) / len(temp)
                key = column_pair[0]
                value = column_pair[1]
                index = str(int(self.data.loc[0][column_pair[0]])).find(str(int(self.data.loc[0][column_pair[1]])))
                
            if ratio >= self.inclusive_threshold:
                
                if key in self.inclusive_dict.keys():
                        self.inclusive_dict[key].append([value, index])
                else:
                    self.inclusive_dict[key] = []
                    self.inclusive_dict[key].append([value, index])
            
        et = time.time()
        self.inclusive_runtime = et - st
        
        num = 0
        for key in self.inclusive_dict:
            num += len(self.inclusive_dict[key])
        print(num, "relationships detected")
        print("Execution Time:", round(self.inclusive_runtime, 4), "seconds")
    
    def create_constraints(self, inequality=False, 
                           arithmetic_equality=False, 
                           arithmetic_inequality=False, 
                           inclusive=False):
        """
        Create constraints for synthetic data generation model training.
        
        Parameters:
            inequality: bool, default=False
                If True, create constraints for inequality deterministic relationships.
            
            arithmetic_equality: bool, default=False
                If True, create constraints for deterministic relationships like "A = B + C".
            
            arithmetic_inequality: bool, default=False
                If True, create constraints for deterministic relationships like "A >= B + X".
            
            inclusive: bool, default=False
                If True, create constraints for inclusive relationships.
        
        Returns:
            None.
        
        Output:
            Running finished message with execution time.
        """
        st = time.time()
        
        if inequality:
            for key in self.inequality_dict:
                for value in self.inequality_dict[key]:
                    self.constraints.append(Inequality(low_column_name=value, high_column_name=key))
        
        if arithmetic_equality:
            self.arithmetic_equality_flag=True
                    
        if arithmetic_inequality:
            for key in self.arithmetic_inequality_dict:
                for value_list in self.arithmetic_inequality_dict[key]:
                    columns = [key, value_list[0], value_list[1]]
                    cons = arithmetic_inequality_constraint(column_names=columns)
                    self.constraints.append(cons)
        
        if inclusive:
            self.inclusive_flag=True
        
        et = time.time()
        elapsed_time = et - st
        
        print("Constrainsts created successfully!\nExecution Time:"
              , round(elapsed_time, 4), "seconds")
    
    def apply_model(self, model_name=None):
        """
        Train a specific model in SDV with constraints.
        Store the trained model in dictionary "models".
        
        Parameters:
            model_name: string
                The name of the models in SDV.
                Eg. "GaussianCopula", "CTGAN", "CopulaGAN" and "TVAE".
        
        Returns:
            None.
        """
        st = time.time()
        
        # Check if the input of model name is missing
        if model_name == None:
            print("No input for model name!")
            return
        
        elif model_name == 'GaussianCopula':
            model = GaussianCopula(constraints=self.constraints)
            model.fit(self.data)
            self.models['GaussianCopula'] = model
            
            et = time.time()
            elapsed_time = et - st
        
            print(f"Execution Time for training {model_name}:", round(elapsed_time, 4), "seconds")
            
            
        elif model_name == 'CTGAN':
            model = CTGAN(constraints=self.constraints)
            model.fit(self.data)
            self.models['CTGAN'] = model
            
            et = time.time()
            elapsed_time = et - st
        
            print(f"Execution Time for training {model_name}:", round(elapsed_time, 4), "seconds")
            
        elif model_name == 'CopulaGAN':
            model = CopulaGAN(constraints=self.constraints)
            model.fit(self.data)
            self.models['CopulaGAN'] = model
            
            et = time.time()
            elapsed_time = et - st
        
            print(f"Execution Time for training {model_name}:", round(elapsed_time, 4), "seconds")
            
        elif model_name == 'TVAE':
            model = TVAE(constraints=self.constraints)
            model.fit(self.data)
            self.models['TVAE'] = model
            
            et = time.time()
            elapsed_time = et - st
        
            print(f"Execution Time for training {model_name}:", round(elapsed_time, 4), "seconds")
            
        # Handle input of wrong model name
        else:
            print("Wrong model name!")
            
    def generate(self, model_name=None, num_rows=0):
        """
        Generate synthetic data of a specific number of rows with a specific pre-trainned model.
        
        Parameters:
            model_name: string
                The name of the models in SDV.
                Eg. "GaussianCopula", "CTGAN", "CopulaGAN" and "TVAE".
        
            num_rows: integer
                Number of rows needed to generate.
                
        Returns:
            Synthetic data in DataFrame type.
        """
        st = time.time()
        
        # Handle wrong/missing input of model name or number of rows for generation
        if model_name == None:
            print("No input for model name!")
            return None
        elif (num_rows == 0) or (type(num_rows) != int):
            print("Number of rows has to be integer and greater than 0!")
            return None
        
        elif model_name not in ["GaussianCopula", "CTGAN", "CopulaGAN", "TVAE"]:
            print("Wrong model name!\nAccepted model name: GaussianCopula, CTAGAN, CopulaGAN and TVAE.")
            return None
        
        else:
            syn_data = self.models[model_name].sample(num_rows=num_rows)
            
            if self.arithmetic_equality_flag:
                for key in self.arithmetic_equality_dict:
                    syn_data[key] = syn_data[self.arithmetic_equality_dict[key][0][0]] + syn_data[self.arithmetic_equality_dict[key][0][1]]
            
            if self.inclusive_flag:
                for key in self.inclusive_dict:
                    for value_list in self.inclusive_dict[key]:
                        for index, row in syn_data.iterrows():
                            list_temp = list(str(int(row[key])))
                            list_temp[value_list[1] : value_list[1] + len(str(row[value_list[1]]))] = str(int(row[value_list[0]]))
                            syn_data.at[index, key] = int(''.join(list_temp))
                            
            et = time.time()
            elapsed_time = et - st       
            print(f"Synthetic data generated successfully with {model_name} model!\nExecution Time:", round(elapsed_time, 4), "seconds")
            
            return syn_data
    
    
            
            

# Test with All Constraints & Models

In [9]:
d = SD_generator(data, inequality_threshold=0.95, arithmetic_equality_threshold=0.95, inclusive_threshold=1.0)

In [10]:
d.preprocess()

Date types reformatted and missing values handled successfully!
Execution Time: 0.0931 seconds


In [11]:
d.data.dtypes

Client_id              int64
Date_of_birth        float64
Opening_date         float64
Expiry_date          float64
Credit_limit         float64
Statement_balance    float64
Available_credit     float64
Opening                int64
Expiry                 int64
dtype: object

In [12]:
d.detect_inequality()
d.inequality_dict

8 relationships detected
Execution Time: 0.6195 seconds


{'Client_id': ['Expiry'],
 'Opening_date': ['Date_of_birth'],
 'Expiry_date': ['Opening_date'],
 'Opening': ['Expiry_date', 'Credit_limit'],
 'Expiry': ['Opening'],
 'Credit_limit': ['Statement_balance', 'Available_credit']}

In [13]:
d.detect_arithmetic_equality()
d.arithmetic_equality_dict

1 relationships detected
Execution Time: 1.4592 seconds


{'Credit_limit': [['Statement_balance', 'Available_credit']]}

In [14]:
d.detect_arithmetic_inequality()
d.arithmetic_inequality_dict

8 relationships detected
Execution Time: 0.6548 seconds


{'Client_id': [['Expiry', 2000010600000000.0]],
 'Opening_date': [['Date_of_birth', 6599.0]],
 'Expiry_date': [['Opening_date', 1826.0]],
 'Opening': [['Expiry_date', 19961750.0], ['Credit_limit', 19900301.0]],
 'Expiry': [['Opening', 49999.0]],
 'Credit_limit': [['Statement_balance', 3.9400000000023283],
  ['Available_credit', 7.260000000000218]]}

In [15]:
d.detect_inclusive()
d.inclusive_dict

2 relationships detected
Execution Time: 0.4999 seconds


{'Client_id': [['Opening', 0], ['Expiry', 8]]}

In [16]:
d.create_constraints(inequality=True, arithmetic_equality=True, arithmetic_inequality=True, inclusive=True)

Constrainsts created successfully!
Execution Time: 0.0002 seconds


In [17]:
d.constraints

[<sdv.constraints.tabular.Inequality at 0x7fbec432db50>,
 <sdv.constraints.tabular.Inequality at 0x7fbec432ddf0>,
 <sdv.constraints.tabular.Inequality at 0x7fbec432da30>,
 <sdv.constraints.tabular.Inequality at 0x7fbec432db20>,
 <sdv.constraints.tabular.Inequality at 0x7fbec432d9a0>,
 <sdv.constraints.tabular.Inequality at 0x7fbec432d6a0>,
 <sdv.constraints.tabular.Inequality at 0x7fbec432d790>,
 <sdv.constraints.tabular.Inequality at 0x7fbec432df40>,
 <sdv.constraints.tabular.create_custom_constraint.<locals>.CustomConstraint at 0x7fbec432d670>,
 <sdv.constraints.tabular.create_custom_constraint.<locals>.CustomConstraint at 0x7fbec432d070>,
 <sdv.constraints.tabular.create_custom_constraint.<locals>.CustomConstraint at 0x7fbec432d490>,
 <sdv.constraints.tabular.create_custom_constraint.<locals>.CustomConstraint at 0x7fbec432d970>,
 <sdv.constraints.tabular.create_custom_constraint.<locals>.CustomConstraint at 0x7fbec432d700>,
 <sdv.constraints.tabular.create_custom_constraint.<locals>

In [18]:
d.apply_model("GaussianCopula")
d.apply_model("CTGAN")
d.apply_model("CopulaGAN")
d.apply_model("TVAE")

Execution Time: 0.1748 seconds
Execution Time for training CTGAN: 51.8197 seconds
Execution Time for training CopulaGAN: 45.5671 seconds
Execution Time for training TVAE: 17.2102 seconds


In [19]:
d.models

{'GaussianCopula': <sdv.tabular.copulas.GaussianCopula at 0x7fbec429da60>,
 'CTGAN': <sdv.tabular.ctgan.CTGAN at 0x7fbebc28bf10>,
 'CopulaGAN': <sdv.tabular.copulagan.CopulaGAN at 0x7fbec42b5250>,
 'TVAE': <sdv.tabular.ctgan.TVAE at 0x7fbec3d13190>}

In [20]:
syn_data1 = d.generate("GaussianCopula", num_rows=200)
syn_data1.head()

Sampling rows: 100%|████████████████████████| 200/200 [00:00<00:00, 1182.80it/s]


Synthetic data generated successfully with GaussianCopula model!
Execution Time: 0.2097 seconds


Unnamed: 0,Client_id,Date_of_birth,Opening_date,Expiry_date,Credit_limit,Statement_balance,Available_credit,Opening,Expiry
0,2011808420233673,10051.0,42938.528203,45628.0,57836.832516,47667.33,10169.502516,20118084,20233673
1,2011406820230823,8499.0,40058.635378,44807.0,80530.66386,42231.94,38298.72386,20114068,20230823
2,2011418920177330,25129.0,40712.783667,43579.0,67221.367732,51575.38,15645.987732,20114189,20177330
3,2011413520254920,5722.0,40848.907373,46042.0,52747.684332,22652.16,30095.524332,20114135,20254920
4,2011448120224471,26595.0,41009.975964,44890.0,58995.18268,42488.79,16506.39268,20114481,20224471


In [21]:
syn_data2 = d.generate("CTGAN", num_rows=200)
syn_data2.head()

Sampling rows: 100%|█████████████████████████| 200/200 [00:00<00:00, 771.40it/s]

Synthetic data generated successfully with CTGAN model!
Execution Time: 0.2987 seconds





Unnamed: 0,Client_id,Date_of_birth,Opening_date,Expiry_date,Credit_limit,Statement_balance,Available_credit,Opening,Expiry
0,2003035420092329,25330.0,36223.692976,42988.0,98352.355864,54331.76,44020.595864,20030354,20092329
1,2010833920177485,10484.0,20137.534609,43461.0,22092.030213,18658.94,3433.090213,20108339,20177485
2,2004741320167990,27962.0,37650.401721,44960.0,65172.923955,60897.94,4274.983955,20047413,20167990
3,2010515620230825,16060.0,29352.861037,45675.0,15213.339283,1438.72,13774.619283,20105156,20230825
4,2009700820150137,28180.0,38702.848738,42818.0,82918.282952,48229.34,34688.942952,20097008,20150137


In [22]:
syn_data3 = d.generate("CopulaGAN", num_rows=200)
syn_data3.head()

Sampling rows: 100%|█████████████████████████| 200/200 [00:00<00:00, 708.34it/s]

Synthetic data generated successfully with CopulaGAN model!
Execution Time: 0.3196 seconds





Unnamed: 0,Client_id,Date_of_birth,Opening_date,Expiry_date,Credit_limit,Statement_balance,Available_credit,Opening,Expiry
0,2001909520081264,2896.0,15223.61514,39214.0,15079.793016,11046.81,4032.983016,20019095,20081264
1,2012885520270874,22931.0,32252.740106,43613.0,9675.396215,7013.05,2662.346215,20128855,20270874
2,2008732520256065,10761.0,22903.861265,38519.0,93291.950286,91146.28,2145.670286,20087325,20256065
3,2003169420082870,1573.0,14068.19453,41507.0,66298.980838,28110.84,38188.140838,20031694,20082870
4,2016419320271221,15747.0,25370.8325,46741.0,58010.653058,39557.52,18453.133058,20164193,20271221


In [23]:
syn_data3 = d.generate("TVAE", num_rows=200)
syn_data3.head()

Sampling rows: 100%|████████████████████████| 200/200 [00:00<00:00, 1313.64it/s]

Synthetic data generated successfully with TVAE model!
Execution Time: 0.1939 seconds





Unnamed: 0,Client_id,Date_of_birth,Opening_date,Expiry_date,Credit_limit,Statement_balance,Available_credit,Opening,Expiry
0,2006949120131148,28589.0,36870.156026,40657.0,54192.506067,51365.76,2826.746067,20069491,20131148
1,2003124020110067,2990.0,32364.042298,39463.0,80291.498043,40909.71,39381.788043,20031240,20110067
2,2001034220066456,9006.0,36139.289875,38882.0,17573.098184,2651.48,14921.618184,20010342,20066456
3,2004006620092761,17595.0,33987.16015,40075.0,27798.156095,13670.3,14127.856095,20040066,20092761
4,2008273820185450,15426.0,35923.449562,43843.0,81637.945798,79672.23,1965.715798,20082738,20185450
