In [None]:
pip install sdv

In [1]:
import numpy as np
import pandas as pd
from datetime import datetime as dt
import re
import time
from sdv.tabular import GaussianCopula, CTGAN, CopulaGAN, TVAE
from sdv.constraints import Range, ScalarRange, Inequality, ScalarInequality, create_custom_constraint
from copy import deepcopy
import warnings

In [2]:
warnings.filterwarnings("ignore")

In [3]:
data = pd.read_csv('/../data/data.csv')

FileNotFoundError: [Errno 2] No such file or directory: '/../data/data.csv'

In [3]:
data.drop(columns=['Name'], axis=1, inplace=True)

In [4]:
data.head()

Unnamed: 0,Date_of_birth,Opening_date,Expiry_date,Credit_limit,Statement_balance,Available_credit
0,1949-08-17,2008-09-02,2013-09-02,38000.0,29297.69,8702.31
1,1938-07-04,2019-09-08,2024-09-08,19500.0,15628.5,3871.5
2,1927-03-28,2019-04-23,2024-04-23,11500.0,7388.34,4111.66
3,1936-01-17,2021-10-27,2026-10-27,68500.0,32060.39,36439.61
4,1960-09-05,2010-10-02,2015-10-02,72000.0,56148.13,15851.87


In [5]:
data.dtypes

Date_of_birth         object
Opening_date          object
Expiry_date           object
Credit_limit         float64
Statement_balance    float64
Available_credit     float64
dtype: object

In [6]:
def is_valid_arithmetic_equality(column_names, data):
    is_equal = [data[column_names[0]] == data[column_names[1]] + data[column_names[2]]][0]
    return is_equal
    
arithmetic_equality_constraint = create_custom_constraint(is_valid_fn = is_valid_arithmetic_equality)

def is_valid_arithmetic_inequality(column_names, data):
    is_equal = [data[column_names[0]] >= data[column_names[1]] + column_names[2]][0]
    return is_equal

arithmetic_inequality_constraint = create_custom_constraint(is_valid_fn = is_valid_arithmetic_inequality)

In [204]:
class SD_generator():
    """
    A class to detect deterministic relationships between two/three columns from a given dataset.
    Create constraints with detected relationships.
    Apply models in SDV and generate synthetic data.
    
    ...

    Attributes
    ----------
    data : Pandas DataFrame
        an input dataset in Pandas DataFrame format
        
    threshold : float
        a cut-off percentage for detection functions to confirm the deterministic relationships
        
    inequality_dict : dictionary
        stores inequality deterministic relationships;
        in which the key is greater than its values
    
    inequality_runtime : float
        a variable to store the runtime of inequality detection function
    
    arithmetic_equality_dict : dictionary
        stores deterministic relationships like "A = B + C" among three colomns
        
    arithmetic_equality_runtime : float
        a variable to store the runtime of arithmetic equality detection function
    
    arithmetic_equality_flag : bool
        a bolean variable; if True, apply arithmetic equality constraints to generate synthetic data
    
    arithmetic_inequality_dict : dictionary
        stores deterministic relationships like "A >= B + X" between two colomns
    
    arithmetic_inequality_runtime : float
        a variable to store the runtime of arithmetic inequality detection function
    
    constraints : list
        a list containing constraints for synthetic data generation model training
        
    models : dictionary
        an empty dictionary to store SDV models that are ready to generate synthetic data
        
    Methods
    -------
    preprocess():
        Change the dtpyes of date columns to float and 
        drop the rows of the input dataframe which have missing values.
    
    detect_inequality():
        Detect the inequality deterministic relationship between two colomns.
    
    detect_arithmetic_equality():
        Detect the deterministic relationships like "A = B + C" among three colomns.
        
    detect_arithmetic_inequality():
        Detect the deterministic relationships like "A >= B + X" between two colomns.
        
    create_constraints():
        Create constraints for synthetic data generation model training.
        
    apply_model():
        Train a specific model in SDV with constraints.
        Store the trained model in dictionary "models".
       
    generate():
        Generate synthetic data of a specific number of rows with a specific pre-trainned model.
    """
    def __init__(self, data, threshold, 
                 inequality_dict={}, inequality_runtime=0,
                 arithmetic_equality_dict={}, arithmetic_equality_runtime=0, arithmetic_equality_flag=False,
                 arithmetic_inequality_dict={}, arithmetic_inequality_runtime=0,
                 constraints=[], models={}):
        """
        Constructs all the necessary attributes for the person object.

        Parameters
        ----------
            dataframe : Pandas DataFrame
                an input dataset in Pandas DataFrame format
                
            threshold : float
                a cut-off percentage for detection functions to confirm the deterministic relationships
                
            inequality_dict : dictionary
                an empty dictionary to store inequality deterministic relationships
            
            inequality_runtime : float
                a variable to store the runtime of inequality detection function
    
            arithmetic_equality_dict : dictionary
                an empty dictionary to store deterministic relationships among three columns like "A = B + C"
                
            arithmetic_equality_runtime : float
                a variable to store the runtime of arithmetic equality detection function
            
            arithmetic_equality_flag : bool
                a bolean variable; if True, apply arithmetic equality constraints to generate synthetic data
            
            arithmetic_inequality : dictionary
                an empty dictionary to store deterministic relationships between two columnslike "A >= B + X"
                
            arithmetic_inequality_runtime : float
                a variable to store the runtime of arithmetic inequality detection function
            
            constarints : list
                an empty list to store contraints for synthetic data generation model training
            
            models : dictionary
                an empty dictionary to store SDV models that are ready to generate synthetic data
        """    
        self.data = data
        self.threshold = threshold
        self.inequality_dict = inequality_dict
        self.inequality_runtime = inequality_runtime
        self.arithmetic_equality_dict = arithmetic_equality_dict
        self.arithmetic_equality_runtime = arithmetic_equality_runtime
        self.arithmetic_equality_flag = arithmetic_equality_flag
        self.arithmetic_inequality_dict = arithmetic_inequality_dict 
        self.arithmetic_inequality_runtime = arithmetic_inequality_runtime
        self.constraints = constraints
        self.models = models
        
    def preprocess(self):
        """
        Change date columns to float format;
        Handle missing values of the input dataframe;
        Drop the rows with missing values.
        
        Returns:
            None.
        
        Output:
            Running finished message with execution time.
        """
        st = time.time()
        
        data = self.data
        
        ref_dt = pd.Timestamp('1900-01-01')

        str2date = lambda x: dt.strptime(x, "%Y-%m-%d") - ref_dt if x.replace(" ", "") else np.nan
        
        for col in data.columns:
            
            try:
                re.match('^[0-9]{4}\-[0-9]{2}\-[0-9]{2}$', data[col][0])
                
                self.data[col] = self.data[col].apply(str2date)
                self.data[col] = (self.data[col] / np.timedelta64(1, 'D')).astype(float)
                
            except:
                pass
            
        data.dropna(axis=0, inplace=True)
        self.data = data

        et = time.time()
        elapsed_time = et - st
        print("Date types reformatted and missing values handled successfully!\nExecution Time:"
              , round(elapsed_time, 4), "seconds")

    def detect_inequality(self):
        """
        Detect the inequality deterministic relationship between colomns;
        Update the inequality_dictionary of the class object.
        
        Returns:
            None.
        
        Output:
            Number of relationships detected with execution time.
        """
        st = time.time()
        
        data = self.data
        inequality_dict = {}
        
    # Looping through all combinitions of columns
        for i in range(len(data.columns)):
            for j in range(i+1, len(data.columns)):
                
                # Extract the column pairs in float format
                if (data[data.columns[i]].dtypes == 'float' and data[data.columns[j]].dtypes == 'float'):
                    diff = []
                    count = 0
                    for rownum, row in data.iterrows():
                        diff.append(row[i] - row[j])
                    for num in diff:
                        if num < 0:
                            count+=1 
                    if float(count)/len(diff) >= self.threshold:
                        if data.columns[j] in inequality_dict.keys():
                            inequality_dict[data.columns[j]].append(data.columns[i])
                        else:
                            inequality_dict[data.columns[j]] = []
                            inequality_dict[data.columns[j]].append(data.columns[i])
                    elif float(len(diff) - count)/len(diff) >= self.threshold:
                        if data.columns[i] in inequality_dict.keys():
                            inequality_dict[data.columns[i]].append(data.columns[j])
                        else:
                            inequality_dict[data.columns[i]] = []
                            inequality_dict[data.columns[i]].append(data.columns[j])
        
        key_list = []
        remove_list = []
        
        for key in inequality_dict:
            for i in key_list:
                if i in inequality_dict[key]:
                    for j in inequality_dict[i]:
                        inequality_dict[key].append(j)
                    remove_list.append(i)
                    key_list.remove(i)
            key_list.append(key)

        for i in remove_list:
            inequality_dict.pop(i)

        for key in inequality_dict:
            for i in range(0, len(inequality_dict[key])):
                for j in range(i+1, len(inequality_dict[key])):
                    if inequality_dict[key][i] == inequality_dict[key][j]:
                        inequality_dict[key].pop(i)
        
    
        self.inequality_dict = inequality_dict
        
        et = time.time()
        elapsed_time = et - st
        self.inequality_runtime = elapsed_time
        
        print(len(inequality_dict), "relationships detected")
        print("Execution Time:", round(self.inequality_runtime, 4), "seconds")

    def detect_arithmetic_equality(self):
        """
        Detect the deterministic relationships "A = B + C" among three columns;
        Based on the dictionary of inequality deterministic relationships.
        
        Returns:
            None.
        
        Output:
            Number of relationships detected with execution time.    
        """
        st = time.time()
        
        data = self.data
        arithmetic_equality_dict = {}
        
        for key in self.inequality_dict:
            # Check if the len(the list of values of the key) is >= 2 
            if len(self.inequality_dict[key]) >= 2:
                for i in range(len(self.inequality_dict[key])):
                    for j in range(i+1, len(self.inequality_dict[key])):
                        count = 0
                        for index, row in data.iterrows():
                            if (row[key] == row[self.inequality_dict[key][i]] + row[self.inequality_dict[key][j]]):
                                count += 1
                        if float(count) / len(data.index) >= self.threshold:
                            if key in arithmetic_equality_dict.keys():
                                arithmetic_equality_dict[key].append([self.inequality_dict[key][i], self.inequality_dict[key][j]])
                            else:
                                arithmetic_equality_dict[key] = []
                                arithmetic_equality_dict[key].append([self.inequality_dict[key][i], self.inequality_dict[key][j]])
                 
        self.arithmetic_equality_dict = arithmetic_equality_dict
        
        et = time.time()
        elapsed_time = et - st
        self.arithmetic_equality_runtime = elapsed_time
        
        print(len(arithmetic_equality_dict), "relationships detected")
        print("Execution Time:", round(self.arithmetic_equality_runtime, 4), "seconds")
    
    def detect_arithmetic_inequality(self):
        """
        Detect the deterministic relationships "A >= B + X" between two columns;
        Based on the dictionary of inequality deterministic relationships.
        
        Returns:
            None.
        
        Output:
            Number of relationships detected with execution time. 
        """
        st = time.time()
        
        data = self.data
        arithmetic_inequality_dict = {}
        
        for key in self.inequality_dict:
            for value in self.inequality_dict[key]:
                diff = []
                for index, row in data.iterrows():
                    diff.append(row[key] - row[value])
                    
                if key in arithmetic_inequality_dict.keys():
                    arithmetic_inequality_dict[key].append([value, min(diff)])
                else:
                    arithmetic_inequality_dict[key] = []
                    arithmetic_inequality_dict[key].append([value, min(diff)])
        
        self.arithmetic_inequality_dict = arithmetic_inequality_dict
            
        et = time.time()
        elapsed_time = et - st
        self.arithmetic_inequality_runtime = elapsed_time
        
        print(len(arithmetic_inequality_dict), "relationships detected")
        print("Execution Time:", round(self.arithmetic_inequality_runtime, 4), "seconds")
    
    def create_constraints(self, inequality=False, arithmetic_equality=False, arithmetic_inequality=False):
        """
        Create constraints for synthetic data generation model training.
        
        Parameters:
            inequality: bool, default=True
                If True, create constraints for inequality deterministic relationships.
            
            arithmetic_equality: bool, default=True
                If True, create constraints for deterministic relationships like "A = B + C".
            
            arithmetic_inequality: bool, default=True
                If True, create constraints for deterministic relationships like "A >= B + X".
        
        Returns:
            None.
        
        Output:
            Running finished message with execution time.
        """
        st = time.time()
        
        if inequality:
            for key in self.inequality_dict:
                for value in self.inequality_dict[key]:
                    self.constraints.append(Inequality(low_column_name=value, high_column_name=key))
        
        if arithmetic_equality:
            self.arithmetic_equality_flag=True
#             for key in self.arithmetic_equality_dict:
#                 for value_list in self.arithmetic_equality_dict[key]:
#                     columns = [key, value_list[0], value_list[1]]
#                     cons = arithmetic_equality_constraint(column_names=columns)
#                     self.constraints.append(cons)
                    
        if arithmetic_inequality:
            for key in self.arithmetic_inequality_dict:
                for value_list in self.arithmetic_inequality_dict[key]:
                    columns = [key, value_list[0], value_list[1]]
                    cons = arithmetic_inequality_constraint(column_names=columns)
                    self.constraints.append(cons)
        et = time.time()
        elapsed_time = et - st
        
        print("Constrainsts created successfully!\nExecution Time:"
              , round(elapsed_time, 4), "seconds")
        
    def resume_arith_equal_column(self, syn_data, arithmetic_equality=False):
        """
        Recalculate the key columns for arithmetic equality deterministic relationships.
        
        Parameters:
            syn_data: Pandas DataFrame
                The synthetic data generated by SDV models.
                
            arithmetic_equality: bool, default=False
                If True, recalculate the key column for arithmetic equality deterministic relationships.
        
        Returns:
            Synthetic data in DataFrame.
        
        """
        st = time.time()
        
        if arithmetic_equality:
            for key in self.arithmetic_equality_dict:
                syn_data[key] = syn_data[self.arithmetic_equality_dict[key][0][0]] + syn_data[self.arithmetic_equality_dict[key][0][1]]
                
            et = time.time()
            elapsed_time = et - st
        
            print("Execution Time:", round(elapsed_time, 4), "seconds")
            return syn_data
        else:
            print("No arithmetic quality constraints applied!\nThe original dataset is returned")
            return syn_data
    
    def apply_model(self, model_name=None):
        """
        Train a specific model in SDV with constraints.
        Store the trained model in dictionary "models".
        
        Parameters:
            model_name: string
                The name of the models in SDV.
                Eg. "GaussianCopula", "CTGAN", "CopulaGAN" and "TVAE".
        
        Returns:
            None.
        """
        st = time.time()
        
        if model_name == None:
            print("No input for model name!")
            return None
        
        elif model_name == 'GaussianCopula':
            model = GaussianCopula(constraints=self.constraints)
            model.fit(self.data)
            self.models['GaussianCopula'] = model
            
            et = time.time()
            elapsed_time = et - st
        
            print("Execution Time:", round(elapsed_time, 4), "seconds")
            
            
        elif model_name == 'CTGAN':
            model = CTGAN(constraints=self.constraints)
            model.fit(self.data)
            self.models['CTGAN'] = model
            
            et = time.time()
            elapsed_time = et - st
        
            print(f"Execution Time for training {model_name}:", round(elapsed_time, 4), "seconds")
            
        elif model_name == 'CopulaGAN':
            model = CopulaGAN(constraints=self.constraints)
            model.fit(self.data)
            self.models['CopulaGAN'] = model
            
            et = time.time()
            elapsed_time = et - st
        
            print(f"Execution Time for training {model_name}:", round(elapsed_time, 4), "seconds")
            
        elif model_name == 'TVAE':
            model = TVAE(constraints=self.constraints)
            model.fit(self.data)
            self.models['TVAE'] = model
            
            et = time.time()
            elapsed_time = et - st
        
            print(f"Execution Time for training {model_name}:", round(elapsed_time, 4), "seconds")
            
        else:
            print("Wrong model name!")
            
    def generate(self, model_name=None, num_rows=0):
        """
        Generate synthetic data of a specific number of rows with a specific pre-trainned model.
        
        Parameters:
            model_name: string
                The name of the models in SDV.
                Eg. "GaussianCopula", "CTGAN", "CopulaGAN" and "TVAE".
        
            num_rows: integer
                Number of rows needed to generate.
                
        Returns:
            Synthetic data in DataFrame type.
        """
        st = time.time()
        
        if model_name == None:
            print("No input for model name!")
            return None
        elif (num_rows == 0) | (type(num_rows) != int):
            print("Number of rows has to be integer and greater than 0!")
            return None
        
        elif model_name not in ["GaussianCopula", "CTGAN", "CopulaGAN", "TVAE"]:
            print("Wrong model name!\nAccepted model name: GaussianCopula, CTAGAN, CopulaGAN and TVAE.")
            return None
        
        else:
            syn_data = self.models[model_name].sample(num_rows=num_rows)
            
            if self.arithmetic_equality_flag:
                for key in self.arithmetic_equality_dict:
                    syn_data[key] = syn_data[self.arithmetic_equality_dict[key][0][0]] + syn_data[self.arithmetic_equality_dict[key][0][1]]
                    
            et = time.time()
            elapsed_time = et - st       
            print(f"Synthetic data generated successfully with {model_name} model!\nExecution Time:", round(elapsed_time, 4), "seconds")
            
            return syn_data
            
            
            

In [205]:
d = SD_generator(data, 0.95)

In [206]:
d.preprocess()

Date types reformatted and missing values handled successfully!
Execution Time: 0.0041 seconds


In [207]:
d.detect_inequality()
d.inequality_dict

2 relationships detected
Execution Time: 1.2985 seconds


{'Expiry_date': ['Opening_date', 'Date_of_birth'],
 'Credit_limit': ['Statement_balance', 'Available_credit']}

In [208]:
d.detect_arithmetic_equality()
d.arithmetic_equality_dict

1 relationships detected
Execution Time: 0.2172 seconds


{'Credit_limit': [['Statement_balance', 'Available_credit']]}

In [209]:
d.detect_arithmetic_inequality()
d.arithmetic_inequality_dict

2 relationships detected
Execution Time: 0.3947 seconds


{'Expiry_date': [['Opening_date', 1826.0], ['Date_of_birth', 8425.0]],
 'Credit_limit': [['Statement_balance', 3.9400000000023283],
  ['Available_credit', 7.260000000000218]]}

In [210]:
d.create_constraints(inequality=True, arithmetic_equality=True, arithmetic_inequality=True)

Constrainsts created successfully!
Execution Time: 0.0002 seconds


In [211]:
d.constraints

[<sdv.constraints.tabular.Inequality at 0x7f84746200a0>,
 <sdv.constraints.tabular.Inequality at 0x7f84742df520>,
 <sdv.constraints.tabular.Inequality at 0x7f84742df3d0>,
 <sdv.constraints.tabular.Inequality at 0x7f84742df130>,
 <sdv.constraints.tabular.create_custom_constraint.<locals>.CustomConstraint at 0x7f84742df2e0>,
 <sdv.constraints.tabular.create_custom_constraint.<locals>.CustomConstraint at 0x7f84742df430>,
 <sdv.constraints.tabular.create_custom_constraint.<locals>.CustomConstraint at 0x7f84742df3a0>,
 <sdv.constraints.tabular.create_custom_constraint.<locals>.CustomConstraint at 0x7f84742df250>]

In [212]:
d.apply_model("GaussianCopula")
d.apply_model("CTGAN")
d.apply_model("CopulaGAN")
d.apply_model("TVAE")

Execution Time: 0.7398 seconds
Execution Time for training CTGAN: 50.5347 seconds
Execution Time for training CopulaGAN: 51.5876 seconds
Execution Time for training TVAE: 17.2607 seconds


In [213]:
d.models

{'GaussianCopula': <sdv.tabular.copulas.GaussianCopula at 0x7f847483a5b0>,
 'CTGAN': <sdv.tabular.ctgan.CTGAN at 0x7f8474526dc0>,
 'CopulaGAN': <sdv.tabular.copulagan.CopulaGAN at 0x7f847483a490>,
 'TVAE': <sdv.tabular.ctgan.TVAE at 0x7f8474629ee0>}

In [214]:
syn_data1 = d.generate("GaussianCopula", num_rows=200)
syn_data1.head()

Sampling rows: 100%|██████████| 200/200 [00:00<00:00, 1095.43it/s]

Synthetic data generated successfully with GaussianCopula model!
Execution Time: 0.1866 seconds





Unnamed: 0,Date_of_birth,Opening_date,Expiry_date,Credit_limit,Statement_balance,Available_credit
0,28406.0,41531.0,43357.223104,8821.373229,8088.73,732.643229
1,8737.0,40838.0,42664.222731,78353.135634,28562.71,49790.425634
2,26025.0,40378.0,42204.223172,45959.927729,31816.22,14143.707729
3,15770.0,42815.0,44641.223264,74393.579093,33865.68,40527.899093
4,31503.0,41557.0,43383.22287,64248.181089,22026.58,42221.601089


In [215]:
syn_data2 = d.generate("CTGAN", num_rows=200)
syn_data2.head()

Sampling rows: 100%|██████████| 200/200 [00:00<00:00, 1478.79it/s]

Synthetic data generated successfully with CTGAN model!
Execution Time: 0.1397 seconds





Unnamed: 0,Date_of_birth,Opening_date,Expiry_date,Credit_limit,Statement_balance,Available_credit
0,20836.0,39953.0,41779.44895,30963.897036,30529.9,433.997036
1,26495.0,42986.0,44812.303555,48350.307088,39869.75,8480.557088
2,30434.0,42270.0,44097.728963,24008.239544,15297.99,8710.249544
3,28823.0,44806.0,46632.34329,30422.02,30418.08,3.94
4,28704.0,38017.0,39843.642501,9874.05,9870.11,3.94


In [216]:
syn_data3 = d.generate("CopulaGAN", num_rows=200)
syn_data3.head()

Sampling rows: 100%|██████████| 200/200 [00:00<00:00, 986.31it/s]

Synthetic data generated successfully with CopulaGAN model!
Execution Time: 0.2063 seconds





Unnamed: 0,Date_of_birth,Opening_date,Expiry_date,Credit_limit,Statement_balance,Available_credit
0,24118.0,42199.0,44026.672444,91281.82673,37250.3,54031.52673
1,10672.0,41829.0,43656.593877,110428.9672,73559.07,36869.8972
2,18724.0,37622.0,39450.438761,16716.483666,11510.75,5205.733666
3,13713.0,40258.0,42087.45018,156384.48067,77368.22,79016.26067
4,9370.0,42710.0,44537.051633,48666.198163,19451.2,29214.998163


In [217]:
syn_data3 = d.generate("TVAE", num_rows=200)
syn_data3.head()

Sampling rows: 100%|██████████| 200/200 [00:00<00:00, 1768.45it/s]

Synthetic data generated successfully with TVAE model!
Execution Time: 0.1172 seconds





Unnamed: 0,Date_of_birth,Opening_date,Expiry_date,Credit_limit,Statement_balance,Available_credit
0,4574.0,38764.0,40590.621279,78878.721129,69934.72,8944.001129
1,12149.0,39850.0,41677.114099,68862.614817,30856.14,38006.474817
2,7362.0,36982.0,38808.327142,33609.270866,3988.13,29621.140866
3,33303.0,44923.0,46749.245971,44936.476967,40911.03,4025.446967
4,27218.0,39590.0,41416.531448,33329.210216,26924.84,6404.370216
