In [1]:
#data loading
import pandas as pd
import zipfile

predata = pd.read_csv('5transfers_rund.csv')

In [None]:
#imbalance of the dataset
predata['is_fraud'].value_counts()

In [2]:
#ignore warnings from ctgan
import sys
import warnings

if not sys.warnoptions:
    warnings.simplefilter("ignore")

In [None]:
#visualize the whole output
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [None]:
#initial number of columns and rows of the dataframe
predata.shape

In [None]:
#columns of the dataframe
predata.columns

In [3]:
#generate synthetic frauds  
from sdv.tabular import CTGAN

model = CTGAN(verbose = True, epochs = 1, log_frequency = False)  #remove epochs, default it's 300

model.fit(predata) 

Epoch 1, Loss G:  2.4302,Loss D: -0.3505


In [4]:
#number of frauds to generate 
f = len(predata[(predata['is_fraud'] == 0)]) - len(predata[(predata['is_fraud'] == 1)])

In [5]:
#conditional sampling
from sdv.sampling import Condition

#generate only values that satisfy this condition
condition = Condition({'is_fraud': 1}, num_rows = f)

#generate synthetic data
frauds_data = model.sample_conditions(conditions = [condition])

Sampling conditions: 100%|█████████████████████████████████████████████████████████| 6543/6543 [02:30<00:00, 43.44it/s]


In [6]:
#create the new dataframe
predatabalanced = pd.concat([predata, frauds_data], ignore_index = True) 

#save the balanced dataset
predatabalanced.to_csv("2transfers_balanced_gan+rund.csv", index = False)

In [None]:
#make sure the dataset is now balanced
predatabalanced['is_fraud'].value_counts()

In [None]:
#select all transfers frauds after balance and save in a csv file
#transfers frauds
target_col = 'is_fraud'
frauds_transfers = predatabalanced[(predatabalanced[target_col] == 1)].copy()

#save csv file
frauds_transfers.to_csv("5transfers_balanced_gan+rund.csv", index = False)

In [7]:
#comparision of the shape of the actual data and the generated data
from table_evaluator import load_data, TableEvaluator

predata.shape, predatabalanced.shape

((7681, 19), (14224, 19))

In [12]:
#compare dataframes by statistical tests
from sdv.metrics.tabular import CSTest, KSTest, GMLogLikelihood

#chi-2 test to compare the distributions of the discrete (categorical/boolean) columns
CSTest.compute(predata, predatabalanced)

0.49317536138808077

In [11]:
#two-sample kolmogorov–smirnov test to compare the distributions of numerical columns using the empirical CDF
KSTest.compute(predata, predatabalanced)

0.9058244683238915

In [15]:
#fits multiple GaussianMixture models to the real data and evaluates how likely it is that the synthetic data belongs to the 
#same distribution as the real data
GMLogLikelihood.compute(predata, predatabalanced)

-72.44870295447264