In [1]:
import os

from tabgan.sampler import OriginalGenerator, GANGenerator
import pandas as pd
import numpy as np

In [2]:
# Random data experiment
train = pd.DataFrame(np.random.randint(-10, 150, size=(150, 4)), columns=list("ABCD"))
target = pd.DataFrame(np.random.randint(0, 2, size=(150, 1)), columns=list("Y"))
test = pd.DataFrame(np.random.randint(0, 100, size=(100, 4)), columns=list("ABCD"))

# generate data
new_train1, new_target1 = OriginalGenerator().generate_data_pipe(train, target, test, )
new_train2, new_target2 = GANGenerator().generate_data_pipe(train, target, test, )

# example with all params defined
new_train3, new_target3 = GANGenerator(
    gen_x_times=1.1,
    cat_cols=None,
    bot_filter_quantile=0.001,
    top_filter_quantile=0.999,
    is_post_process=True,
    adversarial_model_params = {
        "metrics": "AUC",
        "max_depth": 2,
        "max_bin": 100,
        "learning_rate": 0.02,
        "random_state": 42,
        "n_estimators": 500,
    },
    pregeneration_frac=2,
    only_generated_data=False,
    gan_params = {
        "batch_size": 500,
        "patience": 25,
        "epochs": 500
    }
).generate_data_pipe(
    train,
    target,
    test,
    deep_copy=True,
    only_adversarial=False,
    use_adversarial=True
)



Fitting CTGAN transformers for each column:   0%|          | 0/5 [00:00<?, ?it/s]

Training CTGAN, epochs::   0%|          | 0/500 [00:00<?, ?it/s]



Fitting CTGAN transformers for each column:   0%|          | 0/5 [00:00<?, ?it/s]

Training CTGAN, epochs::   0%|          | 0/500 [00:00<?, ?it/s]



In [3]:
train

Unnamed: 0,A,B,C,D
0,29,17,80,-7
1,115,89,-8,106
2,120,-9,42,137
3,136,105,131,106
4,105,55,46,96
...,...,...,...,...
145,123,131,23,104
146,137,85,119,96
147,95,99,115,38
148,39,146,20,20


In [4]:
target

Unnamed: 0,Y
0,1
1,1
2,1
3,1
4,0
...,...
145,1
146,1
147,0
148,1


In [5]:
new_train2

Unnamed: 0,A,B,C,D
0,44,43,45,55
1,7,54,51,55
2,1,36,60,50
3,14,24,1,37
4,12,35,45,39
5,50,85,82,55
6,53,61,5,35
7,49,8,70,37
8,50,19,97,38
9,41,75,37,58


In [6]:
new_train2

Unnamed: 0,A,B,C,D
0,44,43,45,55
1,7,54,51,55
2,1,36,60,50
3,14,24,1,37
4,12,35,45,39
5,50,85,82,55
6,53,61,5,35
7,49,8,70,37
8,50,19,97,38
9,41,75,37,58


In [7]:
# Time-series experiment

from tabgan.utils import get_year_mnth_dt_from_date
from tabgan.sampler import GANGenerator


train_size = 100
train = pd.DataFrame(np.random.randint(-10, 150, size=(train_size, 4)), columns=list("ABCD"))
min_date = pd.to_datetime('2019-01-01')
max_date = pd.to_datetime('2021-12-31')
d = (max_date - min_date).days + 1

train['Date'] = min_date + pd.to_timedelta(pd.np.random.randint(d, size=train_size), unit='d')
train = get_year_mnth_dt_from_date(train, 'Date')

new_train, _ = GANGenerator(
    gen_x_times=1.1,
    cat_cols=['year'],
    bot_filter_quantile=0.001,
    top_filter_quantile=0.999,
    is_post_process=True,
    pregeneration_frac=2,
    only_generated_data=False
).generate_data_pipe(train.drop('Date', axis=1), None, train.drop('Date', axis=1))

Fitting CTGAN transformers for each column:   0%|          | 0/7 [00:00<?, ?it/s]

Training CTGAN, epochs::   0%|          | 0/500 [00:00<?, ?it/s]



In [8]:
train

Unnamed: 0,A,B,C,D,Date,year,month,day
0,69,91,-2,46,2021-05-21,2021,5,21
1,149,84,143,64,2019-08-15,2019,8,15
2,71,53,23,112,2019-02-24,2019,2,24
3,26,24,25,61,2020-07-28,2020,7,28
4,83,43,118,22,2019-08-09,2019,8,9
...,...,...,...,...,...,...,...,...
95,52,119,101,76,2020-07-17,2020,7,17
96,15,114,134,132,2020-12-04,2020,12,4
97,80,48,59,29,2020-01-27,2020,1,27
98,66,43,117,89,2019-01-29,2019,1,29


In [9]:
new_train

Unnamed: 0,A,B,C,D,year,month,day
0,1,46,90,26,2019,12,15
1,-6,43,19,91,2019,11,31
2,4,110,134,60,2021,4,23
3,-1,88,109,37,2021,5,20
4,4,40,100,31,2021,4,21
...,...,...,...,...,...,...,...
161,112,145,108,105,2021,8,19
162,126,38,97,140,2021,7,31
163,94,9,124,112,2021,12,23
164,103,51,104,100,2021,12,9


In [15]:
# Real-world data experiment
dataset_pth = f"../data/adult/adult.gz"

data = pd.read_csv(dataset_pth)
data.fillna(data.mean(), inplace=True)
data.head()

Unnamed: 0,num_0,cat_1,num_2,cat_3,num_4,cat_5,cat_6,cat_7,cat_8,cat_9,num_10,num_11,num_12,cat_13,target
0,25,Private,226802,11th,7,Never-married,Machine-op-inspct,Own-child,Black,Male,0,0,40,United-States,False
1,38,Private,89814,HS-grad,9,Married-civ-spouse,Farming-fishing,Husband,White,Male,0,0,50,United-States,False
2,28,Local-gov,336951,Assoc-acdm,12,Married-civ-spouse,Protective-serv,Husband,White,Male,0,0,40,United-States,True
3,44,Private,160323,Some-college,10,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,7688,0,40,United-States,True
4,18,?,103497,Some-college,10,Never-married,?,Own-child,White,Female,0,0,30,United-States,False


In [24]:
from sklearn.model_selection import train_test_split

cat_cols = [col for col in data.columns if col.startswith("cat")]

X_train, X_test, y_train, y_test = train_test_split(
    data.drop("target", axis=1),
    data[["target"]],
    test_size=0.85,
    shuffle=False,
    random_state=42,
)
X_test, y_test = X_test.reset_index(drop=True), y_test.reset_index(drop=True)

In [25]:
X_train

Unnamed: 0,num_0,cat_1,num_2,cat_3,num_4,cat_5,cat_6,cat_7,cat_8,cat_9,num_10,num_11,num_12,cat_13
0,25,Private,226802,11th,7,Never-married,Machine-op-inspct,Own-child,Black,Male,0,0,40,United-States
1,38,Private,89814,HS-grad,9,Married-civ-spouse,Farming-fishing,Husband,White,Male,0,0,50,United-States
2,28,Local-gov,336951,Assoc-acdm,12,Married-civ-spouse,Protective-serv,Husband,White,Male,0,0,40,United-States
3,44,Private,160323,Some-college,10,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,7688,0,40,United-States
4,18,?,103497,Some-college,10,Never-married,?,Own-child,White,Female,0,0,30,United-States
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7321,35,Private,109133,HS-grad,9,Separated,Machine-op-inspct,Not-in-family,White,Male,3674,0,52,United-States
7322,35,Private,196123,Assoc-acdm,12,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,40,United-States
7323,55,Private,123436,Assoc-voc,11,Married-civ-spouse,Prof-specialty,Husband,White,Male,0,0,40,United-States
7324,56,Self-emp-inc,42298,9th,5,Married-civ-spouse,Exec-managerial,Husband,Black,Male,0,0,40,United-States


In [26]:
X_new, Y_new = GANGenerator(
    gen_x_times=1.1,
    cat_cols=cat_cols,
    bot_filter_quantile=0.001,
    top_filter_quantile=0.999,
    is_post_process=True,
    adversarial_model_params = {
        "metrics": "AUC",
        "max_depth": 2,
        "max_bin": 100,
        "learning_rate": 0.02,
        "random_state": 42,
        "n_estimators": 500,
    },
    pregeneration_frac=2,
    only_generated_data=False,
    gan_params = {
        "batch_size": 500,
        "patience": 25,
        "epochs": 50
    }
).generate_data_pipe(
    X_train,
    y_train,
    X_test,
    deep_copy=True,
    only_adversarial=False,
    use_adversarial=True
)

Fitting CTGAN transformers for each column:   0%|          | 0/15 [00:00<?, ?it/s]

Training CTGAN, epochs::   0%|          | 0/50 [00:00<?, ?it/s]



In [27]:
X_new

Unnamed: 0,num_0,cat_1,num_2,cat_3,num_4,cat_5,cat_6,cat_7,cat_8,cat_9,num_10,num_11,num_12,cat_13
0,30,Local-gov,57722,Masters,13,Never-married,?,Not-in-family,White,Male,192,8,39,United-States
1,75,Private,217196,Some-college,4,Married-civ-spouse,Adm-clerical,Wife,White,Male,184,1,40,United-States
2,37,Private,181628,9th,9,Divorced,Protective-serv,Wife,White,Male,19,2,39,United-States
3,34,Private,153016,5th-6th,13,Married-civ-spouse,Exec-managerial,Not-in-family,Black,Male,95,9,39,United-States
4,47,?,87484,Bachelors,8,Never-married,Craft-repair,Unmarried,White,Female,212,3,18,United-States
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11293,76,Self-emp-not-inc,42162,HS-grad,9,Widowed,Exec-managerial,Not-in-family,White,Female,0,0,2,United-States
11294,71,Local-gov,161342,HS-grad,9,Divorced,Exec-managerial,Unmarried,White,Female,0,0,3,United-States
11295,77,?,309955,Assoc-acdm,12,Married-civ-spouse,?,Husband,White,Male,0,1411,2,United-States
11296,46,Private,168191,HS-grad,9,Separated,Craft-repair,Not-in-family,White,Male,0,0,2,Italy
