In [15]:
import os

from tabgan.sampler import OriginalGenerator, GANGenerator
import pandas as pd
import numpy as np

In [16]:
# Random data experiment
train = pd.DataFrame(np.random.randint(-10, 150, size=(150, 4)), columns=list("ABCD"))
target = pd.DataFrame(np.random.randint(0, 2, size=(150, 1)), columns=list("Y"))
test = pd.DataFrame(np.random.randint(0, 100, size=(100, 4)), columns=list("ABCD"))

# generate data
new_train1, new_target1 = OriginalGenerator().generate_data_pipe(train, target, test, )
new_train2, new_target2 = GANGenerator().generate_data_pipe(train, target, test, )

# example with all params defined
new_train3, new_target3 = GANGenerator(
    gen_x_times=1.1,
    cat_cols=None,
    bot_filter_quantile=0.001,
    top_filter_quantile=0.999,
    is_post_process=True,
    adversarial_model_params = {
        "metrics": "AUC",
        "max_depth": 2,
        "max_bin": 100,
        "learning_rate": 0.02,
        "random_state": 42,
        "n_estimators": 500,
    },
    pregeneration_frac=2,
    only_generated_data=False,
    gan_params = {
        "batch_size": 500,
        "patience": 25,
        "epochs": 500
    }
).generate_data_pipe(
    train,
    target,
    test,
    deep_copy=True,
    only_adversarial=False,
    use_adversarial=True
)



Fitting CTGAN transformers for each column:   0%|          | 0/5 [00:00<?, ?it/s]

Training CTGAN, epochs::   0%|          | 0/500 [00:00<?, ?it/s]



Fitting CTGAN transformers for each column:   0%|          | 0/5 [00:00<?, ?it/s]

Training CTGAN, epochs::   0%|          | 0/500 [00:00<?, ?it/s]



In [17]:
train

Unnamed: 0,A,B,C,D
0,102,121,40,-9
1,10,42,21,22
2,46,97,15,131
3,47,87,50,82
4,55,16,0,-3
...,...,...,...,...
145,9,19,21,84
146,66,92,50,148
147,-5,61,107,98
148,15,70,52,126


In [18]:
target

Unnamed: 0,Y
0,1
1,0
2,1
3,1
4,0
...,...
145,1
146,1
147,1
148,0


In [19]:
new_train2

Unnamed: 0,A,B,C,D
0,88,76,25,74
1,82,85,44,86
2,86,84,21,57
3,71,77,35,73
4,75,84,49,85
5,65,88,19,57
6,64,78,49,78
7,58,95,31,62
8,80,68,38,56
9,85,95,35,53


In [20]:
new_train2

Unnamed: 0,A,B,C,D
0,88,76,25,74
1,82,85,44,86
2,86,84,21,57
3,71,77,35,73
4,75,84,49,85
5,65,88,19,57
6,64,78,49,78
7,58,95,31,62
8,80,68,38,56
9,85,95,35,53


In [29]:
%%capture --no-display
# Time-series experiment

from tabgan.utils import get_year_mnth_dt_from_date
from tabgan.sampler import GANGenerator


train_size = 100
train = pd.DataFrame(np.random.randint(-10, 150, size=(train_size, 4)), columns=list("ABCD"))
min_date = pd.to_datetime('2019-01-01')
max_date = pd.to_datetime('2021-12-31')
d = (max_date - min_date).days + 1

train['Date'] = min_date + pd.to_timedelta(pd.np.random.randint(d, size=train_size), unit='d')
train = get_year_mnth_dt_from_date(train, 'Date')

new_train, _ = GANGenerator(
    gen_x_times=1.1,
    cat_cols=['year'],
    bot_filter_quantile=0.001,
    top_filter_quantile=0.999,
    is_post_process=True,
    pregeneration_frac=2,
    only_generated_data=False
).generate_data_pipe(train.drop('Date', axis=1), None, train.drop('Date', axis=1))

Fitting CTGAN transformers for each column:   0%|          | 0/7 [00:00<?, ?it/s]

Training CTGAN, epochs::   0%|          | 0/500 [00:00<?, ?it/s]

In [30]:
new_train

Unnamed: 0,A,B,C,D,year,month,day
0,14,101,63,57,2020,9,4
1,-5,63,76,18,2021,11,2
2,-5,31,73,21,2020,12,3
3,53,90,34,23,2019,10,3
4,22,33,112,59,2021,12,4
...,...,...,...,...,...,...,...
124,75,30,76,86,2021,6,20
125,121,80,116,101,2020,9,30
126,85,28,54,148,2019,10,28
127,79,14,119,94,2020,1,26


In [23]:
train

Unnamed: 0,A,B,C,D,year,month,day
0,144,70,85,78,2020,11,16
1,72,32,59,40,2021,12,14
2,116,35,83,134,2021,12,20
3,139,61,33,40,2021,12,19
4,128,86,112,35,2021,10,15
...,...,...,...,...,...,...,...
138,3,0,105,128,2021,3,4
139,28,73,132,92,2020,1,6
140,12,8,51,100,2019,2,10
141,5,54,140,31,2019,3,29


In [24]:
# Real-world data experiment
dataset_pth = f"../data/adult/adult.gz"

data = pd.read_csv(dataset_pth)
data.fillna(data.mean(), inplace=True)
data.head()

Unnamed: 0,num_0,cat_1,num_2,cat_3,num_4,cat_5,cat_6,cat_7,cat_8,cat_9,num_10,num_11,num_12,cat_13,target
0,25,Private,226802,11th,7,Never-married,Machine-op-inspct,Own-child,Black,Male,0,0,40,United-States,False
1,38,Private,89814,HS-grad,9,Married-civ-spouse,Farming-fishing,Husband,White,Male,0,0,50,United-States,False
2,28,Local-gov,336951,Assoc-acdm,12,Married-civ-spouse,Protective-serv,Husband,White,Male,0,0,40,United-States,True
3,44,Private,160323,Some-college,10,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,7688,0,40,United-States,True
4,18,?,103497,Some-college,10,Never-married,?,Own-child,White,Female,0,0,30,United-States,False


In [25]:
from sklearn.model_selection import train_test_split

cat_cols = [col for col in data.columns if col.startswith("cat")]

X_train, X_test, y_train, y_test = train_test_split(
    data.drop("target", axis=1),
    data[["target"]],
    test_size=0.85,
    shuffle=False,
    random_state=42,
)
X_test, y_test = X_test.reset_index(drop=True), y_test.reset_index(drop=True)

In [26]:
X_train

Unnamed: 0,num_0,cat_1,num_2,cat_3,num_4,cat_5,cat_6,cat_7,cat_8,cat_9,num_10,num_11,num_12,cat_13
0,25,Private,226802,11th,7,Never-married,Machine-op-inspct,Own-child,Black,Male,0,0,40,United-States
1,38,Private,89814,HS-grad,9,Married-civ-spouse,Farming-fishing,Husband,White,Male,0,0,50,United-States
2,28,Local-gov,336951,Assoc-acdm,12,Married-civ-spouse,Protective-serv,Husband,White,Male,0,0,40,United-States
3,44,Private,160323,Some-college,10,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,7688,0,40,United-States
4,18,?,103497,Some-college,10,Never-married,?,Own-child,White,Female,0,0,30,United-States
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7321,35,Private,109133,HS-grad,9,Separated,Machine-op-inspct,Not-in-family,White,Male,3674,0,52,United-States
7322,35,Private,196123,Assoc-acdm,12,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,40,United-States
7323,55,Private,123436,Assoc-voc,11,Married-civ-spouse,Prof-specialty,Husband,White,Male,0,0,40,United-States
7324,56,Self-emp-inc,42298,9th,5,Married-civ-spouse,Exec-managerial,Husband,Black,Male,0,0,40,United-States


In [31]:
%%capture --no-display
X_new, Y_new = GANGenerator(
    gen_x_times=1.1,
    cat_cols=cat_cols,
    bot_filter_quantile=0.001,
    top_filter_quantile=0.999,
    is_post_process=True,
    adversarial_model_params = {
        "metrics": "AUC",
        "max_depth": 2,
        "max_bin": 100,
        "learning_rate": 0.02,
        "random_state": 42,
        "n_estimators": 500,
    },
    pregeneration_frac=2,
    only_generated_data=False,
    gan_params = {
        "batch_size": 500,
        "patience": 25,
        "epochs": 50
    }
).generate_data_pipe(
    X_train,
    y_train,
    X_test,
    deep_copy=True,
    only_adversarial=False,
    use_adversarial=True
)

Fitting CTGAN transformers for each column:   0%|          | 0/15 [00:00<?, ?it/s]

Training CTGAN, epochs::   0%|          | 0/50 [00:00<?, ?it/s]

In [32]:
X_new

Unnamed: 0,num_0,cat_1,num_2,cat_3,num_4,cat_5,cat_6,cat_7,cat_8,cat_9,num_10,num_11,num_12,cat_13
0,38,Federal-gov,114759,Some-college,4,Divorced,?,Own-child,White,Male,92,4,39,United-States
1,62,Private,479795,Bachelors,15,Married-civ-spouse,Exec-managerial,Husband,White,Male,92,9,40,United-States
2,49,Private,129348,Some-college,6,Married-spouse-absent,Exec-managerial,Not-in-family,White,Male,72165,1,40,United-States
3,27,Private,500138,Assoc-acdm,9,Never-married,Adm-clerical,Not-in-family,Black,Female,68,2,39,United-States
4,60,?,273645,Preschool,9,Married-civ-spouse,Machine-op-inspct,Husband,White,Female,210,9,39,?
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10244,38,Private,258339,Assoc-voc,11,Married-civ-spouse,Other-service,Husband,White,Male,0,0,70,Iran
10245,47,Private,363418,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,45,England
10246,48,Self-emp-inc,275100,HS-grad,9,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,70,Greece
10247,37,Self-emp-not-inc,143774,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,White,Female,0,0,40,Germany
