In [1]:
import numpy as np
import pandas as pd
import copy
from sklearn.model_selection import train_test_split
import scipy
import matplotlib.pyplot as plt
import torch
import torch.optim.swa_utils as swa_utils
import torchcde
import torchsde
import tqdm
import model2
from model2 import Generator, Discriminator
import json
import sdmetrics
from sdmetrics.reports.single_table import QualityReport
import xgboost as xgb
from scipy import stats

In [2]:
from data_transformer import DataTransformer

In [3]:
adult_names = ["age", "workclass", "fnlwgt", "education", "education-num", "marital-status", "occupation", "relationship", "race", "sex", "capital-gain", "capital-loss", "hours-per-week", "native-country", "Over_50k_Salary"]

adult_data =  pd.read_csv('adult.data', names = adult_names, index_col = False)

In [4]:
df = copy.deepcopy(adult_data)
df["Over_50k_Salary"] = pd.factorize(df["Over_50k_Salary"], sort=True )[0]

In [5]:
df

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,Over_50k_Salary
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,0
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,0
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,0
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,0
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,27,Private,257302,Assoc-acdm,12,Married-civ-spouse,Tech-support,Wife,White,Female,0,0,38,United-States,0
32557,40,Private,154374,HS-grad,9,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0,0,40,United-States,1
32558,58,Private,151910,HS-grad,9,Widowed,Adm-clerical,Unmarried,White,Female,0,0,40,United-States,0
32559,22,Private,201490,HS-grad,9,Never-married,Adm-clerical,Own-child,White,Male,0,0,20,United-States,0


In [6]:
# df.drop(['fnlwgt','relationship','education-num'],axis=1, inplace=True)
df.drop(['relationship','education-num'],axis=1, inplace=True)

In [7]:
df = df.drop(df[df.workclass == ' ?'].index)
df = df.drop(df[df.occupation == ' ?'].index)
df = df.drop(df[df['native-country'] == ' ?'].index)

In [8]:
def age_group(x):
    x = int(x)
    x = abs(x)
    if( 18 < x < 31 ):
        return "19-30"
    if( 30 < x < 41 ):
        return "31-40"
    if( 40 < x < 51 ):
        return "41-50"
    if( 50 < x < 61 ):
        return "51-60"
    if( 60 < x < 71 ):
        return "61-70"
    else:
        return "Greater than 70"

df['age'] = df['age'].apply(age_group)

In [9]:
df_no_outliers = df[['fnlwgt','capital-gain','capital-loss','hours-per-week']]
df_no_outliers = df_no_outliers[(np.abs(stats.zscore(df_no_outliers)) < 3).all(axis=1)]
df = df.loc[list(df_no_outliers.index)]
df.reset_index(drop=True, inplace=True)
# df.reset_index()

In [10]:
df

Unnamed: 0,age,workclass,fnlwgt,education,marital-status,occupation,race,sex,capital-gain,capital-loss,hours-per-week,native-country,Over_50k_Salary
0,31-40,State-gov,77516,Bachelors,Never-married,Adm-clerical,White,Male,2174,0,40,United-States,0
1,41-50,Self-emp-not-inc,83311,Bachelors,Married-civ-spouse,Exec-managerial,White,Male,0,0,13,United-States,0
2,31-40,Private,215646,HS-grad,Divorced,Handlers-cleaners,White,Male,0,0,40,United-States,0
3,51-60,Private,234721,11th,Married-civ-spouse,Handlers-cleaners,Black,Male,0,0,40,United-States,0
4,19-30,Private,338409,Bachelors,Married-civ-spouse,Prof-specialty,Black,Female,0,0,40,Cuba,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
27899,19-30,Private,257302,Assoc-acdm,Married-civ-spouse,Tech-support,White,Female,0,0,38,United-States,0
27900,31-40,Private,154374,HS-grad,Married-civ-spouse,Machine-op-inspct,White,Male,0,0,40,United-States,1
27901,51-60,Private,151910,HS-grad,Widowed,Adm-clerical,White,Female,0,0,40,United-States,0
27902,19-30,Private,201490,HS-grad,Never-married,Adm-clerical,White,Male,0,0,20,United-States,0


In [11]:
df_over = df[df['Over_50k_Salary']==1]
df_holdout = df_over.sample(frac = 0.30)
df_holdout

Unnamed: 0,age,workclass,fnlwgt,education,marital-status,occupation,race,sex,capital-gain,capital-loss,hours-per-week,native-country,Over_50k_Salary
25566,31-40,Private,109351,Bachelors,Never-married,Sales,White,Female,8614,0,45,United-States,1
19075,51-60,State-gov,197184,Masters,Married-civ-spouse,Exec-managerial,White,Male,0,0,70,United-States,1
10745,41-50,Private,116797,HS-grad,Married-civ-spouse,Sales,White,Male,7298,0,50,United-States,1
23034,19-30,Private,299908,Some-college,Married-civ-spouse,Exec-managerial,Black,Female,0,0,40,United-States,1
12555,31-40,Private,134886,HS-grad,Married-civ-spouse,Adm-clerical,White,Female,0,0,40,United-States,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
27347,41-50,Private,379883,Masters,Married-civ-spouse,Prof-specialty,White,Male,0,0,40,Cuba,1
6813,31-40,Private,118486,Bachelors,Separated,Prof-specialty,White,Female,4934,0,32,United-States,1
1535,51-60,Private,163921,Bachelors,Married-civ-spouse,Transport-moving,White,Male,0,0,56,United-States,1
8932,51-60,Federal-gov,237819,Bachelors,Married-civ-spouse,Prof-specialty,White,Male,0,0,50,United-States,1


In [12]:
indices_holdout = df_holdout.index

In [13]:
df_over = df_over.loc[~df_over.index.isin(indices_holdout)]
df_over

Unnamed: 0,age,workclass,fnlwgt,education,marital-status,occupation,race,sex,capital-gain,capital-loss,hours-per-week,native-country,Over_50k_Salary
7,51-60,Self-emp-not-inc,209642,HS-grad,Married-civ-spouse,Exec-managerial,White,Male,0,0,45,United-States,1
9,41-50,Private,159449,Bachelors,Married-civ-spouse,Exec-managerial,White,Male,5178,0,40,United-States,1
10,19-30,State-gov,141297,Bachelors,Married-civ-spouse,Prof-specialty,Asian-Pac-Islander,Male,0,0,40,India,1
22,51-60,Local-gov,216851,Bachelors,Married-civ-spouse,Tech-support,White,Male,0,0,40,United-States,1
36,51-60,Federal-gov,337895,Bachelors,Married-civ-spouse,Prof-specialty,Black,Male,0,0,40,United-States,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
27878,31-40,Private,204461,Doctorate,Married-civ-spouse,Prof-specialty,White,Male,0,0,60,United-States,1
27879,51-60,Private,337992,Bachelors,Married-civ-spouse,Exec-managerial,Asian-Pac-Islander,Male,0,0,50,Japan,1
27884,31-40,Private,139180,Bachelors,Divorced,Prof-specialty,Black,Female,15020,0,45,United-States,1
27900,31-40,Private,154374,HS-grad,Married-civ-spouse,Machine-op-inspct,White,Male,0,0,40,United-States,1


In [14]:
transformer = DataTransformer()

In [15]:
# df_over = df_over.iloc[:,:11]
df_over = df_over.iloc[:,:-1]
df_holdout = df_holdout.iloc[:,:-1]
df_over

Unnamed: 0,age,workclass,fnlwgt,education,marital-status,occupation,race,sex,capital-gain,capital-loss,hours-per-week,native-country
7,51-60,Self-emp-not-inc,209642,HS-grad,Married-civ-spouse,Exec-managerial,White,Male,0,0,45,United-States
9,41-50,Private,159449,Bachelors,Married-civ-spouse,Exec-managerial,White,Male,5178,0,40,United-States
10,19-30,State-gov,141297,Bachelors,Married-civ-spouse,Prof-specialty,Asian-Pac-Islander,Male,0,0,40,India
22,51-60,Local-gov,216851,Bachelors,Married-civ-spouse,Tech-support,White,Male,0,0,40,United-States
36,51-60,Federal-gov,337895,Bachelors,Married-civ-spouse,Prof-specialty,Black,Male,0,0,40,United-States
...,...,...,...,...,...,...,...,...,...,...,...,...
27878,31-40,Private,204461,Doctorate,Married-civ-spouse,Prof-specialty,White,Male,0,0,60,United-States
27879,51-60,Private,337992,Bachelors,Married-civ-spouse,Exec-managerial,Asian-Pac-Islander,Male,0,0,50,Japan
27884,31-40,Private,139180,Bachelors,Divorced,Prof-specialty,Black,Female,15020,0,45,United-States
27900,31-40,Private,154374,HS-grad,Married-civ-spouse,Machine-op-inspct,White,Male,0,0,40,United-States


In [16]:
discrete_columns = ['age','workclass','education','marital-status',
       'occupation', 'race', 'sex','hours-per-week', 'native-country']

In [17]:
transformer.fit(df.iloc[:,:-1],discrete_columns)

In [18]:
X = transformer.transform(df_over)
# X = transformer.transform(df)


In [19]:
df_inversed = transformer.inverse_transform(X)

In [20]:
df_over.describe()
# ['age','capital-gain','capital-loss','hours-per-week']

Unnamed: 0,fnlwgt,capital-gain,capital-loss,hours-per-week
count,4481.0,4481.0,4481.0,4481.0
mean,182822.850033,2065.135238,0.0,44.929703
std,91161.202069,4437.995184,0.0,9.238468
min,14878.0,0.0,0.0,5.0
25%,119272.0,0.0,0.0,40.0
50%,175689.0,0.0,0.0,40.0
75%,225860.0,0.0,0.0,50.0
max,506329.0,20051.0,0.0,75.0


##### We filter the columns based on figure 8 of https://www.ncbi.nlm.nih.gov/pmc/articles/PMC6836338/pdf/12911_2019_Article_918.pdf

# MAIN

In [21]:
def evaluate_loss(ts, batch_size, dataloader, generator, discriminator):
    with torch.no_grad():
        total_samples = 0
        total_loss = 0
        for real_samples, in dataloader:
            generated_samples = generator(ts, batch_size)
            generated_score = discriminator(generated_samples)
            real_score = discriminator(real_samples)
            loss = generated_score - real_score
            total_samples += batch_size
            total_loss += loss.item() * batch_size
    return total_loss / total_samples

def sample_generator(generator,ts):
    samples = generator(ts, batch_size=1)
    samples = samples[0]
#     samples = torch.abs(samples[-1])
    samples = samples[-1]
    return samples.detach().numpy()

def synthtetic_dataset(generator,ts,size=3):
    samples=[]
    for i in range(size):
        samples.append(sample_generator(generator,ts))
    dades=np.array(samples)
    df = transformer.inverse_transform(dades)
    df = df[df.select_dtypes(include=[np.number]).ge(0).all(1)]
    return df
#     return pd.DataFrame(data=dades, columns=df.columns)
    
def apply_correction(df, columns= ['age','capital-gain','capital-loss','hours-per-week']):
    df_synt = copy.deepcopy(df)
    for col in columns:
        col_mean = np.mean(df_synt[col])
        df_synt[col] = df_synt[col].mask(df_synt[col] < 0, np.round(col_mean))
        # calculate IQR for column A
        Q1 = df_synt[col].quantile(0.25)
        Q3 = df_synt[col].quantile(0.75)
        IQR = Q3 - Q1

        # identify outliers
        threshold = 1.5
        outliers = df_synt[(df_synt[col] < Q1 - threshold * IQR) | (df_synt[col] > Q3 + threshold * IQR)]
        std = df_synt[col].std()
        size = len(outliers.index)
        if size>0:
            df_synt.iloc[outliers.index][col] = np.round(np.random.normal(col_mean,std,size))
    return df_synt

In [22]:
#https://arxiv.org/pdf/2104.00635.pdf
import os.path
import pandas as pd
import numpy as np
from os import listdir
from os.path import isfile, join

def bin_data(dt1, dt2, c = 10):
    dt1 = dt1.copy()
    dt2 = dt2.copy()
    # quantile binning of numerics
    num_cols = dt1.dtypes[dt1.dtypes!='object'].index
    for col in num_cols:
        # determine breaks based on `dt1`
        breaks = dt1[col].quantile(np.linspace(0, 1, c+1)).unique()
        dt1[col] = pd.cut(dt1[col], bins=breaks, include_lowest=True).astype(str)
        dt2_vals = pd.to_numeric(dt2[col], 'coerce')
        dt2_bins = pd.cut(dt2_vals, bins=breaks, include_lowest=True).astype(str)
        dt2_bins[dt2_vals < min(breaks)] = '_other_'
        dt2_bins[dt2_vals > max(breaks)] = '_other_'
        dt2[col] = dt2_bins
    # top-C binning of categoricals
    cat_cols = dt1.dtypes[dt1.dtypes=='object'].index
    for col in cat_cols:
        # determine top values based on `dt1`
        top_vals = dt1[col].value_counts().head(c).index.tolist()
        dt1[col].replace(np.setdiff1d(dt1[col].unique().tolist(), top_vals), '_other_', inplace=True)
        dt2[col].replace(np.setdiff1d(dt2[col].unique().tolist(), top_vals), '_other_', inplace=True)
    return [dt1, dt2]

def hellinger(p1, p2):
    return np.sqrt(1 - np.sum(np.sqrt(p1*p2)))

def kullback_leibler(p1, p2):
    idx = p1>0
    return np.sum(p1[idx] * np.log(p1[idx]/p2[idx]))

def jensen_shannon(p1, p2):
    m = 0.5 * (p1 + p2)
    return 0.5 * kullback_leibler(p1, m) + 0.5 * kullback_leibler(p2, m)

def fidelity(dt1, dt2, c = 100, k = 1):
    [dt1_bin, dt2_bin] = bin_data(dt1, dt2, c = c)
    # build grid of all cross-combinations
    cols = dt1.columns
    interactions = pd.DataFrame(np.array(np.meshgrid(cols, cols, cols)).reshape(3, len(cols)**3).T)
    interactions.columns = ['dim1', 'dim2', 'dim3']
    if k == 1:
        interactions = interactions.loc[(interactions['dim1']==interactions['dim2']) & (interactions['dim2']==interactions['dim3'])]
    elif k == 2:
        interactions = interactions.loc[(interactions['dim1']<interactions['dim2']) & (interactions['dim2']==interactions['dim3'])]
    elif k == 3:
        interactions = interactions.loc[(interactions['dim1']<interactions['dim2']) & (interactions['dim2']<interactions['dim3'])]
    else:
        raise('k>3 not supported')

    results = []
    for idx in range(interactions.shape[0]):
        row = interactions.iloc[idx]
        val1 = dt1_bin[row.dim1] + dt1_bin[row.dim2] + dt1_bin[row.dim3]
        val2 = dt2_bin[row.dim1] + dt2_bin[row.dim2] + dt2_bin[row.dim3]
        freq1 = val1.value_counts(normalize=True).to_frame(name='p1')
        freq2 = val2.value_counts(normalize=True).to_frame(name='p2')
        freq = freq1.join(freq2, how='outer').fillna(0.0)
        p1 = freq['p1']
        p2 = freq['p2']
        out = pd.DataFrame({
          'k': k,
          'dim1': [row.dim1], 'dim2': [row.dim2], 'dim3': [row.dim3],
          'tvd': [np.sum(np.abs(p1 - p2)) / 2], 
          'mae': [np.mean(np.abs(p1 - p2))], 
          'max': [np.max(np.abs(p1 - p2))],
          'l1d': [np.sum(np.abs(p1 - p2))],
          'l2d': [np.sqrt(np.sum((p1 - p2)**2))],
          'hellinger': [hellinger(p1, p2)],
          'jensen_shannon': [jensen_shannon(p1, p2)]})
        results.append(out)

    return pd.concat(results)

In [23]:
def train(X,
df_original,
initial_noise_size,
noise_size,
hidden_size,
mlp_size,
num_layers,
generator_lr, 
discriminator_lr,
batch_size,
steps,
init_mult1,
init_mult2,
weight_decay,
swa_step_start,
steps_per_print,
t_size,
load=False):
    
    is_cuda = torch.cuda.is_available()
    device = 'cuda' if is_cuda else 'cpu'
    if not is_cuda:
        print("Warning: CUDA not available; falling back to CPU but this is likely to be very slow.")

    ts = torch.linspace(0, t_size - 1, t_size, device=device)
    data_size = X.shape[1]
    
    ys_coeffs = torch.from_numpy(X)
    ys_coeffs = ys_coeffs.to(torch.float32)
    ys_coeffs = torch.reshape(ys_coeffs,(X.shape[0],1,X.shape[1]))
    dataset = torch.utils.data.TensorDataset(ys_coeffs)
    train_dataloader = torch.utils.data.DataLoader(dataset, batch_size=batch_size, shuffle=True)
    
    infinite_train_dataloader = (elem for it in iter(lambda: train_dataloader, None) for elem in it)
    generator = Generator(data_size, initial_noise_size, noise_size, hidden_size, mlp_size, num_layers+1).to(device)
    discriminator = Discriminator(data_size, hidden_size, mlp_size, num_layers-1).to(device)
    
    if load:
        generator.load_state_dict(torch.load('initial_weights_gen.pth'))
        discriminator.load_state_dict(torch.load('initial_weights_disc.pth'))
        
    torch.save(generator.state_dict(), 'initial_weights_gen.pth')
    torch.save(discriminator.state_dict(), 'initial_weights_disc.pth')
    
    
    averaged_generator = swa_utils.AveragedModel(generator)
    averaged_discriminator = swa_utils.AveragedModel(discriminator)
    
    print(generator)
    print(discriminator)
    
    generator_optimiser = torch.optim.Adadelta(generator.parameters(), lr=generator_lr, weight_decay=weight_decay)
    discriminator_optimiser = torch.optim.Adadelta(discriminator.parameters(), lr=discriminator_lr,
                                                   weight_decay=weight_decay)
    
    trange = tqdm.tqdm(range(steps))
    for step in trange:
        real_samples, = next(infinite_train_dataloader)
        generated_samples = generator(ts, batch_size)
        generated_score = discriminator(generated_samples)
        real_score = discriminator(real_samples)
        loss = generated_score - real_score
        loss.backward()
        for param in generator.parameters():
            param.grad *= -1
        generator_optimiser.step()
        discriminator_optimiser.step()
        generator_optimiser.zero_grad()
        discriminator_optimiser.zero_grad()

        ###################
        # We constrain the Lipschitz constant of the discriminator using carefully-chosen clipping (and the use of
        # LipSwish activation functions).
        ###################
        with torch.no_grad():
            for module in discriminator.modules():
                if isinstance(module, torch.nn.Linear):
                    lim = 1 / module.out_features
                    module.weight.clamp_(-lim, lim)

        # Stochastic weight averaging typically improves performance.
        if step > swa_step_start:
            averaged_generator.update_parameters(generator)
            averaged_discriminator.update_parameters(discriminator)

        if (step % steps_per_print) == 0 or step == steps - 1:
            total_unaveraged_loss = evaluate_loss(ts, batch_size, train_dataloader, generator, discriminator)
            if step > swa_step_start:
                total_averaged_loss = evaluate_loss(ts, batch_size, train_dataloader, averaged_generator.module,
                                                    averaged_discriminator.module)
                trange.write(f"Step: {step:3} Loss (unaveraged): {total_unaveraged_loss:.4f} "
                             f"Loss (averaged): {total_averaged_loss:.4f}")
            else:
                trange.write(f"Step: {step:3} Loss (unaveraged): {total_unaveraged_loss:.4f}")
        if step % 200 == 0 or step == steps - 1:
            df_synt = synthtetic_dataset(averaged_generator,ts,size=1000)
#             df_synt = apply_correction(df_synt)
            fid = fidelity(df_original, df_synt, k=2, c=10).agg('mean')
            print(fid)
        if fid['tvd'] <= 0.20: break
#             if step >= 3200:
#                 torch.save(averaged_generator,'./Generator/generator_adult_'+str(step)+'.pth')

    generator.load_state_dict(averaged_generator.module.state_dict())
    discriminator.load_state_dict(averaged_discriminator.module.state_dict())
    
    df_synt = synthtetic_dataset(generator, ts,size=1000)
#     df_synt = apply_correction(df_synt)
    print(fidelity(df_original, df_synt, k=2, c=10).agg('mean'))
    
    report = QualityReport()
    columns = {}
    for col in df_synt.columns:
        columns[col] = {'sdtype': 'numerical'}
    metadata = {
        'METADATA_SPEC_VERSION': 'SINGLE_TABLE_V1',
        'columns': columns
    }
    
    report.generate(df_original, df_synt, metadata)
    final_score = report.get_score()
    
    print('Final score: ', final_score)
    return final_score, generator
    

In [24]:
with open('./Params/model2.json') as f:
    params = json.load(f)
initial_noise_size = params['initial_noise_size'] # How many noise dimensions to sample at the start of the SDE.
noise_size = params['noise_size'] # How many dimensions the Brownian motion has.
hidden_size = params['hidden_size'] # How big the hidden size of the generator SDE and the discriminator CDE are.
mlp_size = params['mlp_size'] # How big the layers in the various MLPs are.
num_layers = params['num_layers'] # How many hidden layers to have in the various MLPs.
# Training hyperparameters. Be prepared to tune these very carefully, as with any GAN.
generator_lr = params['generator_lr'] # Learning rate often needs careful tuning to the problem.
# generator_lr = 0.001
discriminator_lr = params['discriminator_lr'] # Learning rate often needs careful tuning to the problem.
# discriminator_lr = 0.001
batch_size = params['batch_size'] # Batch size.
steps = params['steps'] # How many steps to train both generator and discriminator for.
# steps=2500
init_mult1 = params['init_mult1'] # Changing the initial parameter size can help.
# init_mult1 = 1.5
init_mult2 = params['init_mult2']
weight_decay = params['weight_decay'] # Weight decay.
swa_step_start = params['swa_step_start'] # When to start using stochastic weight averaging.
steps_per_print = params['steps_per_print'] # How often to print the loss.
t_size = params['t_size'] 
# t_size = 20
print(params)

print('This is '+str(steps/(X.shape[0]/batch_size))+' epochs')

{'initial_noise_size': 15, 'noise_size': 10, 'hidden_size': 54, 'mlp_size': 72, 'num_layers': 3, 'generator_lr': 0.1, 'discriminator_lr': 0.1, 'batch_size': 96, 'steps': 3000, 'init_mult1': 3, 'init_mult2': 0.5, 'weight_decay': 0.01, 'swa_step_start': 300, 'steps_per_print': 200, 't_size': 6}
This is 64.27136799821469 epochs


### df_original = df_holdout

final_score, generator = train(X,
df_original,
initial_noise_size,
noise_size,
hidden_size,
mlp_size,
num_layers,
generator_lr, 
discriminator_lr,
batch_size,
steps,
init_mult1,
init_mult2,
weight_decay,
swa_step_start,
steps_per_print,
t_size,
load=False)

In [26]:
is_cuda = torch.cuda.is_available()
device = 'cuda' if is_cuda else 'cpu'
ts = torch.linspace(0, t_size - 1, t_size, device=device)
df_synt = synthtetic_dataset(generator, ts,size=1500)

In [27]:
df_original.describe()

Unnamed: 0,fnlwgt,capital-gain,capital-loss,hours-per-week
count,1921.0,1921.0,1921.0,1921.0
mean,180832.187402,1995.538261,0.0,44.867257
std,90298.281888,4319.399339,0.0,9.354899
min,14878.0,0.0,0.0,5.0
25%,117802.0,0.0,0.0,40.0
50%,174242.0,0.0,0.0,40.0
75%,223934.0,0.0,0.0,50.0
max,478346.0,20051.0,0.0,76.0


In [28]:
df_synt.describe()

Unnamed: 0,fnlwgt,capital-gain,capital-loss,hours-per-week
count,1040.0,1040.0,1040.0,1040.0
mean,187826.232692,4174.5,0.0,43.725
std,96229.099194,5395.842772,0.0,7.712055
min,22925.0,0.0,0.0,11.0
25%,106244.0,3.0,0.0,40.0
50%,175368.5,9.0,0.0,40.0
75%,255610.0,7555.0,0.0,50.0
max,438566.0,15715.0,0.0,67.0


In [29]:
print(fidelity(df_original, df_synt, k=2, c=10).agg('mean'))

k                 2.000000
tvd               0.245578
mae               0.025041
max               0.141364
l1d               0.491155
l2d               0.189170
hellinger         0.256443
jensen_shannon    0.063503
dtype: float64


  print(fidelity(df_original, df_synt, k=2, c=10).agg('mean'))


In [30]:
# df_synt = apply_correction(df_synt)
# print(fidelity(df_original, df_synt, k=2, c=10).agg('mean'))

In [31]:
torch.save(generator,'./Generator/generator_adult.pth')

# ML efficacy

In [113]:
np.unique(df['Over_50k_Salary'],return_counts=True)

(array([0, 1], dtype=int64), array([21502,  6402], dtype=int64))

In [25]:
generator = torch.load('./Generator/generator_adult.pth')

In [26]:
is_cuda = torch.cuda.is_available()
device = 'cuda' if is_cuda else 'cpu'
with open('./Params/model2.json') as f:
    params = json.load(f)
t_size = params['t_size']
ts = torch.linspace(0, t_size - 1, t_size, device=device)
df_synt = synthtetic_dataset(generator, ts,size=10000)
df_synt

Unnamed: 0,age,workclass,fnlwgt,education,marital-status,occupation,race,sex,capital-gain,capital-loss,hours-per-week,native-country
0,31-40,Private,182130,Bachelors,Married-civ-spouse,Exec-managerial,White,Male,5,0,50,United-States
1,31-40,Self-emp-not-inc,236144,Bachelors,Married-civ-spouse,Exec-managerial,White,Male,4636,0,50,United-States
3,51-60,Private,178105,Some-college,Married-civ-spouse,Craft-repair,White,Male,7,0,40,United-States
4,31-40,Private,236712,Bachelors,Married-civ-spouse,Exec-managerial,White,Male,7545,0,40,United-States
5,51-60,Private,181510,Some-college,Married-civ-spouse,Craft-repair,White,Male,9,0,40,United-States
...,...,...,...,...,...,...,...,...,...,...,...,...
9991,31-40,Private,223750,9th,Married-civ-spouse,Exec-managerial,White,Male,7563,0,50,United-States
9993,41-50,Local-gov,171276,Bachelors,Never-married,Prof-specialty,White,Male,14774,0,50,United-States
9995,31-40,Private,107369,HS-grad,Married-civ-spouse,Exec-managerial,White,Male,0,0,60,United-States
9997,51-60,Private,112032,Bachelors,Married-civ-spouse,Sales,White,Male,3176,0,50,United-States


In [27]:
df_train, df_test, y_train, y_test = train_test_split(df.iloc[:,:-1], df.iloc[:,-1], test_size=0.3)

In [28]:
xgb_model = xgb.XGBClassifier(objective="binary:logistic")

In [29]:
X_train = transformer.transform(df_train)
X_train

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.],
       ...,
       [1., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.]])

In [30]:
y_train = np.array(y_train)

In [31]:
xgb_model.fit(X_train, y_train)
# y_pred = xgb_model.predict(X_train)

In [32]:
from sklearn.metrics import roc_auc_score, accuracy_score, f1_score, recall_score

In [33]:
X_test = transformer.transform(df_test)
y_pred_test = xgb_model.predict(X_test)

In [34]:
y_test = np.array(y_test)

In [35]:
accuracy_score(y_test, y_pred_test), f1_score(y_test, y_pred_test), recall_score(y_test, y_pred_test)

(0.8573817486860965, 0.656896551724138, 0.5953125)

## Part amb el synthetic dataset

In [36]:
df_train_filled = pd.concat([df_train,df_synt])
df_train_filled

Unnamed: 0,age,workclass,fnlwgt,education,marital-status,occupation,race,sex,capital-gain,capital-loss,hours-per-week,native-country
4804,19-30,Private,213722,Some-college,Never-married,Sales,White,Male,0,0,50,United-States
5105,41-50,Private,121287,HS-grad,Never-married,Machine-op-inspct,White,Male,0,0,40,United-States
24332,41-50,Private,420986,Bachelors,Married-civ-spouse,Exec-managerial,White,Male,0,0,40,United-States
19981,51-60,Private,306108,HS-grad,Married-civ-spouse,Machine-op-inspct,White,Male,0,0,40,United-States
2,31-40,Private,215646,HS-grad,Divorced,Handlers-cleaners,White,Male,0,0,40,United-States
...,...,...,...,...,...,...,...,...,...,...,...,...
9991,31-40,Private,223750,9th,Married-civ-spouse,Exec-managerial,White,Male,7563,0,50,United-States
9993,41-50,Local-gov,171276,Bachelors,Never-married,Prof-specialty,White,Male,14774,0,50,United-States
9995,31-40,Private,107369,HS-grad,Married-civ-spouse,Exec-managerial,White,Male,0,0,60,United-States
9997,51-60,Private,112032,Bachelors,Married-civ-spouse,Sales,White,Male,3176,0,50,United-States


In [37]:
y_train_filled = np.concatenate([y_train,np.ones(len(df_synt))])

In [38]:
df_train_filled['target'] = y_train_filled

In [39]:
df_train_filled = df_train_filled.sample(frac = 1)

In [40]:
X_train_filled = df_train_filled.iloc[:,:-1]
y_train_filled = np.array(df_train_filled.iloc[:,-1])

In [41]:
X_train_filled = transformer.transform(X_train_filled)

In [42]:
xgb_model_filled = xgb.XGBClassifier(objective="binary:logistic")

In [43]:
xgb_model_filled.fit(X_train_filled, y_train_filled)

In [44]:
y_pred_test_filled = xgb_model_filled.predict(X_test)

In [45]:
accuracy_score(y_test, y_pred_test_filled), f1_score(y_test, y_pred_test_filled), recall_score(y_test, y_pred_test_filled)

(0.8557095078834209, 0.6620033575825406, 0.6161458333333333)

# training only with synthetic data

In [135]:
X_synt = transformer.transform(df_synt)
X_synt

array([[0., 1., 0., ..., 0., 0., 0.],
       [0., 0., 1., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 1., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.]])