In [1]:
import torch 
import pandas as pd
import torch.nn as nn
import time
import io
import joblib
import argparse

from torchsummary import summary
from tqdm import tqdm
from tabulate import tabulate
from sklearn.preprocessing import *

# from preprocessor import Preprocessor
# from AutoCleanse.utils import *
# from AutoCleanse.dataloader import PlainDataset, DataLoader
# from AutoCleanse.autoencoder import *
# from AutoCleanse.loss_model import loss_CEMSE
# # from AutoCleanse.preprocessor import Preprocessor
# from AutoCleanse.anonymize import anonymize
# from AutoCleanse.bucketfs_client import *

from utils import *
from dataloader import PlainDataset, DataLoader
from autoencoder import *
from masked_autoencoder import *
from loss_model import loss_CEMSE
from preprocessor import Preprocessor
from anonymize import anonymize
from bucketfs_client import *

  from .autonotebook import tqdm as notebook_tqdm


Device configuration

In [2]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
torch.manual_seed(42)

<torch._C.Generator at 0x19ce8df70f0>

## Setup directory path

In [3]:
import os
PROJECT_DIR = os.getcwd()
os.chdir(PROJECT_DIR)
DATASET_DIR = os.path.join(PROJECT_DIR,'dataset')
EVAL_DIR = os.path.join(PROJECT_DIR,'evaluate')

## Preparing data

Load dataframe and group features by their type

In [4]:
df = pd.read_csv(os.path.join(DATASET_DIR,'adult.csv')).drop(columns=['fnlwgt','income'])
continous_columns = df.select_dtypes(include=['int64', 'float64']).columns.tolist()
categorical_columns = df.select_dtypes(include=['object', 'bool']).columns.tolist()
og_columns = df.columns.to_list()
df = df[continous_columns+categorical_columns]

Data preprocessing

In [5]:
scaler = MinMaxScaler()
onehotencoder = OneHotEncoder(handle_unknown='ignore',sparse_output=False)
preprocessor = Preprocessor(scaler,
                            onehotencoder,
                            continous_columns,
                            categorical_columns)

X_train,X_val,X_test = preprocessor.split(df=df,
                                        train_ratio=0.7,
                                        val_ratio=0.15,
                                        test_ratio=0.15,
                                        random_seed=42)
X_dirty = replace_with_nan(X_test,0.2,42)

X_train = preprocessor.fit_transform(input_df=X_train)
X_val = preprocessor.transform(input_df=X_val)        
X_test = preprocessor.transform(input_df=X_test)  

categories = preprocessor.encoder.categories_


In [6]:
batch_size = 64

train_dataset = PlainDataset(X_train)
val_dataset = PlainDataset(X_val)
test_dataset = PlainDataset(X_test)

def custom_collate_fn(batch):
    tensor_data = torch.stack([item[0] for item in batch])
    indices = [item[1] for item in batch]
    return tensor_data, indices

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, drop_last=True,collate_fn=custom_collate_fn)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False, drop_last=True, collate_fn=custom_collate_fn)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, drop_last=True, collate_fn=custom_collate_fn)

### Training MAE model and next the main AutoEncoder

In [7]:
layers = [X_train.shape[1],1024,128] 
wlc = (1,1) 

### MAE

In [8]:
# for the masked auto encoder, only training on the continuous section of the data. These columns will be used for inference in MAE,
# as this method is only used to replace the NAN values for these columns.

X_train_mae = X_train.iloc[:, 0:len(continous_columns)]
X_val_mae = X_val.iloc[:, 0:len(continous_columns)]
X_test_mae = X_test.iloc[:, 0:len(continous_columns)]

train_dataset_mae = PlainDataset(X_train_mae)
val_dataset_mae = PlainDataset(X_val_mae)
test_dataset_mae = PlainDataset(X_test_mae)

train_loader_mae = DataLoader(train_dataset_mae, batch_size=batch_size, shuffle=True, drop_last=True,collate_fn=custom_collate_fn)
val_loader_mae = DataLoader(val_dataset_mae, batch_size=batch_size, shuffle=False, drop_last=True, collate_fn=custom_collate_fn)
test_loader_mae = DataLoader(test_dataset_mae, batch_size=batch_size, shuffle=False, drop_last=True, collate_fn=custom_collate_fn)

In [9]:
## fit MAE model
layers_mae = [len(continous_columns),64,64,16] # layers for the MAE model, can be tuned

mae_autoencoder = MaskedAutoencoder(layers=layers_mae,dropout_enc=[(0,0.0)],dropout_dec=[(0,0.1)], batch_norm=True, \
                          learning_rate=1e-4,weight_decay=1e-5,l1_strength=1e-5,l2_strength=1e-5)


In [10]:
mae_autoencoder.train_model(
      patience=10,
      num_epochs=100,
      batch_size=batch_size,
      train_loader=train_loader_mae,
      val_loader=val_loader_mae,
      continous_columns=continous_columns, 
      categorical_columns=[], 
      categories=categories,
      device=device,
      wlc=wlc)

Epoch [1/100], Training Progress:   1%|▏         | 5/356 [00:00<00:11, 30.75it/s]

train progress: 
Epoch [1/100], Training Progress:   0%|          | 0/356 [00:00<?, ?it/s]


Epoch [1/100], Training Progress: 100%|██████████| 356/356 [00:03<00:00, 99.09it/s] 
Epoch [1/100], Validation Progress: 100%|██████████| 76/76 [00:00<00:00, 143.32it/s]


Epoch [1/100], Training Loss: 0.09763432
Epoch [1/100], Validation Loss: 0.06671038
Epoch [1/100], Training CE Loss: 0.00000000
Epoch [1/100], Validation CE Loss: 0.00000000
Epoch [1/100], Training MSE Loss: 0.09763432
Epoch [1/100], Validation MSE Loss: 0.06671038
Epoch [1/100], Training Loss Comp: 0.09763432
Epoch [1/100], Validation Loss Comp: 0.06671038
Epoch [1/100]: Learning Rate = [0.0001]



Epoch [2/100], Training Progress:   3%|▎         | 11/356 [00:00<00:03, 103.97it/s]

train progress: 
Epoch [2/100], Training Progress:   0%|          | 0/356 [00:00<?, ?it/s]


Epoch [2/100], Training Progress: 100%|██████████| 356/356 [00:03<00:00, 103.98it/s]
Epoch [2/100], Validation Progress: 100%|██████████| 76/76 [00:00<00:00, 144.44it/s]


Epoch [2/100], Training Loss: 0.05075332
Epoch [2/100], Validation Loss: 0.03758893
Epoch [2/100], Training CE Loss: 0.00000000
Epoch [2/100], Validation CE Loss: 0.00000000
Epoch [2/100], Training MSE Loss: 0.05075332
Epoch [2/100], Validation MSE Loss: 0.03758893
Epoch [2/100], Training Loss Comp: 0.05075332
Epoch [2/100], Validation Loss Comp: 0.03758893
Epoch [2/100]: Learning Rate = [0.0001]



Epoch [3/100], Training Progress:   3%|▎         | 11/356 [00:00<00:03, 101.57it/s]

train progress: 
Epoch [3/100], Training Progress:   0%|          | 0/356 [00:00<?, ?it/s]


Epoch [3/100], Training Progress: 100%|██████████| 356/356 [00:03<00:00, 104.65it/s]
Epoch [3/100], Validation Progress: 100%|██████████| 76/76 [00:00<00:00, 144.54it/s]


Epoch [3/100], Training Loss: 0.02837885
Epoch [3/100], Validation Loss: 0.02092179
Epoch [3/100], Training CE Loss: 0.00000000
Epoch [3/100], Validation CE Loss: 0.00000000
Epoch [3/100], Training MSE Loss: 0.02837885
Epoch [3/100], Validation MSE Loss: 0.02092179
Epoch [3/100], Training Loss Comp: 0.02837885
Epoch [3/100], Validation Loss Comp: 0.02092179
Epoch [3/100]: Learning Rate = [0.0001]



Epoch [4/100], Training Progress:   3%|▎         | 10/356 [00:00<00:03, 96.38it/s]

train progress: 
Epoch [4/100], Training Progress:   0%|          | 0/356 [00:00<?, ?it/s]


Epoch [4/100], Training Progress: 100%|██████████| 356/356 [00:03<00:00, 102.92it/s]
Epoch [4/100], Validation Progress: 100%|██████████| 76/76 [00:00<00:00, 129.84it/s]


Epoch [4/100], Training Loss: 0.01754266
Epoch [4/100], Validation Loss: 0.01494005
Epoch [4/100], Training CE Loss: 0.00000000
Epoch [4/100], Validation CE Loss: 0.00000000
Epoch [4/100], Training MSE Loss: 0.01754266
Epoch [4/100], Validation MSE Loss: 0.01494005
Epoch [4/100], Training Loss Comp: 0.01754266
Epoch [4/100], Validation Loss Comp: 0.01494005
Epoch [4/100]: Learning Rate = [0.0001]



Epoch [5/100], Training Progress:   3%|▎         | 11/356 [00:00<00:03, 102.08it/s]

train progress: 
Epoch [5/100], Training Progress:   0%|          | 0/356 [00:00<?, ?it/s]


Epoch [5/100], Training Progress: 100%|██████████| 356/356 [00:03<00:00, 102.79it/s]
Epoch [5/100], Validation Progress: 100%|██████████| 76/76 [00:00<00:00, 145.15it/s]


Epoch [5/100], Training Loss: 0.01316105
Epoch [5/100], Validation Loss: 0.01202157
Epoch [5/100], Training CE Loss: 0.00000000
Epoch [5/100], Validation CE Loss: 0.00000000
Epoch [5/100], Training MSE Loss: 0.01316105
Epoch [5/100], Validation MSE Loss: 0.01202157
Epoch [5/100], Training Loss Comp: 0.01316105
Epoch [5/100], Validation Loss Comp: 0.01202157
Epoch [5/100]: Learning Rate = [0.0001]



Epoch [6/100], Training Progress:   3%|▎         | 10/356 [00:00<00:03, 93.65it/s]

train progress: 
Epoch [6/100], Training Progress:   0%|          | 0/356 [00:00<?, ?it/s]


Epoch [6/100], Training Progress: 100%|██████████| 356/356 [00:03<00:00, 101.82it/s]
Epoch [6/100], Validation Progress: 100%|██████████| 76/76 [00:00<00:00, 141.78it/s]


Epoch [6/100], Training Loss: 0.01120654
Epoch [6/100], Validation Loss: 0.01093972
Epoch [6/100], Training CE Loss: 0.00000000
Epoch [6/100], Validation CE Loss: 0.00000000
Epoch [6/100], Training MSE Loss: 0.01120654
Epoch [6/100], Validation MSE Loss: 0.01093972
Epoch [6/100], Training Loss Comp: 0.01120654
Epoch [6/100], Validation Loss Comp: 0.01093972
Epoch [6/100]: Learning Rate = [0.0001]



Epoch [7/100], Training Progress:   5%|▌         | 18/356 [00:00<00:03, 86.66it/s]

train progress: 
Epoch [7/100], Training Progress:   0%|          | 0/356 [00:00<?, ?it/s]


Epoch [7/100], Training Progress: 100%|██████████| 356/356 [00:03<00:00, 98.73it/s] 
Epoch [7/100], Validation Progress: 100%|██████████| 76/76 [00:00<00:00, 142.64it/s]


Epoch [7/100], Training Loss: 0.01012432
Epoch [7/100], Validation Loss: 0.01017111
Epoch [7/100], Training CE Loss: 0.00000000
Epoch [7/100], Validation CE Loss: 0.00000000
Epoch [7/100], Training MSE Loss: 0.01012432
Epoch [7/100], Validation MSE Loss: 0.01017111
Epoch [7/100], Training Loss Comp: 0.01012432
Epoch [7/100], Validation Loss Comp: 0.01017111
Epoch [7/100]: Learning Rate = [0.0001]



Epoch [8/100], Training Progress:   3%|▎         | 9/356 [00:00<00:03, 87.60it/s]

train progress: 
Epoch [8/100], Training Progress:   0%|          | 0/356 [00:00<?, ?it/s]


Epoch [8/100], Training Progress: 100%|██████████| 356/356 [00:03<00:00, 102.06it/s]
Epoch [8/100], Validation Progress: 100%|██████████| 76/76 [00:00<00:00, 137.23it/s]


Epoch [8/100], Training Loss: 0.00942752
Epoch [8/100], Validation Loss: 0.00923648
Epoch [8/100], Training CE Loss: 0.00000000
Epoch [8/100], Validation CE Loss: 0.00000000
Epoch [8/100], Training MSE Loss: 0.00942752
Epoch [8/100], Validation MSE Loss: 0.00923648
Epoch [8/100], Training Loss Comp: 0.00942752
Epoch [8/100], Validation Loss Comp: 0.00923648
Epoch [8/100]: Learning Rate = [0.0001]



Epoch [9/100], Training Progress:   3%|▎         | 9/356 [00:00<00:04, 83.15it/s]

train progress: 
Epoch [9/100], Training Progress:   0%|          | 0/356 [00:00<?, ?it/s]


Epoch [9/100], Training Progress: 100%|██████████| 356/356 [00:03<00:00, 103.74it/s]
Epoch [9/100], Validation Progress: 100%|██████████| 76/76 [00:00<00:00, 145.89it/s]


Epoch [9/100], Training Loss: 0.00894443
Epoch [9/100], Validation Loss: 0.00875726
Epoch [9/100], Training CE Loss: 0.00000000
Epoch [9/100], Validation CE Loss: 0.00000000
Epoch [9/100], Training MSE Loss: 0.00894443
Epoch [9/100], Validation MSE Loss: 0.00875726
Epoch [9/100], Training Loss Comp: 0.00894443
Epoch [9/100], Validation Loss Comp: 0.00875726
Epoch [9/100]: Learning Rate = [0.0001]



Epoch [10/100], Training Progress:   6%|▌         | 21/356 [00:00<00:03, 102.16it/s]

train progress: 
Epoch [10/100], Training Progress:   0%|          | 0/356 [00:00<?, ?it/s]


Epoch [10/100], Training Progress: 100%|██████████| 356/356 [00:03<00:00, 100.97it/s]
Epoch [10/100], Validation Progress: 100%|██████████| 76/76 [00:00<00:00, 145.97it/s]


Epoch [10/100], Training Loss: 0.00861083
Epoch [10/100], Validation Loss: 0.00839778
Epoch [10/100], Training CE Loss: 0.00000000
Epoch [10/100], Validation CE Loss: 0.00000000
Epoch [10/100], Training MSE Loss: 0.00861083
Epoch [10/100], Validation MSE Loss: 0.00839778
Epoch [10/100], Training Loss Comp: 0.00861083
Epoch [10/100], Validation Loss Comp: 0.00839778
Epoch [10/100]: Learning Rate = [0.0001]



Epoch [11/100], Training Progress:   6%|▌         | 21/356 [00:00<00:03, 101.21it/s]

train progress: 
Epoch [11/100], Training Progress:   0%|          | 0/356 [00:00<?, ?it/s]


Epoch [11/100], Training Progress: 100%|██████████| 356/356 [00:03<00:00, 104.44it/s]
Epoch [11/100], Validation Progress: 100%|██████████| 76/76 [00:00<00:00, 145.35it/s]


Epoch [11/100], Training Loss: 0.00787524
Epoch [11/100], Validation Loss: 0.00801688
Epoch [11/100], Training CE Loss: 0.00000000
Epoch [11/100], Validation CE Loss: 0.00000000
Epoch [11/100], Training MSE Loss: 0.00787524
Epoch [11/100], Validation MSE Loss: 0.00801688
Epoch [11/100], Training Loss Comp: 0.00787524
Epoch [11/100], Validation Loss Comp: 0.00801688
Epoch [11/100]: Learning Rate = [0.0001]



Epoch [12/100], Training Progress:   3%|▎         | 10/356 [00:00<00:03, 99.24it/s]

train progress: 
Epoch [12/100], Training Progress:   0%|          | 0/356 [00:00<?, ?it/s]


Epoch [12/100], Training Progress: 100%|██████████| 356/356 [00:03<00:00, 104.67it/s]
Epoch [12/100], Validation Progress: 100%|██████████| 76/76 [00:00<00:00, 146.65it/s]


Epoch [12/100], Training Loss: 0.00763948
Epoch [12/100], Validation Loss: 0.00760670
Epoch [12/100], Training CE Loss: 0.00000000
Epoch [12/100], Validation CE Loss: 0.00000000
Epoch [12/100], Training MSE Loss: 0.00763948
Epoch [12/100], Validation MSE Loss: 0.00760670
Epoch [12/100], Training Loss Comp: 0.00763948
Epoch [12/100], Validation Loss Comp: 0.00760670
Epoch [12/100]: Learning Rate = [0.0001]



Epoch [13/100], Training Progress:   3%|▎         | 10/356 [00:00<00:03, 91.48it/s]

train progress: 
Epoch [13/100], Training Progress:   0%|          | 0/356 [00:00<?, ?it/s]


Epoch [13/100], Training Progress: 100%|██████████| 356/356 [00:03<00:00, 101.54it/s]
Epoch [13/100], Validation Progress: 100%|██████████| 76/76 [00:00<00:00, 146.80it/s]


Epoch [13/100], Training Loss: 0.00740729
Epoch [13/100], Validation Loss: 0.00730327
Epoch [13/100], Training CE Loss: 0.00000000
Epoch [13/100], Validation CE Loss: 0.00000000
Epoch [13/100], Training MSE Loss: 0.00740729
Epoch [13/100], Validation MSE Loss: 0.00730327
Epoch [13/100], Training Loss Comp: 0.00740729
Epoch [13/100], Validation Loss Comp: 0.00730327
Epoch [13/100]: Learning Rate = [0.0001]



Epoch [14/100], Training Progress:   2%|▏         | 7/356 [00:00<00:05, 69.76it/s]

train progress: 
Epoch [14/100], Training Progress:   0%|          | 0/356 [00:00<?, ?it/s]


Epoch [14/100], Training Progress: 100%|██████████| 356/356 [00:03<00:00, 93.10it/s] 
Epoch [14/100], Validation Progress: 100%|██████████| 76/76 [00:00<00:00, 135.22it/s]


Epoch [14/100], Training Loss: 0.00714667
Epoch [14/100], Validation Loss: 0.00712396
Epoch [14/100], Training CE Loss: 0.00000000
Epoch [14/100], Validation CE Loss: 0.00000000
Epoch [14/100], Training MSE Loss: 0.00714667
Epoch [14/100], Validation MSE Loss: 0.00712396
Epoch [14/100], Training Loss Comp: 0.00714667
Epoch [14/100], Validation Loss Comp: 0.00712396
Epoch [14/100]: Learning Rate = [0.0001]



Epoch [15/100], Training Progress:   3%|▎         | 10/356 [00:00<00:03, 93.66it/s]

train progress: 
Epoch [15/100], Training Progress:   0%|          | 0/356 [00:00<?, ?it/s]


Epoch [15/100], Training Progress: 100%|██████████| 356/356 [00:03<00:00, 103.80it/s]
Epoch [15/100], Validation Progress: 100%|██████████| 76/76 [00:00<00:00, 147.62it/s]


Epoch [15/100], Training Loss: 0.00687499
Epoch [15/100], Validation Loss: 0.00679410
Epoch [15/100], Training CE Loss: 0.00000000
Epoch [15/100], Validation CE Loss: 0.00000000
Epoch [15/100], Training MSE Loss: 0.00687499
Epoch [15/100], Validation MSE Loss: 0.00679410
Epoch [15/100], Training Loss Comp: 0.00687499
Epoch [15/100], Validation Loss Comp: 0.00679410
Epoch [15/100]: Learning Rate = [0.0001]



Epoch [16/100], Training Progress:   3%|▎         | 11/356 [00:00<00:03, 102.56it/s]

train progress: 
Epoch [16/100], Training Progress:   0%|          | 0/356 [00:00<?, ?it/s]


Epoch [16/100], Training Progress: 100%|██████████| 356/356 [00:03<00:00, 101.39it/s]
Epoch [16/100], Validation Progress: 100%|██████████| 76/76 [00:00<00:00, 129.65it/s]


Epoch [16/100], Training Loss: 0.00656602
Epoch [16/100], Validation Loss: 0.00671933
Epoch [16/100], Training CE Loss: 0.00000000
Epoch [16/100], Validation CE Loss: 0.00000000
Epoch [16/100], Training MSE Loss: 0.00656602
Epoch [16/100], Validation MSE Loss: 0.00671933
Epoch [16/100], Training Loss Comp: 0.00656602
Epoch [16/100], Validation Loss Comp: 0.00671933
Epoch [16/100]: Learning Rate = [0.0001]



Epoch [17/100], Training Progress:   3%|▎         | 11/356 [00:00<00:03, 100.67it/s]

train progress: 
Epoch [17/100], Training Progress:   0%|          | 0/356 [00:00<?, ?it/s]


Epoch [17/100], Training Progress: 100%|██████████| 356/356 [00:03<00:00, 105.18it/s]
Epoch [17/100], Validation Progress: 100%|██████████| 76/76 [00:00<00:00, 147.64it/s]


Epoch [17/100], Training Loss: 0.00642801
Epoch [17/100], Validation Loss: 0.00658914
Epoch [17/100], Training CE Loss: 0.00000000
Epoch [17/100], Validation CE Loss: 0.00000000
Epoch [17/100], Training MSE Loss: 0.00642801
Epoch [17/100], Validation MSE Loss: 0.00658914
Epoch [17/100], Training Loss Comp: 0.00642801
Epoch [17/100], Validation Loss Comp: 0.00658914
Epoch [17/100]: Learning Rate = [0.0001]



Epoch [18/100], Training Progress:   3%|▎         | 10/356 [00:00<00:03, 98.56it/s]

train progress: 
Epoch [18/100], Training Progress:   0%|          | 0/356 [00:00<?, ?it/s]


Epoch [18/100], Training Progress: 100%|██████████| 356/356 [00:03<00:00, 103.74it/s]
Epoch [18/100], Validation Progress: 100%|██████████| 76/76 [00:00<00:00, 146.67it/s]


Epoch [18/100], Training Loss: 0.00626455
Epoch [18/100], Validation Loss: 0.00656149
Epoch [18/100], Training CE Loss: 0.00000000
Epoch [18/100], Validation CE Loss: 0.00000000
Epoch [18/100], Training MSE Loss: 0.00626455
Epoch [18/100], Validation MSE Loss: 0.00656149
Epoch [18/100], Training Loss Comp: 0.00626455
Epoch [18/100], Validation Loss Comp: 0.00656149
Epoch [18/100]: Learning Rate = [0.0001]



Epoch [19/100], Training Progress:   3%|▎         | 10/356 [00:00<00:03, 90.23it/s]

train progress: 
Epoch [19/100], Training Progress:   0%|          | 0/356 [00:00<?, ?it/s]


Epoch [19/100], Training Progress: 100%|██████████| 356/356 [00:03<00:00, 101.08it/s]
Epoch [19/100], Validation Progress: 100%|██████████| 76/76 [00:00<00:00, 148.27it/s]


Epoch [19/100], Training Loss: 0.00619264
Epoch [19/100], Validation Loss: 0.00603334
Epoch [19/100], Training CE Loss: 0.00000000
Epoch [19/100], Validation CE Loss: 0.00000000
Epoch [19/100], Training MSE Loss: 0.00619264
Epoch [19/100], Validation MSE Loss: 0.00603334
Epoch [19/100], Training Loss Comp: 0.00619264
Epoch [19/100], Validation Loss Comp: 0.00603334
Epoch [19/100]: Learning Rate = [0.0001]



Epoch [20/100], Training Progress:   5%|▌         | 18/356 [00:00<00:03, 91.45it/s]

train progress: 
Epoch [20/100], Training Progress:   0%|          | 0/356 [00:00<?, ?it/s]


Epoch [20/100], Training Progress: 100%|██████████| 356/356 [00:03<00:00, 104.05it/s]
Epoch [20/100], Validation Progress: 100%|██████████| 76/76 [00:00<00:00, 148.38it/s]


Epoch [20/100], Training Loss: 0.00601095
Epoch [20/100], Validation Loss: 0.00610895
Epoch [20/100], Training CE Loss: 0.00000000
Epoch [20/100], Validation CE Loss: 0.00000000
Epoch [20/100], Training MSE Loss: 0.00601095
Epoch [20/100], Validation MSE Loss: 0.00610895
Epoch [20/100], Training Loss Comp: 0.00601095
Epoch [20/100], Validation Loss Comp: 0.00610895
Epoch [20/100]: Learning Rate = [0.0001]



Epoch [21/100], Training Progress:   3%|▎         | 11/356 [00:00<00:03, 101.07it/s]

train progress: 
Epoch [21/100], Training Progress:   0%|          | 0/356 [00:00<?, ?it/s]


Epoch [21/100], Training Progress: 100%|██████████| 356/356 [00:03<00:00, 104.63it/s]
Epoch [21/100], Validation Progress: 100%|██████████| 76/76 [00:00<00:00, 142.75it/s]


Epoch [21/100], Training Loss: 0.00588325
Epoch [21/100], Validation Loss: 0.00623595
Epoch [21/100], Training CE Loss: 0.00000000
Epoch [21/100], Validation CE Loss: 0.00000000
Epoch [21/100], Training MSE Loss: 0.00588325
Epoch [21/100], Validation MSE Loss: 0.00623595
Epoch [21/100], Training Loss Comp: 0.00588325
Epoch [21/100], Validation Loss Comp: 0.00623595
Epoch [21/100]: Learning Rate = [0.0001]



Epoch [22/100], Training Progress:   3%|▎         | 11/356 [00:00<00:03, 101.54it/s]

train progress: 
Epoch [22/100], Training Progress:   0%|          | 0/356 [00:00<?, ?it/s]


Epoch [22/100], Training Progress: 100%|██████████| 356/356 [00:03<00:00, 101.43it/s]
Epoch [22/100], Validation Progress: 100%|██████████| 76/76 [00:00<00:00, 147.09it/s]


Epoch [22/100], Training Loss: 0.00580468
Epoch [22/100], Validation Loss: 0.00598990
Epoch [22/100], Training CE Loss: 0.00000000
Epoch [22/100], Validation CE Loss: 0.00000000
Epoch [22/100], Training MSE Loss: 0.00580468
Epoch [22/100], Validation MSE Loss: 0.00598990
Epoch [22/100], Training Loss Comp: 0.00580468
Epoch [22/100], Validation Loss Comp: 0.00598990
Epoch [22/100]: Learning Rate = [0.0001]



Epoch [23/100], Training Progress:   6%|▌         | 22/356 [00:00<00:03, 105.96it/s]

train progress: 
Epoch [23/100], Training Progress:   0%|          | 0/356 [00:00<?, ?it/s]


Epoch [23/100], Training Progress: 100%|██████████| 356/356 [00:03<00:00, 104.14it/s]
Epoch [23/100], Validation Progress: 100%|██████████| 76/76 [00:00<00:00, 145.98it/s]


Epoch [23/100], Training Loss: 0.00584594
Epoch [23/100], Validation Loss: 0.00575757
Epoch [23/100], Training CE Loss: 0.00000000
Epoch [23/100], Validation CE Loss: 0.00000000
Epoch [23/100], Training MSE Loss: 0.00584594
Epoch [23/100], Validation MSE Loss: 0.00575757
Epoch [23/100], Training Loss Comp: 0.00584594
Epoch [23/100], Validation Loss Comp: 0.00575757
Epoch [23/100]: Learning Rate = [0.0001]



Epoch [24/100], Training Progress:   3%|▎         | 10/356 [00:00<00:03, 98.25it/s]

train progress: 
Epoch [24/100], Training Progress:   0%|          | 0/356 [00:00<?, ?it/s]


Epoch [24/100], Training Progress: 100%|██████████| 356/356 [00:03<00:00, 105.46it/s]
Epoch [24/100], Validation Progress: 100%|██████████| 76/76 [00:00<00:00, 147.98it/s]


Epoch [24/100], Training Loss: 0.00565624
Epoch [24/100], Validation Loss: 0.00577244
Epoch [24/100], Training CE Loss: 0.00000000
Epoch [24/100], Validation CE Loss: 0.00000000
Epoch [24/100], Training MSE Loss: 0.00565624
Epoch [24/100], Validation MSE Loss: 0.00577244
Epoch [24/100], Training Loss Comp: 0.00565624
Epoch [24/100], Validation Loss Comp: 0.00577244
Epoch [24/100]: Learning Rate = [0.0001]



Epoch [25/100], Training Progress:   6%|▌         | 22/356 [00:00<00:03, 105.22it/s]

train progress: 
Epoch [25/100], Training Progress:   0%|          | 0/356 [00:00<?, ?it/s]


Epoch [25/100], Training Progress: 100%|██████████| 356/356 [00:03<00:00, 101.92it/s]
Epoch [25/100], Validation Progress: 100%|██████████| 76/76 [00:00<00:00, 148.33it/s]


Epoch [25/100], Training Loss: 0.00556900
Epoch [25/100], Validation Loss: 0.00553581
Epoch [25/100], Training CE Loss: 0.00000000
Epoch [25/100], Validation CE Loss: 0.00000000
Epoch [25/100], Training MSE Loss: 0.00556900
Epoch [25/100], Validation MSE Loss: 0.00553581
Epoch [25/100], Training Loss Comp: 0.00556900
Epoch [25/100], Validation Loss Comp: 0.00553581
Epoch [25/100]: Learning Rate = [1e-05]



Epoch [26/100], Training Progress:   6%|▌         | 21/356 [00:00<00:03, 103.01it/s]

train progress: 
Epoch [26/100], Training Progress:   0%|          | 0/356 [00:00<?, ?it/s]


Epoch [26/100], Training Progress: 100%|██████████| 356/356 [00:03<00:00, 104.83it/s]
Epoch [26/100], Validation Progress: 100%|██████████| 76/76 [00:00<00:00, 149.44it/s]


Epoch [26/100], Training Loss: 0.00544678
Epoch [26/100], Validation Loss: 0.00556527
Epoch [26/100], Training CE Loss: 0.00000000
Epoch [26/100], Validation CE Loss: 0.00000000
Epoch [26/100], Training MSE Loss: 0.00544678
Epoch [26/100], Validation MSE Loss: 0.00556527
Epoch [26/100], Training Loss Comp: 0.00544678
Epoch [26/100], Validation Loss Comp: 0.00556527
Epoch [26/100]: Learning Rate = [1e-05]



Epoch [27/100], Training Progress:   6%|▌         | 20/356 [00:00<00:03, 99.43it/s]

train progress: 
Epoch [27/100], Training Progress:   0%|          | 0/356 [00:00<?, ?it/s]


Epoch [27/100], Training Progress: 100%|██████████| 356/356 [00:03<00:00, 105.05it/s]
Epoch [27/100], Validation Progress: 100%|██████████| 76/76 [00:00<00:00, 146.81it/s]


Epoch [27/100], Training Loss: 0.00550940
Epoch [27/100], Validation Loss: 0.00551050
Epoch [27/100], Training CE Loss: 0.00000000
Epoch [27/100], Validation CE Loss: 0.00000000
Epoch [27/100], Training MSE Loss: 0.00550940
Epoch [27/100], Validation MSE Loss: 0.00551050
Epoch [27/100], Training Loss Comp: 0.00550940
Epoch [27/100], Validation Loss Comp: 0.00551050
Epoch [27/100]: Learning Rate = [1e-05]



Epoch [28/100], Training Progress:   1%|          | 3/356 [00:00<00:13, 26.11it/s]

train progress: 
Epoch [28/100], Training Progress:   0%|          | 0/356 [00:00<?, ?it/s]


Epoch [28/100], Training Progress: 100%|██████████| 356/356 [00:03<00:00, 99.71it/s] 
Epoch [28/100], Validation Progress: 100%|██████████| 76/76 [00:00<00:00, 143.53it/s]


Epoch [28/100], Training Loss: 0.00547249
Epoch [28/100], Validation Loss: 0.00553967
Epoch [28/100], Training CE Loss: 0.00000000
Epoch [28/100], Validation CE Loss: 0.00000000
Epoch [28/100], Training MSE Loss: 0.00547249
Epoch [28/100], Validation MSE Loss: 0.00553967
Epoch [28/100], Training Loss Comp: 0.00547249
Epoch [28/100], Validation Loss Comp: 0.00553967
Epoch [28/100]: Learning Rate = [1e-05]



Epoch [29/100], Training Progress:   3%|▎         | 10/356 [00:00<00:03, 96.37it/s]

train progress: 
Epoch [29/100], Training Progress:   0%|          | 0/356 [00:00<?, ?it/s]


Epoch [29/100], Training Progress: 100%|██████████| 356/356 [00:03<00:00, 105.00it/s]
Epoch [29/100], Validation Progress: 100%|██████████| 76/76 [00:00<00:00, 147.81it/s]


Epoch [29/100], Training Loss: 0.00544503
Epoch [29/100], Validation Loss: 0.00560146
Epoch [29/100], Training CE Loss: 0.00000000
Epoch [29/100], Validation CE Loss: 0.00000000
Epoch [29/100], Training MSE Loss: 0.00544503
Epoch [29/100], Validation MSE Loss: 0.00560146
Epoch [29/100], Training Loss Comp: 0.00544503
Epoch [29/100], Validation Loss Comp: 0.00560146
Epoch [29/100]: Learning Rate = [1e-05]



Epoch [30/100], Training Progress:   2%|▏         | 7/356 [00:00<00:05, 66.18it/s]

train progress: 
Epoch [30/100], Training Progress:   0%|          | 0/356 [00:00<?, ?it/s]


Epoch [30/100], Training Progress: 100%|██████████| 356/356 [00:03<00:00, 103.65it/s]
Epoch [30/100], Validation Progress: 100%|██████████| 76/76 [00:00<00:00, 124.43it/s]


Epoch [30/100], Training Loss: 0.00539916
Epoch [30/100], Validation Loss: 0.00560338
Epoch [30/100], Training CE Loss: 0.00000000
Epoch [30/100], Validation CE Loss: 0.00000000
Epoch [30/100], Training MSE Loss: 0.00539916
Epoch [30/100], Validation MSE Loss: 0.00560338
Epoch [30/100], Training Loss Comp: 0.00539916
Epoch [30/100], Validation Loss Comp: 0.00560338
Epoch [30/100]: Learning Rate = [1e-05]



Epoch [31/100], Training Progress:   3%|▎         | 9/356 [00:00<00:03, 89.30it/s]

train progress: 
Epoch [31/100], Training Progress:   0%|          | 0/356 [00:00<?, ?it/s]


Epoch [31/100], Training Progress: 100%|██████████| 356/356 [00:03<00:00, 104.66it/s]
Epoch [31/100], Validation Progress: 100%|██████████| 76/76 [00:00<00:00, 146.55it/s]


Epoch [31/100], Training Loss: 0.00533567
Epoch [31/100], Validation Loss: 0.00551923
Epoch [31/100], Training CE Loss: 0.00000000
Epoch [31/100], Validation CE Loss: 0.00000000
Epoch [31/100], Training MSE Loss: 0.00533567
Epoch [31/100], Validation MSE Loss: 0.00551923
Epoch [31/100], Training Loss Comp: 0.00533567
Epoch [31/100], Validation Loss Comp: 0.00551923
Epoch [31/100]: Learning Rate = [1e-05]



Epoch [32/100], Training Progress:   3%|▎         | 10/356 [00:00<00:03, 98.19it/s]

train progress: 
Epoch [32/100], Training Progress:   0%|          | 0/356 [00:00<?, ?it/s]


Epoch [32/100], Training Progress: 100%|██████████| 356/356 [00:03<00:00, 102.89it/s]
Epoch [32/100], Validation Progress: 100%|██████████| 76/76 [00:00<00:00, 146.36it/s]


Epoch [32/100], Training Loss: 0.00530529
Epoch [32/100], Validation Loss: 0.00556034
Epoch [32/100], Training CE Loss: 0.00000000
Epoch [32/100], Validation CE Loss: 0.00000000
Epoch [32/100], Training MSE Loss: 0.00530529
Epoch [32/100], Validation MSE Loss: 0.00556034
Epoch [32/100], Training Loss Comp: 0.00530529
Epoch [32/100], Validation Loss Comp: 0.00556034
Epoch [32/100]: Learning Rate = [1e-05]



Epoch [33/100], Training Progress:   6%|▌         | 21/356 [00:00<00:03, 103.28it/s]

train progress: 
Epoch [33/100], Training Progress:   0%|          | 0/356 [00:00<?, ?it/s]


Epoch [33/100], Training Progress: 100%|██████████| 356/356 [00:03<00:00, 103.83it/s]
Epoch [33/100], Validation Progress: 100%|██████████| 76/76 [00:00<00:00, 125.79it/s]

Epoch [33/100], Training Loss: 0.00535268
Epoch [33/100], Validation Loss: 0.00570360
Epoch [33/100], Training CE Loss: 0.00000000
Epoch [33/100], Validation CE Loss: 0.00000000
Epoch [33/100], Training MSE Loss: 0.00535268
Epoch [33/100], Validation MSE Loss: 0.00570360
Epoch [33/100], Training Loss Comp: 0.00535268
Epoch [33/100], Validation Loss Comp: 0.00570360
Epoch [33/100]: Learning Rate = [1e-05]

Early stopping triggered. Stopping training.





In [11]:
# mae_autoencoder.save("local","MAE")

In [12]:
# mae_autoencoder.load("local","MAE")

In [13]:
# pass in the attributes: "mean", "median", or "distribution_based" to the 'method' attribute to fill in the nan values using these methods, 
# or leave the 'methods' attribute empty and pass in the list to the 'MAE_pack' to fill the values using the Masked Auto Encoder method 
X_dirty = preprocessor.transform(input_df=X_dirty,  method="MAE", MAE_pack=[mae_autoencoder, batch_size, device, custom_collate_fn, test_loader_mae]) 

Clean progress:   0%|          | 0/76 [00:00<?, ?it/s]

Clean progress: 100%|██████████| 76/76 [00:00<00:00, 81.66it/s]


MAE: 0.04055763

MSE: 0.00629703





In [18]:
X_dirty

Unnamed: 0,age,education.num,capital.gain,capital.loss,hours.per.week,workclass_?,workclass_Federal-gov,workclass_Local-gov,workclass_Private,workclass_Self-emp-inc,...,native.country_Mexico,native.country_Nicaragua,native.country_Philippines,native.country_Poland,native.country_Puerto-Rico,native.country_South,native.country_Taiwan,native.country_Trinadad&Tobago,native.country_United-States,native.country_Vietnam
847,0.103485,0.534719,0.187809,0.114989,0.471664,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
739,-0.161607,0.369771,0.218976,0.170122,0.413305,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
274,0.071218,0.352251,0.128771,0.063953,0.375419,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
704,0.019593,0.064369,0.375381,0.069501,0.394139,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
78,-0.007432,0.231049,0.437667,-0.016125,0.366932,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
938,0.019460,-0.001403,0.331468,0.114764,0.416940,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
54,0.036557,0.487635,0.086724,0.169354,0.449624,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
584,0.019460,-0.001403,0.331468,0.114764,0.416940,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
680,0.067823,0.518641,0.126017,0.142384,0.448606,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


(Optional) Check save/load function of preprocessor

Both functiona take in 2 parameters:

    - Suffix of the preprocessor name, in the example below would be **preprocessor_main.pkl**
    - Save/load location: can either be "local" to save/load in the home folder or "bucketfs" to save/load to/from Exasol BucketFS

In [23]:
preprocessor.save("main","local")
# preprocessor.save("main","bucketfs")
# preprocessor = Preprocessor(scaler=MinMaxScaler(),encoder=OneHotEncoder(sparse=False))
# preprocessor2.load("main","local")
# preprocessor2.load("main","bucketfs")

Convert dataframes into datasets, and create dataloaders

In [14]:
dirty_dataset = PlainDataset(X_dirty)

dirty_loader = DataLoader(dirty_dataset, batch_size=batch_size, shuffle=False, drop_last=True, collate_fn=custom_collate_fn)

## Instantiate model

In [15]:
autoencoder = Autoencoder(layers=layers,dropout_enc=[(0,0.0)],dropout_dec=[(0,0.1)], batch_norm=True, \
                          learning_rate=1e-4,weight_decay=1e-5,l1_strength=1e-5,l2_strength=1e-5)

In [27]:
summary(autoencoder.to(device),torch.tensor(X_train.values).float().to(device).shape[1:])

----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
            Linear-1                 [-1, 1024]          95,232
       BatchNorm1d-2                 [-1, 1024]           2,048
              ReLU-3                 [-1, 1024]               0
           Dropout-4                 [-1, 1024]               0
            Linear-5                  [-1, 128]         131,200
       BatchNorm1d-6                  [-1, 128]             256
              ReLU-7                  [-1, 128]               0
            Linear-8                 [-1, 1024]         132,096
              ReLU-9                 [-1, 1024]               0
          Dropout-10                 [-1, 1024]               0
           Linear-11                   [-1, 92]          94,300
             ReLU-12                   [-1, 92]               0
           Linear-13                   [-1, 92]           8,556
Total params: 463,688
Trainable params:

(Optional) Model can be loaded from checkpoint after instantiation

The function takes in 2 parameters:

    - Suffix of the preprocessor name, in the example below would be **autoencoder_main.pkl**
    - Save/load location: can either be "local" to load in the home folder or "bucketfs" to load from Exasol BucketFS

### Train the model

In [16]:
autoencoder.train_model(
      patience=10,
      num_epochs=1,
      batch_size=batch_size,
      train_loader=train_loader,
      val_loader=val_loader,
      continous_columns=continous_columns, 
      categorical_columns=categorical_columns, 
      categories=categories,
      device=device,
      wlc=wlc)

Epoch [1/1], Training Progress: 100%|██████████| 356/356 [00:16<00:00, 21.14it/s]
Epoch [1/1], Validation Progress: 100%|██████████| 76/76 [00:02<00:00, 25.57it/s]

Epoch [1/1], Training Loss: 6.09625684
Epoch [1/1], Validation Loss: 2.12896782
Epoch [1/1], Training CE Loss: 6.05852925
Epoch [1/1], Validation CE Loss: 2.10090744
Epoch [1/1], Training MSE Loss: 0.03772758
Epoch [1/1], Validation MSE Loss: 0.02806038
Epoch [1/1], Training Loss Comp: 6.09625684
Epoch [1/1], Validation Loss Comp: 2.12896782
Epoch [1/1]: Learning Rate = [0.0001]






(Optional) Model can be saved after training

The function takes in 2 parameters:

    - Suffix of the preprocessor name, in the example below would be **autoencoder_main.pkl**
    - Save/load location: can either be "local" to load in the home folder or "bucketfs" to load from Exasol BucketFS

In [29]:
# autoencoder.save("local","main")
# autoencoder.load("local","main")
autoencoder.save("bucketfs","main",url="http://172.18.0.2:6583",bucket="default",user="w",password="write")

### Use trained model to clean data

In [17]:
cleaned_data = autoencoder.clean(
    dirty_loader=dirty_loader,
    test_loader=test_loader,
    df=X_dirty,
    batch_size=batch_size,
    continous_columns=continous_columns, 
    categorical_columns=categorical_columns, 
    og_columns=og_columns,
    onehotencoder=preprocessor.encoder, 
    scaler=preprocessor.scaler,
    device=device) 

Clean progress: 100%|██████████| 76/76 [00:06<00:00, 12.58it/s]


MAE: 0.02430817

MSE: 0.01999156





In [None]:
cleaned_data = autoencoder.clean(
    dirty_loader=dirty_loader,
    test_loader=test_loader,
    df=X_dirty,
    batch_size=batch_size,
    continous_columns=continous_columns, 
    categorical_columns=categorical_columns, 
    og_columns=og_columns,
    onehotencoder=preprocessor.encoder, 
    scaler=preprocessor.scaler,
    device=device) 

Clean progress: 100%|██████████| 76/76 [00:06<00:00, 12.58it/s]


MAE: 0.02430817

MSE: 0.01999156





In [None]:
# original data
print(tabulate(df.loc[[28296,28217,8054,4223,22723],og_columns],headers=og_columns,tablefmt="simple",maxcolwidths=[None, 4]))

In [None]:
# cleaned data
print(tabulate(cleaned_data.loc[[28296,28217,8054,4223,22723]],headers=cleaned_data.columns.to_list(),tablefmt="simple",maxcolwidths=[None, 4]))

### Use trained model to anonymize data

In [18]:
anonymized_data = autoencoder.anonymize(df=X_test,
                                        data_loader=test_loader,
                                        batch_size=batch_size,
                                        device=device)

Anonymize progress: 100%|██████████| 76/76 [00:02<00:00, 26.65it/s]


In [None]:
# anonymized data
print(tabulate(anonymized_data.round(decimals=4).iloc[:5,:32],headers=anonymized_data.columns.to_list(),tablefmt="simple",maxcolwidths=[None, 6]))