In [1]:
import torch 
import pandas as pd
import torch.nn as nn
import time
import io
import joblib
import argparse

from torchsummary import summary
from tqdm import tqdm
from tabulate import tabulate
from sklearn.preprocessing import *

from AutoCleanse.utils import *
from AutoCleanse.dataloader import PlainDataset, DataLoader
from AutoCleanse.autoencoder import *
from AutoCleanse.loss_model import loss_CEMSE
from AutoCleanse.preprocessor import Preprocessor
from AutoCleanse.anonymize import anonymize
from AutoCleanse.bucketfs_client import *

Device configuration

In [13]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
torch.manual_seed(42)

<torch._C.Generator at 0x7f9edd089ed0>

## Setup directory path

In [3]:
import os
PROJECT_DIR = os.getcwd()
os.chdir(PROJECT_DIR)
DATASET_DIR = os.path.join(PROJECT_DIR,'dataset')
EVAL_DIR = os.path.join(PROJECT_DIR,'evaluate')

## Preparing data

Load dataframe and group features by their type

In [11]:
df = pd.read_csv(os.path.join(DATASET_DIR,'adult.csv')).drop(columns=['fnlwgt','income'])
continous_columns = df.select_dtypes(include=['int64', 'float64']).columns.tolist()
categorical_columns = df.select_dtypes(include=['object', 'bool']).columns.tolist()
og_columns = df.columns.to_list()
df = df[continous_columns+categorical_columns]

Data preprocessing

In [14]:
scaler = MinMaxScaler()
onehotencoder = OneHotEncoder(sparse_output=False)
preprocessor = Preprocessor(scaler,onehotencoder)

X_train,X_val,X_test = preprocessor.split(df=df,
                                        train_ratio=0.7,
                                        val_ratio=0.15,
                                        test_ratio=0.15,
                                        random_seed=42)
X_dirty = replace_with_nan(X_test,0,42)


X_train = preprocessor.fit_transform(input_df=X_train,
                                    continous_columns=continous_columns,
                                    categorical_columns=categorical_columns)

X_val = preprocessor.transform(input_df=X_val,    
                               continous_columns=continous_columns,
                               categorical_columns=categorical_columns)                          

X_test = preprocessor.transform(input_df=X_test,   
                                continous_columns=continous_columns,
                                categorical_columns=categorical_columns)  

X_dirty = preprocessor.transform(input_df=X_dirty,   
                                continous_columns=continous_columns,
                                categorical_columns=categorical_columns)

categories = preprocessor.encoder.categories_

(Optional) Check save/load function of preprocessor

Both functiona take in 2 parameters:

    - Suffix of the preprocessor name, in the example below would be **preprocessor_main.pkl**
    - Save/load location: can either be "local" to save/load in the home folder or "bucketfs" to save/load to/from Exasol BucketFS

In [None]:
preprocessor.save("main","local")
preprocessor.save("main","bucketfs")
preprocessor = Preprocessor(scaler=MinMaxScaler(),encoder=OneHotEncoder(sparse=False))
preprocessor2.load("main","local")
preprocessor2.load("main","bucketfs")

Convert dataframes into datasets, and create dataloaders

In [15]:
batch_size = 64

In [16]:
train_dataset = PlainDataset(X_train)
val_dataset = PlainDataset(X_val)
test_dataset = PlainDataset(X_test)
dirty_dataset = PlainDataset(X_dirty)

def custom_collate_fn(batch):
    tensor_data = torch.stack([item[0] for item in batch])
    indices = [item[1] for item in batch]
    return tensor_data, indices

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, drop_last=True,collate_fn=custom_collate_fn)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False, drop_last=True, collate_fn=custom_collate_fn)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, drop_last=True, collate_fn=custom_collate_fn)
dirty_loader = DataLoader(dirty_dataset, batch_size=batch_size, shuffle=False, drop_last=True, collate_fn=custom_collate_fn)

## Instantiate model

In [17]:
layers = [X_train.shape[1],1024,128]   
wlc = (1,1) 

autoencoder = Autoencoder(layers=layers,dropout_enc=[(0,0.0)],dropout_dec=[(0,0.1)], batch_norm=True, \
                          learning_rate=1e-4,weight_decay=1e-5,l1_strength=1e-5,l2_strength=1e-5)

In [18]:
summary(autoencoder.to(device),torch.tensor(X_train.values).float().to(device).shape[1:])

----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
            Linear-1                 [-1, 1024]         110,592
       BatchNorm1d-2                 [-1, 1024]           2,048
              ReLU-3                 [-1, 1024]               0
           Dropout-4                 [-1, 1024]               0
            Linear-5                  [-1, 128]         131,200
       BatchNorm1d-6                  [-1, 128]             256
              ReLU-7                  [-1, 128]               0
            Linear-8                 [-1, 1024]         132,096
              ReLU-9                 [-1, 1024]               0
          Dropout-10                 [-1, 1024]               0
           Linear-11                  [-1, 107]         109,675
             ReLU-12                  [-1, 107]               0
           Linear-13                  [-1, 107]          11,556
Total params: 497,423
Trainable params:

(Optional) Model can be loaded from checkpoint after instantiation

The function takes in 2 parameters:

    - Suffix of the preprocessor name, in the example below would be **autoencoder_main.pkl**
    - Save/load location: can either be "local" to load in the home folder or "bucketfs" to load from Exasol BucketFS

In [None]:
autoencoder.load("local","main")

### Train the model

In [19]:
autoencoder.train_model(
      patience=10,
      num_epochs=100,
      batch_size=batch_size,
      train_loader=train_loader,
      val_loader=val_loader,
      continous_columns=continous_columns, 
      categorical_columns=categorical_columns, 
      categories=categories,
      device=device,
      wlc=wlc)

Epoch [1/1], Training Progress: 100%|██████████| 356/356 [00:11<00:00, 29.83it/s]
Epoch [1/1], Validation Progress: 100%|██████████| 76/76 [00:02<00:00, 35.51it/s]

Epoch [1/1], Training Loss: 6.47384357
Epoch [1/1], Validation Loss: 2.32033534
Epoch [1/1], Training CE Loss: 6.41617539
Epoch [1/1], Validation CE Loss: 2.28548042
Epoch [1/1], Training MSE Loss: 0.05766817
Epoch [1/1], Validation MSE Loss: 0.03485493
Epoch [1/1], Training Loss Comp: 6.47384357
Epoch [1/1], Validation Loss Comp: 2.32033534
Epoch [1/1]: Learning Rate = [0.0001]






(Optional) Model can be saved after training

The function takes in 2 parameters:

    - Suffix of the preprocessor name, in the example below would be **autoencoder_main.pkl**
    - Save/load location: can either be "local" to load in the home folder or "bucketfs" to load from Exasol BucketFS

In [None]:
autoencoder.save("local","main")

### Use trained model to clean data

In [20]:
cleaned_data = autoencoder.clean(dirty_loader=dirty_loader,
                                test_loader=test_loader,
                                df=X_dirty,
                                batch_size=batch_size,
                                continous_columns=continous_columns, 
                                categorical_columns=categorical_columns, 
                                og_columns=og_columns,
                                onehotencoder=preprocessor.encoder, 
                                scaler=preprocessor.scaler,
                                device=device) 

Clean progress: 100%|██████████| 76/76 [00:04<00:00, 17.70it/s]


MAE: 0.01538386

MSE: 0.01054435





In [21]:
# original data
print(tabulate(df.loc[[28296,28217,8054,4223,22723],og_columns],headers=og_columns,tablefmt="simple",maxcolwidths=[None, 4]))

         age  workclass         education      education.num  marital.status      occupation         relationship    race                sex       capital.gain    capital.loss    hours.per.week  native.country
-----  -----  ----------------  -----------  ---------------  ------------------  -----------------  --------------  ------------------  ------  --------------  --------------  ----------------  ----------------
28296     48  ?                 9th                        5  Separated           ?                  Not-in-family   Amer-Indian-Eskimo  Female               0               0                20  United-States
28217     28  ?                 HS-grad                    9  Separated           ?                  Unmarried       White               Female               0               0                40  United-States
 8054     38  Self-emp-not-inc  9th                        5  Divorced            Craft-repair       Not-in-family   White               Male                 0 

In [22]:
# cleaned data
print(tabulate(cleaned_data.loc[[28296,28217,8054,4223,22723]],headers=cleaned_data.columns.to_list(),tablefmt="simple",maxcolwidths=[None, 4]))

         age  workclass         education       education.num  marital.status      occupation         relationship    race    sex       capital.gain    capital.loss    hours.per.week  native.country
-----  -----  ----------------  ------------  ---------------  ------------------  -----------------  --------------  ------  ------  --------------  --------------  ----------------  ----------------
28296     43  ?                 Some-college                6  Separated           ?                  Not-in-family   Black   Female           30033             306                 9  United-States
28217     39  ?                 HS-grad                     5  Widowed             ?                  Unmarried       White   Female           41752            -431                14  United-States
 8054     46  Self-emp-not-inc  Some-college                5  Divorced            Craft-repair       Not-in-family   White   Male              8891             268                47  United-States
 4223 

### Use trained model to anonymize data

In [23]:
anonymized_data = autoencoder.anonymize(df=X_test,
                                        data_loader=test_loader,
                                        batch_size=batch_size,
                                        device=device)

Anonymize progress: 100%|██████████| 76/76 [00:02<00:00, 36.05it/s]


In [25]:
# anonymized data
print(tabulate(anonymized_data.round(decimals=4).iloc[:5,:32],headers=anonymized_data.columns.to_list(),tablefmt="simple",maxcolwidths=[None, 6]))

    0    1       2       3       4       5       6       7       8       9      10     11      12      13      14      15      16      17      18      19      20      21      22      23      24      25      26      27      28      29      30    31      32
-----  ---  ------  ------  ------  ------  ------  ------  ------  ------  ------  -----  ------  ------  ------  ------  ------  ------  ------  ------  ------  ------  ------  ------  ------  ------  ------  ------  ------  ------  ------  ----  ------
28296    0  0       0       0       0       0       2.1497  0       0       0       0      1.8243  0       0       0.0327  0       1.1492  1.0307  0       0       0       0       0       0       0       0       0       0       0       0          0  0
28217    0  0       0.1836  0.1328  3.3679  0.0131  0.7436  0.9163  0       0       0      0       0.4534  0       2.226   0       0.4577  1.4482  0       0       0       0.6967  0       0.1591  0       0       0       0       0       0 