In [1]:
import sys
sys.path.append("..")

import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import random
from torchsummary import summary

from AutoCleanse.preprocessor import Preprocessor
from AutoCleanse.utils import *
from AutoCleanse.dataloader import ClfDataset, DataLoader
from AutoCleanse.evaluate.classifier import *

from sklearn.preprocessing import *
from sklearn.pipeline import make_pipeline
from collections import Counter

## Global value

In [2]:
random_seed = 42
random.seed(random_seed)
np.random.seed(random_seed)
torch.manual_seed(random_seed)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

## Data Preprocessing

### Preprocessor

In [4]:
scaler = MinMaxScaler()
onehotencoder = OneHotEncoder(sparse_output=False)
preprocessor = Preprocessor(scaler,onehotencoder)

### Feature preprocessing

In [6]:
# Load dataset
df = pd.read_csv('../dataset/adult.csv').drop(columns=['fnlwgt'])

continous_columns = df.drop(columns=['income']).select_dtypes(include=['int64', 'float64']).columns.tolist() 
categorical_columns = df.drop(columns=['income']).select_dtypes(include=['object', 'bool']).columns.tolist()
target_columns = ['income']

# Seperate features and target
X = df[continous_columns+categorical_columns]
y = df[target_columns]

# Split dataset
X_train,X_val,X_test = preprocessor.split(df=X,
                                        train_ratio=0.7,
                                        val_ratio=0.15,
                                        test_ratio=0.15,
                                        random_seed=42)
X_dirty = replace_with_nan(X_test,0,42)

# Fit and transform
X_train = preprocessor.fit_transform(input_df=X_train,
                                    continous_columns=continous_columns,
                                    categorical_columns=categorical_columns)

X_val = preprocessor.transform(input_df=X_val,    
                            continous_columns=continous_columns,
                            categorical_columns=categorical_columns)                          

X_test = preprocessor.transform(input_df=X_test,   
                                continous_columns=continous_columns,
                                categorical_columns=categorical_columns)  

X_dirty = preprocessor.transform(input_df=X_dirty,   
                                continous_columns=continous_columns,
                                categorical_columns=categorical_columns)

# Load cleaned test dataset
df_cleaned = pd.read_csv("../dataset/adult_test_cleaned.csv")

X_cleaned = df_cleaned[continous_columns+categorical_columns]
X_cleaned = preprocessor.transform(input_df=X_cleaned,   
                                    continous_columns=continous_columns,
                                    categorical_columns=categorical_columns)                                 

### Target preprocessing

In [7]:
# Preprocess target
y_train,y_val,y_test = preprocessor.split(df=y,
                                        train_ratio=0.7,
                                        val_ratio=0.15,
                                        test_ratio=0.15,
                                        random_seed=42)
y_dirty = replace_with_nan(y_test,0,42)

y_encoder = OneHotEncoder(sparse_output=False)
y_train = pd.DataFrame(y_encoder.fit_transform(y_train),columns=y_encoder.get_feature_names_out(target_columns),index=y_train.index)
y_val = pd.DataFrame(y_encoder.transform(y_val),columns=y_encoder.get_feature_names_out(target_columns),index=y_val.index)
y_test = pd.DataFrame(y_encoder.transform(y_test),columns=y_encoder.get_feature_names_out(target_columns),index=y_test.index)
y_dirty = pd.DataFrame(y_encoder.transform(y_dirty),columns=y_encoder.get_feature_names_out(target_columns),index=y_dirty.index)

### Data loader

In [6]:
train_dataset = ClfDataset(X_train, y_train)
val_dataset = ClfDataset(X_val, y_val)
test_dataset = ClfDataset(X_test, y_test)
dirty_dataset = ClfDataset(X_dirty, y_dirty)
cleaned_dataset = ClfDataset(X_cleaned, y_test)

def custom_collate_fn(batch):
    tensor_data = torch.stack([item[0] for item in batch])
    # Check if tensor_targets are scalars or tensors
    if torch.is_tensor(batch[0][1]):
        tensor_targets = torch.stack([item[1] for item in batch])
    else:
        tensor_targets = torch.tensor([item[1]
                                      for item in batch], dtype=torch.float32)
    indices = [item[2] for item in batch]
    return tensor_data, tensor_targets, indices

batch_size = 256
train_loader = DataLoader(train_dataset, batch_size=batch_size,
                          shuffle=True, drop_last=True, collate_fn=custom_collate_fn)
val_loader = DataLoader(val_dataset, batch_size=batch_size,
                        shuffle=False, drop_last=True, collate_fn=custom_collate_fn)
test_loader = DataLoader(test_dataset, batch_size=batch_size,
                         shuffle=False, drop_last=True, collate_fn=custom_collate_fn)
dirty_loader = DataLoader(dirty_dataset, batch_size=batch_size,
                          shuffle=False, drop_last=True, collate_fn=custom_collate_fn)
cleaned_loader = DataLoader(cleaned_dataset, batch_size=batch_size,
                            shuffle=False, drop_last=True, collate_fn=custom_collate_fn)

## Evaluation: data cleaning

### Initiate model

In [7]:
layers = [X_train.shape[1], 150, 200, 200, 100, 50]

model = ClsNNBase(layers=layers, dropout=[(0, 0.5), (1, 0.5), (2, 0.5)], batch_norm=True, device=device,
                  learning_rate=0.025, weight_decay=1e-5, l1_strength=1e-3, l2_strength=1e-3)                  

In [None]:
summary(model.to(device), torch.tensor(X_train.values).float().to(device).shape[1:])

### Train model

In [None]:
model.train_model(train_loader=train_loader,
                val_loader=val_loader,
                num_epochs=10,
                batch_size=batch_size,
                layers=layers,
                patience=2,
                continous_columns=continous_columns,
                categorical_columns=categorical_columns,
                device=device)

In [None]:
model.save("local","test")

In [None]:
model.load("local", "test")

### Evaluation

In [None]:
model.test(test_loader=test_loader, batch_size=batch_size, device=device)
model.test(test_loader=cleaned_loader, batch_size=batch_size, device=device)

## Evaluation: data anonymization

### Load dataset

In [8]:
anon_train = pd.read_csv("../dataset/adult_train_anonymized.csv",index_col=0)
anon_val = pd.read_csv("../dataset/adult_val_anonymized.csv",index_col=0)
anon_test = pd.read_csv("../dataset/adult_test_anonymized.csv",index_col=0)

### Data loader

In [9]:
train_dataset_anon = ClfDataset(anon_train, y_train)
val_dataset_anon = ClfDataset(anon_val, y_val)
test_dataset_anon = ClfDataset(anon_test, y_test)

def custom_collate_fn(batch):
    tensor_data = torch.stack([item[0] for item in batch])
    # Check if tensor_targets are scalars or tensors
    if torch.is_tensor(batch[0][1]):
        tensor_targets = torch.stack([item[1] for item in batch])
    else:
        tensor_targets = torch.tensor([item[1]
                                      for item in batch], dtype=torch.float32)
    indices = [item[2] for item in batch]
    return tensor_data, tensor_targets, indices

batch_size = 256
train_loader_anon = DataLoader(train_dataset_anon, batch_size=batch_size,
                               shuffle=True, drop_last=True, collate_fn=custom_collate_fn)
val_loader_anon = DataLoader(val_dataset_anon, batch_size=batch_size,
                             shuffle=False, drop_last=True, collate_fn=custom_collate_fn)
test_loader_anon = DataLoader(test_dataset_anon, batch_size=batch_size,
                              shuffle=False, drop_last=True, collate_fn=custom_collate_fn)

### Initiate model

In [11]:
layers_anon = [anon_train.shape[1], 150, 200, 200, 100, 50]

model_anon = ClsNNBase(layers=layers_anon, dropout=[(0, 0.5), (1, 0.5), (2, 0.5)], batch_norm=True, device=device,
                       learning_rate=0.025, weight_decay=1e-5, l1_strength=1e-3, l2_strength=1e-3)                  

### Model training

In [12]:
model_anon.train_model(train_loader=train_loader_anon,
                        val_loader=val_loader_anon,
                        num_epochs=10,
                        batch_size=batch_size,
                        layers=layers_anon,
                        patience=2,
                        continous_columns=continous_columns,
                        categorical_columns=categorical_columns,
                        device=device)

Epoch [1/10], Training Progress: 100%|██████████| 89/89 [00:13<00:00,  6.51it/s]
Epoch [1/10], Validation Progress: 100%|██████████| 19/19 [00:02<00:00,  6.95it/s]
  _warn_prf(average, modifier, msg_start, len(result))


Epoch [1/10], Training Loss  : 0.55796563, Accuracy: 0.75258954, Precision: 0.46747875, Recall: 0.49926510, F1 Score: 0.43365864
Epoch [1/10], Validation Loss: 0.55694846, Accuracy: 0.75637336, Precision: 0.37818668, Recall: 0.50000000, F1 Score: 0.43064497
Epoch [1/10]: Learning Rate = [0.025]



Epoch [2/10], Training Progress: 100%|██████████| 89/89 [00:13<00:00,  6.66it/s]
Epoch [2/10], Validation Progress: 100%|██████████| 19/19 [00:02<00:00,  6.89it/s]
  _warn_prf(average, modifier, msg_start, len(result))


Epoch [2/10], Training Loss  : 0.55457782, Accuracy: 0.75869031, Precision: 0.37934515, Recall: 0.50000000, F1 Score: 0.43139506
Epoch [2/10], Validation Loss: 0.55690311, Accuracy: 0.75637336, Precision: 0.37818668, Recall: 0.50000000, F1 Score: 0.43064497
Epoch [2/10]: Learning Rate = [0.025]



Epoch [3/10], Training Progress: 100%|██████████| 89/89 [00:13<00:00,  6.71it/s]
Epoch [3/10], Validation Progress: 100%|██████████| 19/19 [00:02<00:00,  7.06it/s]

Epoch [3/10], Training Loss  : 0.55457767, Accuracy: 0.75869031, Precision: 0.37934515, Recall: 0.50000000, F1 Score: 0.43139506
Epoch [3/10], Validation Loss: 0.55689693, Accuracy: 0.75637336, Precision: 0.37818668, Recall: 0.50000000, F1 Score: 0.43064497
Epoch [3/10]: Learning Rate = [0.025]

Early stopping triggered. Stopping training.



  _warn_prf(average, modifier, msg_start, len(result))


In [13]:
model_anon.save("local","test_anon")

Saved weight to ClsNNBase_test_anon.pth


In [None]:
model_anon.load("local", "test_anon")

### Evaluation

In [14]:
model_anon.test(test_loader=test_loader_anon, batch_size=batch_size, device=device)

Test Progress: 100%|██████████| 19/19 [00:02<00:00,  6.78it/s]

Test Loss  : 0.54763939, Accuracy: 0.76562500, Precision: 0.38281250, Recall: 0.50000000, F1 Score: 0.43362832



  _warn_prf(average, modifier, msg_start, len(result))
