Author:
        
        PARK, JunHo, junho@ccnets.org

        
        KIM, JeongYoong, jeongyoong@ccnets.org
        
    COPYRIGHT (c) 2024. CCNets. All Rights reserved.

In [None]:
import sys
path_append = "../"
sys.path.append(path_append)  # Go up one directory from where you are.

import torch
import pandas as pd
from sklearn.model_selection import train_test_split 
from sklearn.preprocessing import StandardScaler, RobustScaler, MinMaxScaler

In [None]:
dataroot = path_append + "../data/credit_card_fraud_detection/creditcard.csv"
df = pd.read_csv(dataroot)
df

In [None]:
print('No Frauds', round(df['Class'].value_counts()[0] / len(df) *100,2), '%of the dataset')
print('Frauds', round(df['Class'].value_counts()[1] / len(df) *100,2), '%of the dataset')

In [None]:
# https://fraud-detection-handbook.github.io/fraud-detection-handbook/Chapter_7_DeepLearning/FeedForwardNeuralNetworks.html
class LabeledDataset(torch.utils.data.Dataset):
    def __init__(self, x, y):
        self.x = x
        self.y = y
        
    def __len__(self):
        return len(self.x)

    def __getitem__(self, index):
        vals = torch.tensor(self.x[index], dtype = torch.float32)
        label = torch.tensor(self.y[index], dtype = torch.float32)
        return vals, label

class UnlabelledDataset(torch.utils.data.Dataset):
    def __init__(self, x):
        self.x = x
        
    def __len__(self):
        return len(self.x)

    def __getitem__(self, index):
        vals = torch.tensor(self.x[index], dtype = torch.float32)
        return vals, None

sc = StandardScaler()
df.iloc[:, :-1] = sc.fit_transform(df.iloc[:, :-1])

In [None]:
# number of features
n_elements = df.shape[1]
# number of label classes
# n_classes = y.shape[1]

In [None]:
from tools.setting.ml_params import MLParameters
from tools.setting.data_config import DataConfig
from nn.utils.init import set_random_seed
set_random_seed(0)

from trainer_hub import TrainerHub


In [None]:
data_config = DataConfig(dataset_name = 'CreditCardFraudDetection', task_type='augmentation', obs_shape=[n_elements], label_size=None)

#  Set training configuration from the AlgorithmConfig class, returning them as a Namespace object.
ml_params = MLParameters(core_model = 'none', encoder_model = 'deepfm')
ml_params.encoder_config.num_layers = 4
ml_params.encoder_config.d_model = 128

ml_params.training.num_epoch = 1
# Set the device to GPU if available, else CPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") 

trainer_hub = TrainerHub(ml_params, data_config, device, use_print=True, use_wandb=False, use_full_eval=False) 

In [None]:
df_train, df_test = train_test_split(df, test_size = 0.5, shuffle= False)
X_train, y_train = df_train.iloc[:, :-1].values, df_train.iloc[:, -1:].values
X_test, y_test = df_test.iloc[:, :-1].values, df_test.iloc[:, -1:].values
_df_train = df_train.iloc[:, :].values 

unlabelled_trainset = UnlabelledDataset(_df_train)
trainset = LabeledDataset(X_test, y_test)
testset = LabeledDataset(X_test, y_test)

In [None]:
trainer_hub.train(unlabelled_trainset)

In [None]:
batch_size = 64  # Lower than the original batch size
# Use DataLoader to handle smaller batches
from torch.nn.utils.rnn import pad_sequence
def collate_fn(batch):
    X, y = zip(*batch)
    # Directly use the tensors from X if they are already tensors, else convert appropriately
    X_padded = pad_sequence([x.clone().detach() if isinstance(x, torch.Tensor) else torch.tensor(x) for x in X], batch_first=True, padding_value=0)
    
    if any(label is None for label in y):
        y_padded = None
    else:
        # Directly use the tensors from y if they are already tensors, else convert appropriately
        y_padded = pad_sequence([label.clone().detach() if isinstance(label, torch.Tensor) else torch.tensor(label) for label in y], batch_first=True, padding_value=-1)
    
    return X_padded, y_padded

In [None]:

train_loader = torch.utils.data.DataLoader(dataset=unlabelled_trainset, batch_size=batch_size, collate_fn = collate_fn, shuffle=False)
# Example: Reduce batch size
recreated_dataset = None
for data, _ in train_loader:
    data = data.to(device)
    batch_recreated_data = trainer_hub.encoder_ccnet.synthesize(data, output_multiplier=2)
    recreated_dataset = torch.cat([recreated_dataset, batch_recreated_data]) if recreated_dataset is not None else batch_recreated_data
recreated_dataset.squeeze_(dim=1)

In [None]:
# seperate the data and labels
recreated_training_data, recreated_labels = recreated_dataset[:, :-1].clone().detach().cpu().numpy(), recreated_dataset[:, -1:].clone().detach().cpu().numpy()
ccnet_recreated_dataset = LabeledDataset(recreated_training_data, recreated_labels)

In [None]:
num_features = recreated_training_data.shape[1]
num_classes = recreated_labels.shape[1]
num_features, num_classes

In [None]:
class DNN(torch.nn.Module):
    def __init__(self, input_size, output_size, num_layers=4, hidden_size=128):
        super(DNN, self).__init__()
        self.input_size = input_size
        self.output_size = output_size
        self.hidden_size = hidden_size
        
        # Create a list to hold all layers
        layers = []
        
        # Input layer
        layers.append(torch.nn.Linear(input_size, hidden_size))
        layers.append(torch.nn.ReLU())
        
        # Hidden layers
        for _ in range(num_layers - 2):
            layers.append(torch.nn.Linear(hidden_size, hidden_size))
            layers.append(torch.nn.ReLU())
        
        # Output layer
        layers.append(torch.nn.Linear(hidden_size, output_size))
        layers.append(torch.nn.Sigmoid())
        
        # Register all layers
        self.layers = torch.nn.ModuleList(layers)

    def forward(self, x):
        for layer in self.layers:
            x = layer(x)
        return x

In [None]:
def train_supervised_model(model, dataset):
    optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
    set_random_seed(0)
    trainloader = torch.utils.data.DataLoader(dataset, batch_size=batch_size, shuffle=True)
    for epoch in range(2):
        for i, (data, label) in enumerate(trainloader):
            data = data.to(device)
            label = label.to(device)
            output = model(data)
            loss = torch.nn.functional.binary_cross_entropy(output, label)
            loss.backward()
            optimizer.step()
            optimizer.zero_grad()

In [None]:
model_trained_on_original = DNN(input_size= num_features, output_size=num_classes).to(device)

train_supervised_model(model_trained_on_original, trainset)

In [None]:
model_trained_on_recreated = DNN(input_size= num_features, output_size=num_classes).to(device)

train_supervised_model(model_trained_on_recreated, ccnet_recreated_dataset)

In [None]:
from sklearn.metrics import f1_score

def get_f1_score(model, testset, batch_size=batch_size):
    model.eval()  # Set the model to evaluation mode
    y_true = []
    y_pred = []
    data_loader = torch.utils.data.DataLoader(testset, batch_size=batch_size, shuffle=False)

    with torch.no_grad():  # No need to track gradients for inference
        for data, label in data_loader:
            data = data.to(device)
            label = label.to(device)
            output = model(data)
            # Ensure output is squeezed, thresholded, and converted to long for binary classification
            predicted = (output.squeeze() > 0.5).long()
            # Make sure label is also in the correct format (long type)
            y_true.extend(label.squeeze().long().cpu().numpy())
            y_pred.extend(predicted.cpu().numpy())

    # Compute F1 score, using 'binary' because this is a binary classification task
    score = f1_score(y_true, y_pred, average='binary')
    return score

# Usage example with two models:
f1_score_original = get_f1_score(model_trained_on_original, testset)
f1_score_recreated = get_f1_score(model_trained_on_recreated, testset)

print("F1 score of the supervised learning model trained on the original data: ", f1_score_original)
print("F1 score of the supervised learning model trained on the recreated data: ", f1_score_recreated)
