Author:
        
        PARK, JunHo, junho@ccnets.org

        
        KIM, JeongYoong, jeongyoong@ccnets.org
        
    COPYRIGHT (c) 2024. CCNets. All Rights reserved.

In [None]:
import sys
path_append = "../"
sys.path.append(path_append)  # Go up one directory from where you are.

import torch
import pandas as pd
from sklearn.model_selection import train_test_split 
from sklearn.preprocessing import StandardScaler, RobustScaler, MinMaxScaler

In [None]:
dataroot = path_append + "../data/credit_card_fraud_detection/creditcard.csv"
df = pd.read_csv(dataroot)
df

In [None]:
print('No Frauds', round(df['Class'].value_counts()[0] / len(df) *100,2), '%of the dataset')
print('Frauds', round(df['Class'].value_counts()[1] / len(df) *100,2), '%of the dataset')

In [None]:
# https://fraud-detection-handbook.github.io/fraud-detection-handbook/Chapter_7_DeepLearning/FeedForwardNeuralNetworks.html
class Dataset(torch.utils.data.Dataset):
    def __init__(self, x, y):
        self.x = x
        self.y = y
        
    def __len__(self):
        return len(self.x)

    def __getitem__(self, index):
        vals = torch.tensor(self.x[index], dtype = torch.float32)
        label = torch.tensor(self.y[index], dtype = torch.float32).unsqueeze(-1)
        return vals, label

y = df[['Class']]
X = df.drop(['Class'],axis=1)

sc = RobustScaler()
X['scaled_amount'] = sc.fit_transform(X['Amount'].values.reshape(-1,1))
X['scaled_time'] = sc.fit_transform(X['Time'].values.reshape(-1,1))
X.drop(['Time','Amount'], axis=1, inplace=True)
X = X[:]

In [None]:
# number of features
n_features = X.shape[1]
# number of label classes
n_classes = y.shape[1]

In [None]:
from tools.setting.ml_params import MLParameters
from tools.setting.data_config import DataConfig
from nn.utils.init import set_random_seed
set_random_seed(0)

from trainer_hub import TrainerHub


In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.5, shuffle= True)

X_train = X_train.iloc[:, :].values 
X_test = X_test.iloc[:, :].values 
y_train = y_train.iloc[:, -1].values
y_test = y_test.iloc[:, -1].values

trainset = Dataset(X_train, y_train)
testset = Dataset(X_test, y_test)

In [None]:
from tools.setting.ml_params import ModelConfig
data_config = DataConfig(dataset_name = 'CreditCardFraudDetection', task_type='binary_classification', obs_shape=[n_features], label_size=n_classes)

#  Set training configuration from the AlgorithmConfig class, returning them as a Namespace object.
ml_params = MLParameters(core_model = 'deepfm', encoder_model = 'none')
ml_params.core_config = ModelConfig()
ml_params.training.num_epoch = 2

# Set the device to GPU if available, else CPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") 

trainer_hub = TrainerHub(ml_params, data_config, device, use_print=True, use_wandb=False, use_full_eval=False) 

In [None]:
trainer_hub.train(trainset, testset)

### Data Preparation and Synthetic Data Generation

In this section of the notebook, we are performing a series of operations to prepare our training data and utilize a causal cooperative network (CCNet) to generate synthetic data based on the explanations derived from the original training data. Each step is described as follows:

1. **Data Loading**:
   - `training_data, training_labels = trainset[:]`
   This line extracts all the features and labels from `trainset`, which is presumably a pre-loaded dataset formatted for training. Here, slicing `[:]` is used to retrieve all data and labels without any modifications.

2. **Device Assignment**:
   - `training_data = training_data.to(device)`
   - `training_labels = training_labels.to(device)`
   These lines transfer the training data and labels to a designated computing device (`device`). This device could be a CPU or a GPU and is typically specified to optimize computational efficiency. Moving data to the device ensures that all subsequent operations that require computation can leverage hardware acceleration.

3. **Data Explanation**:
   - `explanation = trainer_hub.core_ccnet.explain(training_data)`
   Here, the `explain` method of the `core_ccnet` module within `trainer_hub` is called with the training data. This function is expected to analyze the data and provide an "explanation" for each instance, which could be feature importances or another form of interpretable output that explains why certain predictions might be made from the data.

4. **Synthetic Data Generation**:
   - `recreated_data, recreated_label = trainer_hub.core_ccnet.generate(explanation)`
   This line generates synthetic data and labels by feeding the explanations obtained from the original data into the `generate` method of `core_ccnet`. The generate method uses the explanations to create new data instances that mimic or expand upon the patterns found in the original dataset. This is particularly useful for enhancing dataset diversity, balancing classes, or improving model robustness by providing additional training samples.

By the end of this process, `recreated_data` and `recreated_label` contain newly generated data and labels that can be used for further training, testing, or analysis to enhance the model's performance or robustness against various types of data inputs.


In [None]:
# Example: Reduce batch size
batch_size = 64  # Lower than the original batch size
training_data, training_labels = trainset[:]
# Use DataLoader to handle smaller batches
train_loader = torch.utils.data.DataLoader(dataset=trainset, batch_size=batch_size, shuffle=False)
recreated_data = None
recreated_label = None
for data, labels in train_loader:
    data = data.to(device)
    labels = labels.to(device)
    explanation = trainer_hub.core_ccnet.explain(data)
    batch_recreated_data, batch_recreated_label = trainer_hub.core_ccnet.generate(explanation)
    recreated_data = torch.cat([recreated_data, batch_recreated_data]) if recreated_data is not None else batch_recreated_data
    recreated_label = torch.cat([recreated_label, batch_recreated_label]) if recreated_label is not None else batch_recreated_label
    # Continue processing


In [None]:
# Assuming recreated_data is a PyTorch tensor and y is the labels associated with the data
recreated_data.squeeze_(dim=1)
recreated_label.squeeze_()

# Convert recreated_data to a NumPy array
recreated_data_data_np = recreated_data.cpu().detach().numpy()
recreated_label_data_np = recreated_label.cpu().detach().numpy()

# Create the dataset using the converted data and labels
ccnet_balanced_dataset = Dataset(recreated_data_data_np, recreated_label_data_np)


In [None]:
class DNN(torch.nn.Module):
    def __init__(self, input_size, output_size, num_layers=3, hidden_size=128):
        super(DNN, self).__init__()
        self.input_size = input_size
        self.output_size = output_size
        self.hidden_size = hidden_size
        
        # Create a list to hold all layers
        layers = []
        
        # Input layer
        layers.append(torch.nn.Linear(input_size, hidden_size))
        layers.append(torch.nn.ReLU())
        
        # Hidden layers
        for _ in range(num_layers - 2):
            layers.append(torch.nn.Linear(hidden_size, hidden_size))
            layers.append(torch.nn.ReLU())
        
        # Output layer
        layers.append(torch.nn.Linear(hidden_size, output_size))
        layers.append(torch.nn.Sigmoid())
        
        # Register all layers
        self.layers = torch.nn.ModuleList(layers)

    def forward(self, x):
        for layer in self.layers:
            x = layer(x)
        return x


In [None]:
def train_supervised_model(model, trainset):
    optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
    set_random_seed(0)
    trainloader = torch.utils.data.DataLoader(trainset, batch_size=64, shuffle=True)
    for epoch in range(1):
        for i, (data, label) in enumerate(trainloader):
            data = data.to(device)
            label = label.to(device)
            output = model(data)
            loss = torch.nn.functional.binary_cross_entropy(output, label)
            loss.backward()
            optimizer.step()
            optimizer.zero_grad()

In [None]:
model_trained_on_original = DNN(input_size= n_features, output_size=n_classes).to(device)

train_supervised_model(model_trained_on_original, trainset)

In [None]:
model_trained_on_recreated = DNN(input_size= n_features, output_size=n_classes).to(device)

train_supervised_model(model_trained_on_recreated, ccnet_balanced_dataset)

In [None]:
from sklearn.metrics import f1_score
import torch

def get_f1_score(model, testset):
    model.eval()  # Set the model to evaluation mode
    y_true = []
    y_pred = []
    # use data loader to handle batch size
    with torch.no_grad():  # No need to track gradients for inference
        data_loader = torch.utils.data.DataLoader(testset, batch_size=64, shuffle=False)
        for data, label in data_loader:
            data = data.to(device)
            label = label.to(device)
            output = model(data)
            predicted = (output.squeeze() > 0.5).long()
            y_true.extend(label.cpu().numpy())
            y_pred.extend(predicted.cpu().numpy())
    # Compute F1 score, using 'binary' because this is a binary classification task
    score = f1_score(y_true, y_pred, average='binary')
    return score

# Usage example with two models:
f1_score_original = get_f1_score(model_trained_on_original, testset)
f1_score_recreated = get_f1_score(model_trained_on_recreated, testset)

print("F1 score of the supervised learning model trained on the original data: ", f1_score_original)
print("F1 score of the supervised learning model trained on the recreated data: ", f1_score_recreated)