# Data Preprocessing

In [None]:
import sys, os

# Add the parent directory containing IBM_GNN to sys.path
# project_root = "/Users/hanbeobmun/Desktop/대학원/연구실/Fraud_detection_GNN"
project_root = "/home/beobmun/Fraud_detection_GNN"

if project_root not in sys.path:
    sys.path.append(project_root)

from IBM_GNN.IBM_dataset import IBM_Dataset
import numpy as np

TRANSACTIONS_CSV_PATH = '../data/IBM_Credit_Card_Transaction/credit_card_transactions-ibm_v2.csv'
USERS_CSV_PATH = '../data/IBM_Credit_Card_Transaction/sd254_users.csv'
CARDS_CSV_PATH = '../data/IBM_Credit_Card_Transaction/sd254_cards.csv'

try:
    dataset = (IBM_Dataset()
                .read_transactions_csv(TRANSACTIONS_CSV_PATH)
                .read_users_csv(USERS_CSV_PATH)
                .read_cards_csv(CARDS_CSV_PATH)
                .preprocess_transactions()
                .preprocess_users()
                .preprocess_cards()
                .create_node_mappings()
                )
except Exception as e:
    print(f"Error occurred: {e}")

In [None]:
dataset.edge_transactions

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import roc_auc_score, average_precision_score, precision_recall_curve, auc, f1_score

edge_transactions = dataset.edge_transactions
onehot_encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
relation_onehot = onehot_encoder.fit_transform(edge_transactions[['Relation']])
relation_types = onehot_encoder.get_feature_names_out(['Relation'])
relation_df = pd.DataFrame(relation_onehot, columns=relation_types, index=edge_transactions.index)
edge_transactions = pd.concat([relation_df, edge_transactions], axis=1)
edge_transactions = edge_transactions.drop(columns=['Relation'])
edge_transactions.head()

### 날짜 기반 split

In [None]:
def get_edge_transactions(edge_transactions, start_date=None, end_date=None):
    if edge_transactions is None:
        raise ValueError("Edge transactions dataframe is not loaded. Please call read_transactions_csv() and preprocess_transactions() first.")
    if start_date is not None and end_date is not None:
        if pd.to_datetime(start_date) == pd.to_datetime(end_date):
            mask = (edge_transactions['Date'] == pd.to_datetime(start_date))
            return edge_transactions.loc[mask].reset_index(drop=True)
        else:
            mask = (edge_transactions['Date'] >= pd.to_datetime(start_date)) & (edge_transactions['Date'] < pd.to_datetime(end_date))
            return edge_transactions.loc[mask].reset_index(drop=True)
    elif start_date is not None and end_date is None:
        mask = (edge_transactions['Date'] >= pd.to_datetime(start_date))
        return edge_transactions.loc[mask].reset_index(drop=True)
    elif start_date is None and end_date is not None:
        mask = (edge_transactions['Date'] < pd.to_datetime(end_date))
        return edge_transactions.loc[mask].reset_index(drop=True)
    else:
        return edge_transactions

In [None]:
start_date = '1996-01-01'
end_date = '2020-01-01'

days = pd.date_range(start=start_date, end=end_date, freq='D')
train_data, test_data = [], []
s = 0.4
for i in range(5):
    t_e = int(len(days) * s)
    train_end_date = days[t_e]
    test_start_date = days[t_e]
    test_end_date = days[min(int(len(days)*(s+0.2)), len(days)-1)]
    print(train_end_date, test_start_date, test_end_date)
    train_d = get_edge_transactions(edge_transactions, end_date=train_end_date)
    test_d = get_edge_transactions(edge_transactions, start_date=test_start_date, end_date=test_end_date)
    train_data.append(train_d)
    test_data.append(test_d)
    s += 0.1

### 거래 횟수 기반 split

In [None]:
# transactions count based split

def get_edge_transactions(edge_transactions, start_date=None, end_date=None):
    if edge_transactions is None:
        raise ValueError("Edge transactions dataframe is not loaded. Please call read_transactions_csv() and preprocess_transactions() first.")
    if start_date is not None and end_date is not None:
        if pd.to_datetime(start_date) == pd.to_datetime(end_date):
            mask = (edge_transactions['Date'] == pd.to_datetime(start_date))
            return edge_transactions.loc[mask].reset_index(drop=True)
        else:
            mask = (edge_transactions['Date'] >= pd.to_datetime(start_date)) & (edge_transactions['Date'] <= pd.to_datetime(end_date))
            return edge_transactions.loc[mask].reset_index(drop=True)
    elif start_date is not None and end_date is None:
        mask = (edge_transactions['Date'] >= pd.to_datetime(start_date))
        return edge_transactions.loc[mask].reset_index(drop=True)
    elif start_date is None and end_date is not None:
        mask = (edge_transactions['Date'] <= pd.to_datetime(end_date))
        return edge_transactions.loc[mask].reset_index(drop=True)
    else:
        return edge_transactions

days = [
    ('1996-01-01', '2011-06-10', '2011-06-11', '2014-05-23'),
    ('1996-01-01','2012-12-10', '2012-12-11', '2015-10-24'),
    ('1996-01-01','2014-05-23', '2014-05-24', '2017-03-20'),
    ('1996-01-01','2015-10-24', '2015-10-25', '2018-08-10'),
    ('1996-01-01','2017-03-20', '2017-03-21', '2019-12-31')
]

train_data, test_data = [], []
for i, (train_start_date, train_end_date, test_start_date, test_end_date) in enumerate(days):
    train_d = get_edge_transactions(edge_transactions, start_date=train_start_date, end_date=train_end_date)
    test_d = get_edge_transactions(edge_transactions, start_date=test_start_date, end_date=test_end_date)
    train_data.append(train_d)
    test_data.append(test_d)
    print(f"Fold {i + 1}: Train ({len(train_d)}), Test ({len(test_d)})")


### Dataset_DNN

In [None]:
import torch

class Dataset(torch.utils.data.Dataset):
    def __init__(self, transactions):
        self.mcc_idx = torch.tensor(transactions[['MCC_idx']].values, dtype=torch.long)
        self.zip_idx = torch.tensor(transactions[['Zip_idx']].values, dtype=torch.long)
        self.transaction = torch.tensor(transactions.drop(columns=['Src', 'Dest', 'MCC_idx', 'Zip_idx', 'Date', 'isFraud']).values, dtype=torch.float)
        self.label = torch.tensor(transactions['isFraud'].values, dtype=torch.float).unsqueeze(1)
    
    def __len__(self):
        return len(self.label)
    
    def __getitem__(self, idx):
        return self.transaction[idx], self.mcc_idx[idx], self.zip_idx[idx], self.label[idx]


### Dataset_LSTM

In [None]:
import torch
import torch.nn as nn
import pandas as pd
import numpy as np
from collections import defaultdict
from tqdm import tqdm

class Dataset_LSTM(torch.utils.data.Dataset):
    def __init__(self, combined_transactions, src_ids, zip_emb_dim, mcc_emb_dim, num_zip_idx, num_mcc_idx, max_seq_len=10):
        self.max_seq_len = max_seq_len

        self.zip_embedding = nn.Embedding(num_zip_idx, zip_emb_dim)
        self.mcc_embedding = nn.Embedding(num_mcc_idx, mcc_emb_dim)

        self.src_sequences = self._prepare_sequence(combined_transactions, src_ids)
        self.src_ids = list(self.src_sequences.keys())

        self.total_transactions = sum(len(seq['labels']) for seq in self.src_sequences.values())

    def _prepare_sequence(self, combined_transactions, src_ids):
        transactions = combined_transactions[combined_transactions['Src'].isin(src_ids)].reset_index(drop=True)

        src_sequences = defaultdict(lambda: {'features': [], 'labels': []})

        embed_zip = self.zip_embedding(torch.tensor(transactions[['Zip_idx']].values, dtype=torch.long)).squeeze(1)
        embed_mcc = self.mcc_embedding(torch.tensor(transactions[['MCC_idx']].values, dtype=torch.long)).squeeze(1)
        with tqdm(total=len(src_ids), desc="Preparing sequences") as pbar:
            for id in src_ids:
                id_transactions = transactions[transactions['Src'] == id]
                transactions_idx = id_transactions.index
                id_embed_zip = embed_zip[transactions_idx]
                id_embed_mcc = embed_mcc[transactions_idx]
                id_features = torch.tensor(id_transactions.drop(columns=['Src', 'Dest', 'MCC_idx', 'Zip_idx', 'Date', 'isFraud']).values, dtype=torch.float)
                id_labels = torch.tensor(id_transactions['isFraud'].values, dtype=torch.float).unsqueeze(1)
                src_sequences[id]['features'] = torch.cat([id_features, id_embed_zip, id_embed_mcc], dim=1)
                src_sequences[id]['labels'] = id_labels
                pbar.update(1)
                
        return src_sequences

    def __len__(self):
        return len(self.src_ids)
    
    def __getitem__(self, idx):
        src_id = self.src_ids[idx]
        seq = self.src_sequences[src_id]
        features = seq['features']
        labels = seq['labels']

        return features, labels, src_id
    
def collate_fn_lstm(batch):
    features_list = [item[0] for item in batch]
    labels_list = [item[1] for item in batch]
    src_ids = [item[2] for item in batch]

    lengths = torch.tensor([f.size(0) for f in features_list], dtype=torch.long)
    lengths_sorted, sorted_indices = lengths.sort(descending=True)

    features_sorted = [features_list[i] for i in sorted_indices]
    labels_sorted = [labels_list[i] for i in sorted_indices]
    src_ids_sorted = [src_ids[i] for i in sorted_indices]

    padded_features = nn.utils.rnn.pad_sequence(features_sorted, batch_first=True)
    padded_labels = nn.utils.rnn.pad_sequence(labels_sorted, batch_first=True)

    return padded_features, padded_labels, lengths_sorted, src_ids_sorted

# DL Methods

## Model Architecture

In [None]:
class FocalLoss(nn.Module):
    def __init__(self, alpha=0.25, gamma=2.0, reduction='mean'):
        super().__init__()
        self.alpha = alpha
        self.gamma = gamma
        self.reduction = reduction

    def forward(self, inputs, targets):
        bce_loss = F.binary_cross_entropy(inputs, targets, reduction='none')
        inputs = torch.clamp(inputs, min=1e-7, max=1 - 1e-7)
        pt = inputs * targets + (1 - inputs) * (1 - targets)
        alpha_factor = self.alpha * targets + (1 - self.alpha) * (1 - targets)
        modulating_factor = (1 - pt) ** self.gamma
        focal_loss = alpha_factor * modulating_factor * bce_loss

        if self.reduction == 'mean':
            return focal_loss.mean()
        elif self.reduction == 'sum':
            return focal_loss.sum()
        else:
            return focal_loss

### DNN

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class DNN(nn.Module):
    def __init__(self, transactions_dim, zip_emb_dim, mcc_emb_dim, num_zip_idx, num_mcc_idx):
        super(DNN, self).__init__()
        self.zip_embedding = nn.Embedding(num_zip_idx, zip_emb_dim)
        self.mcc_embedding = nn.Embedding(num_mcc_idx, mcc_emb_dim)
        self.fc1 = nn.Linear(transactions_dim + zip_emb_dim + mcc_emb_dim, 128)
        self.fc2 = nn.Linear(128, 64)
        self.fc3 = nn.Linear(64, 1)
        self.dropout = nn.Dropout(0.2)

    def forward(self, transactions, zip_idx, mcc_idx):
        zip_emb = self.zip_embedding(zip_idx).squeeze(1)
        mcc_emb = self.mcc_embedding(mcc_idx).squeeze(1)
        x = torch.cat([transactions, zip_emb, mcc_emb], dim=1)
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.dropout(x)
        x = self.fc3(x)
        x = torch.sigmoid(x)
        return x

### LSTM

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence

class LSTMModel(nn.Module):
    def __init__(self, input_dim, hidden_dim, num_layers, output_dim, dropout_rate=0.2):
        super().__init__()
        self.hidden_dim = hidden_dim
        self.num_layers = num_layers

        self.lstm = nn.LSTM(input_dim, hidden_dim, num_layers, batch_first=True, dropout=dropout_rate if num_layers > 1 else 0)
        self.classifier = nn.Sequential(
            nn.Linear(hidden_dim, hidden_dim // 2),
            nn.ReLU(),
            nn.Dropout(dropout_rate),
            nn.Linear(hidden_dim // 2, output_dim),
            nn.Sigmoid()
        )

    def forward(self, sequences, lengths):
        packed_input = pack_padded_sequence(sequences, lengths.cpu(), batch_first=True, enforce_sorted=False)
        packed_output, (hn, cn) = self.lstm(packed_input)
        output, _ = pad_packed_sequence(packed_output, batch_first=True)
        predictions = self.classifier(output.reshape(-1, self.hidden_dim))
        predictions = predictions.reshape(output.size(0), output.size(1), -1)
        return predictions

## Train

In [None]:
import matplotlib.pyplot as plt

def plot_metrics(result):
    # Plot train/val loss, ROC-AUC, PR-AUC side by side
    fig, axes = plt.subplots(1, 3, figsize=(18, 4))

    epochs = np.arange(1, len(result) + 1)

    # Loss plot
    axes[0].plot(epochs, result['train_loss'], label='train_loss')
    axes[0].plot(epochs, result['val_loss'], label='val_loss')
    axes[0].set_title('Loss')
    axes[0].set_xlabel('Epoch')
    axes[0].set_ylabel('Loss')
    axes[0].legend(loc='upper left')
    axes[0].grid(alpha=0.3)

    # ROC-AUC plot
    axes[1].plot(epochs, result['train_roc_auc'], label='train_roc_auc')
    axes[1].plot(epochs, result['val_roc_auc'], label='val_roc_auc')
    axes[1].set_title('ROC AUC')
    axes[1].set_xlabel('Epoch')
    axes[1].set_ylabel('ROC AUC')
    axes[1].legend(loc='upper left')
    axes[1].grid(alpha=0.3)
    axes[1].set_ylim(0.0, 1.0)

    # PR-AUC plot
    axes[2].plot(epochs, result['train_pr_auc'], label='train_pr_auc')
    axes[2].plot(epochs, result['val_pr_auc'], label='val_pr_auc')
    axes[2].set_title('PR AUC (Average Precision)')
    axes[2].set_xlabel('Epoch')
    axes[2].set_ylabel('PR AUC')
    axes[2].legend(loc='upper left')
    axes[2].grid(alpha=0.3)
    axes[2].set_ylim(0.0, 1.0)

    plt.tight_layout()
    plt.show()

In [None]:
import os

class EarlyStopping:
    def __init__(self, patience=5, verbose=False, delta=0, path='training_results', metric_name='metric'):
        self.patience = patience
        self.verbose = verbose
        self.counter = 0
        self.best_score = None
        self.early_stop = False
        self.delta = delta
        self.path = path
        self.metric_name = metric_name

    def __call__(self, val_metric, model):
        score = val_metric

        if self.best_score is None:
            self.best_score = score
            self.save_checkpoint(val_metric, model)
        elif score < self.best_score + self.delta:
            self.counter += 1
            if self.verbose:
                print(f'EarlyStopping counter: {self.counter} out of {self.patience}')
            if self.counter >= self.patience:
                self.early_stop = True
        else:
            self.best_score = score
            self.save_checkpoint(val_metric, model)
            self.counter = 0
    
    def save_checkpoint(self, val_metric, model):
        if self.verbose:
            print(f'Validation {self.metric_name} improved ({self.best_score:.6f} --> {val_metric:.6f}).  Saving model ...')
        os.makedirs(os.path.dirname(self.path), exist_ok=True)
        torch.save(model.state_dict(), os.path.join(self.path, 'checkpoint.pt'))
        self.best_score = val_metric

In [None]:
import numpy as np
from sklearn.metrics import precision_recall_curve, auc, f1_score

def get_max_f1_score(y_true, y_pred_proba):
    precision, recall, thresholds = precision_recall_curve(y_true, y_pred_proba)
    f1_scores = 2 * (precision * recall) / (precision + recall + 1e-10)
    optimal_idx = np.argmax(f1_scores)
    optimal_threshold = thresholds[optimal_idx]
    max_f1 = np.max(f1_scores)
    return max_f1, optimal_threshold

### DNN

In [None]:
from tqdm import tqdm

TRANSACTIONS_DIM = train_data[0].drop(columns=['Src', 'Dest', 'MCC_idx', 'Zip_idx', 'Date', 'isFraud']).shape[1]
ZIP_EMB_DIM = 64
MCC_EMB_DIM = 32
NUM_ZIP_IDX = len(dataset.zip_to_idx)
NUM_MCC_IDX = len(dataset.mcc_to_idx)
final_metrics = {
    'roc_auc': [],
    'pr_auc': [],
    'f1_score': []
}

for i in range(5):
    model = DNN(transactions_dim=TRANSACTIONS_DIM,
                zip_emb_dim=ZIP_EMB_DIM,
                mcc_emb_dim=MCC_EMB_DIM,
                num_zip_idx=NUM_ZIP_IDX,
                num_mcc_idx=NUM_MCC_IDX)
    
    criterion = FocalLoss()
    optimizer = torch.optim.AdamW(model.parameters(), lr=1e-3)

    epochs = 10
    batch_size = 16384

    train_dataset = Dataset(train_data[i])
    train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    test_dataset = Dataset(test_data[i])
    test_dataloader = torch.utils.data.DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

    early_stopping = EarlyStopping(patience=5, path=f'training_results/DNN/checkpoint_fold_{i+1}.pt', metric_name='val_pr_auc')

    metrics = {
        'train_loss': [], 'train_roc_auc': [], 'train_pr_auc': [], 'train_f1_score': [],
        'val_loss': [], 'val_roc_auc': [], 'val_pr_auc': [], 'val_f1_score': []
    }

    for epoch in range(epochs):
        model.train()
        all_labels = []
        all_outputs = []
        with tqdm(total=len(train_dataloader)*batch_size, desc=f"Fold {i+1} Training Epoch {epoch+1}/{epochs}", ncols=100, leave=False) as pbar:
            epoch_loss = 0.0
            for transactions, mcc_idx, zip_idx, labels in train_dataloader:
                optimizer.zero_grad()
                outputs = model(transactions, zip_idx, mcc_idx)
                loss_value = criterion(outputs, labels)
                loss_value.backward()
                optimizer.step()
                epoch_loss += loss_value.item() * transactions.size(0)
                all_labels.extend(labels.cpu().numpy())
                all_outputs.extend(outputs.detach().cpu().numpy())
                pbar.update(transactions.size(0))
        epoch_loss /= len(train_dataloader.dataset)
        metrics['train_loss'].append(epoch_loss)
        # Convert to numpy arrays for stable metric computation
        all_labels = np.array(all_labels).flatten()
        all_outputs = np.array(all_outputs).flatten()
        train_roc_auc = roc_auc_score(all_labels, all_outputs)
        train_pr_auc = average_precision_score(all_labels, all_outputs)
        train_f1_score, train_f1_threshold = get_max_f1_score(all_labels, all_outputs)
        metrics['train_roc_auc'].append(train_roc_auc)
        metrics['train_pr_auc'].append(train_pr_auc)
        metrics['train_f1_score'].append(train_f1_score)
        # print(f"Fold {i+1} Train Epoch {epoch+1}/{epochs}, Loss: {epoch_loss:.4f}, ROC_AUC: {metrics['train_roc_auc'][-1]:.4f}, PR_AUC: {metrics['train_pr_auc'][-1]:.4f}, F1_Score: {metrics['train_f1_score'][-1]:.4f}")

        model.eval()
        # Further evaluation on test data can be added here
        with torch.no_grad():
            all_labels = []
            all_outputs = []
            test_loss = 0.0
            with tqdm(total=len(test_dataloader)*batch_size, desc=f"Fold {i+1} Testing Epoch {epoch+1}/{epochs}", ncols=100, leave=False) as pbar:
                for transactions, mcc_idx, zip_idx, labels in test_dataloader:
                    outputs = model(transactions, zip_idx, mcc_idx)
                    loss_value = criterion(outputs, labels)
                    test_loss += loss_value.item() * transactions.size(0)
                    all_labels.extend(labels.cpu().numpy())
                    all_outputs.extend(outputs.detach().cpu().numpy())
                    pbar.update(transactions.size(0))
            test_loss /= len(test_dataloader.dataset)
            all_labels = np.array(all_labels).flatten()
            all_outputs = np.array(all_outputs).flatten()
            test_roc_auc = roc_auc_score(all_labels, all_outputs)
            test_pr_auc = average_precision_score(all_labels, all_outputs)
            # test_f1_score = f1_score(all_labels, all_outputs >= 0.5)
            test_f1_score, test_f1_threshold = get_max_f1_score(all_labels, all_outputs)
            metrics['val_loss'].append(test_loss)
            metrics['val_roc_auc'].append(test_roc_auc)
            metrics['val_pr_auc'].append(test_pr_auc)
            metrics['val_f1_score'].append(test_f1_score)
            # print(f"Fold {i+1} Test Epoch {epoch+1}/{epochs}, Loss {test_loss:.4f}, ROC_AUC: {test_roc_auc:.4f}, PR_AUC: {test_pr_auc:.4f}, F1_Score: {test_f1_score:.4f}")
        print(f"Fold {i+1} Train Epoch {epoch+1}/{epochs}, Loss: {epoch_loss:.4f}, ROC_AUC: {train_roc_auc:.4f}, PR_AUC: {train_pr_auc:.4f}, F1_Score: {train_f1_score:.4f}(th:{train_f1_threshold:.4f}) | Test Loss {test_loss:.4f}, ROC_AUC: {test_roc_auc:.4f}, PR_AUC: {test_pr_auc:.4f}, F1_Score: {test_f1_score:.4f}(th:{test_f1_threshold:.4f})")
        early_stopping(test_pr_auc, model)
        if early_stopping.early_stop:
            print(f"Early stopping fold {i+1} at epoch {epoch+1}")
            break
    plot_metrics(pd.DataFrame(metrics))

    
    model.load_state_dict(torch.load(f'training_results/DNN/checkpoint_fold_{i+1}.pt'))
    model.eval()

    final_test_labels = []
    final_test_outputs = []
    with torch.no_grad():
        with tqdm(total=len(test_dataloader)*batch_size, desc=f"Fold {i+1} Final Testing", ncols=100, leave=False) as pbar:
            for transactions, mcc_idx, zip_idx, labels in test_dataloader:
                outputs = model(transactions, zip_idx, mcc_idx).squeeze()
                final_test_labels.extend(labels.cpu().numpy())
                final_test_outputs.extend(outputs.detach().cpu().numpy())
                pbar.update(transactions.size(0))
    final_test_labels = np.array(final_test_labels).flatten()
    final_test_outputs = np.array(final_test_outputs).flatten()
    final_test_roc_auc = roc_auc_score(final_test_labels, final_test_outputs)
    final_test_pr_auc = average_precision_score(final_test_labels, final_test_outputs)
    final_test_f1_score = get_max_f1_score(final_test_labels, final_test_outputs)[0]
    final_metrics['roc_auc'].append(final_test_roc_auc)
    final_metrics['pr_auc'].append(final_test_pr_auc)
    final_metrics['f1_score'].append(final_test_f1_score)

    

In [None]:
final_metrics = {
    'roc_auc': [],
    'pr_auc': [],
    'f1_score': []
}

for i in range(5):
    model = DNN(transactions_dim=TRANSACTIONS_DIM,
                zip_emb_dim=ZIP_EMB_DIM,
                mcc_emb_dim=MCC_EMB_DIM,
                num_zip_idx=NUM_ZIP_IDX,
                num_mcc_idx=NUM_MCC_IDX)
    
    
    model.load_state_dict(torch.load(f'training_results/DNN/checkpoint_fold_{i+1}.pt'))
    model.eval()
    
    test_dataset = Dataset(test_data[i])
    test_dataloader = torch.utils.data.DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

    final_test_labels = []
    final_test_outputs = []
    with torch.no_grad():
        with tqdm(total=len(test_dataloader)*batch_size, desc=f"Fold {i+1} Final Testing", ncols=100, leave=False) as pbar:
            for transactions, mcc_idx, zip_idx, labels in test_dataloader:
                outputs = model(transactions, zip_idx, mcc_idx).squeeze()
                final_test_labels.extend(labels.cpu().numpy())
                final_test_outputs.extend(outputs.detach().cpu().numpy())
                pbar.update(transactions.size(0))
    final_test_labels = np.array(final_test_labels).flatten()
    final_test_outputs = np.array(final_test_outputs).flatten()
    final_test_roc_auc = roc_auc_score(final_test_labels, final_test_outputs)
    final_test_pr_auc = average_precision_score(final_test_labels, final_test_outputs)
    final_test_f1_score = get_max_f1_score(final_test_labels, final_test_outputs)[0]
    final_metrics['roc_auc'].append(final_test_roc_auc)
    final_metrics['pr_auc'].append(final_test_pr_auc)
    final_metrics['f1_score'].append(final_test_f1_score)

In [None]:
print("Final 5-Fold Cross Validation Results:")
print(f"Average ROC AUC: {np.mean(final_metrics['roc_auc']):.4f} ± {np.std(final_metrics['roc_auc']):.4f}")
print(f"Average PR AUC: {np.mean(final_metrics['pr_auc']):.4f} ± {np.std(final_metrics['pr_auc']):.4f}")
print(f"Average F1 Score: {np.mean(final_metrics['f1_score']):.4f} ± {np.std(final_metrics['f1_score']):.4f}")

In [None]:
print(f"{final_metrics['roc_auc'][2]:.4f}, {final_metrics['pr_auc'][2]:.4f}, {final_metrics['f1_score'][2]:.4f}")

In [None]:
print(dataset.edge_transactions.iloc[:int(len(dataset.edge_transactions)*0.6)][['isFraud']].mean())
print(dataset.edge_transactions.iloc[int(len(dataset.edge_transactions)*0.6):int(dataset.edge_transactions.shape[0]*0.8)][['isFraud']].mean())
print(dataset.edge_transactions.iloc[int(len(dataset.edge_transactions)*0.8):][['isFraud']].mean())


### LSTM

In [None]:
from tqdm import tqdm
import pandas as pd
import random

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

TRANSACTIONS_DIM = train_data[0].drop(columns=['Src', 'Dest', 'MCC_idx', 'Zip_idx', 'Date', 'isFraud']).shape[1]
ZIP_EMB_DIM = 64
MCC_EMB_DIM = 32
NUM_ZIP_IDX = len(dataset.zip_to_idx)
NUM_MCC_IDX = len(dataset.mcc_to_idx)
batch_size = 64

final_metrics = {
    'roc_auc': [],
    'pr_auc': [],
    'f1_score': []
}

for i in range(5):
    # Data preparation for LSTM
    combined_data = pd.concat([train_data[i], test_data[i]], ignore_index=True)
    src_ids = combined_data['Src'].unique()
    random.seed(42)
    train_ids = random.sample(list(src_ids), int(len(src_ids)*0.8))
    test_ids = set(src_ids) - set(train_ids)
    print(f"Fold {i+1}: Train IDs ({len(train_ids)}), Test IDs ({len(test_ids)})")
    train_dataset = Dataset_LSTM(combined_data, train_ids, ZIP_EMB_DIM, MCC_EMB_DIM, NUM_ZIP_IDX, NUM_MCC_IDX)
    test_dataset = Dataset_LSTM(combined_data, test_ids, ZIP_EMB_DIM, MCC_EMB_DIM, NUM_ZIP_IDX, NUM_MCC_IDX)

    train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn_lstm)
    test_dataloader = torch.utils.data.DataLoader(test_dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_fn_lstm)
    print(f"Fold {i+1}: Train sequences ({len(train_dataset)}), Test sequences ({len(test_dataset)})")
    # Model, criterion, optimizer
    model = LSTMModel(input_dim=TRANSACTIONS_DIM + ZIP_EMB_DIM + MCC_EMB_DIM,
                      hidden_dim=128,
                      num_layers=2,
                      output_dim=1,
                      dropout_rate=0.2)
    model.to(device)

    criterion = FocalLoss()
    optimizer = torch.optim.AdamW(model.parameters(), lr=1e-3)

    epochs = 10

    early_stopping = EarlyStopping(patience=5, path=f'training_results/LSTM/fold{i+1}', metric_name='val_pr_auc')
    metrics = {
        'train_loss': [], 'train_roc_auc': [], 'train_pr_auc': [], 'train_f1_score': [],
        'val_loss': [], 'val_roc_auc': [], 'val_pr_auc': [], 'val_f1_score': []
    }

    for epoch in range(epochs):
        # Training loop
        model.train()
        total_loss = 0.0
        all_preds_flat = []
        all_labels_flat = []
        
        with tqdm(total=len(train_dataloader)*batch_size, desc=f"Fold {i+1} Training Epoch {epoch+1}/{epochs}", ncols=100, leave=False) as pbar:
            for padded_features, padded_labels, lengths, src_ids in train_dataloader:
                padded_features = padded_features.to(device)
                padded_labels = padded_labels.to(device)
                lengths = lengths.to(device)

                optimizer.zero_grad()
                predictions = model(padded_features, lengths)

                valid_predictions = []
                valid_labels = []
                for b in range(predictions.size(0)):
                    valid_predictions.append(predictions[b, :lengths[b], :])
                    valid_labels.append(padded_labels[b, :lengths[b], :])

                valid_predictions = torch.cat(valid_predictions, dim=0).squeeze(-1)
                valid_labels = torch.cat(valid_labels, dim=0).squeeze(-1)

                if valid_labels.numel() == 0:
                    batch_loss = torch.tensor(0.0)
                else:
                    batch_loss = criterion(valid_predictions, valid_labels)

                batch_loss.backward()
                optimizer.step()

                total_loss += batch_loss.item()

                all_preds_flat.extend(valid_predictions.detach().cpu().numpy())
                all_labels_flat.extend(valid_labels.cpu().numpy())

                pbar.update(padded_features.size(0))

        avg_loss = total_loss / len(train_dataloader)
        metrics['train_loss'].append(avg_loss)
        all_labels_flat = np.array(all_labels_flat).flatten()
        all_preds_flat = np.array(all_preds_flat).flatten()
        train_roc_auc = roc_auc_score(all_labels_flat, all_preds_flat)
        train_pr_auc = average_precision_score(all_labels_flat, all_preds_flat)
        train_f1_score, train_f1_threshold = get_max_f1_score(all_labels_flat, all_preds_flat)
        metrics['train_roc_auc'].append(train_roc_auc)
        metrics['train_pr_auc'].append(train_pr_auc)
        metrics['train_f1_score'].append(train_f1_score)

        # Validation loop
        model.eval()
        val_total_loss = 0.0
        val_all_preds_flat = []
        val_all_labels_flat = []

        with torch.no_grad():
            with tqdm(total=len(test_dataloader)*batch_size, desc=f"Fold {i+1} Testing Epoch {epoch+1}/{epochs}", ncols=100, leave=False) as pbar:
                for padded_features, padded_labels, lengths, src_ids in test_dataloader:
                    padded_features = padded_features.to(device)
                    padded_labels = padded_labels.to(device)
                    lengths = lengths.to(device)
                    
                    predictions = model(padded_features, lengths)

                    valid_predictions = []
                    valid_labels = []
                    for b in range(predictions.size(0)):
                        valid_predictions.append(predictions[b, :lengths[b], :])
                        valid_labels.append(padded_labels[b, :lengths[b], :])

                    valid_predictions = torch.cat(valid_predictions, dim=0).squeeze(-1)
                    valid_labels = torch.cat(valid_labels, dim=0).squeeze(-1)

                    if valid_labels.numel() == 0:
                        batch_loss = torch.tensor(0.0)
                    else:
                        batch_loss = criterion(valid_predictions, valid_labels)

                    val_total_loss += batch_loss.item()

                    val_all_preds_flat.extend(valid_predictions.detach().cpu().numpy())
                    val_all_labels_flat.extend(valid_labels.cpu().numpy())

                    pbar.update(padded_features.size(0))
        val_avg_loss = val_total_loss / len(test_dataloader)
        metrics['val_loss'].append(val_avg_loss)
        val_all_labels_flat = np.array(val_all_labels_flat).flatten()
        val_all_preds_flat = np.array(val_all_preds_flat).flatten()
        val_roc_auc = roc_auc_score(val_all_labels_flat, val_all_preds_flat)
        val_pr_auc = average_precision_score(val_all_labels_flat, val_all_preds_flat)
        val_f1_score, val_f1_threshold = get_max_f1_score(val_all_labels_flat, val_all_preds_flat)
        metrics['val_roc_auc'].append(val_roc_auc)
        metrics['val_pr_auc'].append(val_pr_auc)
        metrics['val_f1_score'].append(val_f1_score)

        print(f"Fold {i+1} Train Epoch {epoch+1}/{epochs}, Loss: {avg_loss:.4f}, ROC_AUC: {train_roc_auc:.4f}, PR_AUC: {train_pr_auc:.4f}, F1_Score: {train_f1_score:.4f}(th:{train_f1_threshold:.4f}) | Test Loss {val_avg_loss:.4f}, ROC_AUC: {val_roc_auc:.4f}, PR_AUC: {val_pr_auc:.4f}, F1_Score: {val_f1_score:.4f}(th:{val_f1_threshold:.4f})")
        early_stopping(val_pr_auc, model)
        if early_stopping.early_stop:
            print(f"Early stopping fold {i+1} at epoch {epoch+1}")
            break
    plot_metrics(pd.DataFrame(metrics))
