Perfect with Train and Test

In [7]:
import torch
import torch.nn as nn
import pandas as pd
from torch.utils.data import Dataset, DataLoader
import numpy as np
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

# Dataset class for grouped sequences of transactions by `cc_num`
class CreditCardFraudDataset(Dataset):
    def __init__(self, file_path, seq_len):
        self.data = pd.read_csv(file_path)

        # Convert date to timestamp
        self.data['trans_date_trans_time'] = pd.to_datetime(self.data['trans_date_trans_time']).apply(lambda x: x.timestamp())

        # Encode category column
        self.label_encoder = LabelEncoder()
        self.data['category'] = self.label_encoder.fit_transform(self.data['category'])

        # Normalize features
        scaler = MinMaxScaler()
        self.data[['amt']] = scaler.fit_transform(self.data[['amt']])

        # Group transactions by `cc_num` and create sequences
        self.seq_len = seq_len
        self.sequences = []
        grouped = self.data.groupby('cc_num')
        for _, group in grouped:
            group = group[['category', 'amt', 'is_fraud', 'trans_date_trans_time']].values
            for i in range(len(group)):
                if i < self.seq_len - 1:
                    padding = [group[0]] * (self.seq_len - i - 1)
                    seq = padding + group[:i + 1].tolist()
                else:
                    seq = group[i - self.seq_len + 1:i + 1].tolist()

                label = group[i, -2]  # Fraud label of the current transaction
                time_intervals = np.diff([s[-1] for s in seq], prepend=seq[0][-1])  # Time differences
                time_intervals = time_intervals.reshape(-1, 1)  # Reshape for compatibility
                seq_features = np.array([s[:-1] for s in seq])  # Remove timestamp from features
                seq_features = np.concatenate((seq_features, time_intervals), axis=1)  # Add time intervals as feature
                self.sequences.append((seq_features, label))

    def __len__(self):
        return len(self.sequences)

    def __getitem__(self, idx):
        x_seq, y_label = self.sequences[idx]
        return torch.tensor(x_seq, dtype=torch.float32), torch.tensor(y_label, dtype=torch.float32)


# GRU-based Model with Dropout
class GRUModel(nn.Module):
    def __init__(self, input_dim, hidden_dim, memory_size):
        super(GRUModel, self).__init__()
        self.hidden_dim = hidden_dim
        self.memory_size = memory_size

        # GRU layer
        self.gru = nn.GRU(input_dim, hidden_dim, batch_first=True)
        self.dropout = nn.Dropout(p=0.5)  # Dropout layer

        # Attention mechanism
        self.attention_fc1 = nn.Linear(hidden_dim, hidden_dim)
        self.attention_fc2 = nn.Linear(hidden_dim, 1)

        # Output layer
        self.classifier = nn.Linear(hidden_dim, 1)

    def forward(self, X_seq):
        batch_size, seq_len, _ = X_seq.size()

        # Features (excluding delta_t)
        x_features = X_seq[:, :, :-1]

        # GRU forward pass
        h_0 = torch.zeros(1, batch_size, self.hidden_dim).to(X_seq.device)
        gru_out, _ = self.gru(x_features, h_0)

        # Attention mechanism
        attention_scores = torch.tanh(self.attention_fc1(gru_out))
        attention_weights = torch.softmax(self.attention_fc2(attention_scores), dim=1)
        e_t = torch.sum(attention_weights * gru_out, dim=1)

        # Apply dropout
        e_t = self.dropout(e_t)

        # Final classification
        y_pred = torch.sigmoid(self.classifier(e_t))
        return y_pred

# Training and evaluation remain the same


# Training and testing
if __name__ == "__main__":
    batch_size = 32
    input_dim = 3  # category, amt, delta_t
    hidden_dim = 4
    memory_size = 10
    seq_len = 10

    train_dataset = CreditCardFraudDataset("/home/ducanh/Credit Card Transactions Fraud Detection/fraudTrain.csv", seq_len=seq_len)
    test_dataset = CreditCardFraudDataset("/home/ducanh/Credit Card Transactions Fraud Detection/fraudTest.csv", seq_len=seq_len)

    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

    model = GRUModel(input_dim, hidden_dim, memory_size).to('cuda' if torch.cuda.is_available() else 'cpu')
    optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
    criterion = nn.BCELoss()

    epochs = 3
    for epoch in range(epochs):
        model.train()
        total_loss = 0
        y_true_epoch = []
        y_pred_epoch = []

        for X_batch, y_batch in train_loader:
            X_batch, y_batch = X_batch.to('cuda'), y_batch.view(-1, 1).to('cuda')
            optimizer.zero_grad()
            y_pred = model(X_batch)
            loss = criterion(y_pred, y_batch)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()

            y_true_epoch.extend(y_batch.cpu().numpy())
            y_pred_epoch.extend((y_pred.detach().cpu().numpy() >= 0.5).astype(int))

        epoch_accuracy = accuracy_score(y_true_epoch, y_pred_epoch)
        print(f"Epoch [{epoch + 1}/{epochs}] Loss: {total_loss / len(train_loader):.4f} | Accuracy: {epoch_accuracy:.4f}")

    # Testing
    model.eval()
    y_true_test = []
    y_pred_test_prob = []

    with torch.no_grad():
        for X_batch, y_batch in test_loader:
            X_batch, y_batch = X_batch.to('cuda'), y_batch.view(-1, 1).to('cuda')
            y_pred = model(X_batch)
            y_true_test.extend(y_batch.cpu().numpy())
            y_pred_test_prob.extend(y_pred.cpu().numpy())

    thresholds = [0.1, 0.2, 0.3, 0.4, 0.5]
    for threshold in thresholds:
        y_pred_test_binary = (np.array(y_pred_test_prob) >= threshold).astype(int)
        accuracy = accuracy_score(y_true_test, y_pred_test_binary)
        precision = precision_score(y_true_test, y_pred_test_binary)
        recall = recall_score(y_true_test, y_pred_test_binary)
        f1 = f1_score(y_true_test, y_pred_test_binary)
        auc = roc_auc_score(y_true_test, y_pred_test_binary)

        print(f"Threshold: {threshold:.2f} | Accuracy: {accuracy:.4f} | Precision: {precision:.4f} | Recall: {recall:.4f} | F1: {f1:.4f} | AUC: {auc:.4f}")

Epoch [1/3] Loss: 0.0133 | Accuracy: 0.9983
Epoch [2/3] Loss: 0.0043 | Accuracy: 0.9992
Epoch [3/3] Loss: 0.0024 | Accuracy: 0.9996
Threshold: 0.10 | Accuracy: 1.0000 | Precision: 1.0000 | Recall: 1.0000 | F1: 1.0000 | AUC: 1.0000
Threshold: 0.20 | Accuracy: 1.0000 | Precision: 1.0000 | Recall: 1.0000 | F1: 1.0000 | AUC: 1.0000


KeyboardInterrupt: 

In [8]:
import torch
import torch.nn as nn
import pandas as pd
from torch.utils.data import Dataset, DataLoader
import numpy as np
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
import random

# Dataset class for grouped sequences of transactions by `cc_num`
class CreditCardFraudDataset(Dataset):
    def __init__(self, data, seq_len):
        self.data = data.copy()

        # Convert date to timestamp
        self.data['trans_date_trans_time'] = pd.to_datetime(self.data['trans_date_trans_time']).apply(lambda x: x.timestamp())

        # Encode category column
        self.label_encoder = LabelEncoder()
        self.data['category'] = self.label_encoder.fit_transform(self.data['category'])

        # Normalize features
        scaler = MinMaxScaler()
        self.data[['amt']] = scaler.fit_transform(self.data[['amt']])

        # Group transactions by `cc_num` and create sequences
        self.seq_len = seq_len
        self.sequences = []
        grouped = self.data.groupby('cc_num')
        for _, group in grouped:
            group = group[['category', 'amt', 'is_fraud', 'trans_date_trans_time']].values
            for i in range(len(group)):
                if i < self.seq_len - 1:
                    padding = [group[0]] * (self.seq_len - i - 1)
                    seq = padding + group[:i + 1].tolist()
                else:
                    seq = group[i - self.seq_len + 1:i + 1].tolist()

                label = group[i, -2]  # Fraud label of the current transaction
                time_intervals = np.diff([s[-1] for s in seq], prepend=seq[0][-1])  # Time differences
                time_intervals = time_intervals.reshape(-1, 1)  # Reshape for compatibility
                seq_features = np.array([s[:-1] for s in seq])  # Remove timestamp from features
                seq_features = np.concatenate((seq_features, time_intervals), axis=1)  # Add time intervals as feature
                self.sequences.append((seq_features, label))

    def __len__(self):
        return len(self.sequences)

    def __getitem__(self, idx):
        x_seq, y_label = self.sequences[idx]
        return torch.tensor(x_seq, dtype=torch.float32), torch.tensor(y_label, dtype=torch.float32)


# GRU-based Model with Dropout and Attention
class GRUModel(nn.Module):
    def __init__(self, input_dim, hidden_dim, memory_size):
        super(GRUModel, self).__init__()
        self.hidden_dim = hidden_dim
        self.memory_size = memory_size
        self.dropout = nn.Dropout(p=0.5)  # Dropout: randomly drops 30% of neurons

        # GRU layer
        self.gru = nn.GRU(input_dim, hidden_dim, batch_first=True)

        # Attention mechanism
        self.attention_fc1 = nn.Linear(hidden_dim, hidden_dim)
        self.attention_fc2 = nn.Linear(hidden_dim, 1)

        # Output layer
        self.classifier = nn.Linear(hidden_dim, 1)

    def forward(self, X_seq):
        batch_size, seq_len, _ = X_seq.size()

        # Features and time intervals
        x_features = X_seq[:, :, :-1]  # Exclude time intervals
        delta_t = X_seq[:, :, -1].unsqueeze(-1)  # Time intervals

        # GRU forward pass
        h_0 = torch.zeros(1, batch_size, self.hidden_dim).to(X_seq.device)  # Initial hidden state
        gru_out, _ = self.gru(x_features, h_0)  # Shape: (batch_size, seq_len, hidden_dim)

        # Attention scores
        attention_scores = torch.tanh(self.attention_fc1(gru_out))  # Shape: (batch_size, seq_len, hidden_dim)
        attention_weights = torch.softmax(self.attention_fc2(attention_scores), dim=1)  # Shape: (batch_size, seq_len, 1)

        # Apply attention weights to GRU outputs
        e_t = torch.sum(attention_weights * gru_out, dim=1)  # Shape: (batch_size, hidden_dim)

        # Apply dropout
        e_t = self.dropout(e_t)

        # Final classification
        y_pred = torch.sigmoid(self.classifier(e_t))  # Shape: (batch_size, 1)
        return y_pred


# Training loop with L2 Regularization (weight_decay)
if __name__ == "__main__":
    batch_size = 32
    input_dim = 3  # category, amt, delta_t
    hidden_dim = 64  # Reduced from 64 for efficiency
    memory_size = 10
    seq_len = 10
    epochs = 1

    # Load dataset
    train_data = pd.read_csv("/home/ducanh/Credit Card Transactions Fraud Detection/fraudTrain.csv")

    # Split the training data (80% train, 20% validation)
    all_cc_nums = train_data['cc_num'].unique()
    random.shuffle(all_cc_nums)
    train_cc_nums = all_cc_nums[:int(0.8 * len(all_cc_nums))]
    val_cc_nums = all_cc_nums[int(0.8 * len(all_cc_nums)):]

    new_train_data = train_data[train_data['cc_num'].isin(train_cc_nums)]
    new_val_data = train_data[train_data['cc_num'].isin(val_cc_nums)]

    train_dataset = CreditCardFraudDataset(new_train_data, seq_len=seq_len)
    val_dataset = CreditCardFraudDataset(new_val_data, seq_len=seq_len)

    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

    # Define model, optimizer, and loss function with weight decay (L2 Regularization)
    model = GRUModel(input_dim, hidden_dim, memory_size).to('cuda' if torch.cuda.is_available() else 'cpu')
    optimizer = torch.optim.Adam(model.parameters(), lr=0.001, weight_decay=1e-4)
    criterion = nn.BCELoss()

    # Training loop
    for epoch in range(epochs):
        model.train()
        total_loss = 0

        for X_batch, y_batch in train_loader:
            X_batch, y_batch = X_batch.to('cuda'), y_batch.view(-1, 1).to('cuda')
            optimizer.zero_grad()
            y_pred = model(X_batch)
            loss = criterion(y_pred, y_batch)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()

        print(f"Epoch [{epoch + 1}/{epochs}] Loss: {total_loss / len(train_loader):.4f}")

    print("Training complete.")

# Function to evaluate the model
def evaluate_model(model, data_loader, criterion):
    model.eval()
    y_true = []
    y_pred_prob = []
    total_loss = 0

    with torch.no_grad():
        for X_batch, y_batch in data_loader:
            X_batch, y_batch = X_batch.to('cuda'), y_batch.view(-1, 1).to('cuda')
            y_pred = model(X_batch)
            loss = criterion(y_pred, y_batch)
            total_loss += loss.item()

            y_true.extend(y_batch.cpu().numpy())  # True labels
            y_pred_prob.extend(y_pred.cpu().numpy())  # Predicted probabilities

    # Convert probabilities to binary predictions with threshold 0.5
    y_pred_binary = (np.array(y_pred_prob) >= 0.5).astype(int)

    # Calculate metrics
    accuracy = accuracy_score(y_true, y_pred_binary)
    precision = precision_score(y_true, y_pred_binary, zero_division=0)
    recall = recall_score(y_true, y_pred_binary)
    f1 = f1_score(y_true, y_pred_binary)
    auc = roc_auc_score(y_true, y_pred_prob)

    avg_loss = total_loss / len(data_loader)
    
    return avg_loss, accuracy, precision, recall, f1, auc

# After training, evaluate the model on the validation set
val_loss, val_accuracy, val_precision, val_recall, val_f1, val_auc = evaluate_model(model, val_loader, criterion)

# Print validation metrics
print("\nValidation Results:")
print(f"Validation Loss: {val_loss:.4f}")
print(f"Accuracy: {val_accuracy:.4f}")
print(f"Precision: {val_precision:.4f}")
print(f"Recall: {val_recall:.4f}")
print(f"F1-Score: {val_f1:.4f}")
print(f"AUC: {val_auc:.4f}")



Epoch [1/1] Loss: 0.0027
Training complete.

Validation Results:
Validation Loss: 0.0004
Accuracy: 0.9999
Precision: 1.0000
Recall: 0.9797
F1-Score: 0.9898
AUC: 1.0000
