In [1]:
import torch
import torch.nn as nn
import pandas as pd
import numpy as np
from torch.utils.data import Dataset, DataLoader
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from geopy.distance import geodesic  # Compute distance between transactions

# Dataset class with only 4 features: category, amt, distance_km, delta_t
class CreditCardFraudDataset(Dataset):
    def __init__(self, file_path, seq_len):
        self.data = pd.read_csv(file_path)

        # Convert transaction date to timestamp
        self.data['trans_date_trans_time'] = pd.to_datetime(self.data['trans_date_trans_time']).apply(lambda x: x.timestamp())

        # Encode category column
        self.label_encoder = LabelEncoder()
        self.data['category'] = self.label_encoder.fit_transform(self.data['category'])

        # Normalize numerical features
        scaler = MinMaxScaler()
        self.data[['amt']] = scaler.fit_transform(self.data[['amt']])

        # Compute distance between transactions
        self.data['prev_lat'] = self.data.groupby('cc_num')['lat'].shift(1)
        self.data['prev_long'] = self.data.groupby('cc_num')['long'].shift(1)
        self.data['distance_km'] = self.data.apply(
            lambda row: geodesic((row['lat'], row['long']), (row['prev_lat'], row['prev_long'])).km
            if not pd.isnull(row['prev_lat']) else 0, axis=1
        )

        # Normalize distance
        self.data[['distance_km']] = scaler.fit_transform(self.data[['distance_km']])

        # Group transactions by `cc_num` and create sequences
        self.seq_len = seq_len
        self.sequences = []
        grouped = self.data.groupby('cc_num')

        for _, group in grouped:
            # Selecting only 4 features: category, amt, distance_km, time_intervals (delta_t)
            # Lưu ý: dữ liệu ban đầu có 5 cột: category, amt, distance_km, is_fraud, trans_date_trans_time
            group = group[['category', 'amt', 'distance_km', 'is_fraud', 'trans_date_trans_time']].values

            for i in range(len(group)):
                if i < self.seq_len - 1:
                    padding = [group[0]] * (self.seq_len - i - 1)
                    seq = padding + group[:i + 1].tolist()
                else:
                    seq = group[i - self.seq_len + 1:i + 1].tolist()

                label = group[i, -2]  # Fraud label
                time_intervals = np.diff([s[-1] for s in seq], prepend=seq[0][-1])  # Time differences
                time_intervals = time_intervals.reshape(-1, 1)

                # Final sequence with only 4 features: category, amt, distance_km, delta_t
                # Lưu ý: dùng s[:-1] loại bỏ cột timestamp (trans_date_trans_time)
                # => Các cột còn lại là: category, amt, distance_km, is_fraud
                # Chúng ta cần loại bỏ is_fraud vì nó dùng làm nhãn, do đó thay đổi thành s[:-2]
                seq_features = np.array([s[:-2] for s in seq])  # Remove is_fraud and timestamp
                seq_features = np.concatenate((seq_features, time_intervals), axis=1)  # Add time intervals

                self.sequences.append((seq_features, label))

    def __len__(self):
        return len(self.sequences)

    def __getitem__(self, idx):
        x_seq, y_label = self.sequences[idx]
        return torch.tensor(x_seq, dtype=torch.float32), torch.tensor(y_label, dtype=torch.float32)

# --- Bổ sung bước cuối để kiểm tra shape của seq_features ---
if __name__ == "__main__":
    # Thay 'your_file.csv' bằng đường dẫn file CSV thật của bạn.
    file_path = "/home/ducanh/Credit Card Transactions Fraud Detection/Datasets/fraudTrain.csv"
    seq_len = 5  # Ví dụ: chuỗi gồm 5 giao dịch
    dataset = CreditCardFraudDataset(file_path, seq_len)
    
    print("Tổng số sequence:", len(dataset))
    print("-" * 50)
    # Kiểm tra shape của 5 sequence đầu tiên
    for i in range(min(5, len(dataset))):
        x_seq, y_label = dataset[i]
        print(f"Sequence {i+1}: shape {x_seq.shape}")
        # Nếu bạn muốn xem chi tiết các giá trị, hãy bỏ comment dòng dưới đây:
        # print(x_seq)
        print("-" * 50)

Tổng số sequence: 1296675
--------------------------------------------------
Sequence 1: shape torch.Size([5, 4])
--------------------------------------------------
Sequence 2: shape torch.Size([5, 4])
--------------------------------------------------
Sequence 3: shape torch.Size([5, 4])
--------------------------------------------------
Sequence 4: shape torch.Size([5, 4])
--------------------------------------------------
Sequence 5: shape torch.Size([5, 4])
--------------------------------------------------


In [2]:
import torch
import torch.nn as nn
import pandas as pd
import numpy as np
from torch.utils.data import Dataset, DataLoader
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from geopy.distance import geodesic  # Compute distance between transactions

# Dataset class with only 4 features: category, amt, distance_km, delta_t
class CreditCardFraudDataset(Dataset):
    def __init__(self, file_path, seq_len):
        self.data = pd.read_csv(file_path)

        # Convert transaction date to timestamp
        self.data['trans_date_trans_time'] = pd.to_datetime(self.data['trans_date_trans_time']).apply(lambda x: x.timestamp())

        # Encode category column
        self.label_encoder = LabelEncoder()
        self.data['category'] = self.label_encoder.fit_transform(self.data['category'])

        # Normalize numerical features
        scaler = MinMaxScaler()
        self.data[['amt']] = scaler.fit_transform(self.data[['amt']])

        # Compute distance between transactions
        self.data['prev_lat'] = self.data.groupby('cc_num')['lat'].shift(1)
        self.data['prev_long'] = self.data.groupby('cc_num')['long'].shift(1)
        self.data['distance_km'] = self.data.apply(
            lambda row: geodesic((row['lat'], row['long']), (row['prev_lat'], row['prev_long'])).km
            if not pd.isnull(row['prev_lat']) else 0, axis=1
        )

        # Normalize distance
        self.data[['distance_km']] = scaler.fit_transform(self.data[['distance_km']])

        # Group transactions by `cc_num` and create sequences
        self.seq_len = seq_len
        self.sequences = []
        grouped = self.data.groupby('cc_num')

        for _, group in grouped:
            # Selecting only 4 features: category, amt, distance_km, time_intervals (delta_t)
            # Dữ liệu ban đầu có 5 cột: [category, amt, distance_km, is_fraud, trans_date_trans_time]
            group = group[['category', 'amt', 'distance_km', 'is_fraud', 'trans_date_trans_time']].values

            for i in range(len(group)):
                if i < self.seq_len - 1:
                    padding = [group[0]] * (self.seq_len - i - 1)
                    seq = padding + group[:i + 1].tolist()
                else:
                    seq = group[i - self.seq_len + 1:i + 1].tolist()

                label = group[i, -2]  # Fraud label (is_fraud)
                time_intervals = np.diff([s[-1] for s in seq], prepend=seq[0][-1])  # Compute delta_t
                time_intervals = time_intervals.reshape(-1, 1)

                # Final sequence with only 4 features: category, amt, distance_km, delta_t
                # Loại bỏ 2 cột cuối: is_fraud và trans_date_trans_time
                seq_features = np.array([s[:-2] for s in seq])
                # Nối thêm delta_t (time_intervals)
                seq_features = np.concatenate((seq_features, time_intervals), axis=1)

                self.sequences.append((seq_features, label))

    def __len__(self):
        return len(self.sequences)

    def __getitem__(self, idx):
        x_seq, y_label = self.sequences[idx]
        return torch.tensor(x_seq, dtype=torch.float32), torch.tensor(y_label, dtype=torch.float32)

# LSTM Model with Time & Location-aware modules
class STGN_LSTM(nn.Module):
    def __init__(self, input_dim, hidden_dim):
        super(STGN_LSTM, self).__init__()
        self.hidden_dim = hidden_dim

        # LSTM Gates with Time & Location-aware module
        self.Wfh = nn.Linear(hidden_dim, hidden_dim)
        self.Wfx = nn.Linear(input_dim, hidden_dim)
        self.bf = nn.Parameter(torch.zeros(hidden_dim))

        self.Wih = nn.Linear(hidden_dim, hidden_dim)
        self.Wix = nn.Linear(input_dim, hidden_dim)
        self.bi = nn.Parameter(torch.zeros(hidden_dim))

        self.WTh = nn.Linear(hidden_dim, hidden_dim)
        self.WTx = nn.Linear(input_dim, hidden_dim)
        self.WTt = nn.Linear(1, hidden_dim)
        self.bT = nn.Parameter(torch.zeros(hidden_dim))

        self.WLh = nn.Linear(hidden_dim, hidden_dim)
        self.WLx = nn.Linear(input_dim, hidden_dim)
        self.WLdelta = nn.Linear(1, hidden_dim)
        self.bL = nn.Parameter(torch.zeros(hidden_dim))

        self.classifier = nn.Linear(hidden_dim, 1)

    def forward(self, X_seq):
        batch_size, seq_len, _ = X_seq.shape
        h_prev = torch.zeros(batch_size, self.hidden_dim).to(X_seq.device)
        c_prev = torch.zeros(batch_size, self.hidden_dim).to(X_seq.device)

        for t in range(seq_len):
            # Lấy x_t từ các cột đầu tiên, loại bỏ 2 cột cuối (delta_t và delta_L)
            x_t = X_seq[:, t, :-2]
            # Tách riêng delta_t và delta_L từ 2 cột cuối
            delta_t = X_seq[:, t, -2].view(-1, 1)
            delta_L = X_seq[:, t, -1].view(-1, 1)
            
            # In ra shape của x_t tại mỗi bước thời gian
            print(f"At timestep {t}, x_t shape: {x_t.shape}")

            # Tính các cổng (gates)
            f_t = torch.sigmoid(self.Wfh(h_prev) + self.Wfx(x_t) + self.bf)
            i_t = torch.sigmoid(self.Wih(h_prev) + self.Wix(x_t) + self.bi)
            T_t = torch.sigmoid(self.WTh(h_prev) + self.WTx(x_t) + self.WTt(delta_t) + self.bT)
            L_t = torch.sigmoid(self.WLh(h_prev) + self.WLx(x_t) + self.WLdelta(delta_L) + self.bL)

            c_t = f_t * c_prev + i_t * torch.tanh(T_t * delta_t + L_t * delta_L)
            h_t = torch.tanh(c_t)
            h_prev, c_prev = h_t, c_t  # Cập nhật trạng thái cho bước tiếp theo

        return torch.sigmoid(self.classifier(h_t))

# --- Main ---
if __name__ == "__main__":
    # Thay 'your_file.csv' bằng đường dẫn file CSV thật của bạn
    file_path = "/home/ducanh/Credit Card Transactions Fraud Detection/Datasets/fraudTrain.csv"
    seq_len = 5  # Ví dụ: mỗi sequence gồm 5 giao dịch
    dataset = CreditCardFraudDataset(file_path, seq_len)
    print("Tổng số sequence:", len(dataset))
    print("-" * 50)
    
    # Kiểm tra shape của sequence đầu tiên
    x_seq, y_label = dataset[0]
    print("Shape của sequence đầu tiên (x_seq):", x_seq.shape)
    print("-" * 50)
    
    # Khởi tạo model với input_dim = 4 (category, amt, distance_km, delta_t) và hidden_dim = 64
    model = STGN_LSTM(input_dim=4, hidden_dim=64)
    
    # Thêm batch dimension (giả sử batch size = 1)
    x_seq = x_seq.unsqueeze(0)  # Shape: (1, seq_len, 4)
    print("Shape của x_seq có batch dimension:", x_seq.shape)
    print("-" * 50)
    
    # Forward pass qua model (sẽ in ra shape của x_t tại mỗi timestep)
    output = model(x_seq)
    print("Output của model:", output)

Tổng số sequence: 1296675
--------------------------------------------------
Shape của sequence đầu tiên (x_seq): torch.Size([5, 4])
--------------------------------------------------
Shape của x_seq có batch dimension: torch.Size([1, 5, 4])
--------------------------------------------------
At timestep 0, x_t shape: torch.Size([1, 2])


RuntimeError: mat1 and mat2 shapes cannot be multiplied (1x2 and 4x64)

In [None]:
import torch
import torch.nn as nn
import pandas as pd
import numpy as np
from torch.utils.data import Dataset, DataLoader
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from geopy.distance import geodesic  # Compute distance between transactions

# Dataset class with only 4 features: category, amt, distance_km, delta_t
class CreditCardFraudDataset(Dataset):
    def __init__(self, file_path, seq_len):
        self.data = pd.read_csv(file_path)

        # Convert transaction date to timestamp
        self.data['trans_date_trans_time'] = pd.to_datetime(self.data['trans_date_trans_time']).apply(lambda x: x.timestamp())

        # Encode category column
        self.label_encoder = LabelEncoder()
        self.data['category'] = self.label_encoder.fit_transform(self.data['category'])

        # Normalize numerical features
        scaler = MinMaxScaler()
        self.data[['amt']] = scaler.fit_transform(self.data[['amt']])

        # Compute distance between transactions
        self.data['prev_lat'] = self.data.groupby('cc_num')['lat'].shift(1)
        self.data['prev_long'] = self.data.groupby('cc_num')['long'].shift(1)
        self.data['distance_km'] = self.data.apply(
            lambda row: geodesic((row['lat'], row['long']), (row['prev_lat'], row['prev_long'])).km
            if not pd.isnull(row['prev_lat']) else 0, axis=1
        )

        # Normalize distance
        self.data[['distance_km']] = scaler.fit_transform(self.data[['distance_km']])

        # Group transactions by `cc_num` and create sequences
        self.seq_len = seq_len
        self.sequences = []
        grouped = self.data.groupby('cc_num')

        for _, group in grouped:
            # Selecting only 4 features: category, amt, distance_km, time_intervals (delta_t)
            group = group[['category', 'amt', 'distance_km', 'is_fraud', 'trans_date_trans_time']].values

            for i in range(len(group)):
                if i < self.seq_len - 1:
                    padding = [group[0]] * (self.seq_len - i - 1)
                    seq = padding + group[:i + 1].tolist()
                else:
                    seq = group[i - self.seq_len + 1:i + 1].tolist()

                label = group[i, -2]  # Fraud label
                time_intervals = np.diff([s[-1] for s in seq], prepend=seq[0][-1])  # Time differences
                time_intervals = time_intervals.reshape(-1, 1)

                # Final sequence with only 4 features: category, amt, distance_km, delta_t
                seq_features = np.array([s[:-1] for s in seq])  # Remove timestamp
                seq_features = np.concatenate((seq_features, time_intervals), axis=1)  # Add time intervals

                self.sequences.append((seq_features, label))

    def __len__(self):
        return len(self.sequences)

    def __getitem__(self, idx):
        x_seq, y_label = self.sequences[idx]
        return torch.tensor(x_seq, dtype=torch.float32), torch.tensor(y_label, dtype=torch.float32)

In [None]:
import torch
import torch.nn as nn
import pandas as pd
import numpy as np
from torch.utils.data import Dataset, DataLoader
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from geopy.distance import geodesic  # Compute distance between transactions

# Dataset class with only 4 features: category, amt, distance_km, delta_t
class CreditCardFraudDataset(Dataset):
    def __init__(self, file_path, seq_len):
        self.data = pd.read_csv(file_path)

        # Convert transaction date to timestamp
        self.data['trans_date_trans_time'] = pd.to_datetime(self.data['trans_date_trans_time']).apply(lambda x: x.timestamp())

        # Encode category column
        self.label_encoder = LabelEncoder()
        self.data['category'] = self.label_encoder.fit_transform(self.data['category'])

        # Normalize numerical features
        scaler = MinMaxScaler()
        self.data[['amt']] = scaler.fit_transform(self.data[['amt']])

        # Compute distance between transactions
        self.data['prev_lat'] = self.data.groupby('cc_num')['lat'].shift(1)
        self.data['prev_long'] = self.data.groupby('cc_num')['long'].shift(1)
        self.data['distance_km'] = self.data.apply(
            lambda row: geodesic((row['lat'], row['long']), (row['prev_lat'], row['prev_long'])).km
            if not pd.isnull(row['prev_lat']) else 0, axis=1
        )

        # Normalize distance
        self.data[['distance_km']] = scaler.fit_transform(self.data[['distance_km']])

        # Group transactions by `cc_num` and create sequences
        self.seq_len = seq_len
        self.sequences = []
        grouped = self.data.groupby('cc_num')

        for _, group in grouped:
            # Selecting only 4 features: category, amt, distance_km, time_intervals (delta_t)
            group = group[['category', 'amt', 'distance_km', 'is_fraud', 'trans_date_trans_time']].values

            for i in range(len(group)):
                if i < self.seq_len - 1:
                    padding = [group[0]] * (self.seq_len - i - 1)
                    seq = padding + group[:i + 1].tolist()
                else:
                    seq = group[i - self.seq_len + 1:i + 1].tolist()

                label = group[i, -2]  # Fraud label
                time_intervals = np.diff([s[-1] for s in seq], prepend=seq[0][-1])  # Time differences
                time_intervals = time_intervals.reshape(-1, 1)

                # Final sequence with only 4 features: category, amt, distance_km, delta_t
                seq_features = np.array([s[:-1] for s in seq])  # Remove timestamp
                seq_features = np.concatenate((seq_features, time_intervals), axis=1)  # Add time intervals

                self.sequences.append((seq_features, label))

    def __len__(self):
        return len(self.sequences)

    def __getitem__(self, idx):
        x_seq, y_label = self.sequences[idx]
        return torch.tensor(x_seq, dtype=torch.float32), torch.tensor(y_label, dtype=torch.float32)


# LSTM Model for Fraud Detection with 4 features
class STGN_LSTM(nn.Module):
    def __init__(self, input_dim, hidden_dim):
        super(STGN_LSTM, self).__init__()
        self.hidden_dim = hidden_dim

        # LSTM Gates with Time & Location-aware module
        self.Wfh = nn.Linear(hidden_dim, hidden_dim)
        self.Wfx = nn.Linear(input_dim, hidden_dim)
        self.bf = nn.Parameter(torch.zeros(hidden_dim))

        self.Wih = nn.Linear(hidden_dim, hidden_dim)
        self.Wix = nn.Linear(input_dim, hidden_dim)
        self.bi = nn.Parameter(torch.zeros(hidden_dim))

        self.WTh = nn.Linear(hidden_dim, hidden_dim)
        self.WTx = nn.Linear(input_dim, hidden_dim)
        self.WTt = nn.Linear(1, hidden_dim)
        self.bT = nn.Parameter(torch.zeros(hidden_dim))

        self.WLh = nn.Linear(hidden_dim, hidden_dim)
        self.WLx = nn.Linear(input_dim, hidden_dim)
        self.WLdelta = nn.Linear(1, hidden_dim)
        self.bL = nn.Parameter(torch.zeros(hidden_dim))

        self.classifier = nn.Linear(hidden_dim, 1)

    def forward(self, X_seq):
        batch_size, seq_len, _ = X_seq.shape
        h_prev = torch.zeros(batch_size, self.hidden_dim).to(X_seq.device)
        c_prev = torch.zeros(batch_size, self.hidden_dim).to(X_seq.device)

        for t in range(seq_len):
            x_t = X_seq[:, t, :-2]  # Features excluding delta_t, delta L
            delta_t = X_seq[:, t, -2].view(-1, 1)  # Time interval (delta_t)  
            delta_L = X_seq[:, t, -1].view(-1, 1)  # Distance interval (delta_L)
            # Gates
            f_t = torch.sigmoid(self.Wfh(h_prev) + self.Wfx(x_t) + self.bf)
            i_t = torch.sigmoid(self.Wih(h_prev) + self.Wix(x_t) + self.bi)
            T_t = torch.sigmoid(self.WTh(h_prev) + self.WTx(x_t) + self.WTt(delta_t) + self.bT)
            L_t = torch.sigmoid(self.WLh(h_prev) + self.WLx(x_t) + self.WLdelta(delta_L) + self.bL)

            c_t = f_t * c_prev + i_t * torch.tanh(T_t * delta_t + L_t * delta_L)
            h_t = torch.tanh(c_t)

        return torch.sigmoid(self.classifier(h_t))

batch_size = 32
input_dim = 4 # Now using only 4 features: category, amt, distance_km, delta_t
hidden_dim = 64
seq_len = 5

train_dataset = CreditCardFraudDataset("/home/ducanh/Credit Card Transactions Fraud Detection/Datasets/fraudTrain.csv", seq_len)
test_dataset = CreditCardFraudDataset("/home/ducanh/Credit Card Transactions Fraud Detection/Datasets/fraudTest.csv", seq_len)

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)