In [67]:
import torch
import torch.nn as nn
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import faiss


In [68]:
# Loading all embeddings (which were gotten from SAINT)

path = "../5_embeddings/cls_embeddings_time.struct_time(tm_year=2025, tm_mon=2, tm_mday=7, tm_hour=19, tm_min=40, tm_sec=22, tm_wday=4, tm_yday=38, tm_isdst=0).npy"
cls_embeddings = np.load(path)

print(cls_embeddings.shape)

(118108, 32)


In [69]:
train_df = pd.read_csv("../2_dataset/final/train_df.csv")
train_df

Unnamed: 0.1,cls,ProductCD,card4,card6,P_emaildomain,Unnamed: 0,TransactionDT,TransactionAmt,card1,card2,...,V312,V313,V314,V315,V316,V317,V318,V319,V320,V321
0,0,4,2,1,2,-0.291883,-0.329939,0.108390,-0.145421,-0.399322,...,-0.227583,-0.222385,-0.249222,-0.229148,-0.048377,-0.062211,-0.058049,-0.055287,-0.088855,-0.074142
1,0,4,3,2,16,0.892993,0.871243,-0.359702,0.680504,-0.412094,...,-0.030054,-0.222385,-0.249222,-0.229148,-0.048377,-0.062211,-0.058049,-0.055287,-0.088855,-0.074142
2,0,4,2,2,1,-1.594876,-1.467121,8.134522,-0.109308,0.711822,...,-0.227583,-0.222385,-0.249222,-0.229148,-0.048377,-0.062211,-0.058049,-0.055287,-0.088855,-0.074142
3,0,4,3,2,19,-0.123148,-0.156138,-0.422421,1.487250,-0.265218,...,0.341765,-0.222385,-0.249222,-0.229148,-0.048377,-0.062211,-0.058049,-0.055287,-0.088855,-0.074142
4,0,4,3,2,16,1.611964,1.677853,-0.317889,-0.081355,-0.265218,...,-0.227583,-0.222385,-0.249222,-0.229148,-0.048377,-0.062211,-0.058049,-0.055287,-0.088855,-0.074142
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
76765,0,4,3,2,16,-1.327921,-1.260784,-0.113217,-0.653259,-1.606254,...,1.009878,-0.222385,-0.249222,-0.229148,-0.048377,-0.062211,-0.058049,-0.055287,-0.088855,-0.074142
76766,0,4,2,2,25,0.675641,0.648266,-0.075376,-0.002802,0.756523,...,-0.227583,-0.222385,-0.249222,-0.229148,-0.048377,-0.062211,-0.058049,-0.055287,-0.088855,-0.074142
76767,0,4,1,1,16,0.418154,0.377752,0.150412,-1.485918,-0.226903,...,-0.227583,0.393449,0.090945,0.276274,-0.048377,-0.062211,-0.058049,-0.055287,-0.088855,-0.074142
76768,0,4,3,2,19,0.605578,0.576883,-0.322279,-0.182963,0.577719,...,-0.227583,-0.222385,-0.249222,-0.229148,-0.048377,-0.062211,-0.058049,-0.055287,-0.088855,-0.074142


In [70]:
train_y = pd.read_csv("../2_dataset/final/train_y_df.csv")
train_y.shape

(76770, 1)

In [71]:
test_df = pd.read_csv("../2_dataset/final/test_df.csv")
test_df.shape


(23622, 182)

In [98]:
test_y = pd.read_csv("../2_dataset/final/test_y_df.csv")
test_y['isFraud'] = test_y['isFraud'].astype(np.float32)

In [103]:
val_df = pd.read_csv("../2_dataset/final/val_df.csv")
val_y = pd.read_csv("../2_dataset/final/val_y_df.csv")

In [100]:
train_y.shape, train_df.shape

((76770, 1), (76770, 182))

In [101]:
test_y.shape, test_df.shape

((23622, 1), (23622, 182))

In [104]:
val_y.shape, val_df.shape

((17716, 1), (17716, 182))

#### Splitting embeddings

In [108]:
# Compute 65% of the total rows
total_rows = cls_embeddings.shape[0]
train_size = int(0.65 * total_rows)  # 65% of 118108
test_size = int(0.8 * total_rows)

# Slice the top 65%
train_embeddings = cls_embeddings[:train_size]
val_embeddings = cls_embeddings[train_size:test_size]
test_embeddings = cls_embeddings[test_size:]

print(f"Total embeddings shape: {cls_embeddings.shape}")
print(f"Train embeddings shape: {train_embeddings.shape}")
print(f"Val embeddings shape: {val_embeddings.shape}")
print(f"Test embeddings shape: {test_embeddings.shape}")

Total embeddings shape: (118108, 32)
Train embeddings shape: (76770, 32)
Val embeddings shape: (17716, 32)
Test embeddings shape: (23622, 32)


#### faiss index and similarity search

In [78]:
def create_index(num_embeddings, dimension):
    # num_embeddings = 76770
    # dimension = 32
    index = faiss.IndexFlatL2(dimension)  # L2 similarity
    index.add(train_embeddings)  # index of pre-computed embeddings

    return index

In [79]:
def search_faiss(query_vector):
    index = create_index(76770, 32)

    # Convert PyTorch tensor to NumPy
    if isinstance(query_vector, torch.Tensor):
        query_vector = query_vector.detach().cpu().numpy()

    query_vector = query_vector.astype("float32").reshape(1, -1)
    query_vector = query_vector.astype("float32").reshape(1, -1)

    # k = 120, As best result for 120
    distances, indices = index.search(
        query_vector, k=120
        )   # by default using euclidean distance for similarity
    
    indices = indices.flatten()
    return distances, indices

#### L2 distance component

In [93]:
def compute_similarity(distances, dropout=0.2):

    distances = distances.flatten()
    # Apply softmax to the negative distances
    similarities = np.exp(-distances)
    softmax_scores = similarities / np.sum(similarities)

    # Apply dropout (randomly zero out some softmax scores)
    dropout_mask = np.random.binomial(1, 1 - dropout, size=softmax_scores.shape)
    dropped_softmax_scores = softmax_scores * dropout_mask

    # Renormilizing softmax scores so that they sum to 1 again.
    final_softmax = dropped_softmax_scores / np.sum(dropped_softmax_scores)

    # how to weigh in the final embedding? -> weigh emebedding more if they are closer in vector space.
    return final_softmax, distances

#### mask to drop the dropped out values

In [81]:
def filter_by_mask(arr1, arr2, arr3):
    """
    Removes elements from arr2 and arr3 where corresponding indices in arr1 are zero.
    """
    mask = arr1 != 0  # Create a boolean mask where arr1 is nonzero
    return arr1[mask], arr2[mask], arr3[mask]

#### linear trasnform the value component

In [82]:
class MLP_Wv(nn.Module):
    def __init__(self, input_dim):
        super(MLP_Wv, self).__init__()
        self.layer1 = nn.Linear(input_dim, 32)
        self.activation1 = nn.SiLU()

        self.layer2 = nn.Linear(32, 32)
        self.activation2 = nn.SiLU()
        self.dropout2 = nn.Dropout(p=0.2)

        self.layer3 = nn.Linear(32, 32) 
        self.activation3 = nn.SiLU()

    def forward(self, x):
        x = self.activation1(self.layer1(x))
        x = self.dropout2(self.activation2(self.layer2(x)))
        x = self.activation3(self.layer3(x))
        return x

In [83]:
def compute_l1(distances):
    
    l1_dist = np.sqrt(distances)
    model = MLP_Wv(l1_dist.shape[0])
    # Convert input to tensor and pass it through the model
    l1_dist_tensor = torch.tensor(l1_dist, dtype=torch.float32)
    value_Wv = model(l1_dist_tensor).detach().numpy()

    return value_Wv


In [84]:
class MLP_Wy(nn.Module):
    """
    Instantiate MLP with input_dim=32 (from Wy)
    Expected output shape: (60,)
    """
    def __init__(self, input_dim):
        super(MLP_Wy, self).__init__()
        self.fc1 = nn.Linear(input_dim, 128)  
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(128, 32)

    def forward(self, x):
        x = self.fc1(x)
        x = self.relu(x)
        x = self.fc2(x)  # Shape (60, 1)
        return x.squeeze(1)  # Shape (60,)


In [85]:
def compute_wy(indices):

    y_i = train_y['isFraud'].loc[indices].values
    feature_target_tensor = torch.tensor(y_i, dtype=torch.long)

    # Define Wy: An embedding layer to map to 32-dim space
    embedding_dim = 32
    num_classes = 2  # Since input values are 0 or 1

    Wy = nn.Embedding(num_classes, embedding_dim)
    mlp = MLP_Wy(input_dim=embedding_dim)

    # Compute embeddings using Wy
    embeddings = Wy(feature_target_tensor)  # Shape: (60, 32)

    # Pass embeddings through MLP
    value_Wy = mlp(embeddings)  # Shape: (60,)

    return value_Wy


In [86]:
def compute_value(value_Wv, value_Wy):
    """
    Compute the dot product of value_Wv and value_Wy
    """
    value_Wy_npy = value_Wy.detach().numpy()
    value = value_Wy_npy + value_Wv
    return value


#### Reshaping S to do S * V

In [87]:
def compute_z_in(similarity, value):
    """
    z_in is the intergrated(weighted sum) result of value and similarity componenet
    """
    similarity = similarity.reshape(1, -1)

    numerator = np.sum(similarity @ value)  # Sum the weighted contributions (scalar)
    denominator = np.sum(similarity)        # Total sum of weights (scalar)
    z_in = numerator / denominator         # Weighted average as a single scalar

    return z_in


### Process Samples for Training and Validation

In [89]:
# test_i = test_df.iloc[i].values #shape (182,)
# query_vector = test_embeddings[i] #shape (32,)
# distances, indices = search_faiss(query_vector) # both shape (120,) and flatten
# similarity, distances = compute_similarity(distances)
# similarity, indices, distances = filter_by_mask(similarity, indices, distances)
# value_Wv = compute_l1(distances) #shape (32,)
# value_Wy = compute_wy(indices) #shape (32,)
# value = compute_value(value_Wv, value_Wy)
# z_in = compute_z_in(similarity, value)
# input_to_mlp = test_i * z_in


In [None]:
def process_samples(train_df, train_embeddings):
    """
    Processes all training samples and returns input tensor and labels.
    """
    input_list = []
    labels = []

    for i in range(len(train_df)):
        train_i = torch.tensor(train_df.iloc[i].values, dtype=torch.float32)  # shape (182,)
        query_embedding = torch.tensor(train_embeddings[i], dtype=torch.float32)  # shape (32,)

        distances, indices = search_faiss(query_embedding)
        similarity, distances = compute_similarity(distances)
        similarity, indices, distances = filter_by_mask(similarity, indices, distances)
        value_Wv = compute_l1(distances)  # shape (32,)
        value_Wy = compute_wy(indices)  # shape (32,)
        value = compute_value(value_Wv, value_Wy)
        z_in = compute_z_in(similarity, value)  # Scalar (single value)

        # Append only train_i (input_emb) and z_in (weighted_avg), not query_embedding
        z_in = torch.tensor(z_in, dtype=torch.float32)
        input_list.append((train_i, z_in))

        labels.append(torch.tensor(train_y.iloc[i]['isFraud'], dtype=torch.float32))  # Taking target labels

    return input_list, labels


In [None]:
def val_process_samples(val_df, val_embeddings):
    """
    Processes all validation samples and returns input tensor and labels.
    """
    input_list = []
    labels = []


    for i in range(len(val_df)):  # Process all samples
        test_i = torch.tensor(val_df.iloc[i].values, dtype=torch.float32)  # shape (182,)
        query_embedding = torch.tensor(val_embeddings[i], dtype=torch.float32)  # shape (32,)

        distances, indices = search_faiss(query_embedding)
        similarity, distances = compute_similarity(distances)
        similarity, indices, distances = filter_by_mask(similarity, indices, distances)
        value_Wv = compute_l1(distances)  # shape (32,)
        value_Wy = compute_wy(indices)  # shape (32,)
        value = compute_value(value_Wv, value_Wy)
        z_in = compute_z_in(similarity, value)  # Scalar (single value)

        # Append only test_i (input_emb) and z_in (weighted_avg), not query_embedding
        z_in = torch.tensor(z_in, dtype=torch.float32)

        input_list.append((test_i, z_in))
        labels.append(torch.tensor(val_y.iloc[i]['isFraud'], dtype=torch.float32))  # Taking target labels

    return input_list, labels


### Model Training

In [109]:
import torch
import torch.nn as nn

class Predictor(nn.Module):
    def __init__(self, input_emb_dim, hidden_dim=32, dropout_prob=0.2):
        super(Predictor, self).__init__()
        
        self.input_dim = input_emb_dim + 1  # Adding 1 for weighted_avg

        self.block1 = nn.Sequential(
            nn.LayerNorm(self.input_dim),
            nn.Linear(self.input_dim, hidden_dim),
            nn.SiLU(),
            nn.Dropout(dropout_prob),
            nn.Linear(hidden_dim, hidden_dim),
        )
        
        self.block2 = nn.Sequential(
            nn.LayerNorm(hidden_dim),
            nn.Linear(hidden_dim, hidden_dim),
            nn.SiLU(),
            nn.Dropout(dropout_prob),
            nn.Linear(hidden_dim, hidden_dim),
        )
        
        self.block3 = nn.Sequential(
            nn.LayerNorm(hidden_dim),
            nn.Linear(hidden_dim, hidden_dim),
            nn.SiLU(),
            nn.Dropout(dropout_prob),
            nn.Linear(hidden_dim, hidden_dim),
        )
        
        self.output_layer = nn.Sequential(
            nn.Linear(hidden_dim, 1),
            nn.Sigmoid()
        )
    
    def forward(self, input_emb, weighted_avg):
        # Ensure correct shape for scalar inputs
        weighted_avg = weighted_avg.unsqueeze(-1)  
    
        # Concatenate all inputs
        combined = torch.cat([input_emb, weighted_avg], dim=-1)

        # Pass through MLP blocks
        x = self.block1(combined)
        x = self.block2(x)
        x = self.block3(x)
        x = self.output_layer(x)
        return x


In [94]:
import torch.optim as optim
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset
from sklearn.metrics import average_precision_score
import torch

def train_model(train_df, train_embeddings, model, optimizer, criterion, batch_size=256, epochs=25):
    model.train()
    
    # Process data
    input_list, labels = process_samples(train_df, train_embeddings)
    val_input_list, val_labels = val_process_samples(val_df, val_embeddings)

    train_dataset = TensorDataset(
        torch.stack([item[0] for item in input_list]),  # test_i (input_emb)
        torch.stack([item[1] for item in input_list]),  # z_in (weighted_avg)
        torch.stack(labels)
    )

    val_dataset = TensorDataset(
        torch.stack([item[0] for item in val_input_list]),  # test_i (input_emb)
        torch.stack([item[1] for item in val_input_list]),  # z_in (weighted_avg)
        torch.stack(val_labels)
    )

    train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=False)
    val_dataloader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

    # Training loop
    for epoch in range(epochs):
        epoch_loss = 0
        all_targets = []
        all_outputs = []

        for batch in train_dataloader:
            input_emb, z_in, target = batch
            target = target.unsqueeze(-1)  # Make target shape (batch_size, 1)

            optimizer.zero_grad()
            output = model(input_emb, z_in)

            loss = criterion(output, target)
            loss.backward()
            optimizer.step()

            epoch_loss += loss.item()  # Accumulate loss for epoch
            
            # Collect predictions & targets for AUCPR
            all_outputs.append(output.detach().cpu())  # Move to CPU to avoid memory issues
            all_targets.append(target.detach().cpu())

        # Compute AUCPR at the end of the epoch
        all_outputs = torch.cat(all_outputs).numpy()
        all_targets = torch.cat(all_targets).numpy()
        aucpr = average_precision_score(all_targets, all_outputs)


        # Validation Step
        model.eval()
        val_loss = 0
        all_val_targets = []
        all_val_outputs = []

        with torch.no_grad():  # No gradient computation for validation
            for batch in val_dataloader:
                input_emb, z_in, target = batch
                target = target.unsqueeze(-1)

                output = model(input_emb, z_in)
                loss = criterion(output, target)

                val_loss += loss.item()
                
                all_val_outputs.append(output.cpu())
                all_val_targets.append(target.cpu())

        # Compute AUCPR for validation set
        all_val_outputs = torch.cat(all_val_outputs).numpy()
        all_val_targets = torch.cat(all_val_targets).numpy()

        val_aucpr = average_precision_score(all_val_targets, all_val_outputs)

        print(f"Epoch [{epoch+1}/{epochs}], "
              f"Train Loss: {epoch_loss/len(train_dataloader):.4f}, Train AUCPR: {aucpr:.4f}, "
              f"Val Loss: {val_loss/len(val_dataloader):.4f}, Val AUCPR: {val_aucpr:.4f}")


# Model, optimizer, and loss function
input_emb_dim = 182  # Assuming this based on test_df features
model = Predictor(input_emb_dim=input_emb_dim)
optimizer = optim.AdamW(model.parameters(), lr=0.0001)
criterion = nn.BCELoss()

train_model(train_df, train_embeddings, model, optimizer, criterion, epochs=100)


Epoch [1/100], Train Loss: 0.3308, Train AUCPR: 0.0379, Val Loss: 0.1395, Val AUCPR: 0.1846
Epoch [2/100], Train Loss: 0.1334, Train AUCPR: 0.2154, Val Loss: 0.1264, Val AUCPR: 0.2747
Epoch [3/100], Train Loss: 0.1238, Train AUCPR: 0.2799, Val Loss: 0.1185, Val AUCPR: 0.3230
Epoch [4/100], Train Loss: 0.1188, Train AUCPR: 0.3100, Val Loss: 0.1159, Val AUCPR: 0.3354
Epoch [5/100], Train Loss: 0.1165, Train AUCPR: 0.3259, Val Loss: 0.1146, Val AUCPR: 0.3421
Epoch [6/100], Train Loss: 0.1152, Train AUCPR: 0.3360, Val Loss: 0.1137, Val AUCPR: 0.3462
Epoch [7/100], Train Loss: 0.1142, Train AUCPR: 0.3435, Val Loss: 0.1131, Val AUCPR: 0.3494
Epoch [8/100], Train Loss: 0.1134, Train AUCPR: 0.3496, Val Loss: 0.1126, Val AUCPR: 0.3518
Epoch [9/100], Train Loss: 0.1127, Train AUCPR: 0.3550, Val Loss: 0.1122, Val AUCPR: 0.3544
Epoch [10/100], Train Loss: 0.1121, Train AUCPR: 0.3599, Val Loss: 0.1118, Val AUCPR: 0.3569
Epoch [11/100], Train Loss: 0.1116, Train AUCPR: 0.3646, Val Loss: 0.1115, Val 

## Inference

In [95]:
def test_process_samples(test_df, test_embeddings):
    """
    Processes all samples and returns input tensor and labels.
    """
    input_list = []
    labels = []

    for i in range(len(test_df)):  # Process all samples
        test_i = torch.tensor(test_df.iloc[i].values, dtype=torch.float32)  # shape (182,)
        query_embedding = torch.tensor(test_embeddings[i], dtype=torch.float32)  # shape (32,)

        distances, indices = search_faiss(query_embedding)
        similarity, distances = compute_similarity(distances)
        similarity, indices, distances = filter_by_mask(similarity, indices, distances)
        value_Wv = compute_l1(distances)  # shape (32,)
        value_Wy = compute_wy(indices)  # shape (32,)
        value = compute_value(value_Wv, value_Wy)
        z_in = compute_z_in(similarity, value)  # Scalar (single value)

        # Append only test_i (input_emb) and z_in (weighted_avg), not query_embedding
        z_in = torch.tensor(z_in, dtype=torch.float32)

        input_list.append((test_i, z_in))
        labels.append(torch.tensor(test_y.iloc[i]['isFraud'], dtype=torch.float32))  # Taking target labels

    return input_list, labels


In [96]:
test_input_list, test_input_labels = test_process_samples(test_df=test_df, test_embeddings=test_embeddings)

In [97]:
import torch
from torch.utils.data import DataLoader, TensorDataset

def inference(model, test_df, test_embeddings, batch_size=256):
    """
    Perform inference on test data using a trained model.
    
    Args:
        model (nn.Module): Trained model.
        test_df (pd.DataFrame): Your test dataframe.
        test_embeddings (np.ndarray or torch.Tensor): Precomputed embeddings for test.
        batch_size (int): Batch size for inference.

    Returns:
        all_outputs (np.ndarray): Model outputs (probabilities).
        all_labels (np.ndarray): Ground truth labels.
    """

    model.eval()

    test_dataset = TensorDataset(
        torch.stack([item[0] for item in test_input_list]),  # test_i (input_emb)
        torch.stack([item[1] for item in test_input_list]),  # z_in (weighted_avg)
        torch.stack(test_input_labels)                             # Labels
    )
    test_dataloader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

    # Lists to store outputs and labels
    all_outputs = []
    all_labels = []

    # No gradient updates during inference
    with torch.no_grad():
        for batch in test_dataloader:
            input_emb, z_in, labels = batch
            outputs = model(input_emb, z_in)
            
            # Move outputs and labels to CPU, then store
            all_outputs.append(outputs.cpu())
            all_labels.append(labels.cpu())

    # Concatenate everything into a single array/tensor
    all_outputs = torch.cat(all_outputs).numpy()
    all_labels = torch.cat(all_labels).numpy()

    return all_outputs, all_labels

all_test_outputs, all_test_labels = inference(model, test_df, test_embeddings, batch_size=256)

# Optionally, compute metrics (e.g., AUCPR) on your test set:
from sklearn.metrics import average_precision_score
test_aucpr = average_precision_score(all_test_labels, all_test_outputs)
print("Test AUCPR:", test_aucpr)

# Or create a final binary prediction if desired:
binary_predictions = (all_test_outputs >= 0.5).astype(int)


Test AUCPR: 0.4201159628856508


In [37]:
import numpy as np

# Count occurrences of each unique value (0s and 1s)
unique_values, counts = np.unique(binary_predictions, return_counts=True)

# Print results
for value, count in zip(unique_values, counts):
    print(f"Value {value}: {count} occurrences")


Value 0: 23319 occurrences
Value 1: 303 occurrences


In [38]:
len(test_input_list)

23622