In [None]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.model_selection import train_test_split

In [None]:
hdfc_df = pd.read_csv("hdfc_dataset.csv")
sbi_df = pd.read_csv("sbi_dataset.csv")
df = pd.concat([hdfc_df, sbi_df], ignore_index=True)

In [None]:
df['Timestamp'] = pd.to_datetime(df['Date'] + ' ' + df['Time'])
df.drop(['Date', 'Time'], axis=1, inplace=True)

In [None]:
# Encode categorical columns
#columns ko sub columns me create kerke numerical check karega
cat_cols = ['Merchant', 'Cardholder_Name', 'Transaction_Type', 'Device_Used', 'Location']
encoders = {}
for col in cat_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    encoders[col] = le

In [None]:
#normalizing
scaler = MinMaxScaler()
df[['Amount']] = scaler.fit_transform(df[['Amount']])

In [None]:
# Autoencoder Part 
import torch
import torch.nn as nn
from sklearn.metrics import classification_report, confusion_matrix

In [None]:
features = df.drop(['Fraud_Label', 'Transaction_ID', 'Card_Number', 'IP_Address', 'Timestamp'], axis=1)
labels = df['Fraud_Label']
X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.2, random_state=42)

In [None]:
# Use only non-fraud data for training the autoencoder
X_train_ae = X_train[y_train == 0]  ## non fraud 
X_train_tensor = torch.tensor(X_train_ae.values, dtype=torch.float32)
X_test_tensor = torch.tensor(X_test.values, dtype=torch.float32)

In [None]:
# Define Autoencoder architecture
class Autoencoder(nn.Module):
    def __init__(self, input_dim):
        super(Autoencoder, self).__init__()
        self.encoder = nn.Sequential(nn.Linear(input_dim, 8), nn.ReLU())
        self.decoder = nn.Sequential(nn.Linear(8, input_dim), nn.Sigmoid())

    def forward(self, x): #forward passing of x into encoder and decoder 
        x = self.encoder(x)
        x = self.decoder(x)
        return x

In [None]:
# Training Autoencoder
model = Autoencoder(X_train_tensor.shape[1])
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)

In [None]:
for epoch in range(20):
    output = model(X_train_tensor)
    loss = criterion(output, X_train_tensor)
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    if epoch % 5 == 0:
        print(f"Epoch {epoch}, Loss: {loss.item():.4f}")

In [None]:
# Compute reconstruction error for test data
reconstructions = model(X_test_tensor).detach().numpy()
mse = np.mean(np.power(X_test.values - reconstructions, 2), axis=1)
threshold = np.percentile(mse, 95)  # Set threshold at 95th percentile

In [None]:
# Predict using threshold
y_pred_ae = [1 if e > threshold else 0 for e in mse]

In [None]:
print("Confusion Matrix (Autoencoder):\n", confusion_matrix(y_test, y_pred_ae))
print("\nClassification Report (Autoencoder):\n", classification_report(y_test, y_pred_ae))

In [None]:
# GNN PART 
from torch_geometric.data import Data
from torch_geometric.nn import GCNConv
import torch.nn.functional as F

In [None]:
# Encode graph-related attributes
df_graph = df.copy()
for col in ['Card_Number', 'Merchant', 'IP_Address']:
    df_graph[col] = LabelEncoder().fit_transform(df_graph[col])

In [None]:
# Create edges by connecting transactions sharing same Card_Number, Merchant, or IP_Address
edges = set()
for attr in ['Card_Number', 'Merchant', 'IP_Address']:
    for val in df_graph[attr].unique():
        idx = df_graph[df_graph[attr] == val].index.tolist()
        for i in range(len(idx)):
            for j in range(i + 1, len(idx)):
                edges.add((idx[i], idx[j]))
                edges.add((idx[j], idx[i]))

In [None]:
# Prepare data for PyTorch Geometric
edge_index = torch.tensor(list(edges), dtype=torch.long).t().contiguous()
X_gnn = df_graph.drop(['Transaction_ID', 'Fraud_Label', 'Timestamp'], axis=1)
X_gnn = torch.tensor(X_gnn.values, dtype=torch.float32)
y_gnn = torch.tensor(df_graph['Fraud_Label'].values, dtype=torch.long)

In [None]:
data = Data(x=X_gnn, edge_index=edge_index, y=y_gnn)

In [None]:
# Define GCN model
class GCN(torch.nn.Module):
    def __init__(self, num_features):
        super(GCN, self).__init__()
        self.conv1 = GCNConv(num_features, 16)
        self.conv2 = GCNConv(16, 2)

    def forward(self, data):
        x, edge_index = data.x, data.edge_index
        x = self.conv1(x, edge_index)
        x = F.relu(x)
        x = self.conv2(x, edge_index)
        return F.log_softmax(x, dim=1)

In [None]:
# Train GCN model
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model_gnn = GCN(data.num_node_features).to(device)
data = data.to(device)
optimizer = torch.optim.Adam(model_gnn.parameters(), lr=0.01)

In [None]:
# Create train-test split for GNN
train_mask = torch.zeros(data.num_nodes, dtype=torch.bool)
train_mask[:int(0.7 * data.num_nodes)] = True
test_mask = ~train_mask

In [None]:
for epoch in range(1, 101):
    model_gnn.train()
    optimizer.zero_grad()
    out = model_gnn(data)
    loss = F.nll_loss(out[train_mask], data.y[train_mask])
    loss.backward()
    optimizer.step()
    if epoch % 10 == 0:
        print(f"Epoch {epoch}, Loss: {loss.item():.4f}")

In [None]:
# Evaluate GCN model
model_gnn.eval()
pred = model_gnn(data).argmax(dim=1)
print("Confusion Matrix (GNN):\n", confusion_matrix(data.y[test_mask].cpu(), pred[test_mask].cpu()))
print("\nClassification Report (GNN):\n", classification_report(data.y[test_mask].cpu(), pred[test_mask].cpu()))

In [None]:
# Prediction Functions

In [None]:
# Autoencoder prediction
def predict_autoencoder_with_reason(user_input):
    input_df = pd.DataFrame([user_input])

# Encode categorical values using pre-fitted label encoders
    for col in encoders:
        if user_input[col] not in encoders[col].classes_:
            fallback = encoders[col].classes_[0]
            print(f" Unknown '{user_input[col]}' in '{col}' — using fallback: {fallback}")
            input_df[col] = encoders[col].transform([fallback])
        else:
            input_df[col] = encoders[col].transform([user_input[col]])

    input_df[['Amount']] = scaler.transform(input_df[['Amount']])

    input_tensor = torch.tensor(input_df.values, dtype=torch.float32)

    model.eval()
    output = model(input_tensor).detach().numpy()

 # Calculate the reconstruction error (absolute difference and MSE)
    error_vector = np.abs(input_df.values - output)
    total_error = np.mean(np.power(input_df.values - output, 2), axis=1)

    prediction = "FRAUDULENT" if total_error > threshold else "LEGITIMATE"

 # Identify top 3 features contributing most to the reconstruction error
    top_indices = error_vector[0].argsort()[-3:][::-1]
    top_features = [input_df.columns[i] for i in top_indices]
    top_contributions = error_vector[0][top_indices]

    print(f"\n Autoencoder Prediction: {prediction}")
    print(f" Total Reconstruction Error: {total_error[0]:.5f}")
    print("Top contributing features to anomaly:")
    for feat, val in zip(top_features, top_contributions):
        print(f"  - {feat}: error {val:.4f}")
    return prediction


In [None]:
# GNN- prediction with graph reasoning
def predict_gnn_with_reason(new_txn_raw):
    import copy
    df_temp = copy.deepcopy(df_graph)  

 # Encode and inject the new transaction with synthetic IDs and timestamp
    new_txn = {
        "Amount": scaler.transform([[new_txn_raw["Amount"]]])[0][0],
        "Merchant": encoders["Merchant"].transform([new_txn_raw["Merchant"]])[0],
        "Cardholder_Name": encoders["Cardholder_Name"].transform([new_txn_raw["Cardholder_Name"]])[0],
        "Transaction_Type": encoders["Transaction_Type"].transform([new_txn_raw["Transaction_Type"]])[0],
        "Device_Used": encoders["Device_Used"].transform([new_txn_raw["Device_Used"]])[0],
        "Location": encoders["Location"].transform([new_txn_raw["Location"]])[0],
        "Card_Number": 999999,
        "IP_Address": 888888,
        "Fraud_Label": 0,  
        "Transaction_ID": 999999,
        "Timestamp": pd.to_datetime("2025-01-01 00:00:00")
    }

 # Append the new transaction to the temp DataFrame
    df_temp = pd.concat([df_temp, pd.DataFrame([new_txn])], ignore_index=True)

    for col in ['Card_Number', 'Merchant', 'IP_Address']:
        df_temp[col] = LabelEncoder().fit_transform(df_temp[col])

# Check if new transaction shares connections with past frauds
    fraud_neighbors = []
    new_idx = df_temp.index[-1]

    for attr in ['Card_Number', 'Merchant', 'IP_Address']:
        matching = df_temp[(df_temp[attr] == df_temp.loc[new_idx, attr]) & (df_temp.index != new_idx)]
        frauds = matching[matching['Fraud_Label'] == 1]
        if not frauds.empty:
            fraud_neighbors.append(attr)

 # Build edge list based on shared entities (link-based)
    edges = set()
    for attr in ['Card_Number', 'Merchant', 'IP_Address']:
        for val in df_temp[attr].unique():
            idxs = df_temp[df_temp[attr] == val].index.tolist()
            for i in range(len(idxs)):
                for j in range(i + 1, len(idxs)):
                    edges.add((idxs[i], idxs[j]))
                    edges.add((idxs[j], idxs[i]))

# Convert edge list and features to tensors for PyTorch Geometric
    edge_index = torch.tensor(list(edges), dtype=torch.long).t().contiguous()
    X_tensor = torch.tensor(df_temp.drop(['Transaction_ID', 'Fraud_Label', 'Timestamp'], axis=1).values, dtype=torch.float32)
    y_tensor = torch.tensor(df_temp['Fraud_Label'].values, dtype=torch.long)

    data_input = Data(x=X_tensor, edge_index=edge_index, y=y_tensor).to(device)

    model_gnn.eval()
    pred = model_gnn(data_input).argmax(dim=1)
    prediction = pred[-1].item() 

    result = "FRAUDULENT" if prediction == 1 else "LEGITIMATE"
    print(f"\n GNN Prediction: {result}")
    if fraud_neighbors:
        print(" Connected to previous frauds via:")
        for attr in fraud_neighbors:
            print(f"  - Shared {attr}")
    else:
        print(" No direct links to known frauds.")
    return result

In [None]:
predict_autoencoder_with_reason({
    "Amount": 1200.00,
    "Merchant": "Amazon",
    "Cardholder_Name": "Amit Singhr",
    "Transaction_Type": "Online",
    "Device_Used": "Mobile",
    "Location": "Mumbai"
})

In [None]:
predict_autoencoder_with_reason({
    "Amount": 999990.00,
    "Merchant": "Amazon",
    "Cardholder_Name": "Amit Singh",
    "Transaction_Type": "Online",
    "Device_Used": "Mobile",
    "Location": "Mumbai"
})

In [None]:
predict_gnn_with_reason({
    "Amount": 200.60,
    "Merchant": "Uber",
    "Cardholder_Name": "Vikas Verma",
    "Transaction_Type": "Online",
    "Device_Used": "Laptop",
    "Location": "Delhi"
})

In [None]:
# Plot histogram of reconstruction errors with threshold line for anomaly detection
import matplotlib.pyplot as plt

In [None]:
plt.figure(figsize=(10,5))
plt.hist(mse, bins=50, color='orange', edgecolor='black')
plt.axvline(threshold, color='red', linestyle='--', label='Threshold')
plt.title(" Reconstruction Error Distribution")
plt.xlabel("Reconstruction Error")
plt.ylabel("Number of Transactions")
plt.legend()
plt.show()

In [None]:
# Function to search and find fruad with maximum Reconstruction Error
X_test_copy = X_test.copy()
X_test_copy['Reconstruction_Error'] = mse
X_test_copy['Actual_Label'] = y_test.values
X_test_copy['Predicted_Label'] = y_pred_ae

In [None]:
top_frauds = X_test_copy.sort_values(by='Reconstruction_Error', ascending=False).head(10)
top_frauds

In [None]:
import seaborn as sns
from sklearn.metrics import confusion_matrix

In [None]:
cm = confusion_matrix(y_test, y_pred_ae)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.title("Confusion Matrix - Autoencoder")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.show()

In [None]:
torch.save(model.state_dict(), 'autoencoder_model.pth')

In [None]:
torch.save(model_gnn.state_dict(), 'gnn_model.pth')

In [None]:
# Function: plot_link_graph(df_graph)
import networkx as nx
import matplotlib.pyplot as plt

In [None]:
def plot_link_graph(df_graph):
    
    G = nx.Graph()

    #Add each transaction as a node with label (fraud or legit)
    for idx, row in df_graph.iterrows():
        label = 'fraud' if row['Fraud_Label'] == 1 else 'legit'
        G.add_node(idx, label=label)

    #Define linking attributes for edge creation
    link_attrs = ['Card_Number', 'Merchant', 'IP_Address']

    
    for attr in link_attrs:
        values = df_graph[attr].unique()
        for val in values:
            idx_list = df_graph[df_graph[attr] == val].index.tolist()
            for i in range(len(idx_list)):
                for j in range(i + 1, len(idx_list)):
                    G.add_edge(idx_list[i], idx_list[j], via=attr) 

    
    colors = ['red' if G.nodes[n]['label'] == 'fraud' else 'green' for n in G.nodes]

    
    plt.figure(figsize=(14, 10))
    pos = nx.spring_layout(G, seed=42, k=0.2) 

    nx.draw_networkx_nodes(G, pos, node_color=colors, node_size=50, alpha=0.8)
    nx.draw_networkx_edges(G, pos, alpha=0.2)

    plt.title("🔗 Transaction Link Graph — Fraud (Red) vs Legit (Green)")
    plt.axis("off")
    plt.show()

In [None]:
plot_link_graph(df_graph)

In [None]:
import joblib

In [None]:
joblib.dump(encoders, "encoders.pkl")

In [None]:
joblib.dump(scaler, "scaler.pkl")

In [None]:
print("✅ Encoders and Scaler saved")