<a href="https://colab.research.google.com/github/aishwarya-walimbe/Fraud-Detetection-Using-GNN/blob/main/preprocess.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import os
cur_path = "/content/drive/MyDrive/Fraud_Detection_Project/"
os.chdir(cur_path)
!pwd

/content/drive/MyDrive/Fraud_Detection_Project


In [3]:
!pip install torch_geometric
import pandas as pd
import torch
from torch_geometric.data import HeteroData
from sklearn.preprocessing import LabelEncoder, StandardScaler
from torch_geometric.data import Data
from sklearn.model_selection import train_test_split

Collecting torch_geometric
  Downloading torch_geometric-2.7.0-py3-none-any.whl.metadata (63 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/63.7 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m63.7/63.7 kB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0m
Downloading torch_geometric-2.7.0-py3-none-any.whl (1.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m32.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: torch_geometric
Successfully installed torch_geometric-2.7.0


In [4]:
file_path = "/content/drive/MyDrive/Fraud_Detection_Project/paysim.csv"

In [5]:
def load_data(file_path: str, sample_size: int = 200_000) -> pd.DataFrame:
    """
    Stratified sample: keep ALL fraud rows, fill rest with normal rows.
    """
    df = pd.read_csv(file_path)

    fraud_df    = df[df["isFraud"] == 1]
    nonfraud_df = df[df["isFraud"] == 0].sample(
        n=sample_size - len(fraud_df), random_state=42
    )

    df_small = pd.concat([fraud_df, nonfraud_df]).reset_index(drop=True)
    print(f"[load_data] rows={len(df_small):,}  fraud={fraud_df.shape[0]:,}  "
          f"({fraud_df.shape[0]/len(df_small)*100:.2f}%)")
    return df_small




In [6]:
def build_graph(df: pd.DataFrame) -> Data:

    # ── 2a. Account → integer ID ──────────────────────────────
    all_accounts = pd.concat([df["nameOrig"], df["nameDest"]]).unique()
    account2id   = {acc: i for i, acc in enumerate(all_accounts)}
    num_nodes    = len(account2id)

    # ── 2b. Encode transaction type ───────────────────────────
    type_map = {t: i for i, t in enumerate(df["type"].unique())}
    df = df.copy()
    df["type_enc"] = df["type"].map(type_map)

    # Extract hour from 'step' column (step ≈ hours since simulation start)
    df["hour"] = df["step"] % 24

    # ── 2c. Build edge_index + edge_attr ─────────────────────
    src_ids = df["nameOrig"].map(account2id).values
    dst_ids = df["nameDest"].map(account2id).values

    edge_index = torch.tensor(
        np.vstack([src_ids, dst_ids]), dtype=torch.long
    )

    # Edge features: [amount, type_enc, hour, oldbalanceOrg, newbalanceOrig]
    edge_attr_raw = df[
        ["amount", "type_enc", "hour", "oldbalanceOrg", "newbalanceOrig"]
    ].values.astype(np.float32)

    scaler_edge  = StandardScaler()
    edge_attr_sc = scaler_edge.fit_transform(edge_attr_raw)
    edge_attr    = torch.tensor(edge_attr_sc, dtype=torch.float)


    nf = np.zeros((num_nodes, 12), dtype=np.float64)
    unique_dest = [set() for _ in range(num_nodes)]
    unique_src  = [set() for _ in range(num_nodes)]

    for row in df.itertuples(index=False):
        s   = account2id[row.nameOrig]
        d   = account2id[row.nameDest]
        amt = row.amount

        # Sent-side
        nf[s, 0] += amt
        nf[s, 2] += 1
        nf[s, 6]  = max(nf[s, 6], amt)
        nf[s, 8] += max(0, row.oldbalanceOrg - row.newbalanceOrig)
        unique_dest[s].add(d)

        # Received-side
        nf[d, 1] += amt
        nf[d, 3] += 1
        nf[d, 7]  = max(nf[d, 7], amt)
        nf[d, 9] += max(0, row.oldbalanceDest - row.newbalanceDest) \
                    if hasattr(row, "oldbalanceDest") else 0
        unique_src[d].add(s)

    # Averages
    nf[:, 4] = np.divide(nf[:, 0], nf[:, 2],
                         out=np.zeros(num_nodes), where=nf[:, 2] != 0)
    nf[:, 5] = np.divide(nf[:, 1], nf[:, 3],
                         out=np.zeros(num_nodes), where=nf[:, 3] != 0)

    # Unique counterparty counts
    nf[:, 10] = np.array([len(s) for s in unique_dest])
    nf[:, 11] = np.array([len(s) for s in unique_src])

    scaler_node = StandardScaler()
    nf_scaled   = scaler_node.fit_transform(nf)

    # Guard against NaN / Inf after scaling
    nf_scaled = np.nan_to_num(nf_scaled, nan=0.0, posinf=0.0, neginf=0.0)

    x = torch.tensor(nf_scaled, dtype=torch.float)

    # ── 2e. Labels ────────────────────────────────────────────
    labels = np.zeros(num_nodes, dtype=np.int64)
    for row in df.itertuples(index=False):
        if row.isFraud == 1:
            labels[account2id[row.nameOrig]] = 1

    y = torch.tensor(labels, dtype=torch.long)

    # ── 2f. 3-way split: 70% train / 10% val / 20% test ──────
    idx            = torch.arange(num_nodes)
    train_idx, tmp = train_test_split(idx, test_size=0.30,
                                      stratify=y.numpy(), random_state=42)
    val_idx, test_idx = train_test_split(tmp, test_size=0.667,
                                          stratify=y[tmp].numpy(),
                                          random_state=42)

    train_mask = torch.zeros(num_nodes, dtype=torch.bool)
    val_mask   = torch.zeros(num_nodes, dtype=torch.bool)
    test_mask  = torch.zeros(num_nodes, dtype=torch.bool)

    train_mask[train_idx] = True
    val_mask[val_idx]     = True
    test_mask[test_idx]   = True

    data = Data(x=x, edge_index=edge_index, edge_attr=edge_attr, y=y)
    data.train_mask = train_mask
    data.val_mask   = val_mask
    data.test_mask  = test_mask

    print(f"[build_graph] nodes={num_nodes:,}  edges={edge_index.shape[1]:,}  "
          f"fraud_nodes={int(y.sum()):,}")
    print(f"  train={int(train_mask.sum()):,}  "
          f"val={int(val_mask.sum()):,}  "
          f"test={int(test_mask.sum()):,}")

    return data
