# ðŸ§  Money Laundering Detection Pipeline
This notebook covers:
1. Sample dataset creation
2. Feature engineering
3. Anomaly detection (Isolation Forest)
4. Clustering (DBSCAN)
5. Transaction graph visualization (NetworkX)

In [None]:
# Step 1: Sample Data Generation
import pandas as pd
from datetime import datetime, timedelta
import random

accounts = ['A100', 'B200', 'C300', 'D400']
emails = ['a@xyz.com', 'b@xyz.com', 'c@xyz.com', 'd@xyz.com']
ips = ['192.168.1.10', '192.168.1.20', '192.168.1.30', '192.168.1.40']
phones = ['9999999999', '8888888888', '7777777777', '6666666666']

data = []
base_time = datetime(2025, 5, 25, 10, 0)

for i in range(20):
    sender = random.choice(accounts)
    receiver = random.choice([a for a in accounts if a != sender])
    amount = random.randint(9000, 10000)
    time = base_time + timedelta(minutes=i*10)
    data.append({
        "txn_id": i+1,
        "sender_acct": sender,
        "receiver_acct": receiver,
        "amount": amount,
        "timestamp": time,
        "sender_ip": ips[accounts.index(sender)],
        "receiver_ip": ips[accounts.index(receiver)],
        "sender_email": emails[accounts.index(sender)],
        "receiver_email": emails[accounts.index(receiver)],
        "sender_phone": phones[accounts.index(sender)],
        "receiver_phone": phones[accounts.index(receiver)]
    })

df = pd.DataFrame(data)
df.to_csv("transactions.csv", index=False)
df.head()

In [None]:
# Step 2: Feature Engineering
import numpy as np
df = pd.read_csv("transactions.csv", parse_dates=["timestamp"])
df = df.sort_values("timestamp")

df["time_diff_minutes"] = df["timestamp"].diff().dt.total_seconds() / 60
df["amount_diff"] = df["amount"].diff().fillna(0)
df["same_ip"] = (df["sender_ip"] == df["receiver_ip"]).astype(int)
df["same_phone"] = (df["sender_phone"] == df["receiver_phone"]).astype(int)

def detect_loop(df):
    loops = []
    sent_from = set()
    for idx, row in df.iterrows():
        loops.append(1 if row['receiver_acct'] in sent_from else 0)
        sent_from.add(row['sender_acct'])
    return loops
df['is_loop_txn'] = detect_loop(df)

df['sender_txn_count'] = df.groupby('sender_acct').cumcount() + 1
df['sender_amount_avg'] = df.groupby('sender_acct')['amount'].transform(lambda x: x.rolling(3, min_periods=1).mean())
df['amount_ratio_prev'] = df['amount'] / (df['amount'].shift(1) + 1e-6)

df.head()

In [None]:
# Step 3: Anomaly Detection (Isolation Forest)
from sklearn.ensemble import IsolationForest
from sklearn.preprocessing import StandardScaler

features = [
    "amount", "time_diff_minutes", "amount_diff", "same_ip", "same_phone",
    "is_loop_txn", "sender_txn_count", "sender_amount_avg", "amount_ratio_prev"
]

X = df[features].fillna(0)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

iso = IsolationForest(contamination=0.1, random_state=42)
df["anomaly_score"] = iso.fit_predict(X_scaled)
df["is_anomaly"] = df["anomaly_score"] == -1

df[df["is_anomaly"] == True]

In [None]:
# Step 4: Clustering (DBSCAN)
from sklearn.cluster import DBSCAN

clustering = DBSCAN(eps=1.5, min_samples=3)
df["cluster"] = clustering.fit_predict(X_scaled)
df[["txn_id", "sender_acct", "receiver_acct", "cluster"]]

In [None]:
# Step 5: Visualization with NetworkX
import networkx as nx
import matplotlib.pyplot as plt

G = nx.from_pandas_edgelist(
    df,
    source="sender_acct",
    target="receiver_acct",
    edge_attr="amount",
    create_using=nx.DiGraph()
)

plt.figure(figsize=(10, 7))
pos = nx.spring_layout(G, seed=42)
nx.draw(G, pos, with_labels=True, node_color="skyblue", node_size=2000, arrowsize=20)
nx.draw_networkx_edge_labels(G, pos, edge_labels=nx.get_edge_attributes(G, "amount"))
plt.title("Transaction Flow Graph")
plt.show()