In [1]:
import pandas as pd

df = pd.read_csv("SAML-D.csv")

In [2]:
df.columns

Index(['Time', 'Date', 'Sender_account', 'Receiver_account', 'Amount',
       'Payment_currency', 'Received_currency', 'Sender_bank_location',
       'Receiver_bank_location', 'Payment_type', 'Is_laundering',
       'Laundering_type'],
      dtype='object')

In [3]:
df.dtypes

Time                       object
Date                       object
Sender_account              int64
Receiver_account            int64
Amount                    float64
Payment_currency           object
Received_currency          object
Sender_bank_location       object
Receiver_bank_location     object
Payment_type               object
Is_laundering               int64
Laundering_type            object
dtype: object

In [4]:
df.shape

(9504852, 12)

In [5]:
df.head(10)

Unnamed: 0,Time,Date,Sender_account,Receiver_account,Amount,Payment_currency,Received_currency,Sender_bank_location,Receiver_bank_location,Payment_type,Is_laundering,Laundering_type
0,10:35:19,2022-10-07,8724731955,2769355426,1459.15,UK pounds,UK pounds,UK,UK,Cash Deposit,0,Normal_Cash_Deposits
1,10:35:20,2022-10-07,1491989064,8401255335,6019.64,UK pounds,Dirham,UK,UAE,Cross-border,0,Normal_Fan_Out
2,10:35:20,2022-10-07,287305149,4404767002,14328.44,UK pounds,UK pounds,UK,UK,Cheque,0,Normal_Small_Fan_Out
3,10:35:21,2022-10-07,5376652437,9600420220,11895.0,UK pounds,UK pounds,UK,UK,ACH,0,Normal_Fan_In
4,10:35:21,2022-10-07,9614186178,3803336972,115.25,UK pounds,UK pounds,UK,UK,Cash Deposit,0,Normal_Cash_Deposits
5,10:35:21,2022-10-07,8974559268,3143547511,5130.99,UK pounds,UK pounds,UK,UK,ACH,0,Normal_Group
6,10:35:23,2022-10-07,980191499,8577635959,12176.52,UK pounds,UK pounds,UK,UK,ACH,0,Normal_Small_Fan_Out
7,10:35:23,2022-10-07,8057793308,9350896213,56.9,UK pounds,UK pounds,UK,UK,Credit card,0,Normal_Small_Fan_Out
8,10:35:26,2022-10-07,6116657264,656192169,4738.45,UK pounds,UK pounds,UK,UK,Cheque,0,Normal_Fan_Out
9,10:35:29,2022-10-07,7421451752,2755709071,5883.87,Indian rupee,UK pounds,UK,UK,Credit card,0,Normal_Fan_Out


In [6]:
df.isnull().sum()

Time                      0
Date                      0
Sender_account            0
Receiver_account          0
Amount                    0
Payment_currency          0
Received_currency         0
Sender_bank_location      0
Receiver_bank_location    0
Payment_type              0
Is_laundering             0
Laundering_type           0
dtype: int64

In [7]:
df['Sender_account'].nunique(),df['Receiver_account'].nunique()

(292715, 652266)

In [8]:
df.columns = df.columns.str.lower()

In [9]:
df.columns

Index(['time', 'date', 'sender_account', 'receiver_account', 'amount',
       'payment_currency', 'received_currency', 'sender_bank_location',
       'receiver_bank_location', 'payment_type', 'is_laundering',
       'laundering_type'],
      dtype='object')

In [10]:
# --- if u have spaces in ur column names u can use this snippet --- #
# df.columns = df.columns.str.lower()
# df.columns = df.columns.str.replace(" ", "_")
# df.columns

In [11]:
df["tx_ts"] = pd.to_datetime(df["date"].astype(str)+" "+ df["time"].astype(str),errors = "coerce")
df["tx_ts"].head(10)

0   2022-10-07 10:35:19
1   2022-10-07 10:35:20
2   2022-10-07 10:35:20
3   2022-10-07 10:35:21
4   2022-10-07 10:35:21
5   2022-10-07 10:35:21
6   2022-10-07 10:35:23
7   2022-10-07 10:35:23
8   2022-10-07 10:35:26
9   2022-10-07 10:35:29
Name: tx_ts, dtype: datetime64[ns]

In [12]:
df["tx_ts"].head(5), df["tx_ts"].isna().sum(), df[["date","time"]].head(3)

(0   2022-10-07 10:35:19
 1   2022-10-07 10:35:20
 2   2022-10-07 10:35:20
 3   2022-10-07 10:35:21
 4   2022-10-07 10:35:21
 Name: tx_ts, dtype: datetime64[ns],
 np.int64(0),
          date      time
 0  2022-10-07  10:35:19
 1  2022-10-07  10:35:20
 2  2022-10-07  10:35:20)

In [13]:
""" Clean text columns (strip + normalize)

Run this:

text_cols = [
    "payment_currency", "received_currency",
    "sender_bank_location", "receiver_bank_location",
    "payment_type", "laundering_type"
]

for c in text_cols:
    df[c] = df[c].astype(str).str.strip()

# quick check: show unique examples after cleaning
{c: df[c].dropna().unique()[:10] for c in text_cols} """

' Clean text columns (strip + normalize)\n\nRun this:\n\ntext_cols = [\n    "payment_currency", "received_currency",\n    "sender_bank_location", "receiver_bank_location",\n    "payment_type", "laundering_type"\n]\n\nfor c in text_cols:\n    df[c] = df[c].astype(str).str.strip()\n\n# quick check: show unique examples after cleaning\n{c: df[c].dropna().unique()[:10] for c in text_cols} '

In [14]:
text_cols = [ "payment_currency", "received_currency",
    "sender_bank_location", "receiver_bank_location",
    "payment_type", "laundering_type"]

for c in text_cols:
    df[c] = df[c].astype(str).str.strip()
    print(df[c].unique()[:10])
    print("\n")

['UK pounds' 'Indian rupee' 'Albanian lek' 'Swiss franc' 'Pakistani rupee'
 'Naira' 'Yen' 'Euro' 'Dirham' 'Mexican Peso']


['UK pounds' 'Dirham' 'Pakistani rupee' 'Euro' 'US dollar' 'Mexican Peso'
 'Indian rupee' 'Albanian lek' 'Turkish lira' 'Naira']


['UK' 'Albania' 'Nigeria' 'Japan' 'Spain' 'Switzerland' 'UAE' 'Italy'
 'France' 'Pakistan']


['UK' 'UAE' 'Spain' 'France' 'USA' 'Mexico' 'Albania' 'Turkey' 'Nigeria'
 'Switzerland']


['Cash Deposit' 'Cross-border' 'Cheque' 'ACH' 'Credit card' 'Debit card'
 'Cash Withdrawal']


['Normal_Cash_Deposits' 'Normal_Fan_Out' 'Normal_Small_Fan_Out'
 'Normal_Fan_In' 'Normal_Group' 'Normal_Cash_Withdrawal'
 'Normal_Periodical' 'Normal_Foward' 'Normal_Mutual' 'Smurfing']




In [15]:
df["is_laundering"].value_counts()


is_laundering
0    9494979
1       9873
Name: count, dtype: int64

In [16]:
df.loc[df["is_laundering"] == 1, "laundering_type"].value_counts().head(50)

laundering_type
Structuring             1870
Cash_Withdrawal         1334
Deposit-Send             945
Smurfing                 932
Layered_Fan_In           656
Layered_Fan_Out          529
Stacked Bipartite        506
Behavioural_Change_1     394
Bipartite                383
Cycle                    382
Fan_In                   364
Gather-Scatter           354
Behavioural_Change_2     345
Scatter-Gather           338
Single_large             250
Fan_Out                  237
Over-Invoicing            54
Name: count, dtype: int64

In [17]:
df.loc[df["is_laundering"] == 1 , "laundering_type"].value_counts()

laundering_type
Structuring             1870
Cash_Withdrawal         1334
Deposit-Send             945
Smurfing                 932
Layered_Fan_In           656
Layered_Fan_Out          529
Stacked Bipartite        506
Behavioural_Change_1     394
Bipartite                383
Cycle                    382
Fan_In                   364
Gather-Scatter           354
Behavioural_Change_2     345
Scatter-Gather           338
Single_large             250
Fan_Out                  237
Over-Invoicing            54
Name: count, dtype: int64

In [18]:
# amount sanity
(df["amount"] <= 0).sum(), df["amount"].describe()

(np.int64(0),
 count    9.504852e+06
 mean     8.762968e+03
 std      2.561495e+04
 min      3.730000e+00
 25%      2.143688e+03
 50%      6.113720e+03
 75%      1.045846e+04
 max      1.261850e+07
 Name: amount, dtype: float64)

In [19]:
# laundering flag sanity
df["is_laundering"].unique(), df["is_laundering"].value_counts()


(array([0, 1]),
 is_laundering
 0    9494979
 1       9873
 Name: count, dtype: int64)

In [20]:
#sampling the data from 10M to 1M

In [21]:
laundering_tx = df[df["is_laundering"] == 1]

laundering_accounts = pd.Index(
    laundering_tx["sender_account"]
).union(
    pd.Index(laundering_tx["receiver_account"])
)

len(laundering_tx), laundering_accounts.size


(9873, 18079)

In [22]:
tx_laundering_accounts = df[
    df["sender_account"].isin(laundering_accounts) |
    df["receiver_account"].isin(laundering_accounts)
]

tx_laundering_accounts.shape


(1486495, 13)

In [23]:
tx_laundering_accounts["is_laundering"].value_counts(), tx_laundering_accounts.shape


(is_laundering
 0    1476622
 1       9873
 Name: count, dtype: int64,
 (1486495, 13))

In [24]:
tx_laundering = tx_laundering_accounts[
    tx_laundering_accounts["is_laundering"] == 1
]

tx_non_laundering = tx_laundering_accounts[
    tx_laundering_accounts["is_laundering"] == 0
]

tx_laundering.shape, tx_non_laundering.shape


((9873, 13), (1476622, 13))

In [25]:
# sort by time first
tx_non_laundering_sorted = tx_non_laundering.sort_values("tx_ts")

# keep last N txns per sender laundering account
sender_recent = (
    tx_non_laundering_sorted[
        tx_non_laundering_sorted["sender_account"].isin(laundering_accounts)
    ]
    .groupby("sender_account")
    .tail(30)
)

# keep last N txns per receiver laundering account
receiver_recent = (
    tx_non_laundering_sorted[
        tx_non_laundering_sorted["receiver_account"].isin(laundering_accounts)
    ]
    .groupby("receiver_account")
    .tail(30)
)

# combine and deduplicate
tx_non_laundering_trimmed = pd.concat(
    [sender_recent, receiver_recent]
).drop_duplicates()

tx_non_laundering_trimmed.shape


(278916, 13)

In [26]:
tx_core = pd.concat(
    [tx_laundering, tx_non_laundering_trimmed]
).drop_duplicates()

tx_core.shape


(288789, 13)

In [27]:
safe_pool = df[
    (df["is_laundering"] == 0) &
    (~df["sender_account"].isin(laundering_accounts)) &
    (~df["receiver_account"].isin(laundering_accounts))
]

safe_pool.shape


(8018357, 13)

In [28]:
import numpy as np

safe_pool = safe_pool.copy()

safe_pool["amount_bucket"] = pd.qcut(
    safe_pool["amount"],
    q=5,
    labels=["very_low", "low", "medium", "high", "very_high"]
)

safe_pool["amount_bucket"].value_counts()


amount_bucket
medium       1603675
very_low     1603674
very_high    1603671
low          1603669
high         1603668
Name: count, dtype: int64

In [29]:
# how many rows we need
target_safe_rows = 711_211

# calculate sampling fraction
sample_frac = target_safe_rows / len(safe_pool)

safe_sample = (
    safe_pool
    .groupby(
        ["amount_bucket", "payment_type", "payment_currency", "sender_bank_location"],
        group_keys=False
    )
    .apply(lambda x: x.sample(frac=sample_frac, random_state=42))
)

safe_sample.shape


  .groupby(
  .apply(lambda x: x.sample(frac=sample_frac, random_state=42))


(711036, 14)

In [30]:
# how many more rows needed
remaining = 711_211 - len(safe_sample)
remaining


175

In [31]:
# rows not already sampled
safe_remaining_pool = safe_pool.drop(safe_sample.index)

safe_topup = safe_remaining_pool.sample(
    n=remaining,
    random_state=42
)

safe_sample_final = pd.concat([safe_sample, safe_topup])

safe_sample_final.shape


(711211, 14)

In [32]:
final_tx = pd.concat(
    [tx_core, safe_sample_final],
    ignore_index=True
)

# drop helper column
final_tx = final_tx.drop(columns=["amount_bucket"])

final_tx.shape, final_tx["is_laundering"].value_counts()


((1000000, 13),
 is_laundering
 0    990127
 1      9873
 Name: count, dtype: int64)

In [33]:
# Sampling is done next is Feature Engineering

In [34]:
# work on a copy
tx = final_tx.copy()

# ensure proper sorting
tx = tx.sort_values("tx_ts")

# set index for time-based ops
tx = tx.set_index("tx_ts")

tx.head(3)


Unnamed: 0_level_0,time,date,sender_account,receiver_account,amount,payment_currency,received_currency,sender_bank_location,receiver_bank_location,payment_type,is_laundering,laundering_type
tx_ts,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
2022-10-07 10:35:20,10:35:20,2022-10-07,1491989064,8401255335,6019.64,UK pounds,Dirham,UK,UAE,Cross-border,0,Normal_Fan_Out
2022-10-07 10:35:23,10:35:23,2022-10-07,8057793308,9350896213,56.9,UK pounds,UK pounds,UK,UK,Credit card,0,Normal_Small_Fan_Out
2022-10-07 10:35:37,10:35:37,2022-10-07,6715177555,4460925916,586.28,UK pounds,UK pounds,UK,UK,Cheque,0,Normal_Small_Fan_Out


In [35]:
tx = tx.reset_index()   # tx_ts becomes a column again

In [None]:
df.columns