In [1]:
import pandas as pd
import numpy as np

from dotenv import load_dotenv

import re
from datetime import datetime
import os
import ast

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split

# Load data

In [132]:
load_dotenv()

data_path = os.getenv("DATA_PATH") #~/Real-Time-Anti-Money-Laundering-Detection/data
df = pd.read_csv(os.path.join(os.path.expanduser(data_path), "raw/AMLNet_August_2025.csv"), nrows=10000)

# Reduce memory usage from 144.4+ to 56.8+ MB
df['day_of_month'] = np.int8(df['day_of_month'])
df['day_of_week'] = np.int8(df['day_of_week'])
df['month'] = np.int8(df['month'])
df['hour'] = np.int8(df['hour'])
df['step'] = np.int8(df['step'])

df['isMoneyLaundering'] = df['isMoneyLaundering'].astype(bool)
df['isFraud'] = df['isFraud'].astype(bool)

df["type"] = df["type"].astype("category")
df["category"] = df["category"].astype("category")
df["nameOrig"] = df["nameOrig"].astype("category")
df["nameDest"] = df["nameDest"].astype("category")
df["laundering_typology"] = df["laundering_typology"].astype("category")

df.head()

Unnamed: 0,step,type,amount,category,nameOrig,nameDest,oldbalanceOrg,newbalanceOrig,isFraud,isMoneyLaundering,laundering_typology,metadata,fraud_probability,hour,day_of_week,day_of_month,month
0,0,DEBIT,298.842041,Other,C8083,C7053,455489.321571,455190.479531,False,False,normal,"{'timestamp': datetime.datetime(2025, 2, 4, 12...",,12,1,4,2
1,0,DEBIT,93.087916,Recreation,C5575,C1117,229508.291214,229415.203298,False,False,normal,"{'timestamp': datetime.datetime(2025, 2, 4, 12...",,12,1,4,2
2,0,EFTPOS,155.644864,Healthcare,C1549,C1423,202568.806856,202413.161992,False,False,normal,"{'timestamp': datetime.datetime(2025, 2, 4, 12...",,12,1,4,2
3,0,BPAY,299.759073,Food,C7435,C6390,491560.600203,491260.841131,False,False,normal,"{'timestamp': datetime.datetime(2025, 2, 4, 12...",,12,1,4,2
4,0,DEBIT,173.715615,Other,C8083,C5946,455190.479531,455016.763916,False,False,normal,"{'timestamp': datetime.datetime(2025, 2, 4, 12...",,12,1,4,2


# Basic structure checks

In [53]:
df.isna().sum()

step                       0
type                       0
amount                     0
category                   0
nameOrig                   0
nameDest                   0
oldbalanceOrg              0
newbalanceOrig             0
isFraud                    0
isMoneyLaundering          0
laundering_typology        0
metadata                   0
fraud_probability      10000
hour                       0
day_of_week                0
day_of_month               0
month                      0
dtype: int64

## Get json

In [57]:
def normalize_python_json_string(s):
    # return str for working with json
    pattern = r"datetime\.datetime\((.*?)\)"

    def repl(match):
        args = match.group(1).split(',')
        nums = [int(a.strip()) for a in args]
        dt = datetime(*nums)
        return f"'{dt.isoformat()}'"

    s = re.sub(pattern, repl, s)
    return s

def parse_row(s):
    cleaned = normalize_python_json_string(s)
    return ast.literal_eval(cleaned)

In [133]:
df["metadata"] = df["metadata"].apply(parse_row)
df_meta = df['metadata'].apply(pd.Series)

loc = df_meta["location"].apply(pd.Series)
merch = df_meta["merchant_info"].apply(pd.Series).fillna("Unknown")
dev = df_meta["device_info"].apply(pd.Series)
risk = df_meta["risk_indicators"].apply(pd.Series)

df = pd.concat([df, dev.add_prefix("device_"), loc.add_prefix("loc_"), merch.add_prefix("merch_"), risk.add_prefix("risk_"), df_meta], axis=1)

## Clean dataset and work with gaps

In [117]:
total = len(df)

for col, missing in df.isna().sum().items():
    if missing > 0:
        pct = missing / total * 100
        print(f"{col}: {pct:.2f}% пропусков")

fraud_probability: 100.00% пропусков
merchant_info: 73.52% пропусков
integration_info: 99.99% пропусков
sophistication: 99.99% пропусков
structuring: 99.98% пропусков
layering: 99.97% пропусков
layering_sophistication: 99.97% пропусков


In [None]:
# Drop columns with max % of nan
df = df.drop(columns=['fraud_probability', 'integration_info', 'sophistication', 'structuring', 'layering', 'layering_sophistication', 'merchant_info'], axis=1)
# Drop unuseful col
df = df.drop(columns=['metadata', 'timestamp', 'location', 'device_info', 'risk_indicators'])

# Encoding

I will use different techniques for encoding data: for low cardinality features - one hot, for high cardinality - custom Frequency

In [209]:
low_card_cols = []
high_card_cols = []
for i in df.select_dtypes(include=['object', 'category']).columns:
    try:
        if df[i].nunique() <= 10:
            low_card_cols.append(i)
        else:
            high_card_cols.append(i)
        print(f'category {i} has {df[i].nunique()} categories')
    except:
        None

category type has 8 categories
category category has 10 categories
category nameOrig has 3663 categories
category nameDest has 5668 categories
category laundering_typology has 4 categories
category device_type has 3 categories
category device_os has 4 categories
category device_ip_address has 10000 categories
category loc_city has 6 categories
category loc_state has 5 categories
category loc_country has 1 categories
category loc_postcode has 4022 categories
category merch_merchant_id has 926 categories
category merch_category has 5 categories
category merch_risk_level has 3 categories
category merch_avg_transaction has 5 categories
category risk_category_risk has 3 categories
category payment_method has 3 categories


In [210]:
low_card_cols, high_card_cols

(['type',
  'category',
  'laundering_typology',
  'device_type',
  'device_os',
  'loc_city',
  'loc_state',
  'loc_country',
  'merch_category',
  'merch_risk_level',
  'merch_avg_transaction',
  'risk_category_risk',
  'payment_method'],
 ['nameOrig',
  'nameDest',
  'device_ip_address',
  'loc_postcode',
  'merch_merchant_id'])

In [221]:
for col in low_card_cols + high_card_cols:
    df[col] = df[col].astype(str)

In [222]:
target_cols = ["isFraud", "isMoneyLaundering"]

num_cols = df.select_dtypes(
    include=["int8", "float64", "bool"]
).columns.difference(target_cols).tolist()

In [223]:
class FrequencyEncoder(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.freq_maps_ = {}

    def fit(self, X, y=None):
        """X – DataFrame с категориальными колонками."""
        X = pd.DataFrame(X).copy()
        self.freq_maps_ = {}
        for col in X.columns:
            freq = X[col].value_counts(normalize=True)
            self.freq_maps_[col] = freq
        return self

    def transform(self, X):
        X = pd.DataFrame(X).copy()
        X_encoded = pd.DataFrame(index=X.index)
        for col in X.columns:
            freq = self.freq_maps_[col]
            # unseen категории → 0
            X_encoded[col] = X[col].map(freq).fillna(0.0)
        return X_encoded.values


In [224]:
# For nums
num_pipe = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler()),
])

# For low-card: One Hot Encoder
cat_low_pipe = OneHotEncoder(handle_unknown="ignore", sparse_output=False)

# For high-card: frequency encoding
cat_high_pipe = FrequencyEncoder()

preprocessor = ColumnTransformer(
    transformers=[
        ("num", num_pipe, num_cols),
        ("cat_low", cat_low_pipe, low_card_cols),
        ("cat_high", cat_high_pipe, high_card_cols),
    ],
    remainder="drop",
)


In [225]:
y = df[["isFraud", "isMoneyLaundering"]]

In [226]:
X = df.drop(columns=target_cols)
y = df[target_cols]

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    stratify=y[ "isMoneyLaundering" ],  # isFraud
    random_state=42,
)

X_train_enc = preprocessor.fit_transform(X_train)
X_test_enc = preprocessor.transform(X_test)