In [1]:
import numpy as np
import datatable as dt
from datetime import datetime
from datatable import f, join, sort
import sys
import os

output_path = "aml_preprocessed_train_data.csv"

raw_data = dt.fread('HI-Small_Trans.csv', columns=dt.str32)

currency_dict = {}
payment_format_dict = {}
bank_account_dict = {}
account_dict = {}

def get_dict_value(name, collection):
    if name in collection:
        value = collection[name]
    else:
        value = len(collection)
        collection[name] = value
    return value

header = "EdgeID,from_id,to_id,Timestamp,\
Amount Sent,Sent Currency,Amount Received,Received Currency,\
Payment Format,Is Laundering\n"

first_timestamp = -1

with open(output_path, 'w') as writer:
    writer.write(header)
    for i in range(raw_data.nrows):
        datetime_object = datetime.strptime(raw_data[i, "Timestamp"], '%Y/%m/%d %H:%M')
        timestamp = datetime_object.timestamp()
        day = datetime_object.day
        month = datetime_object.month
        year = datetime_object.year

        if first_timestamp == -1:
            start_time = datetime(year, month, day)
            first_timestamp = start_time.timestamp() - 10

        timestamp = timestamp - first_timestamp

        cur1 = get_dict_value(raw_data[i, "Receiving Currency"], currency_dict)
        cur2 = get_dict_value(raw_data[i, "Payment Currency"], currency_dict)

        fmt = get_dict_value(raw_data[i, "Payment Format"], payment_format_dict)

        from_acc_id_str = raw_data[i, "From Bank"] + raw_data[i, 2]
        from_id = get_dict_value(from_acc_id_str, account_dict)

        to_acc_id_str = raw_data[i, "To Bank"] + raw_data[i, 4]
        to_id = get_dict_value(to_acc_id_str, account_dict)

        amount_received_orig = float(raw_data[i, "Amount Received"])
        amount_paid_orig = float(raw_data[i, "Amount Paid"])

        isl = int(raw_data[i, "Is Laundering"])

        line = '%d,%d,%d,%d,%f,%d,%f,%d,%d,%d\n' % \
               (i, from_id, to_id, timestamp, amount_paid_orig, cur2, amount_received_orig, cur1, fmt, isl)

        writer.write(line)

formatted_data = dt.fread(output_path)
formatted_data = formatted_data[:, :, sort(3)]

formatted_data.to_csv(output_path)



In [2]:
import networkx as nx
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import StandardScaler

# Step 1: Create a Directed Graph Network
# Assuming you have a pandas DataFrame 'transactions_df' with columns: 
# 'from_acc', 'to_acc', 'amount', 'currency', 'timestamp', and 'label'

transactions_df = pd.read_csv(output_path)

G = nx.DiGraph()

for index, row in transactions_df.iterrows():
    G.add_edge(row['from_id'], row['to_id'], timestamp = row['Timestamp'], amount_sent=row['Amount Sent'], 
               amount_Received=['Amount Received'], received_currency=row['Received Currency'], 
               payment_format=row['Payment Format'])

In [5]:
import warnings
warnings.filterwarnings("ignore", message="The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.", category=FutureWarning)

In [None]:
from tqdm import tqdm
# Step 2: Extract Graph Features
# Feature Extraction function
def extract_features(from_id, to_id):
    features = {}
    # Degree of 'from_acc' and 'to_acc'
    features['From Degree'] = G.degree[from_id]
    features['To Degree'] = G.degree[to_id]
    # Betweenness Centrality
    #features['From betweenness_centrality'] = nx.betweenness_centrality(G)[from_id]
    #features['To betweenness_centrality'] = nx.betweenness_centrality(G)[from_id]
    # Closeness Centrality
    #features['From closeness_centrality'] = nx.closeness_centrality(G)[from_id]
    #features['To closeness_centrality'] = nx.closeness_centrality(G)[to_id]
    # Eigenvector Centrality
    #features['From eigenvector_centrality'] = nx.eigenvector_centrality(G)[from_id]
    #features['To eigenvector_centrality'] = nx.eigenvector_centrality(G)[to_id]
    # PageRank
    #features['From pagerank'] = nx.pagerank(G)[from_id]
    #features['To pagerank'] = nx.pagerank(G)[to_id]
    # Clustering Coefficient
    features['From clustering_coefficient'] = nx.clustering(G, from_id)
    features['To clustering_coefficient'] = nx.clustering(G, from_id)
    # Eccentricity
    #features['From eccentricity'] = nx.eccentricity(G, v=from_id)
    #features['To eccentricity'] = nx.eccentricity(G, v=to_id)
    # Degree Centrality
    features['From degree_centrality'] = nx.degree_centrality(G)[from_id]
    features['To degree_centrality'] = nx.degree_centrality(G)[to_id]
    return features

# Create a DataFrame to store features for each transaction
transaction_features = pd.DataFrame()

# Iterate through each transaction and extract features
for index, row in tqdm(transactions_df.iterrows()):
    features = extract_features(row['from_id'], row['to_id'])
    transaction_features = transaction_features.append(features, ignore_index=True)

# Concatenate the original DataFrame with the extracted features DataFrame
transactions_df_with_features = pd.concat([transactions_df, transaction_features], axis=1)

438it [01:54,  3.80it/s]

In [None]:
# Extract features and labels
X = transactions_df_with_features.drop(columns=['from_id', 'to_id', 'Is Laundering'])
y = transactions_df_with_features['Is Laundering']

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, , stratify=y)

In [None]:
# Initialize and train a Random Forest Classifier
clf = RandomForestClassifier()
clf.fit(X_train, y_train)

# Step 5: Model Evaluation
# Predict on the test set
y_pred = clf.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)
print("Classification Report:")
print(classification_report(y_test, y_pred))

# Step 6: Iterate and Improve
# Iterate on feature engineering, model architecture, and hyperparameters as needed
