In [None]:
import json
import numpy as np
import os
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.ensemble import IsolationForest
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score, roc_curve, roc_auc_score

In [None]:
with open("../config.json") as f:
    config = json.load(f)

DATASET = config['dataset']
THRESHOLD = config['if_model']['threshold']

DATASET_LOCATION = f"../data/01-ibm-transactions-for-aml/feature_engineering/{DATASET}-features"
CASES_LOCATION = f"../data/01-ibm-transactions-for-aml/preprocessed/{DATASET}-patterns/{DATASET}-cases.parquet"
GFP_FEATURES_LOCATION = f"../data/01-ibm-transactions-for-aml/feature_engineering/{DATASET}-enriched"

NORMAL_OUTPUT = f"../data/01-ibm-transactions-for-aml/filtered_output/normal"
NON_NORMAL_OUTPUT = f"../data/01-ibm-transactions-for-aml/filtered_output/non_normal"

In [None]:
data = pd.read_parquet(DATASET_LOCATION)
cases = pd.read_parquet(CASES_LOCATION)

cases_columns = cases[['transaction_id', 'id']]
data = data.merge(cases_columns, on='transaction_id', how='left')
data['id'] = data['id'].fillna(-1)

In [None]:
data["timestamp"] = pd.to_datetime(data["timestamp"])
data["timestamp"] = data["timestamp"].values.astype(int) // 10**9
min_timestamp = data["timestamp"].min()
data["timestamp"] = data["timestamp"] - min_timestamp

In [None]:
enriched_data = pd.read_parquet(GFP_FEATURES_LOCATION)

In [None]:
data = data.merge(enriched_data, on='transaction_id', how='left')
data = data.sort_values(by="timestamp")

In [None]:
X_train = data.drop(columns=['target', 'source', 'target_bank', 'source_bank', 'transaction_id',
'timestamp', 'source_currency', 'target_currency', 'source_amount', 'target_amount', 'format',
'is_laundering', 'amount', 'id'])

In [None]:
X_train_columns = X_train.columns
y_target = data[['transaction_id', 'id', 'is_laundering']].copy()
y_target_columns = y_target.columns

In [None]:
X_train = X_train.astype(np.float32)

In [None]:
y_target['is_case'] = np.where(y_target['id'] > 0, 1, 0)

In [None]:
X_train = X_train.replace([np.inf, -np.inf], 0)
X_train = X_train.fillna(0)

In [None]:
selector = SelectKBest(score_func=f_classif, k=40)
X_new = selector.fit_transform(X_train, y_target['is_laundering'])
selected_features = X_train.columns[selector.get_support()]

X_training = X_train[selected_features]
X_scaled_training = X_training

print("X_train.shape=", X_scaled_training.shape)
print(selected_features)

In [None]:
model = IsolationForest(n_estimators=200,
                        max_samples=0.01, 
                        contamination=0.1, 
                        max_features=0.8, 
                        bootstrap=False, 
                        n_jobs=-1,
                        random_state=42)

In [None]:
model.fit(X_scaled_training)

In [None]:
predictions = model.predict(X_scaled_training)
scores = model.decision_function(X_scaled_training)

In [None]:
X_scaled_training = pd.DataFrame(X_scaled_training, columns=selected_features)

In [None]:
eval = pd.concat([X_scaled_training, y_target], axis=1)
eval["predictions"] = predictions
eval["scores"] = -scores + 0.5

# print("Cases", eval[(eval['predictions'] == -1) & (eval['is_case'] == 1)].shape)
# print("Laundering", eval[(eval['predictions'] == -1) & (eval['is_laundering'] == 1)].shape)
# print("True Negatives", eval[(eval['predictions'] == 1) & (eval['is_laundering'] == 0)].shape)
# print("False Positives", eval[(eval['predictions'] == -1) & (eval['is_laundering'] == 0)].shape)

In [None]:
# TP = eval[(eval['predictions'] == -1) & (eval['is_laundering'] == 1)].shape[0]
# FN = eval[(eval['predictions'] == 1) & (eval['is_laundering'] == 1)].shape[0]
# FP = eval[(eval['predictions'] == -1) & (eval['is_laundering'] == 0)].shape[0]
# TN = eval[(eval['predictions'] == 1) & (eval['is_laundering'] == 0)].shape[0]

# print(f"True Positives: {TP}")
# print(f"False Negatives: {FN}")
# print(f"False Positives: {FP}")
# print(f"True Negatives: {TN}")

In [None]:
threshold = eval['scores'].quantile(THRESHOLD)
print(threshold)

new_scores = [1 if score>=threshold else 0 for score in eval['scores']]
eval['new_scores'] = new_scores

TP = eval[(eval['new_scores'] == 1) & (eval['is_laundering'] > 0)].shape[0]
FN = eval[(eval['new_scores'] == 0) & (eval['is_laundering'] > 0)].shape[0]
FP = eval[(eval['new_scores'] == 1) & (eval['id'] <= 0)].shape[0]
TN = eval[(eval['new_scores'] == 0) & (eval['id'] <= 0)].shape[0]

print(f"True Positives: {TP}")
print(f"False Negatives: {FN}")
print(f"False Positives: {FP}")
print(f"True Negatives: {TN}")

In [None]:
plt.figure(figsize=(14, 8))

normal_transactions = eval[eval['is_laundering'] == 0]
plt.scatter(normal_transactions.index, normal_transactions['scores'], c='CornflowerBlue', s=20, label='Normal Transactions', alpha=0.6)
laundering_transactions = eval[eval['is_laundering'] == 1]
plt.scatter(laundering_transactions.index, laundering_transactions['scores'], c='DarkOrange', s=20, label='Laundering Transactions', alpha=0.6)

plt.xlabel('Transactions')
plt.ylabel('Anomaly Score')
plt.legend()
plt.show()

In [None]:
fig, ax1 = plt.subplots(figsize=(10, 6))

normal = eval[eval['is_case'] == 0]
laundering = eval[eval['is_case'] == 1]

ax1.hist(normal['scores'], bins=300, alpha=0.7, label='Normal Transactions', color='cornflowerblue')
ax1.set_xlabel('Model Score')
ax1.set_ylabel('Frequency (Normal Transactions)', color='cornflowerblue')
ax1.tick_params(axis='y', labelcolor='cornflowerblue')

ax2 = ax1.twinx()
ax2.hist(laundering['scores'], bins=300, alpha=0.7, label='Laundering Transactions', color='goldenrod')
ax2.set_ylabel('Frequency (Case Transactions)', color='goldenrod')
ax2.tick_params(axis='y', labelcolor='goldenrod')

plt.title('Model scores of Normal and Case Transactions')
fig.tight_layout()
fig.legend()
plt.show()

In [None]:
# precision = precision_score(eval['is_case'], eval['new_scores'])
# recall = recall_score(eval['is_case'], eval['new_scores'])
# f1 = f1_score(eval['is_case'], eval['new_scores'])
# accuracy = accuracy_score(eval['is_case'], eval['new_scores'])

# print(f"Precision: {precision}")
# print(f"Recall: {recall}")
# print(f"F1: {f1}")
# print(f"Accuracy: {accuracy}")

In [None]:
# fpr, tpr, thresholds = roc_curve(eval['is_laundering'], eval['scores'])
# roc_auc = roc_auc_score(eval['is_laundering'], eval['scores'])

# plt.figure(figsize=(8, 6))
# plt.plot(fpr, tpr, color='blue', label=f'AUC = {roc_auc.round(2)}')
# plt.plot([0, 1], [0, 1], linestyle='--')
# plt.xlim([0.0, 1.0])
# plt.ylim([0.0, 1.05])
# plt.xlabel('FP Rate')
# plt.ylabel('TP Rate')
# plt.title('ROC Curve')
# plt.legend()
# plt.show()

In [None]:
# total_cases = data[data['id']>0]['id'].nunique()
# incomplete_cases = eval[(eval['new_scores'] == 0) & (eval['id']>0)]['id'].nunique()
# print(f"Anomalous identified cases: {total_cases-incomplete_cases}/{total_cases}")

In [None]:
# touched_cases = eval[(eval['new_scores'] == 1)&(eval['id']>0)]['id'].nunique()
# print(f"Cases touched by the model: {touched_cases}/{total_cases}")

In [None]:
normal_path = f"{NORMAL_OUTPUT}_{DATASET}_{THRESHOLD}.csv"
non_normal_path = f"{NON_NORMAL_OUTPUT}_{DATASET}_{THRESHOLD}.csv"
eval.query('new_scores == 0')['transaction_id'].to_csv(normal_path, index=False)
eval.query('new_scores == 1')['transaction_id'].to_csv(non_normal_path, index=False)