```
Copyright 2024 IBM Corporation

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

     http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
```

## Training LightGBM model for AML using graph features

<div> <img src="img/train_pipeline.jpg" width="1000"> </div>

In [4]:
from snapml import GraphFeaturePreprocessor

import numpy as np
import pandas as pd
import time

import lightgbm as lgb
from sklearn.metrics import fbeta_score

from ipywidgets import IntProgress
from IPython.display import display

# For exporting to PMML
from lightgbm import LGBMClassifier

from sklearn2pmml import sklearn2pmml
from sklearn2pmml.decoration import CategoricalDomain, ContinuousDomain
from sklearn2pmml.pipeline import PMMLPipeline


## Input parameters

In [5]:
formatted_data_path = "./aml-demo-data/out_dir/"

# Set the path to the input transacton file enriched with graph-based features.
transactions_path = formatted_data_path + "formatted_transactions.csv"

# Set the output path for the trained PMML model.
gf_model_out_path = formatted_data_path + "aml-hi-small_model.pmml"
basic_model_out_path = formatted_data_path + "aml-hi-small_basic_model.pmml"

# Set the number of test transactions
n_test = 863901

# Set the column indices to be removed: Transaction ID, Source Account ID, Target Account ID, Source Bank ID, Target Bank ID
remove_cols = [0,1,2,10,11]

# Batch size used for creating graph-based features
batch_size = 128


## Preparing data for training

In [None]:
print("Loading test data")
X_all = np.loadtxt(transactions_path, dtype=np.float64, delimiter=",", comments='#', skiprows=1)

Y_all = X_all[:,-1] # Labels
X_all = X_all[:,:-1] # Drop labels

print("Data loaded succesfully.")
print("Data shape is ", X_all.shape)

In [None]:
print("Creating a graph feature preprocessor ")
gp = GraphFeaturePreprocessor()

print("Setting the parameters of the graph feature preprocessor ")
tw_days = 1
gf_params = {
    # Number of software threads to be used
    'num_threads': 12,

    # Enable account statistics
    'vertex_stats': True,
    'vertex_stats_cols': [3,6],

    # Enable graph-pattern-based features
    'fan': False,
    'degree': False,
    'scatter-gather': True,
    'temp-cycle': True,
    'lc-cycle': True,
    'lc-cycle_len': 10,

    # Set time window parameters
    'time_window': tw_days*24*3600,
    'vertex_stats_tw': tw_days*24*3600,
    'scatter-gather_tw': 6*3600,
    'temp-cycle_tw': tw_days*24*3600,
    'lc-cycle_tw': tw_days*24*3600,
}
gp.set_params(gf_params)

### Function for determining the number of graph-based features produced by Graph Feature Preprocessor

In [8]:
def get_num_gf_feats(gf):
    params = gf.get_params()
    feat_num = 0

    # add features names for the graph patterns
    for pattern in ['fan', 'degree', 'scatter-gather', 'temp-cycle', 'lc-cycle']:
        if pattern in params:
            if params[pattern]:
                bins = len(params[pattern +'_bins'])
                if pattern in ['fan', 'degree']:
                    feat_num += 2*bins
                else:
                    feat_num += bins

    # add fan, deg, and ratio features
    for k in [0, 1, 2]:
        if k in params["vertex_stats_feats"]:
            feat_num += 4

    # add avg, sum, min, max, median, var, skew, and kurtosis features
    for k in [3, 4, 5, 6, 7, 8, 9, 10]:
        if k in params["vertex_stats_feats"]:
            feat_num += 4*len(params["vertex_stats_cols"])

    return feat_num

### Generate graph-based features using Graph Feature Preprocessor

In [None]:
print("Generating graph-based features", flush=True)

batch_size = 128
print("Batch size:", batch_size)

num_rows = np.shape(X_all)[0]
num_cols = np.shape(X_all)[1] + get_num_gf_feats(gp)

X_all_gf = np.zeros((num_rows,num_cols))

# Display the progress bar
pbar = IntProgress(min=0, max=num_rows, description='Processing:',)
display(pbar)

for i in range(0, num_rows, batch_size):
    pbar.value = i # update progress bar

    batch_size_true = batch_size
    if i + batch_size > num_rows:
        batch_size_true = num_rows - i

    # Generate graph-based features for a batch of transactions
    Features_in = X_all[i:i+batch_size_true,:].astype('float64')
    Features_out = gp.transform(Features_in)
    X_all_gf[i:i+batch_size_true,:] = Features_out

### Split the data to train and test sets

In [10]:
# Remove columns that relate to transaction and account IDs
X_all_gf_rm = np.delete(X_all_gf, remove_cols, 1)
X_all_rm = np.delete(X_all, remove_cols, 1)

## Split to train and test set
# Graph-based features
X_train_gf = X_all_gf_rm[:-n_test]
X_test_gf = X_all_gf_rm[-n_test:]

# Basic features only
X_train_basic = X_all_rm[:-n_test]
X_test_basic = X_all_rm[-n_test:]

# Labels
y_train = Y_all[:-n_test]
y_test = Y_all[-n_test:]

## LGBM Model Training

### Function for training LightGBM model

In [14]:
def lgbm_train_evaluate(X_train, y_train, X_test, y_test, params, model_path, gen_pmml=False):
        """ Evaluate an LightGBM configuration

        Args:
            X_train (np.ndarray): Training feature matrix
            y_train (np.ndarray): Training labels
            X_test (np.ndarray): Test feature matrix
            y_test (np.ndarray): Test labels
            params (dict): Model configuration
            model_path (string): Where to save the model

        Returns:
            score (float): Configuration score
        """

        lgb_params = params.copy()
        num_round = lgb_params['num_round']
        lgb_params.pop('num_round')

        lgb_params['objective'] = 'binary'
        lgb_params.pop('alpha')
        lgb_params.pop('gamma')

        early_stopping_rounds = 20
        if gen_pmml:
            lgbm_mdl = LGBMClassifier(**lgb_params,
                                        num_iterations=num_round)

            pmml_pipeline = PMMLPipeline([("classifier", lgbm_mdl)])
            pmml_pipeline.fit(X_train, y_train)

            sklearn2pmml(pmml_pipeline, model_path, with_repr=True)
            print("PMML Model saved to path", model_path, flush=True)

            z_test = pmml_pipeline.predict_proba(X_test)
            z_test = z_test[:,1] # probabilities of class 1
            preds = pmml_pipeline.predict(X_test)
        else:
            dtrain = lgb.Dataset(X_train, y_train, weight=None)
            dtest = lgb.Dataset(X_test, y_test, weight=None)

            bst = lgb.train(lgb_params,
                            dtrain,
                            num_boost_round=num_round,
                            valid_sets=[dtest],
                            callbacks=[lgb.early_stopping(stopping_rounds=early_stopping_rounds), lgb.log_evaluation(50)]
                            )

            # bst.save_model(model_path)
            # print("Txt model saved to path", model_path, flush=True)

            z_test = bst.predict(X_test)
            preds = np.round(z_test)  # 1: illicit, 0: licit

        score = fbeta_score(y_test, preds.astype(np.float64), beta=1, zero_division=0)

        return score


### Training the model with graph-based features

In [None]:
# Set the training parameters. These parameters can be found using a Hyperparameter Tuning method such as Successive Halving.
params = {
    'boosting': 'gbdt',
    'metric': 'auc',
    'num_round': 185,
    'num_leaves': 21,
    'max_bin': 256,
    'learning_rate': 0.08995441299910924,
    'lambda_l1': 0.4902016501409548,
    'lambda_l2': 81.93169246795033,
    'scale_pos_weight': 4.495921090533586,
    'alpha': 0.8028096762102561,
    'gamma': 2.1902844884226473,
    'seed': 5935727,
    'max_depth': 10
}

print("Training using graph-based features.")
print("----------------------------------------------------------------------------------")
res_score = lgbm_train_evaluate(X_train_gf, y_train, X_test_gf, y_test, params, gf_model_out_path, gen_pmml=True)
print("----------------------------------------------------------------------------------")
print("Test minority-class F1 score is: ", res_score)

### Training the model only basic features

In [None]:
params = {
    'boosting': 'gbdt',
    'metric': 'auc',
    'num_round': 1,
    'num_leaves': 10462,
    'max_bin': 256,
    'learning_rate': 0.022428612814196603,
    'lambda_l1': 0.26870745974673377,
    'lambda_l2': 0.030489255163862368,
    'scale_pos_weight': 3.5442726714074495,
    'alpha': 1.1564605530755112,
    'gamma': 2.13621316479638,
    'seed': 254784112,
}

print("Training using only basic features.")
print("----------------------------------------------------------------------------------")
res_score = lgbm_train_evaluate(X_train_basic, y_train, X_test_basic, y_test, params, basic_model_out_path, gen_pmml=False)
print("----------------------------------------------------------------------------------")
print("Test minority-class F1 score is: ", res_score)