In [None]:
import json
import pandas as pd
import time
from snapml import GraphFeaturePreprocessor

In [None]:
with open("../config.json") as f:
    config = json.load(f)

DATASET = config["dataset"]
PREPROCESSED_DATA = f"../data/01-ibm-transactions-for-aml/preprocessed/{DATASET}-transactions"
WRITE_LOCATION = f"../data/01-ibm-transactions-for-aml/feature_engineering/{DATASET}-enriched"

In [None]:
data = pd.read_parquet(PREPROCESSED_DATA)

In [None]:
data = data.sort_values(by="timestamp", ascending=True)
data["timestamp"] = pd.to_datetime(data["timestamp"])
data["timestamp"] = data["timestamp"].values.astype(int) // 10**9
min_timestamp = data["timestamp"].min()
data["timestamp"] = data["timestamp"] - min_timestamp

In [None]:
data_to_preprocess = data[['transaction_id', "source", "target", "timestamp", "amount"]].copy()
data_id = data['transaction_id'].values

del data

In [None]:
params = {
    "num_threads": 12,                  # number of software threads to be used (important for performance)
    "time_window": 24*3600,             # time window used if no pattern was specified (seconds)
    
    "vertex_stats": True,               # produce vertex statistics
    "vertex_stats_tw": 24*3600,
    # produce vertex statistics using the selected input columns (timestamp, source_amount, target_amount)
    "vertex_stats_cols": [3,4],     
    
    # features: 0:fan,1:deg,2:ratio,3:avg,4:sum,5:min,6:max,7:median,8:var,9:skew,10:kurtosis
    "vertex_stats_feats": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
    #"vertex_stats_feats": [],
    
    # fan in/out parameters
    "fan": True,
    "fan_tw": 24*3600,
    "fan_bins": [y+2 for y in range(16)],
    
    # in/out degree parameters
    "degree": True,
    "degree_tw": 24*3600,
    "degree_bins": [y+1 for y in range(16)],
    
    # scatter gather parameters
    "scatter-gather": True,   
    "scatter-gather_tw": 6*3600,        # 6-hours
    "scatter-gather_bins": [y+2 for y in range(16)],
    
    # temporal cycle parameters
    "temp-cycle": True,
    "temp-cycle_tw": 24*3600,
    "temp-cycle_bins": [y+2 for y in range(12)],
    
    # length-constrained simple cycle parameters
    "lc-cycle": True,
    "lc-cycle_tw": 24*3600,
    "lc-cycle_len": 5,
    "lc-cycle_bins": [y+2 for y in range(16)]
}

In [None]:
gp = GraphFeaturePreprocessor()
gp.set_params(params)
print("Graph feature preprocessor parameters: ", json.dumps(gp.get_params(), indent=4))

In [None]:
src_dst_map = {}

i = 0
for src in data_to_preprocess["source"].unique():
    i += 1
    src_dst_map[src] = i

for dst in data_to_preprocess["target"].unique():
    if src_dst_map.get(dst) is None:
        i += 1
        src_dst_map[dst] = i

data_to_preprocess.loc[:, "source"] = data_to_preprocess["source"].map(src_dst_map)
data_to_preprocess.loc[:, "target"] = data_to_preprocess["target"].map(src_dst_map)

In [None]:
data_to_preprocess.loc[:, 'id'] = range(len(data_to_preprocess))
X = data_to_preprocess.loc[:, ['id', 'source', 'target', 'timestamp', 'amount']]

In [None]:
print("Enriching the transactions with new graph features ")
print("Raw dataset shape: ", X.shape)
X = X.to_numpy()

In [None]:
start_time = time.time()

X_train_enriched = gp.fit_transform(X.astype('float64'))

end_time = time.time()
runtime = end_time - start_time

print(f"Runtime: {runtime:.6f} seconds")
print("\n")
print("Enriched dataset shape: ", X_train_enriched.shape)

In [None]:
def print_enriched_transaction(transaction, params):
    colnames = []

    # add raw features names
    colnames.append("transaction_id")
    colnames.append("source")
    colnames.append("target")
    colnames.append("timestamp")
    colnames.append("amount")


    # add features names for the graph patterns
    for pattern in ['fan', 'degree', 'scatter-gather', 'temp-cycle', 'lc-cycle']:
        if pattern in params:
            if params[pattern]:
                bins = len(params[pattern +'_bins'])
                if pattern in ['fan', 'degree']:
                    for i in range(bins-1):
                        colnames.append(pattern+"_in_bins_"+str(params[pattern +'_bins'][i])+"-"+str(params[pattern +'_bins'][i+1]))
                    colnames.append(pattern+"_in_bins_"+str(params[pattern +'_bins'][i+1])+"-inf")
                    for i in range(bins-1):
                        colnames.append(pattern+"_out_bins_"+str(params[pattern +'_bins'][i])+"-"+str(params[pattern +'_bins'][i+1]))
                    colnames.append(pattern+"_out_bins_"+str(params[pattern +'_bins'][i+1])+"-inf")
                else:
                    for i in range(bins-1):
                        colnames.append(pattern+"_bins_"+str(params[pattern +'_bins'][i])+"-"+str(params[pattern +'_bins'][i+1]))
                    colnames.append(pattern+"_bins_"+str(params[pattern +'_bins'][i+1])+"-inf")

    vert_feat_names = ["fan","deg","ratio","avg","sum","min","max","median","var","skew","kurtosis"]

    # add features names for the vertex statistics
    for orig in ['source', 'dest']:
        for direction in ['out', 'in']:
            # add fan, deg, and ratio features
            for k in [0, 1, 2]:
                if k in params["vertex_stats_feats"]:
                    feat_name = orig + "_" + vert_feat_names[k] + "_" + direction
                    colnames.append(feat_name)
            for col in params["vertex_stats_cols"]:
                # add avg, sum, min, max, median, var, skew, and kurtosis features
                for k in [3, 4, 5, 6, 7, 8, 9, 10]:
                    if k in params["vertex_stats_feats"]:
                        feat_name = orig + "_" + vert_feat_names[k] + "_col" + str(col) + "_" + direction
                        colnames.append(feat_name)

    return pd.DataFrame(transaction, columns=colnames)

In [None]:
enriched_dataset = print_enriched_transaction(X_train_enriched, gp.get_params())
enriched_dataset['transaction_id'] = data_id

In [None]:
enriched_dataset = enriched_dataset.loc[:, enriched_dataset.apply(pd.Series.nunique) != 1]
print(json.dumps(enriched_dataset.columns.tolist(), indent=2))

In [None]:
enriched_dataset = enriched_dataset.drop(columns=['source', 'target', 'timestamp', 'amount'])
print("Total columns", len(enriched_dataset.columns))

In [None]:
enriched_dataset.to_parquet(WRITE_LOCATION)