## Data Preprocessing

In [None]:
import pandas as pd
import igraph as ig

In [None]:
DATASET = "HI-Small" ## either HI-Small or LI-Small
DATA_PATH = f"../datasets/synthetic/02_preprocessed/{DATASET}-transactions"
CASES_PATH = f"../datasets/synthetic/02_preprocessed/{DATASET}-patterns/{DATASET}-cases.parquet"
PATTERN_PATH = f"../datasets/synthetic/02_preprocessed/{DATASET}-patterns.csv"

In [None]:
transactions = pd.read_parquet(DATA_PATH)

cases = pd.read_parquet(CASES_PATH)

starting_transactions = cases.shape[0]
starting_patterns = cases['id'].nunique()
original_pattern_ids = list(cases['id'].unique())

# Retain only patterns having > 1 transaction
cases_filtered = cases.groupby('id').filter(lambda x: len(x) > 1)
cases_filtered['target'] = cases_filtered['target'].str.slice(0, 8)
cases_filtered['source'] = cases_filtered['source'].str.slice(0, 8)

def is_connected_component(df):
    g = ig.Graph.TupleList(df[['source', 'target']].itertuples(index=False), directed=False)
    return len(g.connected_components(mode="weak")) == 1

connected_pattern_ids = []
unconnected_pattern_ids = []

# Retain only the patterns which form weakly connected components
for _id, group in cases_filtered.groupby('id'):
    if is_connected_component(group):
        connected_pattern_ids.append(_id)
    else:
        unconnected_pattern_ids.append(_id)
        
cases_filtered = cases_filtered[cases_filtered['id'].isin(connected_pattern_ids)]

In [None]:
laundering_ids = list(transactions[transactions['is_laundering']==1]['transaction_id'].values)
pattern_ids = list(cases_filtered['transaction_id'].values)

ids_to_drop = set(laundering_ids).difference(pattern_ids)
transactions = transactions[~transactions['transaction_id'].isin(ids_to_drop)]

In [None]:
# Utility: keep the sources and the target ids to 8 chars in order to be unique
transactions.loc[:, 'target'] = transactions['target'].str.slice(0, 8)
transactions.loc[:, 'source'] = transactions['source'].str.slice(0, 8)

In [None]:
df_source = transactions.groupby('source').agg(
    count=('source', 'size'),
    sum_is_laundering=('is_laundering', 'sum')
).reset_index()

df_target = transactions.groupby('target').agg(
    count=('target', 'size'),
    sum_is_laundering=('is_laundering', 'sum')
).reset_index()


# Retain the sources and targets which have less than 500 transactions each
t_src = 500
t_dst = 500
valid_sources = list(df_source[df_source['count']<=t_src]['source'].values)
valid_targets = list(df_target[df_target['count']<=t_dst]['target'].values)

In [None]:
transactions_filtered = transactions[
    (transactions['source'].isin(valid_sources)) & (transactions['target'].isin(valid_targets))
]

In [None]:
shape_laundering = transactions_filtered[transactions_filtered['is_laundering']==1].shape[0]
shape_patterns = cases_filtered.shape[0]
assert shape_laundering == shape_patterns

In [None]:
transactions_filtered.to_parquet(DATA_PATH + ".parquet")
cases_filtered.to_csv(PATTERN_PATH, index=False)