## Imports

In [1]:
import os
import pandas as pd

## Global varibels

In [None]:
DATASET  = 'small'

## Functions

In [2]:
def convert_values_to_integers(df, column_name):
    # Check if the column exists in the DataFrame
    if column_name not in df.columns:
        raise ValueError(f"Column '{column_name}' not found in the DataFrame")

    # Create a conversion map using the unique string values in the specified column
    unique_values = df[column_name].unique()
    conversion_map = {value: i for i, value in enumerate(unique_values)}

    # Create a new DataFrame by replacing the string values with their corresponding integers
    new_df = df.copy()
    new_df[column_name] = new_df[column_name].map(conversion_map)

    return new_df, conversion_map

def convert_values_to_integers_using_map(df, column_name, conversion_map):
    # Check if the column exists in the DataFrame
    if column_name not in df.columns:
        raise ValueError(f"Column '{column_name}' not found in the DataFrame")

    # Update the conversion map with any new unique string values in the specified column
    unique_values = df[column_name].unique()
    new_unique_values = [value for value in unique_values if value not in conversion_map]
    new_indices = range(len(conversion_map), len(conversion_map) + len(new_unique_values))
    conversion_map.update({value: i for value, i in zip(new_unique_values, new_indices)})

    # Create a new DataFrame by replacing the string values with their corresponding integers
    new_df = df.copy()
    new_df[column_name] = new_df[column_name].map(conversion_map)

    return new_df, conversion_map

## Load dataset

In [4]:
txs_df = pd.read_csv(f'../AMLsim/outputs/{DATASET}/tx_log.csv')
txs_df

Unnamed: 0,step,type,amount,nameOrig,bankOrig,daysInBankOrig,phoneChangesOrig,oldbalanceOrig,newbalanceOrig,nameDest,bankDest,daysInBankDest,phoneChangesDest,oldbalanceDest,newbalanceDest,isSAR,alertID,modelType
0,1,TRANSFER,11.27,13,bank_b,2,0,77149.50,77138.22,7,bank_a,1,0,85182.13,85193.40,0,-1,1
1,1,TRANSFER,11.21,2,bank_a,2,0,81263.28,81252.07,0,bank_a,2,0,85512.67,85523.89,0,-1,2
2,1,TRANSFER,587.76,5,bank_a,1,0,66656.75,66068.99,8,bank_b,2,0,53149.21,53736.97,1,0,2
3,2,TRANSFER,16.33,1,bank_a,3,0,89252.39,89236.06,15,bank_b,3,0,99661.07,99677.40,0,-1,2
4,2,TRANSFER,11.63,4,bank_a,3,0,91403.15,91391.52,14,bank_b,2,0,54560.80,54572.43,0,-1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
120,94,TRANSFER,12.77,11,bank_b,95,3,89992.07,89979.28,9,bank_b,95,4,95734.45,95747.23,0,-1,1
121,95,TRANSFER,16.47,5,bank_a,96,21,65888.43,65871.96,1,bank_a,95,20,88515.83,88532.30,0,-1,1
122,96,TRANSFER,14.57,3,bank_a,97,20,79807.08,79792.50,8,bank_b,97,21,55029.12,55043.69,0,-1,1
123,99,TRANSFER,10.15,13,bank_b,100,4,76963.25,76953.09,7,bank_a,99,21,84897.07,84907.22,0,-1,1


## Save dataset

In [5]:
os.makedirs(f'datasets/{DATASET}', exist_ok=True)
txs_df.to_parquet(f'datasets/{DATASET}/tx_log.parquet')