In [22]:
# This file combines all different type of transactions in to a single data frame. 
# Then output a dictionary with cutomer ID's as keys and features as values. 

# All the categorical columns are converted to one hot encoding and and the 
# transaction data and time are converted to time delta absolute positional encoding and cyclical positional encoding for hour of the day, day of the week and day of the month

# The idea is to feed all of these transaction into a reccurrant model such as 
# RNN or GRU to automatically make features for each customer instead of hand crafting them. 

In [2]:
import pandas as pd
from datetime import time
from collections import Counter
import numpy as np
from tqdm import tqdm
import matplotlib.pyplot as plt
import pickle
from pathlib import Path
import os

In [24]:
# Load data
path  = Path('processed_synth_dataset')

frac = 1
wire = pd.read_csv(path/'wire_s.csv', engine="pyarrow").sample(frac=frac)
ach = pd.read_csv(path/'ach_s.csv', engine="pyarrow").sample(frac=frac)
cheque = pd.read_csv(path/'cheque_s.csv', engine="pyarrow").sample(frac=frac)
card = pd.read_csv(path/'card_s.csv', engine="pyarrow").sample(frac=frac)



In [25]:
#making dictionary of all transactions
transactions = {
    'wire': wire,
    'ach': ach,
    'cheque': cheque,
    'card': card
    }

In [26]:
# # Drop all the transaction id columns since they are all unique and not useful
# transactions['wire'].drop(columns=['wire_id'], inplace=True)
# transactions['abm'].drop(columns=['abm_id'], inplace=True)
# transactions['cheque'].drop(columns=['cheque_id'], inplace=True)
# transactions['eft'].drop(columns=['eft_id'], inplace=True)
# transactions['emt'].drop(columns=['emt_id'], inplace=True)
# transactions['card'].drop(columns=['card_trxn_id'], inplace=True)

In [27]:
# Combine date and time columns into a single datetime column

def combine_date_and_time(df):
    df['transaction_datetime'] = pd.to_datetime(df['transaction_date'].astype(str) + ' ' + df['transaction_time'].astype(str))
    df.drop(columns=['transaction_date', 'transaction_time'], inplace=True)
    return df

In [28]:
# sort by date and time
def sort_by_date_time(df):
    df = df.sort_values(by='transaction_datetime')    
    return df

In [29]:
# Add a column for the transaction type since all the dataframes are going to be combined
def trx_type_col(df:pd.DataFrame, trx_type:str):
    df['trx_type'] = trx_type
    return df

In [30]:
# Combine all the dataframes into a single dataframe
def combine_df(df_dict:dict):
    df = pd.concat(df_dict.values(), axis= 0, ignore_index=True)
    return df

In [31]:
# Combine date and time columns, sort by date and time, and add a column for the transaction type
for trx_type, df in transactions.items():
    df = combine_date_and_time(df)
    # df = sort_by_date_time(df)
    df = trx_type_col(df, trx_type)   
    transactions[trx_type] = df

In [32]:
# Combine all the dataframes into a single dataframe
combined_df = combine_df(transactions)

In [21]:
combined_df1 = combine_df(transactions)

In [33]:
# add positional encoding and cyclic encoding for date and time features
earliest_time = combined_df['transaction_datetime'].min()
latest_time = combined_df['transaction_datetime'].max()

abs_pos_encoding = (combined_df['transaction_datetime'] - earliest_time).apply(lambda x: x.total_seconds()/(latest_time-earliest_time).total_seconds()) 
combined_df['abs_pos_encoding'] = abs_pos_encoding
minutes = combined_df['transaction_datetime'].dt.minute
hours = combined_df['transaction_datetime'].dt.hour
# days = combined_df['transaction_datetime'].dt.day
week_day = combined_df['transaction_datetime'].dt.dayofweek

combined_df['cyc_enc_minutes_sin'] = np.sin(2*np.pi*minutes/60)
combined_df['cyc_enc_minutes_cos'] = np.cos(2*np.pi*minutes/60)

combined_df['cyc_enc_hours_sin'] = np.sin(2*np.pi*hours/24)
combined_df['cyc_enc_hours_cos'] = np.cos(2*np.pi*hours/24)

# combined_df['cyc_enc_days_sin'] = np.sin(2*np.pi*days/31)
# combined_df['cyc_enc_days_cos'] = np.cos(2*np.pi*days/31)

combined_df['cyc_enc_weekday_sin'] = np.sin(2*np.pi*week_day/7)
combined_df['cyc_enc_weekday_cos'] = np.cos(2*np.pi*week_day/7)

combined_df.drop(columns=['transaction_datetime'], inplace=True)

In [38]:
#converting city number to binary format
max_binary_width = len(np.binary_repr(combined_df['city'].max()))
city_id_to_binary = [np.binary_repr(x, width=max_binary_width) for x in list(combined_df['city'])]
bin_col_names = [f'city_bin_{i}' for i in range(max_binary_width)]
binary_dict = {}    
for col_name in bin_col_names:
    binary_dict[col_name] = []
   
for bin_num in city_id_to_binary:
    for i, col_name in enumerate(bin_col_names):
        binary_dict[col_name].append(int(bin_num[i]))

combined_df = pd.concat([combined_df, pd.DataFrame(binary_dict)], axis=1)

In [39]:
combined_df.drop(columns=['city'], inplace=True)

In [40]:
# Convert categorical columns to categorical data type
cols = combined_df.columns
for col in cols:
    if col in ['debit_credit', 'trx_type', 'currency']:
        combined_df[col] = combined_df[col].astype('category')

In [41]:
# One hot encode categorical columns
combined_df_encoded = pd.get_dummies(
    combined_df,
    columns=['debit_credit', 'trx_type', 'currency'],  # Columns to encode
)

In [42]:
# Make a dictionary of all the unique customer ids and their transactions
# get all the unique customer ids
combined_df_encoded_copy = combined_df_encoded.copy()

combined_df_encoded_copy.sort_values(by=['customer_id'], inplace=True, ignore_index=True) 

customer_ids = combined_df_encoded_copy['customer_id'].unique()

cust_trx_dict = {}

start_idx = 0
curr_idx = 0
last_idx = None
for cust in tqdm(customer_ids):
    while cust == combined_df_encoded_copy['customer_id'][curr_idx]:
        curr_idx += 1
        if curr_idx == len(combined_df_encoded_copy):
            cust_trx_dict[cust] = combined_df_encoded_copy.iloc[start_idx:]
            break
    last_idx = curr_idx
    cust_trx_dict[cust] = combined_df_encoded_copy.iloc[start_idx:last_idx]
    start_idx = curr_idx

100%|██████████| 467468/467468 [00:58<00:00, 7954.03it/s] 


In [43]:
list(cust_trx_dict[list(cust_trx_dict.keys())[0]].columns)

['customer_id',
 'amount_cad',
 'Is Laundering',
 'abs_pos_encoding',
 'cyc_enc_minutes_sin',
 'cyc_enc_minutes_cos',
 'cyc_enc_hours_sin',
 'cyc_enc_hours_cos',
 'cyc_enc_weekday_sin',
 'cyc_enc_weekday_cos',
 'city_bin_0',
 'city_bin_1',
 'city_bin_2',
 'city_bin_3',
 'city_bin_4',
 'city_bin_5',
 'city_bin_6',
 'city_bin_7',
 'city_bin_8',
 'city_bin_9',
 'city_bin_10',
 'city_bin_11',
 'city_bin_12',
 'city_bin_13',
 'city_bin_14',
 'city_bin_15',
 'city_bin_16',
 'city_bin_17',
 'city_bin_18',
 'debit_credit_credit',
 'debit_credit_debit',
 'trx_type_ach',
 'trx_type_card',
 'trx_type_cheque',
 'trx_type_wire',
 'currency_Australian Dollar',
 'currency_Bitcoin',
 'currency_Brazil Real',
 'currency_Canadian Dollar',
 'currency_Euro',
 'currency_Mexican Peso',
 'currency_Ruble',
 'currency_Rupee',
 'currency_Saudi Riyal',
 'currency_Shekel',
 'currency_Swiss Franc',
 'currency_UK Pound',
 'currency_US Dollar',
 'currency_Yen',
 'currency_Yuan']

In [None]:
# Save the dictionary to a pickle file
# The dictionalry has customer id as keys and their transactions as values
with open('synth_trx_level_training_data_dict.pkl', 'wb') as f:
    pickle.dump(cust_trx_dict, f)

In [45]:
i = 0
for cust, trx in cust_trx_dict.items():
    display(cust_trx_dict[cust])
    i += 1
    if i>5:
        break

Unnamed: 0,customer_id,amount_cad,Is Laundering,abs_pos_encoding,cyc_enc_minutes_sin,cyc_enc_minutes_cos,cyc_enc_hours_sin,cyc_enc_hours_cos,cyc_enc_weekday_sin,cyc_enc_weekday_cos,...,currency_Mexican Peso,currency_Ruble,currency_Rupee,currency_Saudi Riyal,currency_Shekel,currency_Swiss Franc,currency_UK Pound,currency_US Dollar,currency_Yen,currency_Yuan
0,10042B660,28236.661509,0,0.534713,-5.877853e-01,-8.090170e-01,-7.071068e-01,0.707107,-0.433884,-0.900969,...,False,False,False,False,False,False,False,True,False,False
1,10042B660,307.257162,0,0.272739,-3.090170e-01,9.510565e-01,1.224647e-16,-1.000000,0.000000,1.000000,...,False,False,False,False,False,False,False,True,False,False
2,10042B660,4789.885107,0,0.504506,-2.079117e-01,-9.781476e-01,7.071068e-01,-0.707107,-0.433884,-0.900969,...,False,False,False,False,False,False,False,True,False,False
3,10042B660,936.771624,0,0.321303,8.090170e-01,-5.877853e-01,8.660254e-01,-0.500000,0.781831,0.623490,...,False,False,False,False,False,False,False,True,False,False
4,10042B660,2823.624243,0,0.367365,-1.000000e+00,-1.836970e-16,5.000000e-01,0.866025,0.974928,-0.222521,...,False,False,False,False,False,False,False,True,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
130771,10042B660,276.187512,0,0.304740,-9.945219e-01,-1.045285e-01,2.588190e-01,0.965926,0.781831,0.623490,...,False,False,False,False,False,False,False,True,False,False
130772,10042B660,2322.174543,0,0.585614,-4.067366e-01,9.135455e-01,-9.659258e-01,-0.258819,-0.974928,-0.222521,...,False,False,False,False,False,False,False,True,False,False
130773,10042B660,2235.006111,0,0.068842,5.665539e-16,-1.000000e+00,7.071068e-01,0.707107,-0.433884,-0.900969,...,False,False,False,False,False,False,False,True,False,False
130774,10042B660,45.722964,0,0.548773,9.781476e-01,2.079117e-01,7.071068e-01,0.707107,-0.974928,-0.222521,...,False,False,False,False,False,False,False,True,False,False


Unnamed: 0,customer_id,amount_cad,Is Laundering,abs_pos_encoding,cyc_enc_minutes_sin,cyc_enc_minutes_cos,cyc_enc_hours_sin,cyc_enc_hours_cos,cyc_enc_weekday_sin,cyc_enc_weekday_cos,...,currency_Mexican Peso,currency_Ruble,currency_Rupee,currency_Saudi Riyal,currency_Shekel,currency_Swiss Franc,currency_UK Pound,currency_US Dollar,currency_Yen,currency_Yuan
130776,10042B6A8,1.069376e+04,0,0.304197,-0.104528,-0.994522,2.588190e-01,0.965926,0.781831,0.623490,...,False,False,False,False,False,False,False,False,False,False
130777,10042B6A8,9.426615e+03,0,0.181575,-0.207912,-0.978148,0.000000e+00,1.000000,-0.781831,0.623490,...,False,False,False,False,False,False,False,False,False,False
130778,10042B6A8,7.039288e+01,0,0.152495,-0.500000,0.866025,1.224647e-16,-1.000000,-0.974928,-0.222521,...,False,False,False,False,False,False,False,False,False,False
130779,10042B6A8,4.540739e+03,0,0.498206,0.104528,0.994522,9.659258e-01,-0.258819,-0.433884,-0.900969,...,False,False,False,False,False,False,False,False,False,False
130780,10042B6A8,1.836576e+03,0,0.098798,0.207912,-0.978148,-7.071068e-01,-0.707107,-0.433884,-0.900969,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
212554,10042B6A8,1.922747e+05,0,0.314336,-0.406737,-0.913545,9.659258e-01,0.258819,0.781831,0.623490,...,False,False,False,False,False,False,False,False,False,False
212555,10042B6A8,3.344602e+02,0,0.359521,-0.669131,-0.743145,-2.588190e-01,0.965926,0.781831,0.623490,...,False,False,False,False,False,False,False,False,False,False
212556,10042B6A8,2.184330e+03,0,0.252336,-0.951057,0.309017,8.660254e-01,0.500000,0.000000,1.000000,...,False,False,False,False,False,False,False,False,False,False
212557,10042B6A8,9.564619e+02,0,0.325809,0.809017,0.587785,5.000000e-01,-0.866025,0.781831,0.623490,...,False,False,False,False,False,False,False,False,False,False


Unnamed: 0,customer_id,amount_cad,Is Laundering,abs_pos_encoding,cyc_enc_minutes_sin,cyc_enc_minutes_cos,cyc_enc_hours_sin,cyc_enc_hours_cos,cyc_enc_weekday_sin,cyc_enc_weekday_cos,...,currency_Mexican Peso,currency_Ruble,currency_Rupee,currency_Saudi Riyal,currency_Shekel,currency_Swiss Franc,currency_UK Pound,currency_US Dollar,currency_Yen,currency_Yuan
212559,10042B6F0,44.264808,0,0.160047,-0.406737,9.135455e-01,-0.707107,-7.071068e-01,-0.974928,-0.222521,...,False,False,False,False,False,False,False,False,False,True
212560,10042B6F0,548.101540,0,0.161090,0.809017,-5.877853e-01,-0.866025,-5.000000e-01,-0.974928,-0.222521,...,False,False,False,False,False,False,False,False,False,True
212561,10042B6F0,593.266912,0,0.355766,0.669131,7.431448e-01,-0.500000,8.660254e-01,0.781831,0.623490,...,False,False,False,False,False,False,False,False,False,True
212562,10042B6F0,339.248224,0,0.459321,0.104528,-9.945219e-01,-0.707107,-7.071068e-01,0.433884,-0.900969,...,False,False,False,False,False,False,False,False,False,True
212563,10042B6F0,411.448408,0,0.502587,-0.994522,1.045285e-01,0.866025,-5.000000e-01,-0.433884,-0.900969,...,False,False,False,False,False,False,False,False,False,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
237661,10042B6F0,448.174720,0,0.443049,-0.104528,9.945219e-01,0.866025,-5.000000e-01,0.433884,-0.900969,...,False,False,False,False,False,False,False,False,False,True
237662,10042B6F0,233.192400,0,0.499917,-0.951057,-3.090170e-01,0.965926,-2.588190e-01,-0.433884,-0.900969,...,False,False,False,False,False,False,False,False,False,True
237663,10042B6F0,165.210752,0,0.445427,-0.406737,9.135455e-01,0.707107,-7.071068e-01,0.433884,-0.900969,...,False,False,False,False,False,False,False,False,False,True
237664,10042B6F0,4913.167056,0,0.372246,-0.951057,-3.090170e-01,0.866025,5.000000e-01,0.974928,-0.222521,...,False,False,False,False,False,False,False,False,False,True


Unnamed: 0,customer_id,amount_cad,Is Laundering,abs_pos_encoding,cyc_enc_minutes_sin,cyc_enc_minutes_cos,cyc_enc_hours_sin,cyc_enc_hours_cos,cyc_enc_weekday_sin,cyc_enc_weekday_cos,...,currency_Mexican Peso,currency_Ruble,currency_Rupee,currency_Saudi Riyal,currency_Shekel,currency_Swiss Franc,currency_UK Pound,currency_US Dollar,currency_Yen,currency_Yuan
237666,10042B738,1527.497086,0,0.086741,-0.809017,-0.587785,5.000000e-01,-0.866025,-0.433884,-0.900969,...,False,False,False,False,False,False,False,False,True,False
237667,10042B738,1177.251206,0,0.037049,-0.951057,0.309017,-5.000000e-01,-0.866025,0.433884,-0.900969,...,False,False,False,False,False,False,False,False,True,False
237668,10042B738,18607.150070,0,0.006467,-0.500000,-0.866025,5.000000e-01,0.866025,0.433884,-0.900969,...,False,False,False,False,False,False,False,False,True,False
237669,10042B738,3883.955450,0,0.090162,0.104528,0.994522,1.224647e-16,-1.000000,-0.433884,-0.900969,...,False,False,False,False,False,False,False,False,True,False
237670,10042B738,161.026230,0,0.174483,-0.951057,-0.309017,-7.071068e-01,0.707107,-0.974928,-0.222521,...,False,False,False,False,False,False,False,False,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
248239,10042B738,257.116790,0,0.381300,0.913545,-0.406737,8.660254e-01,-0.500000,0.974928,-0.222521,...,False,False,False,False,False,False,False,False,True,False
248240,10042B738,7054.025832,0,0.490863,0.500000,0.866025,8.660254e-01,0.500000,-0.433884,-0.900969,...,False,False,False,False,False,False,False,False,True,False
248241,10042B738,99.109840,0,0.520903,0.500000,0.866025,-8.660254e-01,-0.500000,-0.433884,-0.900969,...,False,False,False,False,False,False,False,False,True,False
248242,10042B738,24570.061314,0,0.098673,0.500000,-0.866025,-7.071068e-01,-0.707107,-0.433884,-0.900969,...,False,False,False,False,False,False,False,False,True,False


Unnamed: 0,customer_id,amount_cad,Is Laundering,abs_pos_encoding,cyc_enc_minutes_sin,cyc_enc_minutes_cos,cyc_enc_hours_sin,cyc_enc_hours_cos,cyc_enc_weekday_sin,cyc_enc_weekday_cos,...,currency_Mexican Peso,currency_Ruble,currency_Rupee,currency_Saudi Riyal,currency_Shekel,currency_Swiss Franc,currency_UK Pound,currency_US Dollar,currency_Yen,currency_Yuan
248244,10042B780,3.188630e+02,0,0.136641,-0.500000,-0.866025,1.000000,6.123234e-17,-0.974928,-0.222521,...,False,False,True,False,False,False,False,False,False,False
248245,10042B780,4.238710e+06,0,0.017356,-0.406737,0.913545,1.000000,6.123234e-17,0.433884,-0.900969,...,False,False,True,False,False,False,False,False,False,False
248246,10042B780,1.349826e+03,0,0.084529,-0.994522,0.104528,0.707107,-7.071068e-01,-0.433884,-0.900969,...,False,False,True,False,False,False,False,False,False,False
248247,10042B780,3.991546e+03,0,0.412258,-0.913545,-0.406737,-0.866025,5.000000e-01,0.974928,-0.222521,...,False,False,True,False,False,False,False,False,False,False
248248,10042B780,7.227211e+03,0,0.274282,-0.406737,-0.913545,-0.258819,-9.659258e-01,0.000000,1.000000,...,False,False,True,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
266146,10042B780,5.850470e+03,0,0.196262,0.587785,-0.809017,1.000000,6.123234e-17,-0.781831,0.623490,...,False,False,True,False,False,False,False,False,False,False
266147,10042B780,2.538294e+03,0,0.311999,-0.743145,-0.669131,0.866025,5.000000e-01,0.781831,0.623490,...,False,False,True,False,False,False,False,False,False,False
266148,10042B780,3.575871e+01,0,0.064252,-0.866025,-0.500000,0.258819,9.659258e-01,-0.433884,-0.900969,...,False,False,True,False,False,False,False,False,False,False
266149,10042B780,1.122211e+07,0,0.048440,0.809017,-0.587785,-0.965926,2.588190e-01,0.433884,-0.900969,...,False,False,True,False,False,False,False,False,False,False


Unnamed: 0,customer_id,amount_cad,Is Laundering,abs_pos_encoding,cyc_enc_minutes_sin,cyc_enc_minutes_cos,cyc_enc_hours_sin,cyc_enc_hours_cos,cyc_enc_weekday_sin,cyc_enc_weekday_cos,...,currency_Mexican Peso,currency_Ruble,currency_Rupee,currency_Saudi Riyal,currency_Shekel,currency_Swiss Franc,currency_UK Pound,currency_US Dollar,currency_Yen,currency_Yuan
266151,10042B7C8,40.138938,0,0.066672,-7.431448e-01,-6.691306e-01,5.000000e-01,0.866025,-0.433884,-0.900969,...,False,True,False,False,False,False,False,False,False,False
266152,10042B7C8,32545.688280,0,0.522613,-9.945219e-01,1.045285e-01,-8.660254e-01,-0.500000,-0.433884,-0.900969,...,False,True,False,False,False,False,False,False,False,False
266153,10042B7C8,2317.651245,0,0.094334,-9.135455e-01,-4.067366e-01,-2.588190e-01,-0.965926,-0.433884,-0.900969,...,False,True,False,False,False,False,False,False,False,False
266154,10042B7C8,106.381107,0,0.393650,1.000000e+00,2.832769e-16,-2.588190e-01,-0.965926,0.974928,-0.222521,...,False,True,False,False,False,False,False,False,False,False
266155,10042B7C8,235.102833,0,0.591121,7.431448e-01,6.691306e-01,-8.660254e-01,0.500000,-0.974928,-0.222521,...,False,True,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
270607,10042B7C8,28221.548628,0,0.108895,5.665539e-16,-1.000000e+00,-9.659258e-01,0.258819,-0.433884,-0.900969,...,False,True,False,False,False,False,False,False,False,False
270608,10042B7C8,1618.060605,0,0.400617,2.079117e-01,9.781476e-01,-8.660254e-01,-0.500000,0.974928,-0.222521,...,False,True,False,False,False,False,False,False,False,False
270609,10042B7C8,7668.347757,0,0.096837,-9.135455e-01,-4.067366e-01,-5.000000e-01,-0.866025,-0.433884,-0.900969,...,False,True,False,False,False,False,False,False,False,False
270610,10042B7C8,25529.240541,0,0.452603,-9.510565e-01,3.090170e-01,1.224647e-16,-1.000000,0.433884,-0.900969,...,False,True,False,False,False,False,False,False,False,False


In [3]:
reloaded_data = pickle.load(open('synthetic_cust_trx_dict.pkl', 'rb'))

In [5]:
num_trx = 0
for value in reloaded_data.values():
    num_trx += len(value)
print(num_trx)

5308695
