In [1]:
# This file combines all different type of transactions in to a single data frame. 
# Then output a dictionary with cutomer ID's as keys and features as values. 

# All the categorical columns are converted to one hot encoding and and the 
# transaction data and time are converted to time delta absolute positional encoding and cyclical positional encoding for hour of the day, day of the week and day of the month

# The idea is to feed all of these transaction into a reccurrant model such as 
# RNN or GRU to automatically make features for each customer instead of hand crafting them. 

In [2]:
import pandas as pd
from datetime import time
from collections import Counter
import numpy as np
from tqdm import tqdm
import matplotlib.pyplot as plt
import pickle

In [3]:
# Load data
frac = 1
wire = pd.read_csv('raw_data/wire.csv', engine="pyarrow")
abm = pd.read_csv('raw_data/abm.csv', engine="pyarrow")
cheque = pd.read_csv('raw_data/cheque.csv', engine="pyarrow").sample(frac = frac)
eft = pd.read_csv('raw_data/eft.csv', engine="pyarrow").sample(frac = frac)
emt = pd.read_csv('raw_data/emt.csv', engine="pyarrow").sample(frac = frac)
card = pd.read_csv('raw_data/card.csv', engine="pyarrow").sample(frac = frac)


In [4]:
# Preprocess data
# Make all amounts positive
card['amount_cad'] = np.abs(card['amount_cad'])
# Make debit and credit columns consistent across all dataframes
emt['debit_credit'] = emt['debit_credit'].apply(lambda x: 'debit' if x == 'D' else 'credit')

In [5]:
#making dictionary of all transactions
transactions = {
    'wire': wire,
    'abm': abm,
    'cheque': cheque,
    'eft': eft,
    'emt': emt,
    'card': card}

In [6]:
# Drop all the transaction id columns since they are all unique and not useful
transactions['wire'].drop(columns=['wire_id'], inplace=True)
transactions['abm'].drop(columns=['abm_id'], inplace=True)
transactions['cheque'].drop(columns=['cheque_id'], inplace=True)
transactions['eft'].drop(columns=['eft_id'], inplace=True)
transactions['emt'].drop(columns=['emt_id'], inplace=True)
transactions['card'].drop(columns=['card_trxn_id'], inplace=True)

In [7]:
# Combine date and time columns into a single datetime column

def combine_date_and_time(df):
    if 'transaction_time' not in df.columns:
        #rename the column transaction_date to transaction_datetime
        df['transaction_time'] = time(0, 0, 0)
        
    df['datetime_str'] = (
            df['transaction_date'].astype(str) + " " + df['transaction_time'].astype(str)
        )

    # Parse the combined string into datetime
    df['transaction_datetime'] = pd.to_datetime(
        df['datetime_str'],
        format='%Y-%m-%d %H:%M:%S',  # Adjust format to match your data
        errors='coerce'  # Coerce invalid values to NaT
    )
    df = df.drop(columns='datetime_str')
    df.drop(columns=['transaction_date', 'transaction_time'], inplace=True)
    
    return df

In [8]:
# sort by date and time
def sort_by_date_time(df):
    df = df.sort_values(by='transaction_datetime')    
    return df

In [9]:
# Add a column for the transaction type since all the dataframes are going to be combined
def trx_type_col(df:pd.DataFrame, trx_type:str):
    df['trx_type'] = trx_type
    return df

In [10]:
# Combine all the dataframes into a single dataframe
def combine_df(df_dict:dict):
    df = pd.concat(df_dict.values(), axis= 0, ignore_index=True)
    return df

In [11]:
# Combine date and time columns, sort by date and time, and add a column for the transaction type
for trx_type, df in transactions.items():
    df = combine_date_and_time(df)
    # df = sort_by_date_time(df)
    df = trx_type_col(df, trx_type)   
    transactions[trx_type] = df

In [12]:
# Combine all the dataframes into a single dataframe
combined_df = combine_df(transactions)

In [13]:
# add positional encoding and cyclic encoding for date and time features
earliest_time = combined_df['transaction_datetime'].min()
latest_time = combined_df['transaction_datetime'].max()

abs_pos_encoding = (combined_df['transaction_datetime'] - earliest_time).apply(lambda x: x.total_seconds()/(latest_time-earliest_time).total_seconds()) 
combined_df['abs_pos_encoding'] = abs_pos_encoding
hours = combined_df['transaction_datetime'].dt.hour
days = combined_df['transaction_datetime'].dt.day
week_day = combined_df['transaction_datetime'].dt.dayofweek

combined_df['cyc_enc_hours_sin'] = np.sin(2*np.pi*hours/24)
combined_df['cyc_enc_hours_cos'] = np.cos(2*np.pi*hours/24)

combined_df['cyc_enc_days_sin'] = np.sin(2*np.pi*days/31)
combined_df['cyc_enc_days_cos'] = np.cos(2*np.pi*days/31)

combined_df['cyc_enc_weekday_sin'] = np.sin(2*np.pi*week_day/7)
combined_df['cyc_enc_weekday_cos'] = np.cos(2*np.pi*week_day/7)

combined_df.drop(columns=['transaction_datetime'], inplace=True)

In [14]:
# Fill missing values with 'other' for categorical columns
cols = combined_df.columns
for col in cols:
    if col in ['cash_indicator', 'ecommerce_ind', 'merchant_category', 'country', 'city', 'province']:
       combined_df[col].fillna('other', inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  combined_df[col].fillna('other', inplace=True)


In [15]:
# Convert categorical columns to categorical data type
for col in cols:
    if col in ['debit_credit','trx_type','cash_indicator', 'country', 'province', 'city','merchant_category', 'ecommerce_ind']:
        combined_df[col] = combined_df[col].astype('category')

In [16]:
# One hot encode categorical columns
combined_df_encoded = pd.get_dummies(
    combined_df,
    columns=['debit_credit','trx_type','cash_indicator', 'country', 'province', 'city','merchant_category', 'ecommerce_ind'],  # Columns to encode
)

In [17]:
# Make a dictionary of all the unique customer ids and their transactions
# get all the unique customer ids
combined_df_encoded_copy = combined_df_encoded.copy()

combined_df_encoded_copy.sort_values(by=['customer_id'], inplace=True, ignore_index=True) 

customer_ids = combined_df_encoded_copy['customer_id'].unique()

cust_trx_dict = {}

start_idx = 0
curr_idx = 0
last_idx = None
for cust in tqdm(customer_ids):
    while cust == combined_df_encoded_copy['customer_id'][curr_idx]:
        curr_idx += 1
        if curr_idx == len(combined_df_encoded_copy):
            cust_trx_dict[cust] = combined_df_encoded_copy.iloc[start_idx:]
            break
    last_idx = curr_idx
    cust_trx_dict[cust] = combined_df_encoded_copy.iloc[start_idx:last_idx]
    start_idx = curr_idx

100%|██████████| 16226/16226 [00:03<00:00, 5166.67it/s]


In [19]:
list(cust_trx_dict[list(cust_trx_dict.keys())[0]].columns)

['customer_id',
 'amount_cad',
 'abs_pos_encoding',
 'cyc_enc_hours_sin',
 'cyc_enc_hours_cos',
 'cyc_enc_days_sin',
 'cyc_enc_days_cos',
 'cyc_enc_weekday_sin',
 'cyc_enc_weekday_cos',
 'debit_credit_credit',
 'debit_credit_debit',
 'trx_type_abm',
 'trx_type_card',
 'trx_type_cheque',
 'trx_type_eft',
 'trx_type_emt',
 'trx_type_wire',
 'cash_indicator_False',
 'cash_indicator_True',
 'cash_indicator_other',
 'country_CA',
 'country_GB',
 'country_IE',
 'country_SE',
 'country_US',
 'country_other',
 'province_AB',
 'province_AZ',
 'province_BC',
 'province_CA',
 'province_DE',
 'province_FL',
 'province_GA',
 'province_MA',
 'province_MB',
 'province_NB',
 'province_NL',
 'province_NS',
 'province_NY',
 'province_ON',
 'province_PE',
 'province_PQ',
 'province_QC',
 'province_SK',
 'province_TX',
 'province_WA',
 'province_YT',
 'province_other',
 'city_ABBOTSFORD',
 'city_AIRDRIE',
 'city_AJAX',
 'city_AMHERST',
 'city_AURORA',
 'city_BARRIE',
 'city_BELLEVILLE',
 'city_BOLTON',
 '

In [20]:
# Save the dictionary to a pickle file
# The dictionalry has customer id as keys and their transactions as values
with open('cust_trx_dict.pkl', 'wb') as f:
    pickle.dump(cust_trx_dict, f)

In [21]:
i = 0
for cust, trx in cust_trx_dict.items():
    display(cust_trx_dict[cust])
    i += 1
    if i>5:
        break

Unnamed: 0,customer_id,amount_cad,abs_pos_encoding,cyc_enc_hours_sin,cyc_enc_hours_cos,cyc_enc_days_sin,cyc_enc_days_cos,cyc_enc_weekday_sin,cyc_enc_weekday_cos,debit_credit_credit,...,merchant_category_7538,merchant_category_7542,merchant_category_8099,merchant_category_8398,merchant_category_8699,merchant_category_9399,merchant_category_other,ecommerce_ind_False,ecommerce_ind_True,ecommerce_ind_other
0,SYNCID0000000000,6316.04,0.184787,0.0,1.0,-0.485302,-0.874347,-0.433884,-0.900969,False,...,False,False,False,False,False,False,True,False,False,True
1,SYNCID0000000000,0.03,0.652188,0.0,1.0,-2.449294e-16,1.0,-0.974928,-0.222521,True,...,False,False,False,False,False,False,True,False,False,True
2,SYNCID0000000000,415.24,0.076089,0.0,1.0,0.9987165,-0.050649,0.781831,0.62349,False,...,False,False,False,False,False,False,True,False,False,True


Unnamed: 0,customer_id,amount_cad,abs_pos_encoding,cyc_enc_hours_sin,cyc_enc_hours_cos,cyc_enc_days_sin,cyc_enc_days_cos,cyc_enc_weekday_sin,cyc_enc_weekday_cos,debit_credit_credit,...,merchant_category_7538,merchant_category_7542,merchant_category_8099,merchant_category_8398,merchant_category_8699,merchant_category_9399,merchant_category_other,ecommerce_ind_False,ecommerce_ind_True,ecommerce_ind_other
3,SYNCID0000000001,213.4,0.221591,0.707107,-0.707107,-0.897805,-0.440394,0.0,1.0,False,...,False,False,False,False,False,False,False,True,False,False
4,SYNCID0000000001,60.3,0.54043,-0.965926,-0.258819,-0.790776,-0.612106,0.781831,0.62349,False,...,False,False,False,False,False,False,True,True,False,False
5,SYNCID0000000001,1.72,0.016976,-0.258819,-0.965926,0.394356,0.918958,0.974928,-0.222521,False,...,False,False,False,False,False,False,False,False,True,False
6,SYNCID0000000001,15.97,0.3093,0.5,-0.866025,-0.394356,0.918958,0.781831,0.62349,False,...,False,False,False,False,False,False,True,True,False,False


Unnamed: 0,customer_id,amount_cad,abs_pos_encoding,cyc_enc_hours_sin,cyc_enc_hours_cos,cyc_enc_days_sin,cyc_enc_days_cos,cyc_enc_weekday_sin,cyc_enc_weekday_cos,debit_credit_credit,...,merchant_category_7538,merchant_category_7542,merchant_category_8099,merchant_category_8398,merchant_category_8699,merchant_category_9399,merchant_category_other,ecommerce_ind_False,ecommerce_ind_True,ecommerce_ind_other
7,SYNCID0000000002,1597.38,0.163047,0.0,1.0,-0.101168,-0.994869,0.974928,-0.222521,True,...,False,False,False,False,False,False,True,False,False,True
8,SYNCID0000000002,3771.6,0.260875,0.0,1.0,-0.937752,0.347305,-0.433884,-0.900969,False,...,False,False,False,False,False,False,True,False,False,True
9,SYNCID0000000002,2186.16,0.097828,0.0,1.0,0.897805,-0.440394,0.433884,-0.900969,True,...,False,False,False,False,False,False,True,False,False,True
10,SYNCID0000000002,2506.0,0.250005,0.0,1.0,-0.988468,0.151428,0.433884,-0.900969,False,...,False,False,False,False,False,False,True,False,False,True
11,SYNCID0000000002,2503.42,0.141307,0.0,1.0,0.299363,-0.954139,0.0,1.0,True,...,False,False,False,False,False,False,True,False,False,True
12,SYNCID0000000002,1661.76,0.913063,0.0,1.0,-0.988468,0.151428,0.781831,0.62349,True,...,False,False,False,False,False,False,True,False,False,True
13,SYNCID0000000002,8137.34,0.380443,0.0,1.0,0.937752,0.347305,0.781831,0.62349,False,...,False,False,False,False,False,False,True,False,False,True
14,SYNCID0000000002,5296.11,0.717407,0.0,1.0,0.937752,0.347305,-0.433884,-0.900969,True,...,False,False,False,False,False,False,True,False,False,True
15,SYNCID0000000002,1127.94,0.065219,0.0,1.0,0.988468,0.151428,0.0,1.0,False,...,False,False,False,False,False,False,True,False,False,True
16,SYNCID0000000002,1283.8,0.945672,0.0,1.0,-0.724793,0.688967,-0.433884,-0.900969,True,...,False,False,False,False,False,False,True,False,False,True


Unnamed: 0,customer_id,amount_cad,abs_pos_encoding,cyc_enc_hours_sin,cyc_enc_hours_cos,cyc_enc_days_sin,cyc_enc_days_cos,cyc_enc_weekday_sin,cyc_enc_weekday_cos,debit_credit_credit,...,merchant_category_7538,merchant_category_7542,merchant_category_8099,merchant_category_8398,merchant_category_8699,merchant_category_9399,merchant_category_other,ecommerce_ind_False,ecommerce_ind_True,ecommerce_ind_other
34,SYNCID0000000004,105.35,0.646265,0.5,-0.8660254,-0.201299,0.97953,-0.433884,-0.900969,False,...,False,False,False,False,False,False,False,True,False,False
35,SYNCID0000000004,1241.79,0.43206,-0.9659258,-0.258819,0.897805,-0.440394,-0.974928,-0.222521,False,...,False,False,False,False,False,False,False,True,False,False
36,SYNCID0000000004,5.6,0.138979,-1.0,-1.83697e-16,0.485302,-0.874347,-0.781831,0.62349,False,...,False,False,False,False,False,False,False,True,False,False
37,SYNCID0000000004,17.14,0.050599,-0.7071068,-0.7071068,0.848644,0.528964,-0.974928,-0.222521,False,...,False,False,False,False,False,False,False,True,False,False
38,SYNCID0000000004,29.86,0.638435,-0.9659258,-0.258819,-0.394356,0.918958,0.433884,-0.900969,False,...,False,False,False,False,False,False,False,True,False,False
39,SYNCID0000000004,236.07,0.201238,1.224647e-16,-1.0,-0.651372,-0.758758,-0.974928,-0.222521,True,...,False,False,False,False,False,False,False,True,False,False
40,SYNCID0000000004,19.85,0.083077,-0.7071068,-0.7071068,0.998717,-0.050649,0.781831,0.62349,False,...,False,False,False,False,False,False,False,False,True,False
41,SYNCID0000000004,53.44,0.962769,-0.258819,-0.9659258,-0.571268,0.820763,-0.974928,-0.222521,False,...,False,False,False,False,False,False,False,True,False,False
42,SYNCID0000000004,118.32,0.299129,1.224647e-16,-1.0,-0.571268,0.820763,0.0,1.0,False,...,False,False,False,False,False,False,False,True,False,False
43,SYNCID0000000004,43.02,0.960046,0.9659258,-0.258819,-0.571268,0.820763,-0.974928,-0.222521,False,...,False,False,False,False,False,False,False,True,False,False


Unnamed: 0,customer_id,amount_cad,abs_pos_encoding,cyc_enc_hours_sin,cyc_enc_hours_cos,cyc_enc_days_sin,cyc_enc_days_cos,cyc_enc_weekday_sin,cyc_enc_weekday_cos,debit_credit_credit,...,merchant_category_7538,merchant_category_7542,merchant_category_8099,merchant_category_8398,merchant_category_8699,merchant_category_9399,merchant_category_other,ecommerce_ind_False,ecommerce_ind_True,ecommerce_ind_other
78,SYNCID0000000005,2.14,0.715711,-0.866025,0.5,0.8486443,0.528964,0.433884,-0.900969,True,...,False,False,False,False,False,False,True,False,False,True
79,SYNCID0000000005,22.12,0.094518,-0.866025,-0.5,0.9680771,-0.250653,0.974928,-0.222521,True,...,False,False,False,False,False,False,True,False,False,True
80,SYNCID0000000005,43.22,0.49671,-0.866025,-0.5,-0.1011683,-0.994869,-0.433884,-0.900969,True,...,False,False,False,False,False,False,True,False,False,True
81,SYNCID0000000005,84.7,0.178306,0.707107,-0.7071068,-0.2993631,-0.954139,0.433884,-0.900969,True,...,False,False,False,False,False,False,True,False,False,True
82,SYNCID0000000005,82.05,0.873356,0.866025,-0.5,-0.7907757,-0.612106,-0.433884,-0.900969,True,...,False,False,False,False,False,False,True,False,False,True
83,SYNCID0000000005,852.5,0.794285,0.258819,0.9659258,0.485302,-0.874347,-0.433884,-0.900969,True,...,False,False,False,False,False,False,True,False,False,True
84,SYNCID0000000005,148.33,0.192349,-0.866025,-0.5,-0.485302,-0.874347,-0.433884,-0.900969,True,...,False,False,False,False,False,False,True,False,False,True
85,SYNCID0000000005,41.35,0.83049,0.707107,-0.7071068,-0.1011683,-0.994869,0.0,1.0,True,...,False,False,False,False,False,False,True,False,False,True
86,SYNCID0000000005,72.44,0.374837,0.258819,-0.9659258,0.8486443,0.528964,0.0,1.0,True,...,False,False,False,False,False,False,True,False,False,True
87,SYNCID0000000005,61.65,0.559351,0.258819,-0.9659258,-0.9680771,-0.250653,0.433884,-0.900969,True,...,False,False,False,False,False,False,True,False,False,True


Unnamed: 0,customer_id,amount_cad,abs_pos_encoding,cyc_enc_hours_sin,cyc_enc_hours_cos,cyc_enc_days_sin,cyc_enc_days_cos,cyc_enc_weekday_sin,cyc_enc_weekday_cos,debit_credit_credit,...,merchant_category_7538,merchant_category_7542,merchant_category_8099,merchant_category_8398,merchant_category_8699,merchant_category_9399,merchant_category_other,ecommerce_ind_False,ecommerce_ind_True,ecommerce_ind_other
128,SYNCID0000000006,3071.24,0.771756,0.0,1.0,0.790776,-0.612106,0.974928,-0.222521,False,...,False,False,False,False,False,False,True,False,False,True
129,SYNCID0000000006,3047.98,0.315224,0.0,1.0,-0.201299,0.97953,0.974928,-0.222521,False,...,False,False,False,False,False,False,True,False,False,True
130,SYNCID0000000006,141466.98,0.413052,0.0,1.0,0.968077,-0.250653,-0.433884,-0.900969,True,...,False,False,False,False,False,False,True,False,False,True
