# Processing Synthetic Data to make it look like Competition Dataset

In [32]:
import pandas as pd
import numpy as np
from tqdm import tqdm


In [33]:
# # Load data only for calculating the proportion of credit and debit card transactions


# frac = 1
# wire = pd.read_csv('raw_data/wire.csv', engine="pyarrow")
# abm = pd.read_csv('raw_data/abm.csv', engine="pyarrow")
# cheque = pd.read_csv('raw_data/cheque.csv', engine="pyarrow").sample(frac = frac)
# eft = pd.read_csv('raw_data/eft.csv', engine="pyarrow").sample(frac = frac)
# emt = pd.read_csv('raw_data/emt.csv', engine="pyarrow").sample(frac = frac)
# card = pd.read_csv('raw_data/card.csv', engine="pyarrow").sample(frac = frac)

# # Preprocess data
# # Make all amounts positive
# card['amount_cad'] = np.abs(card['amount_cad'])
# # Make debit and credit columns consistent across all dataframes
# emt['debit_credit'] = emt['debit_credit'].apply(lambda x: 'debit' if x == 'D' else 'credit')

# #making dictionary of all the competition transactions only for calculating the the debit and credit proportions
# transactions = {
#     'wire': wire,
#     'abm': abm,
#     'cheque': cheque,
#     'eft': eft,
#     'emt': emt,
#     'card': card
#     }

# # calculate proportion of debit and credit transactions in the competition dataset for each format
# # the same proportion will be used to generate unidirectional data from the bidirectional synthetic IBM data 
# for key, value in transactions.items():
#     debit_transactions = len(value[value['debit_credit'] == 'debit'])
#     credit_transactions = len(value[value['debit_credit'] == 'credit'])
#     total_transactions = len(value)
#     print(f'{key[:3]} \t Debit % = {round(100*debit_transactions/total_transactions, 2)},\tCredit % = {round(100*credit_transactions/total_transactions, 2)}')

In [34]:
ibm_frac = 1
ibm = pd.read_csv('synth_datasets/LI-Small_Trans.csv').sample(frac = ibm_frac)

In [35]:
print(f'# of unique transactions = {len(ibm)}')
#extract the Cheque, ACH, Credit Card and Wire Payment Format Transactions from IBM dataset
filtered_ibm = ibm[(ibm['Payment Format'] == 'Cheque') | (ibm['Payment Format'] == 'ACH') | (ibm['Payment Format'] == 'Credit Card') | (ibm['Payment Format'] == 'Wire')]
filtered_ibm.reset_index(drop=True, inplace=True)
print(f'# of unique transactions that are Cheque, ACH, Credit Card and Wire Payment Format Transactions = {len(filtered_ibm)}')


# of unique transactions = 6924049
# of unique transactions that are Cheque, ACH, Credit Card and Wire Payment Format Transactions = 5308695


In [36]:
# Proportion of debit and credit transactions in the competition dataset
deb_cred_proportion = {
    'Cheque': 58.26,
    'ACH':50.0,
    'Credit Card': 97.56,
    'Wire':55.63
}

In [37]:
#Define a random true/false generator with a given probability
def random_generator(probability):
    return np.random.random() < probability

In [38]:
# get exchange rates for all currencies used in the dataset and store them in a dictionary
# dictionary is used to convert all currency amounts to CAD
import requests

def get_exchange_rates():
    url = "https://api.exchangerate-api.com/v4/latest/CAD"  # Using ExchangeRate-API
    response = requests.get(url)
    data = response.json()
    
    currency_list = ['US Dollar', 'Swiss Franc', 'Saudi Riyal', 'Euro',
                     'Canadian Dollar', 'Rupee', 'Australian Dollar', 'Yuan', 'Ruble',
                     'Bitcoin', 'Mexican Peso', 'Shekel', 'Brazil Real', 'UK Pound',
                     'Yen']
    
    currency_map = {
        'US Dollar': 'USD', 'Swiss Franc': 'CHF', 'Saudi Riyal': 'SAR', 'Euro': 'EUR',
        'Canadian Dollar': 'CAD', 'Rupee': 'INR', 'Australian Dollar': 'AUD', 'Yuan': 'CNY',
        'Ruble': 'RUB', 'Mexican Peso': 'MXN', 'Shekel': 'ILS',
        'Brazil Real': 'BRL', 'UK Pound': 'GBP', 'Yen': 'JPY'
    }
    
    exchange_rates = {}
    for currency, code in currency_map.items():
        rate = data['rates'].get(code, 'N/A')
        if rate != 'N/A':
            exchange_rates[currency] = round(1 / rate, 4)  # Convert from currency to CAD and round
    
    # Fetch Bitcoin exchange rate separately
    btc_url = "https://api.coindesk.com/v1/bpi/currentprice/CAD.json"
    btc_response = requests.get(btc_url)
    btc_data = btc_response.json()
    btc_rate = btc_data['bpi']['CAD']['rate_float']
    exchange_rates['Bitcoin'] = round(btc_rate, 2)  # Higher precision for Bitcoin
    
    return exchange_rates


In [39]:
# exchange_rates = get_exchange_rates()
# print(exchange_rates)

In [40]:
# Exchange rates as of 2025-01-31
exchange_rates = {'US Dollar': 1.4451, 
                  'Swiss Franc': 1.5898, 
                  'Saudi Riyal': 0.3846, 
                  'Euro': 1.5038, 
                  'Canadian Dollar': 1.0, 
                  'Rupee': 0.0167, 
                  'Australian Dollar': 0.9009, 
                  'Yuan': 0.1988, 
                  'Ruble': 0.0147, 
                  'Mexican Peso': 0.0701, 
                  'Shekel': 0.4032, 
                  'Brazil Real': 0.2457, 
                  'UK Pound': 1.7953, 
                  'Yen': 0.0094, 
                  'Bitcoin': 148034.33
                  }

In [41]:
row_list = []
for i in tqdm(range(len(filtered_ibm))):
    true_or_false = random_generator(deb_cred_proportion[filtered_ibm['Payment Format'][i]]/100)
    row = filtered_ibm.iloc[i]
    if true_or_false:
        # print(row['Payment Currency'])
        # print(exchange_rates['US Dollar'])
        amount_cad = row['Amount Paid'] * exchange_rates[row['Payment Currency']]
        row_list.append({
           'transaction_date': str(pd.Timestamp(row['Timestamp']).date()),
           'transaction_time': str(pd.Timestamp(row['Timestamp']).time()),
           'city': row['From Bank'],
           'customer_id': row['Account'],
           'amount_cad': amount_cad,
           'debit_credit': 'debit',
           'trx_type': row['Payment Format'],
           'Is Laundering': row['Is Laundering'],
            })
    else:
        amount_cad = row['Amount Received'] * exchange_rates[row['Receiving Currency']]
        row_list.append({
            'transaction_date': str(pd.Timestamp(row['Timestamp']).date()),
            'transaction_time': str(pd.Timestamp(row['Timestamp']).time()),
            'city': row['To Bank'],
            'customer_id': row['Account.1'],
            'amount_cad': amount_cad,
            'debit_credit': 'credit',
            'trx_type': row['Payment Format'],
            'Is Laundering': row['Is Laundering'],
            })

d_c_dataframe = pd.DataFrame(row_list)    

100%|██████████| 5308695/5308695 [02:27<00:00, 35994.29it/s]


In [48]:
#save the generated data to a csv file
d_c_dataframe.to_csv('synth_training_data.csv', index=False)

In [43]:
#load the generated data from the csv file
# d_c_dataframe = pd.read_csv('synth_training_data.csv')

In [49]:
# Percentage of fraudulent accounts in the dataset
fraud_accounts = len((d_c_dataframe.loc[d_c_dataframe['Is Laundering'] == 1])['customer_id'].unique())
total_accounts = len(d_c_dataframe['customer_id'].unique())
fraud_percentage = fraud_accounts / total_accounts * 100
print(fraud_percentage)

0.5299095219052792


In [50]:
# Percentage of fraudulent transactions in the dataset
fraud_transactions = len(d_c_dataframe.loc[d_c_dataframe['Is Laundering'] == 1])
total_transactions = len(d_c_dataframe)
fraud_percentage = fraud_transactions / total_transactions * 100
print(fraud_percentage)


0.06274611745447799


In [51]:
# Save the data in separate files for each payment format
save_path = 'processed_synth_dataset/'
for payment_format in d_c_dataframe['trx_type'].unique():
    if payment_format == 'ACH':
        d_c_dataframe[d_c_dataframe['trx_type'] == payment_format].drop(columns=['trx_type'], inplace=False).to_csv(save_path + 'ach_s.csv', index=False)
    elif payment_format == 'Cheque':
        d_c_dataframe[d_c_dataframe['trx_type'] == payment_format].drop(columns=['trx_type'], inplace=False).to_csv(save_path + 'cheque_s.csv', index=False)
    elif payment_format == 'Credit Card':
        d_c_dataframe[d_c_dataframe['trx_type'] == payment_format].drop(columns=['trx_type'], inplace=False).to_csv(save_path + 'card_s.csv', index=False)
    elif payment_format == 'Wire':
        d_c_dataframe[d_c_dataframe['trx_type'] == payment_format].drop(columns=['trx_type'], inplace=False).to_csv(save_path + 'wire_s.csv', index=False)
        

In [None]:
# Load the synthetic data
ach_s = pd.read_csv(save_path + 'ach_s.csv')
cheque_s = pd.read_csv(save_path + 'cheque_s.csv')
card_s = pd.read_csv(save_path + 'card_s.csv')
wire_s = pd.read_csv(save_path + 'wire_s.csv')
