# Experimentation with Synthetic Data

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.cluster import DBSCAN
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import IsolationForest
import umap as um 
import seaborn as sns


In [3]:
ibm = pd.read_csv('synth_datasets/LI-Small_Trans.csv')

In [5]:
print(len(ibm))

6924049


In [6]:
print(f"IBM = {ibm['Timestamp'].min(), ibm['Timestamp'].max()}")
# print(f"sam = {saml['Date'].min(), saml['Date'].max()}")

IBM = ('2022/09/01 00:00', '2022/09/17 15:28')


In [7]:
for col in ibm.columns:
    print(f'{col} - {ibm[col].unique(), len(ibm[col].unique())}') 

Timestamp - (array(['2022/09/01 00:08', '2022/09/01 00:21', '2022/09/01 00:00', ...,
       '2022/09/10 23:43', '2022/09/10 23:53', '2022/09/10 23:36'],
      dtype=object), 14533)
From Bank - (array([    11,   3402,   3814, ..., 376530, 376722, 376735]), 41814)
Account - (array(['8000ECA90', '80021DAD0', '8006AD080', ..., '80D27FCA0',
       '81B172D21', '81B3D58E1'], dtype=object), 681281)
To Bank - (array([    11,   3402,   1120, ..., 322256, 346093, 326118]), 21588)
Account.1 - (array(['8000ECA90', '80021DAD0', '8006AA910', ..., '816C52C00',
       '819D4F791', '805E8C600'], dtype=object), 576176)
Amount Received - (array([3.195403e+06, 1.858960e+03, 5.925710e+05, ..., 2.585800e-01,
       2.993790e-01, 6.305000e-02]), 1194921)
Receiving Currency - (array(['US Dollar', 'Euro', 'Bitcoin', 'Yuan', 'Yen', 'UK Pound',
       'Brazil Real', 'Australian Dollar', 'Rupee', 'Ruble',
       'Canadian Dollar', 'Mexican Peso', 'Swiss Franc', 'Shekel',
       'Saudi Riyal'], dtype=object), 15)


In [13]:
#Num of transactions for with sending and receiving currency is US Dollar
only_usd_transactions = ibm[(ibm['Payment Currency'] == 'US Dollar') & (ibm['Receiving Currency'] == 'US Dollar')]
print(f"Num of transactions for with sending and receiving currency is US Dollar = {len(only_usd_transactions)}")

print("First 10 transactions")
print(only_usd_transactions.head(10))

print("Last 10 transactions")
print(only_usd_transactions.tail(10))

Num of transactions for with sending and receiving currency is US Dollar = 2502756
First 10 transactions
          Timestamp  From Bank    Account  To Bank  Account.1  \
0  2022/09/01 00:08         11  8000ECA90       11  8000ECA90   
1  2022/09/01 00:21       3402  80021DAD0     3402  80021DAD0   
2  2022/09/01 00:00         11  8000ECA90     1120  8006AA910   
3  2022/09/01 00:16       3814  8006AD080     3814  8006AD080   
4  2022/09/01 00:00         20  8006AD530       20  8006AD530   
5  2022/09/01 00:24         12  8006ADD30       12  8006ADD30   
6  2022/09/01 00:17         11  800059120     1217  8006AD4E0   
7  2022/09/01 00:07         11  8000ECA90       11  8000ECA90   
8  2022/09/01 00:28       1120  8006AA910   243166  81470DCF0   
9  2022/09/01 00:22       1217  8006AD4E0     1217  8006AD4E0   

   Amount Received Receiving Currency  Amount Paid Payment Currency  \
0       3195403.00          US Dollar   3195403.00        US Dollar   
1          1858.96          US Dollar

In [5]:
ibm.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6924049 entries, 0 to 6924048
Data columns (total 11 columns):
 #   Column              Dtype  
---  ------              -----  
 0   Timestamp           object 
 1   From Bank           int64  
 2   Account             object 
 3   To Bank             int64  
 4   Account.1           object 
 5   Amount Received     float64
 6   Receiving Currency  object 
 7   Amount Paid         float64
 8   Payment Currency    object 
 9   Payment Format      object 
 10  Is Laundering       int64  
dtypes: float64(2), int64(3), object(6)
memory usage: 581.1+ MB


In [6]:
fraud_accounts = len((ibm.loc[ibm['Is Laundering'] == 1])['Account'].unique())
# print(fraud_accounts)
total_accounts = len(ibm['Account'].unique())
fraud_percentage = fraud_accounts / total_accounts * 100
print(fraud_percentage)

0.34963546612924773


In [8]:
saml.head(10)

Unnamed: 0,Time,Date,Sender_account,Receiver_account,Amount,Payment_currency,Received_currency,Sender_bank_location,Receiver_bank_location,Payment_type,Is_laundering,Laundering_type
0,10:35:19,2022-10-07,8724731955,2769355426,1459.15,UK pounds,UK pounds,UK,UK,Cash Deposit,0,Normal_Cash_Deposits
1,10:35:20,2022-10-07,1491989064,8401255335,6019.64,UK pounds,Dirham,UK,UAE,Cross-border,0,Normal_Fan_Out
2,10:35:20,2022-10-07,287305149,4404767002,14328.44,UK pounds,UK pounds,UK,UK,Cheque,0,Normal_Small_Fan_Out
3,10:35:21,2022-10-07,5376652437,9600420220,11895.0,UK pounds,UK pounds,UK,UK,ACH,0,Normal_Fan_In
4,10:35:21,2022-10-07,9614186178,3803336972,115.25,UK pounds,UK pounds,UK,UK,Cash Deposit,0,Normal_Cash_Deposits
5,10:35:21,2022-10-07,8974559268,3143547511,5130.99,UK pounds,UK pounds,UK,UK,ACH,0,Normal_Group
6,10:35:23,2022-10-07,980191499,8577635959,12176.52,UK pounds,UK pounds,UK,UK,ACH,0,Normal_Small_Fan_Out
7,10:35:23,2022-10-07,8057793308,9350896213,56.9,UK pounds,UK pounds,UK,UK,Credit card,0,Normal_Small_Fan_Out
8,10:35:26,2022-10-07,6116657264,656192169,4738.45,UK pounds,UK pounds,UK,UK,Cheque,0,Normal_Fan_Out
9,10:35:29,2022-10-07,7421451752,2755709071,5883.87,Indian rupee,UK pounds,UK,UK,Credit card,0,Normal_Fan_Out


In [9]:
saml.describe()

Unnamed: 0,Sender_account,Receiver_account,Amount,Is_laundering
count,9504852.0,9504852.0,9504852.0,9504852.0
mean,5006619000.0,5006006000.0,8762.968,0.001038733
std,2885814000.0,2884763000.0,25614.95,0.03221263
min,9018.0,9018.0,3.73,0.0
25%,2513133000.0,2513219000.0,2143.688,0.0
50%,5001017000.0,5002572000.0,6113.72,0.0
75%,7505051000.0,7502397000.0,10458.46,0.0
max,9999987000.0,9999971000.0,12618500.0,1.0


In [10]:
saml.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9504852 entries, 0 to 9504851
Data columns (total 12 columns):
 #   Column                  Dtype  
---  ------                  -----  
 0   Time                    object 
 1   Date                    object 
 2   Sender_account          int64  
 3   Receiver_account        int64  
 4   Amount                  float64
 5   Payment_currency        object 
 6   Received_currency       object 
 7   Sender_bank_location    object 
 8   Receiver_bank_location  object 
 9   Payment_type            object 
 10  Is_laundering           int64  
 11  Laundering_type         object 
dtypes: float64(1), int64(3), object(8)
memory usage: 870.2+ MB


In [11]:
fraud_accounts_saml = len((saml.loc[saml['Is_laundering'] == 1])['Receiver_account'].unique())
# print(fraud_accounts)
total_accounts_saml = len(saml['Receiver_account'].unique())
fraud_percentage_saml = fraud_accounts_saml / total_accounts_saml * 100
print(fraud_percentage_saml)

0.6245918076367617
