In [39]:
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
import scipy.stats as st
from sklearn.preprocessing import FunctionTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression


In [2]:
df_feature_engineered = pd.read_csv('feature_engineered_dataset.csv')
df_feature_engineered

Unnamed: 0,Timestamp,Recieving_account,Originating_Account,Amount Received,Receiving Currency,Amount Paid,Payment Currency,Payment Format,Is Laundering,Year_Norm,Month_Norm,Day_Norm,Hour_Norm,Minute_Norm,Day_of_Week_Norm,Amount Paid and Recieved Match,Account Match,Currency Match
0,2022-09-01 00:20:00,8000EBD30,8000EBD30,3697.34,US Dollar,3697.34,US Dollar,Reinvestment,0.0,,0.727273,0.000000,0.000000,0.338983,0.500000,1.0,1.0,1.0
1,2022-09-01 00:20:00,8000F4580,8000F5340,0.01,US Dollar,0.01,US Dollar,Cheque,0.0,,0.727273,0.000000,0.000000,0.338983,0.500000,1.0,0.0,1.0
2,2022-09-01 00:00:00,8000F4670,8000F4670,14675.57,US Dollar,14675.57,US Dollar,Reinvestment,0.0,,0.727273,0.000000,0.000000,0.000000,0.500000,1.0,1.0,1.0
3,2022-09-01 00:02:00,8000F5030,8000F5030,2806.97,US Dollar,2806.97,US Dollar,Reinvestment,0.0,,0.727273,0.000000,0.000000,0.033898,0.500000,1.0,1.0,1.0
4,2022-09-01 00:06:00,8000F5200,8000F5200,36682.97,US Dollar,36682.97,US Dollar,Reinvestment,0.0,,0.727273,0.000000,0.000000,0.101695,0.500000,1.0,1.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1728955,2022-09-02 19:19:00,1004288A0,80A247AF0,694.36,Australian Dollar,694.36,Australian Dollar,Cash,0.0,,0.727273,0.033333,0.826087,0.322034,0.666667,1.0,0.0,1.0
1728956,2022-09-02 19:21:00,80A24CD50,80A24C630,2346.59,Australian Dollar,2346.59,Australian Dollar,ACH,0.0,,0.727273,0.033333,0.826087,0.355932,0.666667,1.0,0.0,1.0
1728957,2022-09-02 19:07:00,80A252350,80A24FA90,869.53,Australian Dollar,869.53,Australian Dollar,ACH,0.0,,0.727273,0.033333,0.826087,0.118644,0.666667,1.0,0.0,1.0
1728958,2022-09-02 19:17:00,80A23DAA0,80A25EB50,3749.98,Australian Dollar,3749.98,Australian Dollar,Credit Card,0.0,,0.727273,0.033333,0.826087,0.288136,0.666667,1.0,0.0,1.0


In [3]:
# remove nan from df['Is Laundering']
df_feature_engineered.dropna(subset=['Is Laundering'], inplace=True)

In [4]:
df_feature_engineered.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1728959 entries, 0 to 1728958
Data columns (total 18 columns):
 #   Column                          Dtype  
---  ------                          -----  
 0   Timestamp                       object 
 1   Recieving_account               object 
 2   Originating_Account             object 
 3   Amount Received                 float64
 4   Receiving Currency              object 
 5   Amount Paid                     float64
 6   Payment Currency                object 
 7   Payment Format                  object 
 8   Is Laundering                   float64
 9   Year_Norm                       float64
 10  Month_Norm                      float64
 11  Day_Norm                        float64
 12  Hour_Norm                       float64
 13  Minute_Norm                     float64
 14  Day_of_Week_Norm                float64
 15  Amount Paid and Recieved Match  float64
 16  Account Match                   float64
 17  Currency Match                  

In [6]:
def engineer_timestamp_features(df):
    df['Timestamp'] = pd.to_datetime(df['Timestamp'])
    df['Year_Norm'] = df['Timestamp'].dt.year
    df['Month_Norm'] = df['Timestamp'].dt.month
    df['Day_Norm'] = df['Timestamp'].dt.day
    df['Hour_Norm'] = df['Timestamp'].dt.hour
    df['Minute_Norm'] = df['Timestamp'].dt.minute
    df['Day_of_Week_Norm'] = df['Timestamp'].dt.dayofweek
    return df.drop('Timestamp', axis=1)

In [7]:
def engineer_match_features(df):
    df['Amount Paid and Recieved Match'] = (df['Amount Received'] == df['Amount Paid']).astype(float)
    df['Account Match'] = (df['Recieving_account'] == df['Originating_Account']).astype(float)
    df['Currency Match'] = (df['Receiving Currency'] == df['Payment Currency']).astype(float)
    return df

In [8]:
numeric_features = [
    'Amount Received', 'Amount Paid',
    'Year_Norm', 'Month_Norm', 'Day_Norm',
    'Hour_Norm', 'Minute_Norm', 'Day_of_Week_Norm',
    'Amount Paid and Recieved Match',
    'Account Match', 'Currency Match'
]

In [9]:
categorical_features = [
    'Recieving_account', 'Originating_Account',
    'Receiving Currency', 'Payment Currency',
    'Payment Format'
]

In [10]:
def frequency_encode(df, columns):
    for col in columns:
        freq_encoding = df[col].value_counts(normalize=True)
        df[f'{col}_freq'] = df[col].map(freq_encoding)
    return df.drop(columns, axis=1)

In [11]:
high_cardinality_features = ['Recieving_account', 'Originating_Account']

In [12]:
X = df_feature_engineered.drop('Is Laundering', axis=1)
y = df_feature_engineered['Is Laundering']

In [13]:
X

Unnamed: 0,Timestamp,Recieving_account,Originating_Account,Amount Received,Receiving Currency,Amount Paid,Payment Currency,Payment Format,Year_Norm,Month_Norm,Day_Norm,Hour_Norm,Minute_Norm,Day_of_Week_Norm,Amount Paid and Recieved Match,Account Match,Currency Match
0,2022-09-01 00:20:00,8000EBD30,8000EBD30,3697.34,US Dollar,3697.34,US Dollar,Reinvestment,,0.727273,0.000000,0.000000,0.338983,0.500000,1.0,1.0,1.0
1,2022-09-01 00:20:00,8000F4580,8000F5340,0.01,US Dollar,0.01,US Dollar,Cheque,,0.727273,0.000000,0.000000,0.338983,0.500000,1.0,0.0,1.0
2,2022-09-01 00:00:00,8000F4670,8000F4670,14675.57,US Dollar,14675.57,US Dollar,Reinvestment,,0.727273,0.000000,0.000000,0.000000,0.500000,1.0,1.0,1.0
3,2022-09-01 00:02:00,8000F5030,8000F5030,2806.97,US Dollar,2806.97,US Dollar,Reinvestment,,0.727273,0.000000,0.000000,0.033898,0.500000,1.0,1.0,1.0
4,2022-09-01 00:06:00,8000F5200,8000F5200,36682.97,US Dollar,36682.97,US Dollar,Reinvestment,,0.727273,0.000000,0.000000,0.101695,0.500000,1.0,1.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1728954,2022-09-02 19:13:00,1004288A0,80A247AF0,2236.16,Australian Dollar,2236.16,Australian Dollar,Cheque,,0.727273,0.033333,0.826087,0.220339,0.666667,1.0,0.0,1.0
1728955,2022-09-02 19:19:00,1004288A0,80A247AF0,694.36,Australian Dollar,694.36,Australian Dollar,Cash,,0.727273,0.033333,0.826087,0.322034,0.666667,1.0,0.0,1.0
1728956,2022-09-02 19:21:00,80A24CD50,80A24C630,2346.59,Australian Dollar,2346.59,Australian Dollar,ACH,,0.727273,0.033333,0.826087,0.355932,0.666667,1.0,0.0,1.0
1728957,2022-09-02 19:07:00,80A252350,80A24FA90,869.53,Australian Dollar,869.53,Australian Dollar,ACH,,0.727273,0.033333,0.826087,0.118644,0.666667,1.0,0.0,1.0


In [14]:

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [19]:
test_pipeline_2 = Pipeline([
    ('feature_engineering', Pipeline([
        ('timestamp', FunctionTransformer(engineer_timestamp_features)),
        ('match', FunctionTransformer(engineer_match_features)),
        ('freq_encode', FunctionTransformer(
            lambda df: frequency_encode(df, high_cardinality_features))
        )
    ])),
    ('preprocessor', ColumnTransformer([
        ('num', Pipeline([
            ('imputer', SimpleImputer(strategy='median')),
            ('scaler', StandardScaler())
        ]), numeric_features + [f'{col}_freq' for col in high_cardinality_features]),
        ('cat', Pipeline([
            ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
            ('onehot', OneHotEncoder(handle_unknown='ignore'))
        ]), ['Receiving Currency', 'Payment Currency', 'Payment Format'])
    ])),
    ('classifier', RandomForestClassifier(random_state=42))
])

In [30]:
param_grid = {
    'classifier__n_estimators': [5, 20, 100],
    'classifier__max_depth': [None, 10, 20],
    #'classifier__min_samples_split': [2, 5, 10],
   # 'classifier__min_samples_leaf': [1, 2, 4]
}


In [31]:
grid_search = GridSearchCV(
    estimator=test_pipeline_2,
    param_grid=param_grid,
    cv=5,
    scoring='precision',
    n_jobs= -1 ,
    verbose=9
)

In [32]:
grid_search.fit(X_train, y_train)

Fitting 5 folds for each of 9 candidates, totalling 45 fits


In [35]:
print("Best Parameters:", grid_search.best_params_)
print("Best Cross-Validation Precision:", grid_search.best_score_)

Best Parameters: {'classifier__max_depth': 10, 'classifier__n_estimators': 20}
Best Cross-Validation Precision: 0.9164600911264632


In [37]:
grid_search.cv_results_

{'mean_fit_time': array([ 42.40524564, 119.58731985, 528.7670393 ,  39.97354951,
        100.94313555, 405.47903914,  43.29504123, 109.43320632,
        440.73137312]),
 'std_fit_time': array([ 2.25425348,  8.57432925, 32.34467812,  2.31538662,  8.04569377,
         7.71128362,  2.34052393,  5.10170337, 81.45764333]),
 'mean_score_time': array([3.72406478, 4.89526658, 7.16268377, 3.18217173, 3.90212045,
        6.67832079, 4.38384376, 3.75699267, 6.81231694]),
 'std_score_time': array([0.45538629, 0.81383737, 0.7822446 , 0.65885077, 0.29024815,
        1.28813949, 1.1380503 , 0.51174184, 1.37437064]),
 'param_classifier__max_depth': masked_array(data=[None, None, None, 10, 10, 10, 20, 20, 20],
              mask=[False, False, False, False, False, False, False, False,
                    False],
        fill_value='?',
             dtype=object),
 'param_classifier__n_estimators': masked_array(data=[5, 20, 100, 5, 20, 100, 5, 20, 100],
              mask=[False, False, False, False, Fa

In [41]:
test_pipeline_logreg = Pipeline([
    ('feature_engineering', Pipeline([
        ('timestamp', FunctionTransformer(engineer_timestamp_features)),
        ('match', FunctionTransformer(engineer_match_features)),
        ('freq_encode', FunctionTransformer(
            lambda df: frequency_encode(df, high_cardinality_features))
        )
    ])),
    ('preprocessor', ColumnTransformer([
        ('num', Pipeline([
            ('imputer', SimpleImputer(strategy='median')),
            ('scaler', StandardScaler())
        ]), numeric_features + [f'{col}_freq' for col in high_cardinality_features]),
        ('cat', Pipeline([
            ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
            ('onehot', OneHotEncoder(handle_unknown='ignore'))
        ]), ['Receiving Currency', 'Payment Currency', 'Payment Format'])
    ])),
    ('classifier', LogisticRegression(random_state=42, max_iter=1000))
])

In [42]:
param_grid_logreg = {
    'classifier__C': [0.01, 0.1, 1, 10, 100],
    'classifier__penalty': ['l1', 'l2']
}

In [43]:
grid_search_logreg = GridSearchCV(
    estimator=test_pipeline_logreg ,
    param_grid=param_grid_logreg,
    cv=5,
    scoring='precision',
    n_jobs= -1 ,
    verbose=9
)

In [44]:
grid_search_logreg.fit(X_train, y_train)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


25 fits failed out of a total of 50.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
25 fits failed with the following error:
Traceback (most recent call last):
  File "/usr/local/lib/python3.11/dist-packages/sklearn/model_selection/_validation.py", line 866, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/usr/local/lib/python3.11/dist-packages/sklearn/base.py", line 1389, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/sklearn/pipeline.py", line 662, in fit
    self._final_estimator.fit(Xt, y, **last_step_params["fit"])
  File "/usr/local/lib/python3.11/dist-packages/sklearn/base.py", line 1389, in 

In [45]:
print("Best Parameters:", grid_search_logreg.best_params_)
print("Best Cross-Validation Precision:", grid_search_logreg.best_score_)

Best Parameters: {'classifier__C': 10, 'classifier__penalty': 'l2'}
Best Cross-Validation Precision: 0.8646456434261311


The Random Forest model seems to have a higher precision score than the logistic regression model when it comes to predicting whether a transaction is fraudulent or not because it is better at capturing complex, non-linear relationships in the data and can leverage multiple decision trees to reduce bias, whereas logistic regression relies on a linear decision boundary, which may not be as effective in distinguishing fraudulent transactions from legitimate ones. Additionally, Random Forest's ability to handle imbalanced datasets through bootstrapping and feature bagging may contribute to its higher precision in identifying fraud.

Precision, in this context, refers to the proportion of transactions that the model correctly identifies as fraudulent out of all the transactions it predicts as fraudulent. A higher precision score means that when the model flags a transaction as fraudulent, it is more likely to be correct, reducing the number of false positives. This is particularly important in fraud detection, where incorrectly labeling legitimate transactions as fraud can lead to unnecessary disruptions for customers.