In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [13]:
BASE_PATH = "/content/drive/MyDrive/fraud_assignment"

CSV_PATH = f"{BASE_PATH}/Fraud.csv"
DICT_PATH = f"{BASE_PATH}/Data Dictionary.txt"

print("Paths set successfully")

Paths set successfully


In [7]:
import numpy as np
import pandas as pd

In [8]:
df = pd.read_csv(CSV_PATH)
df.head()

Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud
0,1,PAYMENT,9839.64,C1231006815,170136.0,160296.36,M1979787155,0.0,0.0,0,0
1,1,PAYMENT,1864.28,C1666544295,21249.0,19384.72,M2044282225,0.0,0.0,0,0
2,1,TRANSFER,181.0,C1305486145,181.0,0.0,C553264065,0.0,0.0,1,0
3,1,CASH_OUT,181.0,C840083671,181.0,0.0,C38997010,21182.0,0.0,1,0
4,1,PAYMENT,11668.14,C2048537720,41554.0,29885.86,M1230701703,0.0,0.0,0,0


In [9]:
df.shape

(6362620, 11)

In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6362620 entries, 0 to 6362619
Data columns (total 11 columns):
 #   Column          Dtype  
---  ------          -----  
 0   step            int64  
 1   type            object 
 2   amount          float64
 3   nameOrig        object 
 4   oldbalanceOrg   float64
 5   newbalanceOrig  float64
 6   nameDest        object 
 7   oldbalanceDest  float64
 8   newbalanceDest  float64
 9   isFraud         int64  
 10  isFlaggedFraud  int64  
dtypes: float64(5), int64(3), object(3)
memory usage: 534.0+ MB


In [11]:
df.isnull().sum()

Unnamed: 0,0
step,0
type,0
amount,0
nameOrig,0
oldbalanceOrg,0
newbalanceOrig,0
nameDest,0
oldbalanceDest,0
newbalanceDest,0
isFraud,0


In [14]:
with open(DICT_PATH, 'r') as f:
    data_dict = f.read()

print(data_dict)

step - maps a unit of time in the real world. In this case 1 step is 1 hour of time. Total steps 744 (30 days simulation).

type - CASH-IN, CASH-OUT, DEBIT, PAYMENT and TRANSFER.

amount - amount of the transaction in local currency.

nameOrig - customer who started the transaction

oldbalanceOrg - initial balance before the transaction

newbalanceOrig - new balance after the transaction

nameDest - customer who is the recipient of the transaction

oldbalanceDest - initial balance recipient before the transaction. Note that there is not information for customers that start with M (Merchants).

newbalanceDest - new balance recipient after the transaction. Note that there is not information for customers that start with M (Merchants).

isFraud - This is the transactions made by the fraudulent agents inside the simulation. In this specific dataset the fraudulent behavior of the agents aims to profit by taking control or customers accounts and try to empty the funds by transferring to anot

In [15]:
df['isFraud'].value_counts()


Unnamed: 0_level_0,count
isFraud,Unnamed: 1_level_1
0,6354407
1,8213


In [16]:
df['isFraud'].mean() * 100

np.float64(0.12908204481801522)

In [17]:
pd.crosstab(df['type'], df['isFraud'], normalize='index') * 100

isFraud,0,1
type,Unnamed: 1_level_1,Unnamed: 2_level_1
CASH_IN,100.0,0.0
CASH_OUT,99.816045,0.183955
DEBIT,100.0,0.0
PAYMENT,100.0,0.0
TRANSFER,99.231201,0.768799


In [18]:
df['hour'] = df['step'] % 24
df[['step', 'hour']].head()

Unnamed: 0,step,hour
0,1,1
1,1,1
2,1,1
3,1,1
4,1,1


In [19]:
df['orig_balance_diff'] = df['oldbalanceOrg'] - df['newbalanceOrig']
df['dest_balance_diff'] = df['newbalanceDest'] - df['oldbalanceDest']

df[['orig_balance_diff', 'dest_balance_diff']].head()

Unnamed: 0,orig_balance_diff,dest_balance_diff
0,9839.64,0.0
1,1864.28,0.0
2,181.0,0.0
3,181.0,-21182.0
4,11668.14,0.0


In [20]:
df['log_amount'] = np.log1p(df['amount'])
df[['amount', 'log_amount']].head()

Unnamed: 0,amount,log_amount
0,9839.64,9.194276
1,1864.28,7.531166
2,181.0,5.204007
3,181.0,5.204007
4,11668.14,9.364703


In [21]:
df_model = df.drop(columns=['nameOrig', 'nameDest'])
df_model.head()

Unnamed: 0,step,type,amount,oldbalanceOrg,newbalanceOrig,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud,hour,orig_balance_diff,dest_balance_diff,log_amount
0,1,PAYMENT,9839.64,170136.0,160296.36,0.0,0.0,0,0,1,9839.64,0.0,9.194276
1,1,PAYMENT,1864.28,21249.0,19384.72,0.0,0.0,0,0,1,1864.28,0.0,7.531166
2,1,TRANSFER,181.0,181.0,0.0,0.0,0.0,1,0,1,181.0,0.0,5.204007
3,1,CASH_OUT,181.0,181.0,0.0,21182.0,0.0,1,0,1,181.0,-21182.0,5.204007
4,1,PAYMENT,11668.14,41554.0,29885.86,0.0,0.0,0,0,1,11668.14,0.0,9.364703


In [22]:
df_model.shape, df_model.columns

((6362620, 13),
 Index(['step', 'type', 'amount', 'oldbalanceOrg', 'newbalanceOrig',
        'oldbalanceDest', 'newbalanceDest', 'isFraud', 'isFlaggedFraud', 'hour',
        'orig_balance_diff', 'dest_balance_diff', 'log_amount'],
       dtype='object'))

In [23]:
from sklearn.model_selection import train_test_split

X = df_model.drop('isFraud', axis=1)
y = df_model['isFraud']

X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

X_train.shape, X_test.shape

((5090096, 12), (1272524, 12))

In [24]:
# Identify categorical and numerical columns
cat_cols = ['type']
num_cols = [col for col in X_train.columns if col not in cat_cols]

cat_cols, num_cols

(['type'],
 ['step',
  'amount',
  'oldbalanceOrg',
  'newbalanceOrig',
  'oldbalanceDest',
  'newbalanceDest',
  'isFlaggedFraud',
  'hour',
  'orig_balance_diff',
  'dest_balance_diff',
  'log_amount'])

In [25]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

preprocess = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'), cat_cols),
        ('num', 'passthrough', num_cols)
    ]
)

preprocess

In [26]:
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, roc_auc_score

log_clf = Pipeline(steps=[
    ('preprocess', preprocess),
    ('model', LogisticRegression(
        max_iter=1000,
        class_weight='balanced',
        n_jobs=-1
    ))
])

log_clf.fit(X_train, y_train)

# Predictions
y_pred = log_clf.predict(X_test)
y_prob = log_clf.predict_proba(X_test)[:, 1]

print(classification_report(y_test, y_pred))
print("ROC-AUC:", roc_auc_score(y_test, y_prob))

              precision    recall  f1-score   support

           0       1.00      0.96      0.98   1270881
           1       0.03      0.95      0.06      1643

    accuracy                           0.96   1272524
   macro avg       0.51      0.95      0.52   1272524
weighted avg       1.00      0.96      0.98   1272524

ROC-AUC: 0.9914589901642087


In [27]:
from sklearn.ensemble import RandomForestClassifier

rf_clf = Pipeline(steps=[
    ('preprocess', preprocess),
    ('model', RandomForestClassifier(
        n_estimators=200,
        random_state=42,
        class_weight='balanced',
        n_jobs=-1
    ))
])

rf_clf.fit(X_train, y_train)

# Predictions
y_pred_rf = rf_clf.predict(X_test)
y_prob_rf = rf_clf.predict_proba(X_test)[:, 1]

print(classification_report(y_test, y_pred_rf))
print("ROC-AUC:", roc_auc_score(y_test, y_prob_rf))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00   1270881
           1       0.98      0.84      0.90      1643

    accuracy                           1.00   1272524
   macro avg       0.99      0.92      0.95   1272524
weighted avg       1.00      1.00      1.00   1272524

ROC-AUC: 0.9981177984648423


In [28]:
# Extract feature importance from trained Random Forest
rf_model = rf_clf.named_steps['model']

# Get feature names after OneHotEncoding
ohe = rf_clf.named_steps['preprocess'].named_transformers_['cat']
ohe_features = list(ohe.get_feature_names_out(cat_cols))

feature_names = ohe_features + num_cols

importances = rf_model.feature_importances_

fi = pd.DataFrame({
    'feature': feature_names,
    'importance': importances
}).sort_values(by='importance', ascending=False)

fi.head(15)

Unnamed: 0,feature,importance
13,orig_balance_diff,0.330684
7,oldbalanceOrg,0.171623
8,newbalanceOrig,0.111956
6,amount,0.068789
15,log_amount,0.067147
14,dest_balance_diff,0.045105
4,type_TRANSFER,0.042651
3,type_PAYMENT,0.038258
5,step,0.031437
12,hour,0.029612


Q7. What kind of prevention should be adopted while the company updates its infrastructure?

Based on the insights obtained from the fraud detection model, a multi-layered prevention strategy should be adopted while updating the company’s infrastructure.

First, the machine learning model should be deployed as a real-time transaction scoring system, where each transaction is evaluated for fraud risk before completion. Transactions with high risk scores can be temporarily blocked or routed for manual review.

Second, additional security controls such as multi-factor authentication (MFA) should be enforced for high-risk transactions, particularly large-value TRANSFER and CASH_OUT transactions.

Third, velocity and behavioral checks should be introduced to detect abnormal patterns such as sudden account draining or rapid consecutive transfers.

Finally, rule-based systems (e.g., fixed amount thresholds) should be retained only as a supporting layer, while the machine learning model serves as the primary decision engine, as it adapts better to evolving fraud patterns.

Q8. Assuming these actions have been implemented, how would you determine if they work?

The effectiveness of the implemented fraud prevention measures can be evaluated using both model-level metrics and business-level KPIs.

From a modeling perspective, continuous monitoring of recall, precision, F1-score, and ROC-AUC should be performed on recent transaction data to ensure that fraud detection performance does not degrade over time. Special attention should be given to recall, as missing fraudulent transactions can result in significant financial loss.

From a business perspective, key indicators such as reduction in fraud losses, decrease in false positives, lower manual review volume, and improved customer experience should be tracked.

Additionally, A/B testing can be conducted by comparing transaction outcomes before and after model deployment, and data drift monitoring should be used to identify changes in transaction behavior that may require model retraining.