## 1. importing Necessary Libraries

In [20]:
# importing necessary libraries

import pandas as pd
import numpy as np
import plotly.express as px
import os

from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier

## 2. Loading & Exploring the Dataset

In [4]:
# i am loading the dataset from google colab, You can download from this link
# https://www.kaggle.com/datasets/ealaxi/paysim1

dataset = pd.read_csv("/content/PS_20174392719_1491204439457_log.csv")

In [9]:
dataset.head()

Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud
0,1,PAYMENT,9839.64,C1231006815,170136.0,160296.36,M1979787155,0.0,0.0,0,0
1,1,PAYMENT,1864.28,C1666544295,21249.0,19384.72,M2044282225,0.0,0.0,0,0
2,1,TRANSFER,181.0,C1305486145,181.0,0.0,C553264065,0.0,0.0,1,0
3,1,CASH_OUT,181.0,C840083671,181.0,0.0,C38997010,21182.0,0.0,1,0
4,1,PAYMENT,11668.14,C2048537720,41554.0,29885.86,M1230701703,0.0,0.0,0,0


In [5]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6362620 entries, 0 to 6362619
Data columns (total 11 columns):
 #   Column          Dtype  
---  ------          -----  
 0   step            int64  
 1   type            object 
 2   amount          float64
 3   nameOrig        object 
 4   oldbalanceOrg   float64
 5   newbalanceOrig  float64
 6   nameDest        object 
 7   oldbalanceDest  float64
 8   newbalanceDest  float64
 9   isFraud         int64  
 10  isFlaggedFraud  int64  
dtypes: float64(5), int64(3), object(3)
memory usage: 534.0+ MB


In [6]:
dataset.isnull().sum()

Unnamed: 0,0
step,0
type,0
amount,0
nameOrig,0
oldbalanceOrg,0
newbalanceOrig,0
nameDest,0
oldbalanceDest,0
newbalanceDest,0
isFraud,0


## 3. Transcation type anaylysis

In [7]:
dataset.type.value_counts()

Unnamed: 0_level_0,count
type,Unnamed: 1_level_1
CASH_OUT,2237500
PAYMENT,2151495
CASH_IN,1399284
TRANSFER,532909
DEBIT,41432


In [8]:
# Lets calculate the transaction types and visualize them
type = dataset["type"].value_counts()
transactions = type.index
values = type.values

In [12]:


# plotting the plotly bar chart
figure = px.bar(x = transactions, y = values, color = values,
  color_continuous_scale = "sunset")
figure.show()



In [13]:
# plotting the pie chart for more clarity
figure = px.pie(dataset, values = values, names = transactions, title="Types of Transaction")
figure.show()

## 4. Correlation Anaylsis

In [15]:
# Select only the numerical columns
numeric_df = dataset.select_dtypes(include=[float, int])

# Now compute the correlation
correlation = numeric_df.corr()

# Print sorted correlation values with 'isFraud'
print(correlation["isFraud"].sort_values(ascending=False))


isFraud           1.000000
amount            0.076688
isFlaggedFraud    0.044109
step              0.031578
oldbalanceOrg     0.010154
newbalanceDest    0.000535
oldbalanceDest   -0.005885
newbalanceOrig   -0.008148
Name: isFraud, dtype: float64


## 5. Preprocessing Categorical Variable

In [17]:
# Transforming categorical variables into numerical
dataset['type'] = dataset['type'].map({'CASH_OUT':1,'PAYMENT':2,'CASH_IN':3,'TRANSFER':4,'DEBIT':5})
dataset["isFraud"] = dataset ["isFraud"].map({0: "No Fraud", 1: "Fraud"})
dataset.head(10)

Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud
0,1,2,9839.64,C1231006815,170136.0,160296.36,M1979787155,0.0,0.0,No Fraud,0
1,1,2,1864.28,C1666544295,21249.0,19384.72,M2044282225,0.0,0.0,No Fraud,0
2,1,4,181.0,C1305486145,181.0,0.0,C553264065,0.0,0.0,Fraud,0
3,1,1,181.0,C840083671,181.0,0.0,C38997010,21182.0,0.0,Fraud,0
4,1,2,11668.14,C2048537720,41554.0,29885.86,M1230701703,0.0,0.0,No Fraud,0
5,1,2,7817.71,C90045638,53860.0,46042.29,M573487274,0.0,0.0,No Fraud,0
6,1,2,7107.77,C154988899,183195.0,176087.23,M408069119,0.0,0.0,No Fraud,0
7,1,2,7861.64,C1912850431,176087.23,168225.59,M633326333,0.0,0.0,No Fraud,0
8,1,2,4024.36,C1265012928,2671.0,0.0,M1176932104,0.0,0.0,No Fraud,0
9,1,5,5337.77,C712410124,41720.0,36382.23,C195600860,41898.0,40348.79,No Fraud,0


## 6. Building the classification model

In [19]:
# Lets build a classification model to classify whether the transaction is fraud or not by doing the feature and target selection
x = np.array(dataset[["type", "amount", "oldbalanceOrg", "newbalanceOrig"]])
y = np.array(dataset["isFraud"])

In [21]:
# Splitting the dataset for model training
xtrain, xtest, ytrain, ytest = train_test_split(x, y, test_size = 0.2, random_state = 42)

In [22]:
model = DecisionTreeClassifier()
model.fit(xtrain, ytrain)
print(model.score(xtest, ytest) * 100)

99.96982375185065


## 7. Model Evaluation

In [27]:
from sklearn.metrics import confusion_matrix, classification_report, roc_auc_score

# Predict and evaluate
y_pred = model.predict(xtest)
print(confusion_matrix(ytest, y_pred))
print(classification_report(ytest, y_pred))


[[   1421     199]
 [    185 1270719]]
              precision    recall  f1-score   support

       Fraud       0.88      0.88      0.88      1620
    No Fraud       1.00      1.00      1.00   1270904

    accuracy                           1.00   1272524
   macro avg       0.94      0.94      0.94   1272524
weighted avg       1.00      1.00      1.00   1272524



## 8. predicting the Fradulent Transication

In [23]:
# Lets predict the transactions

features = np.array([[4, 9000.60, 9000.60, 0.00]])
print(model.predict(features))

['Fraud']


In [24]:
# Lets predict the transactions
features = np.array([[2, 9839.64, 170136.00, 160296.36]])
print(model.predict(features))

['No Fraud']


### We can see the model performing with very good accuracy 99.96%. So, there is no specific hyperparamter tuning required