In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt


df = pd.read_csv("/content/PS_20174392719_1491204439457_log.csv")
df.head()

Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud
0,1,PAYMENT,9839.64,C1231006815,170136.0,160296.36,M1979787155,0.0,0.0,0.0,0.0
1,1,PAYMENT,1864.28,C1666544295,21249.0,19384.72,M2044282225,0.0,0.0,0.0,0.0
2,1,TRANSFER,181.0,C1305486145,181.0,0.0,C553264065,0.0,0.0,1.0,0.0
3,1,CASH_OUT,181.0,C840083671,181.0,0.0,C38997010,21182.0,0.0,1.0,0.0
4,1,PAYMENT,11668.14,C2048537720,41554.0,29885.86,M1230701703,0.0,0.0,0.0,0.0


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 69858 entries, 0 to 69857
Data columns (total 11 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   step            69858 non-null  int64  
 1   type            69858 non-null  object 
 2   amount          69858 non-null  float64
 3   nameOrig        69858 non-null  object 
 4   oldbalanceOrg   69858 non-null  float64
 5   newbalanceOrig  69858 non-null  float64
 6   nameDest        69858 non-null  object 
 7   oldbalanceDest  69858 non-null  float64
 8   newbalanceDest  69857 non-null  float64
 9   isFraud         69857 non-null  float64
 10  isFlaggedFraud  69857 non-null  float64
dtypes: float64(7), int64(1), object(3)
memory usage: 5.9+ MB


In [None]:
df.isnull().sum()

step              0
type              0
amount            0
nameOrig          0
oldbalanceOrg     0
newbalanceOrig    0
nameDest          0
oldbalanceDest    0
newbalanceDest    1
isFraud           1
isFlaggedFraud    1
dtype: int64

as you can see that their are some missing values in the dataset, so we will fillup those values with the mean of that particular column


In [None]:
df["newbalanceDest"] = df["newbalanceDest"].fillna(df["newbalanceDest"].mean())
df["isFraud"] = df["isFraud"].fillna(df["isFraud"].mean())
df["isFlaggedFraud"] = df["isFlaggedFraud"].fillna(df["isFlaggedFraud"].mean())


the following null value has been replaced by the mean of the column

In [None]:
df.isnull().sum()

step              0
type              0
amount            0
nameOrig          0
oldbalanceOrg     0
newbalanceOrig    0
nameDest          0
oldbalanceDest    0
newbalanceDest    0
isFraud           0
isFlaggedFraud    0
dtype: int64

Exploring the data type

So, when you execute df.type.value_counts(), you'll get a Series that shows the counts of each unique value in the "type" column of the DataFrame df.

In [None]:
print(df.type.value_counts())

PAYMENT     28733
CASH_OUT    20389
CASH_IN     13785
TRANSFER     6173
DEBIT         778
Name: type, dtype: int64


In [None]:
type = df["type"].value_counts()
transaction = type.index
quantity = type.values

import plotly.express as px
figure = px.pie(
    data_frame = df,
    values = quantity,
    names = transaction,
    hole = 0.5,
    title = "Distribution of the Transaction types"
)
figure.show()

Now let’s have a look at the correlation between the features of the data with the isFraud column:

In [None]:
# Checking correlation
correlation = df.corr()
print(correlation["isFraud"].sort_values(ascending = False))

isFraud           1.000000
amount            0.047712
oldbalanceOrg    -0.004517
newbalanceDest   -0.007152
oldbalanceDest   -0.010576
newbalanceOrig   -0.012301
step             -0.051862
isFlaggedFraud         NaN
Name: isFraud, dtype: float64






Now let’s transform the categorical features into numerical. Here I will also transform the values of the isFraud column into No Fraud and Fraud labels to have a better understanding of the output:

In [None]:
df["type"] = df["type"].map({
    "CASH_OUT" : 1, "PAYMENT": 2,
    "CASH_IN" : 3, "TRANSFER" : 4,
    "DEBIT" : 5
})
df["isFraud"] = df["isFraud"].map({
    0: "No Fraud", 1: "Fraud"
})
print(df.head())

   step  type    amount     nameOrig  oldbalanceOrg  newbalanceOrig  \
0     1     2   9839.64  C1231006815       170136.0       160296.36   
1     1     2   1864.28  C1666544295        21249.0        19384.72   
2     1     4    181.00  C1305486145          181.0            0.00   
3     1     1    181.00   C840083671          181.0            0.00   
4     1     2  11668.14  C2048537720        41554.0        29885.86   

      nameDest  oldbalanceDest  newbalanceDest   isFraud  isFlaggedFraud  
0  M1979787155             0.0             0.0  No Fraud             0.0  
1  M2044282225             0.0             0.0  No Fraud             0.0  
2   C553264065             0.0             0.0     Fraud             0.0  
3    C38997010         21182.0             0.0     Fraud             0.0  
4  M1230701703             0.0             0.0  No Fraud             0.0  


Now, we will be detecting the model using the decission tree classifier


In [None]:
df.isnull().sum()

step              0
type              0
amount            0
nameOrig          0
oldbalanceOrg     0
newbalanceOrig    0
nameDest          0
oldbalanceDest    0
newbalanceDest    0
isFraud           1
isFlaggedFraud    0
dtype: int64

In [None]:
replacement = "0"
df["isFraud"] = df["isFraud"].fillna(replacement)
df.isnull().sum()

step              0
type              0
amount            0
nameOrig          0
oldbalanceOrg     0
newbalanceOrig    0
nameDest          0
oldbalanceDest    0
newbalanceDest    0
isFraud           0
isFlaggedFraud    0
dtype: int64

In [None]:
# splitting the data
from sklearn.model_selection import train_test_split
x = np.array(df[["type", "amount", "oldbalanceOrg", "newbalanceOrig"]])
y = np.array(df[["isFraud"]])

from sklearn.tree import DecisionTreeClassifier
xtrain, xtest, ytrain, ytest = train_test_split(x, y, test_size=0.10, random_state=42)
model = DecisionTreeClassifier()
model.fit(xtrain, ytrain)
print(model.score(xtest, ytest))


0.9987117091325508


In [None]:
# prediction
#features = [type, amount, oldbalanceOrg, newbalanceOrig]
features = np.array([[4, 9000.60, 9000.60, 0.0]])
print(model.predict(features))



['No Fraud']
