<a href="https://colab.research.google.com/github/YagyanshB/Machine-Learning-Projects/blob/main/Online_Payments_Fraud_Detection.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Using Machine Learning for Online Payments Fraud Detection 

# Link to Kaggle Dataset - https://www.kaggle.com/datasets/ealaxi/paysim1?resource=download

In [2]:
# Importing the required libraries

import numpy as np 
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [6]:
# Uploading the dataset in the environment

df = pd.read_csv('/Data.csv')

In [7]:
# Describing the uploaded dataset

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1048575 entries, 0 to 1048574
Data columns (total 11 columns):
 #   Column          Non-Null Count    Dtype  
---  ------          --------------    -----  
 0   step            1048575 non-null  int64  
 1   type            1048575 non-null  object 
 2   amount          1048575 non-null  float64
 3   nameOrig        1048575 non-null  object 
 4   oldbalanceOrg   1048575 non-null  float64
 5   newbalanceOrig  1048575 non-null  float64
 6   nameDest        1048575 non-null  object 
 7   oldbalanceDest  1048575 non-null  float64
 8   newbalanceDest  1048575 non-null  float64
 9   isFraud         1048575 non-null  int64  
 10  isFlaggedFraud  1048575 non-null  int64  
dtypes: float64(5), int64(3), object(3)
memory usage: 88.0+ MB


In [9]:
df.describe().transpose()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
step,1048575.0,26.96617,15.62325,1.0,15.0,20.0,39.0,95.0
amount,1048575.0,158667.0,264940.9,0.1,12149.065,76343.33,213761.89,10000000.0
oldbalanceOrg,1048575.0,874009.5,2971751.0,0.0,0.0,16002.0,136642.02,38900000.0
newbalanceOrig,1048575.0,893808.9,3008271.0,0.0,0.0,0.0,174599.99,38900000.0
oldbalanceDest,1048575.0,978160.0,2296780.0,0.0,0.0,126377.21,915923.475,42100000.0
newbalanceDest,1048575.0,1114198.0,2416593.0,0.0,0.0,218260.36,1149807.51,42200000.0
isFraud,1048575.0,0.001089097,0.03298351,0.0,0.0,0.0,0.0,1.0
isFlaggedFraud,1048575.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [10]:
# Inspecting the first few rows

df.head(5)

Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud
0,1,PAYMENT,9839.64,C1231006815,170136.0,160296.36,M1979787155,0.0,0.0,0,0
1,1,PAYMENT,1864.28,C1666544295,21249.0,19384.72,M2044282225,0.0,0.0,0,0
2,1,TRANSFER,181.0,C1305486145,181.0,0.0,C553264065,0.0,0.0,1,0
3,1,CASH_OUT,181.0,C840083671,181.0,0.0,C38997010,21182.0,0.0,1,0
4,1,PAYMENT,11668.14,C2048537720,41554.0,29885.86,M1230701703,0.0,0.0,0,0


In [11]:
# Inspecting the last few rows

df.tail(5)

Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud
1048570,95,CASH_OUT,132557.35,C1179511630,479803.0,347245.65,C435674507,484329.37,616886.72,0,0
1048571,95,PAYMENT,9917.36,C1956161225,90545.0,80627.64,M668364942,0.0,0.0,0,0
1048572,95,PAYMENT,14140.05,C2037964975,20545.0,6404.95,M1355182933,0.0,0.0,0,0
1048573,95,PAYMENT,10020.05,C1633237354,90605.0,80584.95,M1964992463,0.0,0.0,0,0
1048574,95,PAYMENT,11450.03,C1264356443,80584.95,69134.92,M677577406,0.0,0.0,0,0


In [12]:
df.columns

Index(['step', 'type', 'amount', 'nameOrig', 'oldbalanceOrg', 'newbalanceOrig',
       'nameDest', 'oldbalanceDest', 'newbalanceDest', 'isFraud',
       'isFlaggedFraud'],
      dtype='object')

In [13]:
# Checking for null values

df.isnull().sum()

step              0
type              0
amount            0
nameOrig          0
oldbalanceOrg     0
newbalanceOrig    0
nameDest          0
oldbalanceDest    0
newbalanceDest    0
isFraud           0
isFlaggedFraud    0
dtype: int64

In [14]:
# We report that there are no null values present.

In [17]:
df.type.value_counts()

CASH_OUT    373641
PAYMENT     353873
CASH_IN     227130
TRANSFER     86753
DEBIT         7178
Name: type, dtype: int64

In [24]:
transaction_type = df["type"].value_counts()
transactions = transaction_type.index
quantity = transaction_type.values

In [25]:
import plotly.express as px
figure = px.pie(transaction_type, 
             values=quantity, 
             names=transactions,hole = 0.5, 
             title="Distribution of Transaction Type")
figure.show()

In [32]:
# Inspecting the correlation between the variables

correlation = df.corr()
print(correlation["isFraud"].sort_values(ascending = False))

isFraud           1.000000
amount            0.128862
step              0.045030
oldbalanceOrg     0.003829
newbalanceDest   -0.000495
oldbalanceDest   -0.007552
newbalanceOrig   -0.009438
isFlaggedFraud         NaN
Name: isFraud, dtype: float64


In [34]:
# Transforming Categorical Variables to Numerical Variables

df["type"] = df["type"].map({"CASH_OUT": 1, "PAYMENT": 2, "CASH_IN": 3, "TRANSFER": 4,"DEBIT": 5})
df["isFraud"] = df["isFraud"].map({0: "No Fraud", 1: "Fraud"})

print(df.head())

   step  type    amount     nameOrig  oldbalanceOrg  newbalanceOrig  \
0     1     2   9839.64  C1231006815       170136.0       160296.36   
1     1     2   1864.28  C1666544295        21249.0        19384.72   
2     1     4    181.00  C1305486145          181.0            0.00   
3     1     1    181.00   C840083671          181.0            0.00   
4     1     2  11668.14  C2048537720        41554.0        29885.86   

      nameDest  oldbalanceDest  newbalanceDest   isFraud  isFlaggedFraud  
0  M1979787155             0.0             0.0  No Fraud               0  
1  M2044282225             0.0             0.0  No Fraud               0  
2   C553264065             0.0             0.0     Fraud               0  
3    C38997010         21182.0             0.0     Fraud               0  
4  M1230701703             0.0             0.0  No Fraud               0  


In [35]:
# We now build a classification model to classify fraud and no fraud transactions 

# splitting the data

from sklearn.model_selection import train_test_split
x = np.array(df[["type", "amount", "oldbalanceOrg", "newbalanceOrig"]])
y = np.array(df[["isFraud"]])

In [36]:
# training a machine learning model

from sklearn.tree import DecisionTreeClassifier

xtrain, xtest, ytrain, ytest = train_test_split(x, y, test_size=0.10, random_state=42)
model = DecisionTreeClassifier()
model.fit(xtrain, ytrain)
print(model.score(xtest, ytest))

0.9994468710065041


In [39]:
# Now we see if our model can correctly predict if a transaction is fraudulent or not

# Prediction 

# Features = [type, amount, oldbalanceOrg, newbalanceOrig]

features = np.array([[3.0,489.0,950.0,0.0]])
print(model.predict(features))

['No Fraud']
