In [341]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [342]:
dataset = pd.read_csv("Fraud Detection Dataset.csv")
dataset.head(5)

Unnamed: 0,Transaction_ID,User_ID,Transaction_Amount,Transaction_Type,Time_of_Transaction,Device_Used,Location,Previous_Fraudulent_Transactions,Account_Age,Number_of_Transactions_Last_24H,Payment_Method,Fraudulent
0,T1,4174,1292.76,ATM Withdrawal,16.0,Tablet,San Francisco,0,119,13,Debit Card,0
1,T2,4507,1554.58,ATM Withdrawal,13.0,Mobile,New York,4,79,3,Credit Card,0
2,T3,1860,2395.02,ATM Withdrawal,,Mobile,,3,115,9,,0
3,T4,2294,100.1,Bill Payment,15.0,Desktop,Chicago,4,3,4,UPI,0
4,T5,2130,1490.5,POS Payment,19.0,Mobile,San Francisco,2,57,7,Credit Card,0


In [343]:
dataset.isnull().sum()

Transaction_ID                         0
User_ID                                0
Transaction_Amount                  2520
Transaction_Type                       0
Time_of_Transaction                 2552
Device_Used                         2473
Location                            2547
Previous_Fraudulent_Transactions       0
Account_Age                            0
Number_of_Transactions_Last_24H        0
Payment_Method                      2469
Fraudulent                             0
dtype: int64

In [344]:
dataset.dtypes

Transaction_ID                       object
User_ID                               int64
Transaction_Amount                  float64
Transaction_Type                     object
Time_of_Transaction                 float64
Device_Used                          object
Location                             object
Previous_Fraudulent_Transactions      int64
Account_Age                           int64
Number_of_Transactions_Last_24H       int64
Payment_Method                       object
Fraudulent                            int64
dtype: object

In [345]:
object_col = dataset.select_dtypes(include="object").columns
for col in object_col:
    if dataset[col].isnull().any():
        dataset[col] = dataset[col].fillna(dataset[col].mode().values[0])

In [346]:
intfl_col = dataset.select_dtypes(include= ["int64", "float64"]).columns
for col in intfl_col:
    if dataset[col].isnull().any():
        dataset[col] = dataset[col].fillna(dataset[col].mean())

In [347]:
dataset.isnull().sum()

Transaction_ID                      0
User_ID                             0
Transaction_Amount                  0
Transaction_Type                    0
Time_of_Transaction                 0
Device_Used                         0
Location                            0
Previous_Fraudulent_Transactions    0
Account_Age                         0
Number_of_Transactions_Last_24H     0
Payment_Method                      0
Fraudulent                          0
dtype: int64

In [348]:
dataset.duplicated().sum()

np.int64(881)

In [349]:
dataset.drop_duplicates(inplace = True)

In [350]:
dataset.describe()

Unnamed: 0,User_ID,Transaction_Amount,Time_of_Transaction,Previous_Fraudulent_Transactions,Account_Age,Number_of_Transactions_Last_24H,Fraudulent
count,50119.0,50119.0,50119.0,50119.0,50119.0,50119.0,50119.0
mean,3004.733195,2998.963564,11.488566,1.996448,60.029071,7.49564,0.049223
std,1152.620471,4934.60234,6.746058,1.415492,34.392569,4.019806,0.216335
min,1000.0,5.03,0.0,0.0,1.0,1.0,0.0
25%,2007.0,1333.37,6.0,1.0,30.0,4.0,0.0
50%,2997.0,2655.34,11.4884,2.0,60.0,7.0,0.0
75%,4005.0,3721.755,17.0,3.0,90.0,11.0,0.0
max,4999.0,49997.8,23.0,4.0,119.0,14.0,1.0


In [351]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
Index: 50119 entries, 0 to 50999
Data columns (total 12 columns):
 #   Column                            Non-Null Count  Dtype  
---  ------                            --------------  -----  
 0   Transaction_ID                    50119 non-null  object 
 1   User_ID                           50119 non-null  int64  
 2   Transaction_Amount                50119 non-null  float64
 3   Transaction_Type                  50119 non-null  object 
 4   Time_of_Transaction               50119 non-null  float64
 5   Device_Used                       50119 non-null  object 
 6   Location                          50119 non-null  object 
 7   Previous_Fraudulent_Transactions  50119 non-null  int64  
 8   Account_Age                       50119 non-null  int64  
 9   Number_of_Transactions_Last_24H   50119 non-null  int64  
 10  Payment_Method                    50119 non-null  object 
 11  Fraudulent                        50119 non-null  int64  
dtypes: float6

In [352]:
dataset.drop(["Transaction_ID", "User_ID"], axis = 1, inplace = True)

In [353]:
corr = dataset.corr(numeric_only = True)
print(corr["Fraudulent"].sort_values())

Number_of_Transactions_Last_24H    -0.003964
Previous_Fraudulent_Transactions    0.000766
Account_Age                         0.005517
Transaction_Amount                  0.005653
Time_of_Transaction                 0.005697
Fraudulent                          1.000000
Name: Fraudulent, dtype: float64


In [354]:
dataset.head(3)

Unnamed: 0,Transaction_Amount,Transaction_Type,Time_of_Transaction,Device_Used,Location,Previous_Fraudulent_Transactions,Account_Age,Number_of_Transactions_Last_24H,Payment_Method,Fraudulent
0,1292.76,ATM Withdrawal,16.0,Tablet,San Francisco,0,119,13,Debit Card,0
1,1554.58,ATM Withdrawal,13.0,Mobile,New York,4,79,3,Credit Card,0
2,2395.02,ATM Withdrawal,11.4884,Mobile,Boston,3,115,9,UPI,0


In [355]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

In [356]:
categorical_cols = ["Transaction_Type", "Device_Used", "Location", "Payment_Method"]
for col in categorical_cols:
    dataset[col] = le.fit_transform(dataset[col])

In [357]:
dataset.head(3)

Unnamed: 0,Transaction_Amount,Transaction_Type,Time_of_Transaction,Device_Used,Location,Previous_Fraudulent_Transactions,Account_Age,Number_of_Transactions_Last_24H,Payment_Method,Fraudulent
0,1292.76,0,16.0,2,6,0,119,13,1,0
1,1554.58,0,13.0,1,5,4,79,3,0,0
2,2395.02,0,11.4884,1,0,3,115,9,4,0


In [358]:
dataset["Fraudulent"].value_counts()

Fraudulent
0    47652
1     2467
Name: count, dtype: int64

In [359]:
from sklearn.model_selection import train_test_split
x = dataset.drop('Fraudulent', axis=1)
y = dataset['Fraudulent']
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state=42)

In [360]:
from imblearn.over_sampling import SMOTE
os_smote = SMOTE(sampling_strategy=0.1, random_state=42)
x_train, y_train = os_smote.fit_resample(x_train, y_train)

In [361]:
x.shape , y.shape, x_train.shape , y_train.shape, x_test.shape, y_test.shape

((50119, 9), (50119,), (39316, 9), (39316,), (12530, 9), (12530,))

In [362]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
x_sc = sc.fit_transform(x_train)

In [363]:
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier(random_state=42)
rfc.fit(x_train, y_train)

In [364]:
from sklearn.metrics import accuracy_score
y_pred = rfc.predict(x_test)
accuracy_score(y_test, y_pred)

0.9493216280925778

In [365]:
import pickle
pickle.dump(rfc, open("transaction_fraud_detection.pkl", "wb"))
print("Model saved successfully!")

Model saved successfully!
