##Importing Libraries

In [2]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import  PolynomialFeatures
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier 
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix

In [3]:
data=pd.read_csv("creditcard.csv")

In [4]:
Total_transactions = len(data)
normal = len(data[data.Class == 0])
fraudulent = len(data[data.Class == 1])
fraud_percentage = round(fraudulent/normal*100, 2)
print('Total number of Trnsactions are: {}'.format(Total_transactions))
print('Number of Normal Transactions are: {}'.format(normal))
print('Number of fraudulent Transactions are: {}'.format(fraudulent))
print('Percentage of fraud Transactions is: {}'.format(fraud_percentage))

Total number of Trnsactions are: 284807
Number of Normal Transactions are: 284315
Number of fraudulent Transactions are: 492
Percentage of fraud Transactions is: 0.17


We can say only 0.17% of transactions are fraudulent.

In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 284807 entries, 0 to 284806
Data columns (total 31 columns):
 #   Column  Non-Null Count   Dtype  
---  ------  --------------   -----  
 0   Time    284807 non-null  float64
 1   V1      284807 non-null  float64
 2   V2      284807 non-null  float64
 3   V3      284807 non-null  float64
 4   V4      284807 non-null  float64
 5   V5      284807 non-null  float64
 6   V6      284807 non-null  float64
 7   V7      284807 non-null  float64
 8   V8      284807 non-null  float64
 9   V9      284807 non-null  float64
 10  V10     284807 non-null  float64
 11  V11     284807 non-null  float64
 12  V12     284807 non-null  float64
 13  V13     284807 non-null  float64
 14  V14     284807 non-null  float64
 15  V15     284807 non-null  float64
 16  V16     284807 non-null  float64
 17  V17     284807 non-null  float64
 18  V18     284807 non-null  float64
 19  V19     284807 non-null  float64
 20  V20     284807 non-null  float64
 21  V21     28

In [6]:
data.shape

(284807, 31)

In [7]:
data.describe()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
count,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,...,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0
mean,94813.859575,1.168375e-15,3.416908e-16,-1.379537e-15,2.074095e-15,9.604066e-16,1.487313e-15,-5.556467e-16,1.213481e-16,-2.406331e-15,...,1.654067e-16,-3.568593e-16,2.578648e-16,4.473266e-15,5.340915e-16,1.683437e-15,-3.660091e-16,-1.22739e-16,88.349619,0.001727
std,47488.145955,1.958696,1.651309,1.516255,1.415869,1.380247,1.332271,1.237094,1.194353,1.098632,...,0.734524,0.7257016,0.6244603,0.6056471,0.5212781,0.482227,0.4036325,0.3300833,250.120109,0.041527
min,0.0,-56.40751,-72.71573,-48.32559,-5.683171,-113.7433,-26.16051,-43.55724,-73.21672,-13.43407,...,-34.83038,-10.93314,-44.80774,-2.836627,-10.2954,-2.604551,-22.56568,-15.43008,0.0,0.0
25%,54201.5,-0.9203734,-0.5985499,-0.8903648,-0.8486401,-0.6915971,-0.7682956,-0.5540759,-0.2086297,-0.6430976,...,-0.2283949,-0.5423504,-0.1618463,-0.3545861,-0.3171451,-0.3269839,-0.07083953,-0.05295979,5.6,0.0
50%,84692.0,0.0181088,0.06548556,0.1798463,-0.01984653,-0.05433583,-0.2741871,0.04010308,0.02235804,-0.05142873,...,-0.02945017,0.006781943,-0.01119293,0.04097606,0.0165935,-0.05213911,0.001342146,0.01124383,22.0,0.0
75%,139320.5,1.315642,0.8037239,1.027196,0.7433413,0.6119264,0.3985649,0.5704361,0.3273459,0.597139,...,0.1863772,0.5285536,0.1476421,0.4395266,0.3507156,0.2409522,0.09104512,0.07827995,77.165,0.0
max,172792.0,2.45493,22.05773,9.382558,16.87534,34.80167,73.30163,120.5895,20.00721,15.59499,...,27.20284,10.50309,22.52841,4.584549,7.519589,3.517346,31.6122,33.84781,25691.16,1.0


In [8]:
sc = StandardScaler()
amount = data['Amount'].values
data['Amount'] = sc.fit_transform(amount.reshape(-1, 1))

In [9]:
data.drop(['Time'], axis=1, inplace=True)
data.drop_duplicates(inplace=True)

In [10]:
data.shape

(275663, 30)

##Spliting data into training set and test set

In [11]:
X = data.drop('Class', axis = 1).values
y = data['Class'].values

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 1)

##Model Building

##Decision Tree


In [13]:
DT = DecisionTreeClassifier(max_depth = 4, criterion = 'entropy')
DT.fit(X_train, y_train)
dt_pred = DT.predict(X_test)

In [22]:
print('Accuracy score of the Decision Tree model is: {:.2f}%'.format(accuracy_score(y_test, dt_pred)*100))
confusion_matrix(y_test, dt_pred)

Accuracy score of the Decision Tree model is: 99.91%


array([[68770,    18],
       [   41,    87]])

##K-Nearest Neighbors

In [24]:
KNN = KNeighborsClassifier(n_neighbors = 7)
KNN.fit(X_train, y_train)
knn_pred = KNN.predict(X_test)

In [25]:
print('Accuracy score of the K-Nearest Neighbors model is: {:.2f}%'.format(accuracy_score(y_test, knn_pred)*100))
confusion_matrix(y_test, knn_pred)

Accuracy score of the K-Nearest Neighbors model is: 99.93%


array([[68772,    16],
       [   33,    95]])

##Logistic Regression

In [27]:
lr = LogisticRegression()
lr.fit(X_train, y_train)
lr_pred = lr.predict(X_test)

In [28]:
print('Accuracy score of the Logistic Regression model is: {:.2f}%'.format(accuracy_score(y_test, lr_pred)*100))
confusion_matrix(y_test, lr_pred)

Accuracy score of the Logistic Regression model is: 99.90%


array([[68772,    16],
       [   56,    72]])

##Support Vector Machines

In [30]:
svm = SVC()
svm.fit(X_train, y_train)
svm_pred = svm.predict(X_test)

In [31]:
print('Accuracy score of the Support Vector Machines model is: {:.2f}%'.format(accuracy_score(y_test, svm_pred)*100))
confusion_matrix(y_test, svm_pred)

Accuracy score of the Support Vector Machines model is: 99.93%


array([[68785,     3],
       [   44,    84]])

##Random Forest

In [32]:
rf = RandomForestClassifier(max_depth = 4)
rf.fit(X_train, y_train)
rf_pred = rf.predict(X_test)

In [41]:
print('Accuracy score of the Random Forest model is: {:.2f}%'.format(accuracy_score(y_test, rf_pred)*100))
confusion_matrix(y_test, rf_pred)

Accuracy score of the Random Forest model is: 99.92%


array([[68778,    10],
       [   47,    81]])

##XGBoost

In [34]:
xgb = XGBClassifier(max_depth = 4)
xgb.fit(X_train, y_train)
xgb_pred = xgb.predict(X_test)

In [42]:
print('Accuracy score of the XGBoost model is: {:.2f}%'.format(accuracy_score(y_test, xgb_pred)*100))
confusion_matrix(y_test, xgb_pred)

Accuracy score of the XGBoost model is: 99.94%


array([[68784,     4],
       [   34,    94]])

##Catboost

In [36]:
!pip install catboost

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting catboost
  Downloading catboost-1.1.1-cp38-none-manylinux1_x86_64.whl (76.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m76.6/76.6 MB[0m [31m7.7 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: catboost
Successfully installed catboost-1.1.1


In [37]:
from catboost import CatBoostClassifier
cat_boost = CatBoostClassifier()
cat_boost.fit(X_train, y_train)
cat_pred = cat_boost.predict(X_test)

Learning rate set to 0.100375
0:	learn: 0.3569222	total: 214ms	remaining: 3m 33s
1:	learn: 0.1980382	total: 379ms	remaining: 3m 9s
2:	learn: 0.1070370	total: 489ms	remaining: 2m 42s
3:	learn: 0.0590893	total: 630ms	remaining: 2m 36s
4:	learn: 0.0352181	total: 737ms	remaining: 2m 26s
5:	learn: 0.0220055	total: 953ms	remaining: 2m 37s
6:	learn: 0.0145488	total: 1.2s	remaining: 2m 50s
7:	learn: 0.0104005	total: 1.32s	remaining: 2m 44s
8:	learn: 0.0079780	total: 1.48s	remaining: 2m 43s
9:	learn: 0.0063361	total: 1.72s	remaining: 2m 50s
10:	learn: 0.0052550	total: 1.92s	remaining: 2m 53s
11:	learn: 0.0045281	total: 2.12s	remaining: 2m 54s
12:	learn: 0.0039879	total: 2.29s	remaining: 2m 53s
13:	learn: 0.0036216	total: 2.5s	remaining: 2m 55s
14:	learn: 0.0033344	total: 2.74s	remaining: 2m 59s
15:	learn: 0.0031232	total: 2.98s	remaining: 3m 3s
16:	learn: 0.0029626	total: 3.29s	remaining: 3m 10s
17:	learn: 0.0028350	total: 3.93s	remaining: 3m 34s
18:	learn: 0.0027222	total: 4.71s	remaining: 4m 

In [40]:
print('Accuracy score of the XGBoost model is: {:.2f}%'.format(accuracy_score(y_test, cat_pred)*100))
confusion_matrix(y_test, cat_pred)

Accuracy score of the XGBoost model is: 99.95%


array([[68784,     4],
       [   31,    97]])

##K-fold cross validation

In [50]:
from sklearn.model_selection import cross_val_score
accuracies = cross_val_score(estimator = cat_boost, X = X_train, y = y_train, cv = 10) 
print("Accuracy: {:.2f} %".format(accuracies.mean()*100))
print("Standard Deviation: {:.2f} %".format(accuracies.std()*100))

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
6:	learn: 0.0191204	total: 370ms	remaining: 52.5s
7:	learn: 0.0132394	total: 416ms	remaining: 51.6s
8:	learn: 0.0098740	total: 472ms	remaining: 52s
9:	learn: 0.0076332	total: 531ms	remaining: 52.5s
10:	learn: 0.0061264	total: 584ms	remaining: 52.5s
11:	learn: 0.0051968	total: 654ms	remaining: 53.8s
12:	learn: 0.0045276	total: 714ms	remaining: 54.2s
13:	learn: 0.0040931	total: 764ms	remaining: 53.8s
14:	learn: 0.0037243	total: 813ms	remaining: 53.4s
15:	learn: 0.0034632	total: 865ms	remaining: 53.2s
16:	learn: 0.0032806	total: 924ms	remaining: 53.4s
17:	learn: 0.0031203	total: 974ms	remaining: 53.1s
18:	learn: 0.0029877	total: 1.02s	remaining: 52.9s
19:	learn: 0.0029021	total: 1.08s	remaining: 53.2s
20:	learn: 0.0028321	total: 1.14s	remaining: 53.2s
21:	learn: 0.0027571	total: 1.19s	remaining: 53.1s
22:	learn: 0.0027152	total: 1.27s	remaining: 53.9s
23:	learn: 0.0026763	total: 1.32s	remaining: 53.6s
24:	learn: 0.0025988	to

We got the best accuracy of 99.96% with CatBoost Model.