## Importing the necessary libraries

In [95]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

## Loading the dataset

In [96]:
df = pd.read_csv(r"creditcard.csv")
df.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0.0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0
2,1.0,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0


## Information about the data frame

In [97]:
df.shape

(284807, 31)

In [98]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 284807 entries, 0 to 284806
Data columns (total 31 columns):
 #   Column  Non-Null Count   Dtype  
---  ------  --------------   -----  
 0   Time    284807 non-null  float64
 1   V1      284807 non-null  float64
 2   V2      284807 non-null  float64
 3   V3      284807 non-null  float64
 4   V4      284807 non-null  float64
 5   V5      284807 non-null  float64
 6   V6      284807 non-null  float64
 7   V7      284807 non-null  float64
 8   V8      284807 non-null  float64
 9   V9      284807 non-null  float64
 10  V10     284807 non-null  float64
 11  V11     284807 non-null  float64
 12  V12     284807 non-null  float64
 13  V13     284807 non-null  float64
 14  V14     284807 non-null  float64
 15  V15     284807 non-null  float64
 16  V16     284807 non-null  float64
 17  V17     284807 non-null  float64
 18  V18     284807 non-null  float64
 19  V19     284807 non-null  float64
 20  V20     284807 non-null  float64
 21  V21     28

## checking for null values

In [99]:
df.isnull().sum()

Time      0
V1        0
V2        0
V3        0
V4        0
V5        0
V6        0
V7        0
V8        0
V9        0
V10       0
V11       0
V12       0
V13       0
V14       0
V15       0
V16       0
V17       0
V18       0
V19       0
V20       0
V21       0
V22       0
V23       0
V24       0
V25       0
V26       0
V27       0
V28       0
Amount    0
Class     0
dtype: int64

## Checking for duplicate values

In [100]:
df.duplicated().sum()

1081

<h2>Removing the Duplicate Values<h2>

In [101]:
df.drop_duplicates(inplace=True)

In [102]:
df.duplicated().sum()

0

## Checking the distribution of legit and fraudulent transanctions

In [103]:
df['Class'].value_counts()

Class
0    283253
1       473
Name: count, dtype: int64

In [104]:
fraud = df[df['Class']==1]
legit = df[df['Class']==0]

In [105]:
legit.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0.0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0
2,1.0,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0


In [106]:
fraud.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
541,406.0,-2.312227,1.951992,-1.609851,3.997906,-0.522188,-1.426545,-2.537387,1.391657,-2.770089,...,0.517232,-0.035049,-0.465211,0.320198,0.044519,0.17784,0.261145,-0.143276,0.0,1
623,472.0,-3.043541,-3.157307,1.088463,2.288644,1.359805,-1.064823,0.325574,-0.067794,-0.270953,...,0.661696,0.435477,1.375966,-0.293803,0.279798,-0.145362,-0.252773,0.035764,529.0,1
4920,4462.0,-2.30335,1.759247,-0.359745,2.330243,-0.821628,-0.075788,0.56232,-0.399147,-0.238253,...,-0.294166,-0.932391,0.172726,-0.08733,-0.156114,-0.542628,0.039566,-0.153029,239.93,1
6108,6986.0,-4.397974,1.358367,-2.592844,2.679787,-1.128131,-1.706536,-3.496197,-0.248778,-0.247768,...,0.573574,0.176968,-0.436207,-0.053502,0.252405,-0.657488,-0.827136,0.849573,59.0,1
6329,7519.0,1.234235,3.01974,-4.304597,4.732795,3.624201,-1.357746,1.713445,-0.496358,-1.282858,...,-0.379068,-0.704181,-0.656805,-1.632653,1.488901,0.566797,-0.010016,0.146793,1.0,1


## statistical measure of the data

In [107]:
legit.Amount.describe()

count    283253.000000
mean         88.413575
std         250.379023
min           0.000000
25%           5.670000
50%          22.000000
75%          77.460000
max       25691.160000
Name: Amount, dtype: float64

In [108]:
fraud.Amount.describe()

count     473.000000
mean      123.871860
std       260.211041
min         0.000000
25%         1.000000
50%         9.820000
75%       105.890000
max      2125.870000
Name: Amount, dtype: float64

## Comparing the values of both transanctions

In [109]:
df.groupby('Class').mean()

Unnamed: 0_level_0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V20,V21,V22,V23,V24,V25,V26,V27,V28,Amount
Class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,94835.058093,0.013439,-0.009829,0.012853,-0.01044,0.006769,0.001251,0.010447,-0.002448,0.002613,...,-0.000489,-0.00115,-0.00016,0.00036,0.000393,-0.000301,6.5e-05,0.001409,0.000418,88.413575
1,80450.513742,-4.49828,3.405965,-6.729599,4.472591,-2.957197,-1.432518,-5.175912,0.953255,-2.522124,...,0.405043,0.46655,0.086639,-0.096464,-0.106643,0.040615,0.050456,0.213774,0.07827,123.87186


In [110]:
x = df.drop(columns='Class',axis=1)
y = df['Class']

## Splitting the data into training and testing data

In [111]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=.2,stratify=y,random_state=15)

<h2>Model training before Undersampeling

In [112]:
from sklearn.linear_model import LogisticRegression
model = LogisticRegression()

In [113]:
model.fit(x_train,y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


<h2>Analyzing the Performance of Model </h2>

In [114]:
from sklearn.metrics import accuracy_score,precision_score,f1_score,recall_score
Pred1=model.predict(x_test)
acc=accuracy_score(y_test,Pred1)
print("Accuracy",acc*100)

Accuracy 99.89250343636556


In [115]:
Precision=precision_score(y_test,Pred1)
print("Precision",Precision*100)

Precision 70.73170731707317


In [116]:
Recall=recall_score(y_test,Pred1)
print("Recall",Recall*100)

Recall 61.05263157894737


In [117]:
f1Score=f1_score(y_test,Pred1)
print("F1 Score",f1Score*100)

F1 Score 65.53672316384181


## Undersampling
- since the data is highly unbalanced
- build a sample dataset containing similar distribution of legit and fraud transanctions

In [118]:
legit_sample = legit.sample(n=473)

## Concatenating 2 dataframes

In [119]:
df2 = pd.concat([legit_sample,fraud],axis=0)

In [120]:
df2.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
44577,42001.0,0.930949,-1.262508,0.866608,-0.472538,-1.739665,-0.474897,-0.728696,0.005245,-0.802699,...,-0.254204,-0.725641,0.029841,0.549683,-0.208182,0.843211,-0.064851,0.047201,198.15,0
75460,56080.0,1.308148,0.069888,-1.294638,-0.481847,2.137669,3.182143,-0.502909,0.804442,-0.07426,...,-0.341701,-1.149353,0.09386,0.946204,0.369122,0.110454,-0.022141,0.023919,3.59,0
207873,136865.0,0.079023,0.670998,0.607331,-0.330064,0.689563,-1.061922,1.168898,-0.479137,-0.13593,...,-0.194894,-0.311355,0.072805,0.053325,-1.062139,0.106944,-0.001446,-0.005948,1.98,0
79496,58046.0,1.321347,1.37372,-1.989489,1.391941,1.461493,-1.208837,0.84772,-0.398895,-0.584337,...,-0.262427,-0.582514,-0.352395,-0.935476,1.041699,-0.234874,0.047601,0.08946,1.0,0
280128,169336.0,1.978519,-0.506993,-0.433,0.256778,-0.574634,-0.058804,-0.762025,0.130668,1.002451,...,0.27264,0.953131,0.069908,-0.409304,-0.221743,0.65409,-0.027397,-0.064893,6.47,0


In [121]:
df2.shape

(946, 31)

## Splitting the data into features and target

In [122]:
x = df2.drop(columns='Class',axis=1)
y = df2['Class']

## Splitting the data into training and testing data

In [123]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=.2,stratify=y,random_state=15)

## Model training after Undersampeling

In [124]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression()

### Training the LR model with train data

In [125]:
model.fit(x_train,y_train)

<h2>Analysing the performance of the Tranined Model</h2>

In [126]:
from sklearn.metrics import accuracy_score,precision_score,f1_score,recall_score
Pred1=model.predict(x_test)
acc=accuracy_score(y_test,Pred1)
print("Accuracy",acc*100)

Accuracy 94.21052631578948


In [127]:
Precision=precision_score(y_test,Pred1)
print("Precision",Precision*100)

Precision 95.65217391304348


In [128]:
Recall=recall_score(y_test,Pred1)
print("Recall",Recall*100)

Recall 92.63157894736842


In [129]:
f1Score=f1_score(y_test,Pred1)
print("F1 Score",f1Score*100)

F1 Score 94.11764705882354


## Conclusions after After Analysing Performance
- Before Undersampeling 
    - Accuracy = 99.89
    - Precision = 70.73
    - Recall = 61.05
    - F1Score = 65.53
- After Undersampeling
    - Accuracy = 94.21
    - Precision = 95.65
    - Recall = 92.63
    - F1Score = 94.11
- We can see that the  Accuracy of the model is reduced after undersampeling but the overall performance of our model have increased significantly.

<h2>Creating a Model file</h2>

In [130]:
import joblib
joblib.dump(model,"creditcard.model")

['creditcard.model']