# Credit Fraud Detection

> Summary:

    * In this project, we will focus on developing ML/DL model to identify fraudulent transactions from the given dataset.
    * The dataset was collected from Kaggle.
    * The dataset was observerd to be highly imbalanced.
    * We initially proceed with creating ML model with undersampled data, but it always ended up with the model overfitting.
    * We later proceeded with the original data to develop DL model with the help of adding weights to the classes according to the imbalance ratio, and we obtained the best performing model with ove 96% accuracy on Test and Train data.
    * Finally we shall save our model as a pickle file and reuse it to create an external app using streamlit.

In [1]:
import pandas as pd
from warnings import filterwarnings
filterwarnings('ignore')

In [2]:
# Reading csv
df = pd.read_csv('creditcard.csv')
df.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0.0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0
2,1.0,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 284807 entries, 0 to 284806
Data columns (total 31 columns):
 #   Column  Non-Null Count   Dtype  
---  ------  --------------   -----  
 0   Time    284807 non-null  float64
 1   V1      284807 non-null  float64
 2   V2      284807 non-null  float64
 3   V3      284807 non-null  float64
 4   V4      284807 non-null  float64
 5   V5      284807 non-null  float64
 6   V6      284807 non-null  float64
 7   V7      284807 non-null  float64
 8   V8      284807 non-null  float64
 9   V9      284807 non-null  float64
 10  V10     284807 non-null  float64
 11  V11     284807 non-null  float64
 12  V12     284807 non-null  float64
 13  V13     284807 non-null  float64
 14  V14     284807 non-null  float64
 15  V15     284807 non-null  float64
 16  V16     284807 non-null  float64
 17  V17     284807 non-null  float64
 18  V18     284807 non-null  float64
 19  V19     284807 non-null  float64
 20  V20     284807 non-null  float64
 21  V21     28

In [4]:
df.describe()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
count,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,...,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0
mean,94813.859575,1.168375e-15,3.416908e-16,-1.379537e-15,2.074095e-15,9.604066e-16,1.487313e-15,-5.556467e-16,1.213481e-16,-2.406331e-15,...,1.654067e-16,-3.568593e-16,2.578648e-16,4.473266e-15,5.340915e-16,1.683437e-15,-3.660091e-16,-1.22739e-16,88.349619,0.001727
std,47488.145955,1.958696,1.651309,1.516255,1.415869,1.380247,1.332271,1.237094,1.194353,1.098632,...,0.734524,0.7257016,0.6244603,0.6056471,0.5212781,0.482227,0.4036325,0.3300833,250.120109,0.041527
min,0.0,-56.40751,-72.71573,-48.32559,-5.683171,-113.7433,-26.16051,-43.55724,-73.21672,-13.43407,...,-34.83038,-10.93314,-44.80774,-2.836627,-10.2954,-2.604551,-22.56568,-15.43008,0.0,0.0
25%,54201.5,-0.9203734,-0.5985499,-0.8903648,-0.8486401,-0.6915971,-0.7682956,-0.5540759,-0.2086297,-0.6430976,...,-0.2283949,-0.5423504,-0.1618463,-0.3545861,-0.3171451,-0.3269839,-0.07083953,-0.05295979,5.6,0.0
50%,84692.0,0.0181088,0.06548556,0.1798463,-0.01984653,-0.05433583,-0.2741871,0.04010308,0.02235804,-0.05142873,...,-0.02945017,0.006781943,-0.01119293,0.04097606,0.0165935,-0.05213911,0.001342146,0.01124383,22.0,0.0
75%,139320.5,1.315642,0.8037239,1.027196,0.7433413,0.6119264,0.3985649,0.5704361,0.3273459,0.597139,...,0.1863772,0.5285536,0.1476421,0.4395266,0.3507156,0.2409522,0.09104512,0.07827995,77.165,0.0
max,172792.0,2.45493,22.05773,9.382558,16.87534,34.80167,73.30163,120.5895,20.00721,15.59499,...,27.20284,10.50309,22.52841,4.584549,7.519589,3.517346,31.6122,33.84781,25691.16,1.0


In [5]:
df.shape

(284807, 31)

In [6]:
df.isna().sum()

Time      0
V1        0
V2        0
V3        0
V4        0
V5        0
V6        0
V7        0
V8        0
V9        0
V10       0
V11       0
V12       0
V13       0
V14       0
V15       0
V16       0
V17       0
V18       0
V19       0
V20       0
V21       0
V22       0
V23       0
V24       0
V25       0
V26       0
V27       0
V28       0
Amount    0
Class     0
dtype: int64

In [7]:
df['Class'].value_counts(normalize=True)*100

Class
0    99.827251
1     0.172749
Name: proportion, dtype: float64

In [8]:
df['Class'].value_counts()

Class
0    284315
1       492
Name: count, dtype: int64

* High imbalance in target column.
* We skip doing EDA as we cannot perform useful EDA without the presence of proper feature labeling.
* We shall perform Under sampling by balancing the Class 0 with Class 1 and proceed to model.

# Technique 1
    # Under Sampling

In [9]:
good = df[df['Class'] == 0]

In [10]:
fraud = df[df['Class'] == 1]

* Extracting 492 random Class 0 samples

In [11]:
gsamp = good.sample(n = 492)

Concatenating Dataframes

In [12]:
ndf = pd.concat([gsamp, fraud], axis = 0)

In [13]:
ndf.shape

(984, 31)

In [14]:
ndf['Class'].value_counts()

Class
0    492
1    492
Name: count, dtype: int64

In [15]:
ndf.isna().sum()

Time      0
V1        0
V2        0
V3        0
V4        0
V5        0
V6        0
V7        0
V8        0
V9        0
V10       0
V11       0
V12       0
V13       0
V14       0
V15       0
V16       0
V17       0
V18       0
V19       0
V20       0
V21       0
V22       0
V23       0
V24       0
V25       0
V26       0
V27       0
V28       0
Amount    0
Class     0
dtype: int64

In [16]:
ndf.describe()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
count,984.0,984.0,984.0,984.0,984.0,984.0,984.0,984.0,984.0,984.0,...,984.0,984.0,984.0,984.0,984.0,984.0,984.0,984.0,984.0,984.0
mean,87282.273374,-2.318158,1.838736,-3.524671,2.28042,-1.570961,-0.74819,-2.7697,0.299867,-1.259794,...,0.374793,0.025718,-0.026426,-0.029002,0.031424,0.018643,0.081794,0.035793,99.818628,0.5
std,48149.082335,5.53885,3.674185,6.227481,3.178558,4.212848,1.735162,5.862139,4.846831,2.342436,...,2.774194,1.171379,1.165763,0.536223,0.672902,0.482414,1.021987,0.425622,213.358144,0.500254
min,218.0,-30.55238,-14.077923,-31.103685,-3.847286,-22.105532,-6.406267,-43.557242,-41.044261,-13.434066,...,-22.797604,-8.887017,-19.254328,-2.028024,-4.781606,-1.152671,-7.263482,-2.125563,0.0,0.0
25%,45200.0,-2.800824,-0.131103,-5.113334,-0.081218,-1.775001,-1.603779,-3.078361,-0.217513,-2.303539,...,-0.152332,-0.512797,-0.23693,-0.372906,-0.328404,-0.297542,-0.061965,-0.059728,1.18,0.0
50%,81997.5,-0.749997,0.943748,-1.415353,1.334079,-0.448188,-0.684996,-0.63861,0.147696,-0.681674,...,0.160359,0.045907,-0.038422,0.015046,0.061644,-0.048071,0.044064,0.035099,18.97,0.5
75%,133787.75,1.098606,2.814266,0.310536,4.234137,0.444707,-0.000202,0.29003,0.853103,0.200703,...,0.670739,0.610616,0.192766,0.380346,0.399107,0.307777,0.437129,0.208837,99.99,1.0
max,172090.0,2.422508,22.057729,2.950218,12.114672,11.095089,6.474115,9.667389,20.007208,4.373871,...,27.202839,8.361985,5.46623,1.210315,2.208209,2.745261,3.052358,1.779364,2125.87,1.0


# Train test Split

In [17]:
# Splitting features from target variable

X = ndf.iloc[:,1:30]
y = ndf.iloc[:,-1]

In [18]:
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import classification_report, confusion_matrix, cohen_kappa_score

In [19]:
xtrain, xtest, ytrain, ytest = train_test_split(X,y, test_size=0.3, random_state=100)

In [20]:
print(xtrain.shape, xtest.shape, ytrain.shape, ytest.shape)

(688, 29) (296, 29) (688,) (296,)


# Logistic Regression

In [21]:
# Initialising the model
from sklearn.linear_model import LogisticRegression

# Defining the model
lr = LogisticRegression(random_state=100)

# Fitting the model
model_lr = lr.fit(xtrain,ytrain)

In [22]:
# Printing model performance
y_pred_train_lr = model_lr.predict(xtrain)

print('Logistic Regression for Train')
print(classification_report(ytrain, y_pred_train_lr))
print('Confusion Matrix \n',confusion_matrix(ytrain, y_pred_train_lr))
print('\n Cohen Kappa Score \n',cohen_kappa_score(ytrain, y_pred_train_lr))

Logistic Regression for Train
              precision    recall  f1-score   support

           0       0.92      0.97      0.95       329
           1       0.97      0.93      0.95       359

    accuracy                           0.95       688
   macro avg       0.95      0.95      0.95       688
weighted avg       0.95      0.95      0.95       688

Confusion Matrix 
 [[320   9]
 [ 26 333]]

 Cohen Kappa Score 
 0.8982816011354612


In [23]:
# Printing model performance
y_pred_test_lr = model_lr.predict(xtest)

print('Logistic Regression for Test data')
print(classification_report(ytest, y_pred_test_lr))
print('Confusion Matrix \n',confusion_matrix(ytest, y_pred_test_lr))
print('\n Cohen Kappa Score \n',cohen_kappa_score(ytest, y_pred_test_lr))

Logistic Regression for Test data
              precision    recall  f1-score   support

           0       0.91      0.98      0.94       163
           1       0.97      0.88      0.92       133

    accuracy                           0.94       296
   macro avg       0.94      0.93      0.93       296
weighted avg       0.94      0.94      0.94       296

Confusion Matrix 
 [[160   3]
 [ 16 117]]

 Cohen Kappa Score 
 0.8691118972258425


In [24]:
print('Cross Validation Score for Logistic Regression', cross_val_score(lr, X, y, cv = 30, scoring='accuracy' ).mean())

Cross Validation Score for Logistic Regression 0.9420454545454546


* We can observe that the model has predicted 26 & 16 False Negative cases.
* Our aim of the model is to attain as low of a False Negative identification as possible. 
* Meaning, we dont want the model to label a fraudulent transaction as a legit one.
* We can also say that the current Lr model is Overfitting considering the drop in performance in the test data compared to the train.
* We Created further ML models using Decission Tree, Random Forest, AdaBoost, Gradient Boost to reduce this errors and all of them were overfitting the train data acquiring an accuracy of 1 for train and of 0.9 for test data.
* We shall create a Simple Neural Netword using Sklearn

# Multi Layer Perceptron

In [25]:
# Initialising the model
from sklearn.neural_network import MLPClassifier

# Defining the model
mlp = MLPClassifier(hidden_layer_sizes=(10,10,10,10), random_state= 100)

# Fitting the model
model_mlp = mlp.fit(xtrain,ytrain)

In [26]:
# Printing model performance
y_pred_train_mlp = model_mlp.predict(xtrain)

print('Multi Layer Perceptron for Train data')
print(classification_report(ytrain, y_pred_train_mlp))
print('Confusion Matrix \n',confusion_matrix(ytrain, y_pred_train_mlp))
print('\n Cohen Kappa Score \n',cohen_kappa_score(ytrain, y_pred_train_mlp))

Multi Layer Perceptron for Train data
              precision    recall  f1-score   support

           0       0.98      0.99      0.98       329
           1       0.99      0.98      0.99       359

    accuracy                           0.99       688
   macro avg       0.99      0.99      0.99       688
weighted avg       0.99      0.99      0.99       688

Confusion Matrix 
 [[327   2]
 [  8 351]]

 Cohen Kappa Score 
 0.9708970313279921


In [27]:
# Printing model performance
y_pred_test_mlp = model_mlp.predict(xtest)

print('Logistic Regression for Test data')
print(classification_report(ytest, y_pred_test_mlp))
print('Confusion Matrix \n',confusion_matrix(ytest, y_pred_test_mlp))
print('\n Cohen Kappa Score \n',cohen_kappa_score(ytest, y_pred_test_mlp))

Logistic Regression for Test data
              precision    recall  f1-score   support

           0       0.91      0.98      0.94       163
           1       0.97      0.89      0.93       133

    accuracy                           0.94       296
   macro avg       0.94      0.93      0.93       296
weighted avg       0.94      0.94      0.94       296

Confusion Matrix 
 [[159   4]
 [ 15 118]]

 Cohen Kappa Score 
 0.8692944129404109


In [28]:
print('Cross Validation Score for Multi Layer Perceptron', cross_val_score(mlp, X, y, cv = 30, scoring='accuracy' ).mean())

Cross Validation Score for Multi Layer Perceptron 0.9349747474747475


* In MLP, the model has predicted 8 & 15 False Negative cases.
* But model is under performing for test data meaning model overfits the train data.
* Comparing the Cross Validation scores, MLP performs slightly better.
* We shall proceed with MLP for HyperParameter Tuning

# Grid Search

In [29]:
from sklearn.model_selection import GridSearchCV

In [30]:
# Define the hyperparameters to tune
param_grid = {
    'hidden_layer_sizes': [(50,), (100,), (50, 50)],
    'activation': ['relu', 'tanh'],
    'solver': ['adam', 'sgd'],
    'learning_rate': ['constant', 'adaptive']
}

In [31]:
# Create the GridSearchCV object
gs = GridSearchCV(estimator=mlp, param_grid=param_grid, cv=5)

In [32]:
# Fit the model to the data
gs.fit(X, y)

In [33]:
# Print the best hyperparameters and accuracy score
print("Best Hyperparameters: ", gs.best_params_)
print("Best Accuracy Score: ", gs.best_score_)

Best Hyperparameters:  {'activation': 'relu', 'hidden_layer_sizes': (50, 50), 'learning_rate': 'adaptive', 'solver': 'sgd'}
Best Accuracy Score:  0.9318968196415623


# Hyper Parameter tuned MLP

In [34]:
# Defining the model
mlpg = MLPClassifier(activation='relu', hidden_layer_sizes = (50,), learning_rate='constant', solver= 'adam',random_state= 100)

# Fitting the model
model_mlpg = mlpg.fit(xtrain,ytrain)

In [35]:
# Printing model performance
y_pred_train_mlpg = model_mlpg.predict(xtrain)

print('Multi Layer Perceptron for Train data')
print(classification_report(ytrain, y_pred_train_mlpg))
print('Confusion Matrix \n',confusion_matrix(ytrain, y_pred_train_mlpg))
print('\n Cohen Kappa Score \n',cohen_kappa_score(ytrain, y_pred_train_mlpg))

Multi Layer Perceptron for Train data
              precision    recall  f1-score   support

           0       0.96      0.99      0.98       329
           1       0.99      0.97      0.98       359

    accuracy                           0.98       688
   macro avg       0.98      0.98      0.98       688
weighted avg       0.98      0.98      0.98       688

Confusion Matrix 
 [[326   3]
 [ 12 347]]

 Cohen Kappa Score 
 0.9563621602422069


In [36]:
# Printing model performance
y_pred_test_mlpg = model_mlpg.predict(xtest)

print('Logistic Regression for Test data')
print(classification_report(ytest, y_pred_test_mlpg))
print('Confusion Matrix \n',confusion_matrix(ytest, y_pred_test_mlpg))
print('\n Cohen Kappa Score \n',cohen_kappa_score(ytest, y_pred_test_mlpg))

Logistic Regression for Test data
              precision    recall  f1-score   support

           0       0.91      0.98      0.94       163
           1       0.97      0.88      0.92       133

    accuracy                           0.93       296
   macro avg       0.94      0.93      0.93       296
weighted avg       0.93      0.93      0.93       296

Confusion Matrix 
 [[159   4]
 [ 16 117]]

 Cohen Kappa Score 
 0.8623191776361692


In [37]:
print('Cross Validation Score for Hyper Parameter Tuned Multi Layer Perceptron', 
      cross_val_score(mlpg, X, y, cv = 30, scoring='accuracy' ).mean())

Cross Validation Score for Hyper Parameter Tuned Multi Layer Perceptron 0.9421717171717172


* In this HP MLP, the model has predicted 12 & 16 False Negative cases.
* In all the above cases the model is over fitting.
* One of the major reasons could be the low sample size of 984.
* We cannot perform Feature Selection or Feature Engineering due having to less than sufficient knowledge of the features.
* We shall proceed to develop model using Tensorflow with the original dataframe.
* Further which we shall try Neural Network model with Cost Sensitive Neural Network.

# Technique 2
    #  Cost Sensitive Neural Network

# Train test split

In [38]:
# Splitting Features from target
X = df.iloc[:,1:30]
y = df.iloc[:,-1]

In [39]:
# Train test split
xtrain, xtest, ytrain, ytest = train_test_split(X, y, test_size=0.30, random_state=42)

In [40]:
print(xtrain.shape, xtest.shape, ytrain.shape, ytest.shape)

(199364, 29) (85443, 29) (199364,) (85443,)


In [41]:
ytrain.value_counts()

Class
0    199008
1       356
Name: count, dtype: int64

In [42]:
199008/356

559.0112359550562

* This means, for ever Class 1, there are 599 Class 0

# Tensorflow

In [43]:
import tensorflow

In [44]:
# Initialising the model
from tensorflow.keras.layers import Dense
from tensorflow.keras.models import Sequential

In [45]:
n_inputs = len(X.columns)
n_inputs

29

In [46]:
# Define model
sq = Sequential()

# Define first hidden layer and visible layer
# kernel_initializer - Weight Initializer
sq.add(Dense(50, input_dim = n_inputs, activation = 'relu', kernel_initializer= 'he_uniform'))

# Define output layer
sq.add(Dense(1, activation = 'sigmoid'))

# Define losss and optimizer
# binary_crossentropy - Due to binary classification (0 & 1). If multi class, use Multi or sparse
sq.compile(loss='binary_crossentropy', optimizer = 'adam')
sq.fit(xtrain,ytrain, epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x1dfc1a43a90>

In [47]:
# Importing scoring metrics
from sklearn.metrics import roc_auc_score

In [48]:
# Printing model performance
y_pred_train_sq = sq.predict(xtrain)

print('Roc_auc_score for Train', roc_auc_score(ytrain,y_pred_train_sq))

Roc_auc_score for Train 0.9329610048424455


In [49]:
# Printing model performance
y_pred_train_sq = sq.predict(xtest)

print('Roc_auc_score for Train', roc_auc_score(ytest,y_pred_train_sq))

Roc_auc_score for Train 0.9338001708707443


* The roc auc scores are very similar for train and test data, meaning the model is performing well.
* We shall now proceed with the same model but we shall add weights to the classes as necessary.

# Weighted Neural Networks

In [50]:
ytrain.value_counts()

Class
0    199008
1       356
Name: count, dtype: int64

In [51]:
199008/356

559.0112359550562

In [52]:
# Defining the weights
assigned_weights = {0:1, 1:599}

In [53]:
# Define model
sqw = Sequential()

# Define first hidden layer and visible layer
sqw.add(Dense(150, input_dim = n_inputs, activation = 'relu', kernel_initializer= 'he_uniform'))

# Define output layer
sqw.add(Dense(1, activation = 'sigmoid'))

# Define loss and optimizer and assign weights
sqw.compile(loss='binary_crossentropy', optimizer = 'adam')
sqw.fit(xtrain,ytrain, class_weight = assigned_weights, epochs = 10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x1dfcbca00d0>

In [54]:
# Printing model performance
y_pred_train_sqw = sqw.predict(xtrain)

print('Roc_auc_score for Train',roc_auc_score(ytrain,y_pred_train_sqw))

Roc_auc_score for Train 0.9675762427708852


In [55]:
# Printing model performance
y_pred_test_sqw = sqw.predict(xtest)

print('Roc_auc_score for Test',roc_auc_score(ytest,y_pred_test_sqw))

Roc_auc_score for Test 0.9640500417523146


* We have achieved a good model with very similar performance in train and test data with the help of Weighted Neural Network.
* This will be our final model.

# Trail tests

In [95]:
df.iloc[542,1:30].values

array([-1.64500860e+00, -2.09079913e+00,  1.59256684e+00, -1.38885259e+00,
        1.73289919e+00, -1.63105878e+00, -9.22859376e-01,  1.38144495e-02,
        1.76077706e+00, -1.27016428e+00, -1.40105324e+00,  3.56018140e-01,
        1.91579696e-01, -4.73274886e-01,  7.07895259e-01, -1.79083043e-01,
       -8.16905117e-01,  6.31246543e-01,  3.09446502e-01,  5.64245803e-01,
        2.48155008e-01,  4.15496484e-01,  2.93293937e-01, -4.63425038e-01,
       -4.05175360e-01, -9.24103280e-01,  6.58569044e-03,  1.86830113e-02,
        4.66800000e+01])

In [106]:
# Trail prediction
prediction = sqw.predict([[-2.31222654,  1.95199201, -1.60985073,  3.99790559, -0.52218786,
       -1.42654532, -2.53738731,  1.39165725, -2.77008928, -2.77227214,
        3.20203321, -2.89990739, -0.59522188, -4.28925378,  0.38972412,
       -1.14074718, -2.83005567, -0.01682247,  0.41695571,  0.12691056,
        0.51723237, -0.03504937, -0.46521108,  0.3201982 ,  0.04451917,
        0.1778398 ,  0.261145  , -0.14327587,  0.        ]])

predicted_labels = (prediction > 0.5).astype(int)
predicted_labels



array([[1]])

In [107]:
# Trail prediction
prediction = sqw.predict([[-1.64500860e+00, -2.09079913e+00,  1.59256684e+00, -1.38885259e+00,
        1.73289919e+00, -1.63105878e+00, -9.22859376e-01,  1.38144495e-02,
        1.76077706e+00, -1.27016428e+00, -1.40105324e+00,  3.56018140e-01,
        1.91579696e-01, -4.73274886e-01,  7.07895259e-01, -1.79083043e-01,
       -8.16905117e-01,  6.31246543e-01,  3.09446502e-01,  5.64245803e-01,
        2.48155008e-01,  4.15496484e-01,  2.93293937e-01, -4.63425038e-01,
       -4.05175360e-01, -9.24103280e-01,  6.58569044e-03,  1.86830113e-02,
        4.66800000e+01]])


predicted_labels = (prediction > 0.5).astype(int)
predicted_labels



array([[0]])

# Saving the model using Pickle

In [100]:
import pickle

In [103]:
# Saving the model
# wb = Write binary

with open('sqw.pkl', 'wb') as file:
    pickle.dump(sqw, file)

# Deploy the model as streamlit file

In [3]:
%%writefile fraud_app.py

import streamlit as st
st.title('Fraud Detection App')
st.subheader('Is this a Fraudulent transaction')
import pandas as pd
import pickle
from sklearn.preprocessing import StandardScaler


# Step1: Load the pickled model

model = open('sqw.pkl','rb') # read binary
clf = pickle.load(model)
model.close()

# Step2: Get input from front end user

# Create input fields for features
feature1 = st.number_input("V1", value=0.0)
feature2 = st.number_input("V2", value=0.0)
feature3 = st.number_input("V3", value=0.0)
feature4 = st.number_input("V4", value=0.0)
feature5 = st.number_input("V5", value=0.0)
feature6 = st.number_input("V6", value=0.0)
feature7 = st.number_input("V7", value=0.0)
feature8 = st.number_input("V8", value=0.0)
feature9 = st.number_input("V9", value=0.0)
feature10 = st.number_input("V10", value=0.0)
feature11 = st.number_input("V11", value=0.0)
feature12 = st.number_input("V12", value=0.0)
feature13 = st.number_input("V13", value=0.0)
feature14 = st.number_input("V14", value=0.0)
feature15 = st.number_input("V15", value=0.0)
feature16 = st.number_input("V16", value=0.0)
feature17 = st.number_input("V17", value=0.0)
feature18 = st.number_input("V18", value=0.0)
feature19 = st.number_input("V19", value=0.0)
feature20 = st.number_input("V20", value=0.0)
feature21 = st.number_input("V21", value=0.0)
feature22 = st.number_input("V22", value=0.0)
feature23 = st.number_input("V23", value=0.0)
feature24 = st.number_input("V24", value=0.0)
feature25 = st.number_input("V25", value=0.0)
feature26 = st.number_input("V26", value=0.0)
feature27 = st.number_input("V27", value=0.0)
feature28 = st.number_input("V28", value=0.0)
feature29 = st.number_input("V29", value=0.0)

# Step3: Collect the front end user input as model input data

data = {'V1':feature1, 
        'V2':feature2,
        'V3':feature3,
        'V4':feature4,
        'V5':feature5,
        'V6':feature6,
        'V7':feature7,
        'V8':feature8,
        'V9':feature9,
        'V10':feature10,
        'V11':feature11,
        'V12':feature12,
        'V13':feature13,
        'V14':feature14,
        'V15':feature15,
        'V16':feature16,
        'V17':feature17,
        'V18':feature18,
        'V19':feature19,
        'V20':feature20,
        'V21':feature21,
        'V22':feature22,
        'V23':feature23,
        'V24':feature24,
        'V25':feature25,
        'V26':feature26,
        'V27':feature27,
        'V28':feature28,
        'V29':feature29,
       }
input_data = pd.DataFrame([data])

# Step4: get the predictions and print the result

preds=clf.predict(input_data)[0]
if st.button('Predict'):
    if preds==1:
        st.error('Fraud')
    if preds<1:
        st.error('Legit')

Overwriting fraud_app.py
