In [2]:
%matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd
import os
import numpy as np
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, recall_score


In [3]:
card_activity = pd.read_csv('creditcard.csv')
card_activity.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0.0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0
2,1.0,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0


In [4]:
card_activity.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 284807 entries, 0 to 284806
Data columns (total 31 columns):
 #   Column  Non-Null Count   Dtype  
---  ------  --------------   -----  
 0   Time    284807 non-null  float64
 1   V1      284807 non-null  float64
 2   V2      284807 non-null  float64
 3   V3      284807 non-null  float64
 4   V4      284807 non-null  float64
 5   V5      284807 non-null  float64
 6   V6      284807 non-null  float64
 7   V7      284807 non-null  float64
 8   V8      284807 non-null  float64
 9   V9      284807 non-null  float64
 10  V10     284807 non-null  float64
 11  V11     284807 non-null  float64
 12  V12     284807 non-null  float64
 13  V13     284807 non-null  float64
 14  V14     284807 non-null  float64
 15  V15     284807 non-null  float64
 16  V16     284807 non-null  float64
 17  V17     284807 non-null  float64
 18  V18     284807 non-null  float64
 19  V19     284807 non-null  float64
 20  V20     284807 non-null  float64
 21  V21     28

In [5]:
X =card_activity.drop(columns=['Time', 'Class'])
y= card_activity['Class']
y

0         0
1         0
2         0
3         0
4         0
         ..
284802    0
284803    0
284804    0
284805    0
284806    0
Name: Class, Length: 284807, dtype: int64

In [7]:
print(X.shape, y.shape)

(284807, 29) (284807,)


In [8]:
card_activity['Class'].value_counts()

0    284315
1       492
Name: Class, dtype: int64

In [9]:
#separate 'training' and 'testing' data
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [10]:
unique_ytrain, counts_ytrain = np.unique(y_train, return_counts=True)
print(unique_ytrain,counts_ytrain)

[0 1] [213224    381]


In [11]:
unique_ytest, counts_ytest = np.unique(y_test, return_counts=True)
print(unique_ytest,counts_ytest)

[0 1] [71091   111]


In [12]:
print('percentage of positive values in the training sample:', counts_ytrain[1]*100/(counts_ytrain[1]+counts_ytrain[0]))
print('percentage of positive values in the test sample:', counts_ytest[1]*100/(counts_ytest[1]+counts_ytest[0]))

percentage of positive values in the training sample: 0.17836661126846282
percentage of positive values in the test sample: 0.15589449734558017


In [13]:
#///////////////////////////////////////////////////////////////////////////////////
#Since the sample is biased towards non-fraud transactions resampling is necessary
#use the resample package from sklearn
#the resampling should be done after splitting the test and train samples to avoid 
#having the same rows in both test and train samples
#//////////////////////////////////////////////////////////////////////////////////
from sklearn.utils import resample

#Now let's create a new test data sample by combining the X_tarin and y_train samples
#and separate fraud and non-fraud transactions
Xy_train = pd.concat([X_train, y_train], axis=1)

non_fraud = Xy_train[Xy_train['Class']==0]
fraud = Xy_train[Xy_train['Class']==1]

# upsample fraud transactions

fraud_new = resample(fraud,replace=True, # if true Implements resampling with replacement
                          n_samples=len(non_fraud), # no. of samples
                          random_state=1)

# now combine and create a train sample with eqaul no. of fraud and non-fraud transactions
Xy_train_new = pd.concat([non_fraud, fraud_new])

#split back X and y
X_train_new = Xy_train_new.drop('Class',axis=1)
y_train_new = Xy_train_new['Class']
print(X_train_new.shape, y_train_new.shape)

(426448, 29) (426448,)


In [14]:
#scaling
from sklearn.preprocessing import MinMaxScaler
X_scale = MinMaxScaler().fit(X_train_new)

X_train_scaled = X_scale.transform(X_train_new)
X_test_scaled = X_scale.transform(X_test)

X_train_new.shape
X_train_scaled.shape

(426448, 29)

In [15]:
from keras.utils import to_categorical

#One-hot encoding
one_hot_y_train = to_categorical(y_train_new)
one_hot_y_test = to_categorical(y_test)
print(one_hot_y_train.shape)
print(X_train_scaled.shape)

Using TensorFlow backend.


(426448, 2)
(426448, 29)


In [16]:
from tensorflow.keras.models import Sequential
model = Sequential()

In [17]:
from tensorflow.keras.layers import Dense
#dense means every neuron conneted to every one in the previous layer
#relu is an activation fn
number_inputs = 29
number_hidden_nodes = 58
model.add(Dense(units=number_hidden_nodes,
                activation='relu', input_dim=number_inputs))
number_classes = 2
model.add(Dense(units=number_classes, activation='softmax'))

In [18]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 58)                1740      
_________________________________________________________________
dense_1 (Dense)              (None, 2)                 118       
Total params: 1,858
Trainable params: 1,858
Non-trainable params: 0
_________________________________________________________________


In [19]:
model.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['accuracy'])
type(y_train_new)
y_train_new.dtype


dtype('int64')

In [20]:
# Fit
model.fit(
    X_train_scaled,
    one_hot_y_train,
    epochs=200,
    shuffle=True,
    verbose=2
)

Train on 426448 samples
Epoch 1/200
426448/426448 - 47s - loss: 0.1705 - accuracy: 0.9386
Epoch 2/200
426448/426448 - 43s - loss: 0.1479 - accuracy: 0.9457
Epoch 3/200
426448/426448 - 48s - loss: 0.1329 - accuracy: 0.9506
Epoch 4/200
426448/426448 - 51s - loss: 0.1173 - accuracy: 0.9574
Epoch 5/200
426448/426448 - 51s - loss: 0.1058 - accuracy: 0.9616
Epoch 6/200
426448/426448 - 57s - loss: 0.0965 - accuracy: 0.9646
Epoch 7/200
426448/426448 - 57s - loss: 0.0897 - accuracy: 0.9669
Epoch 8/200
426448/426448 - 57s - loss: 0.0846 - accuracy: 0.9680
Epoch 9/200
426448/426448 - 56s - loss: 0.0804 - accuracy: 0.9693
Epoch 10/200
426448/426448 - 59s - loss: 0.0767 - accuracy: 0.9705
Epoch 11/200
426448/426448 - 62s - loss: 0.0738 - accuracy: 0.9712
Epoch 12/200
426448/426448 - 61s - loss: 0.0708 - accuracy: 0.9723
Epoch 13/200
426448/426448 - 69s - loss: 0.0681 - accuracy: 0.9735
Epoch 14/200
426448/426448 - 58s - loss: 0.0664 - accuracy: 0.9739
Epoch 15/200
426448/426448 - 54s - loss: 0.0646

<tensorflow.python.keras.callbacks.History at 0x1218212e8>

In [29]:
from sklearn.metrics import classification_report
predictions = model.predict(X_test_scaled)
predictions
#print(classification_report(one_hot_y_test, predictions))
#one_hot_y_test.shape,X_test_scaled.shape
predictions

array([[1.0000000e+00, 8.7719450e-11],
       [1.0000000e+00, 9.0222534e-14],
       [1.0000000e+00, 1.2841474e-08],
       ...,
       [1.0000000e+00, 4.0890451e-12],
       [1.0000000e+00, 7.0445771e-17],
       [1.0000000e+00, 1.6100540e-17]], dtype=float32)

In [31]:
one_hot_y_test

array([[1., 0.],
       [1., 0.],
       [1., 0.],
       ...,
       [1., 0.],
       [1., 0.],
       [1., 0.]], dtype=float32)

In [35]:
model.save("neural_model.h5")

In [37]:
# Load the model
from tensorflow.keras.models import load_model
loaded_model = load_model("neural_model.h5")

In [39]:
predictions = model.predict(X_test_scaled)
predictions

array([[1.0000000e+00, 8.7719450e-11],
       [1.0000000e+00, 9.0222534e-14],
       [1.0000000e+00, 1.2841474e-08],
       ...,
       [1.0000000e+00, 4.0890451e-12],
       [1.0000000e+00, 7.0445771e-17],
       [1.0000000e+00, 1.6100540e-17]], dtype=float32)

In [41]:
model_loss, model_accuracy = model.evaluate(
    X_test_scaled, one_hot_y_test, verbose=2)
print(
    f"Normal Neural Network - Loss: {model_loss}, Accuracy: {model_accuracy}")

71202/1 - 4s - loss: 0.0209 - accuracy: 0.9895
Normal Neural Network - Loss: 0.04183714014962137, Accuracy: 0.9894806146621704


In [47]:
temp =np.round(predictions)

In [48]:
print(classification_report(one_hot_y_test, temp))

              precision    recall  f1-score   support

           0       1.00      0.99      0.99     71091
           1       0.11      0.84      0.20       111

   micro avg       0.99      0.99      0.99     71202
   macro avg       0.56      0.91      0.60     71202
weighted avg       1.00      0.99      0.99     71202
 samples avg       0.99      0.99      0.99     71202

