In [1]:
#imports

from __future__ import absolute_import, division, print_function

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
import sklearn.metrics as metrics
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.preprocessing import StandardScaler

import tensorflow as tf
import tensorflow_probability as tfp
from tensorflow.python.ops import control_flow_util
import tensorflow.keras as keras
K = keras.backend


In [2]:

#tf2 config
control_flow_util.ENABLE_CONTROL_FLOW_V2 = True


In [4]:

#read the csv
df = pd.read_csv("../Datasets/creditcard.csv")

#split data/labels
X = df.iloc[:,1:-1]
y = df.iloc[:,-1]
del df


#split train/test
X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.33, random_state=42)
del X
del y

#scale the data
sc = StandardScaler()
X_train.iloc[:,:] = sc.fit_transform(X_train)
X_test.iloc[:,:] = sc.transform (X_test)

#data without anomalies
healthy_data = X_train[y_train==0]

#indexes for later use
X_train_indexes = tf.Variable(np.array(X_train.index), dtype=tf.float64)
X_test_indexes = tf.Variable(np.array(X_test.index), dtype=tf.float64)


In [5]:

@tf.function
def estimate_health(healthy_data):
    """
        returns the mean and variance of the healthy data
    """
    mean = K.mean( healthy_data, axis=0)
    var = K.var( healthy_data, axis=0)
    return mean, var


In [6]:

#means and variances of the healthy data
mus_healthy, sigmasqs_healthy = estimate_health(healthy_data.values)


In [7]:

@tf.function
def probs(mus_healthy, sigmasqs_healthy, sample):
    """
        returns the pdf of the sample given mus_healthy and sigmasqs_healthy
    """
    norm = K.prod(tfp.distributions.Normal(mus_healthy, sigmasqs_healthy).prob(sample),axis=1)
    multiv = tfp.distributions.MultivariateNormalDiag(mus_healthy, sigmasqs_healthy).prob(sample)

    return norm, multiv


In [8]:

#pdf of X_train of beign part of a healthy distribution
norm_train, multiv_train = probs(mus_healthy,sigmasqs_healthy, X_train.values)


In [9]:

@tf.function
def reduce_samples(sample, labels ,indexes, norm, multiv, percentile):
    """
        returns the indexes of the min values for pdf of normal and multivariate distributions
    """

    sample_probs_n = tf.stack((tf.cast(indexes, tf.float64), norm), axis=1)
    sample_probs_norm = tf.gather(sample_probs_n, tf.where(sample_probs_n[:,1] <= tfp.stats.percentile(norm, percentile)))
    sample_probs_norm = tf.reshape(sample_probs_norm, [tf.shape(sample_probs_norm)[0], 2])

    sample_probs_m = tf.stack((tf.cast(indexes, tf.float64), multiv), axis=1)
    sample_probs_multiv = tf.gather(sample_probs_m, tf.where(sample_probs_m[:,1] <= tfp.stats.percentile(multiv, percentile)))
    sample_probs_multiv = tf.reshape(sample_probs_multiv, [tf.shape(sample_probs_multiv)[0], 2])

    a = sample_probs_multiv[:,0]
    b = sample_probs_norm[:,0]

    return a,b


In [10]:

#reduction of the samples

a,b = reduce_samples(X_train.values, y_train.values, X_train_indexes, norm_train, multiv_train, 25)
idx = np.intersect1d(a.numpy(), b.numpy())
idx = np.array(idx, dtype=np.int32)
idx = tf.Variable(idx, dtype=tf.int32)


X_train_red_norm, X_train_red_multi = probs(mus_healthy, sigmasqs_healthy, tf.gather(X_train.values, idx))
c, d = reduce_samples(tf.gather(X_train.values, idx), tf.gather(y_train.values, idx), idx, X_train_red_norm, X_train_red_multi, 25)
idx2 = np.intersect1d(c.numpy(), d.numpy())
idx2 = np.array(idx2, dtype=np.int32)
idx2 = tf.Variable(idx2, dtype=tf.int32)


X_train_red = tf.gather(X_train.values, idx2)
y_train_red = tf.gather(y_train.values, idx2)

In [11]:
#Sequiential model

sgd = keras.optimizers.SGD(lr=0.2, momentum=.3, nesterov=True)

model = keras.models.Sequential()
model.add(keras.layers.Dense(100, activation='sigmoid', input_dim=X_train_red.shape[1]))
model.add(keras.layers.Dropout(0.2))
model.add(keras.layers.Dense(40, activation='tanh'))
model.add(keras.layers.Dropout(0.2))
model.add(keras.layers.Dense(1, activation='sigmoid'))

model.compile(optimizer=sgd,
              loss='mse',
              metrics=['accuracy'])

hist = model.fit(X_train_red, y_train_red, epochs=10, batch_size=16, verbose=1)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [12]:
#predictions

preds_train = model.predict_classes(X_train)
preds_test = model.predict_classes(X_test)

In [13]:
#some stadistics

def confusionm(labels,pred):
    matrix = confusion_matrix(labels,pred)
    print(matrix)

def classif_rep(labels,pred):
    report = classification_report(labels,pred)
    print(report)

In [14]:
#results

print("train data")
confusionm(y_train, preds_train)
classif_rep(y_train, preds_train)

print("test data")
confusionm(y_test, preds_test)
classif_rep(y_test, preds_test)

train data
[[190429     48]
 [    73    270]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00    190477
           1       0.85      0.79      0.82       343

   micro avg       1.00      1.00      1.00    190820
   macro avg       0.92      0.89      0.91    190820
weighted avg       1.00      1.00      1.00    190820

test data
[[93805    33]
 [   26   123]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     93838
           1       0.79      0.83      0.81       149

   micro avg       1.00      1.00      1.00     93987
   macro avg       0.89      0.91      0.90     93987
weighted avg       1.00      1.00      1.00     93987

