In [None]:


import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)


import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.

In [None]:
## data analysis for credit_card frauds 

In [None]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt 
import seaborn as sns

In [None]:
df_cred=pd.read_csv("/kaggle/input/creditcardfraud/creditcard.csv")

In [None]:
df_cred.shape

In [None]:
from scipy.stats import randint as sp_randint
from scipy.stats import uniform as sp_uniform
import warnings
import plotly.offline as py
py.init_notebook_mode(connected=True)
import plotly.graph_objs as go
import plotly.tools as tls
import plotly.figure_factory as ff

warnings.filterwarnings('ignore')

from contextlib import contextmanager

@contextmanager
def timer(title):
    t0 = time.time()
    yield
    print("{} - done in {:.0f}s".format(title, time.time() - t0))


In [None]:
fraud = df_cred[(df_cred['Class'] != 0)]
normal = df_cred[(df_cred['Class'] == 0)]

trace = go.Pie(labels = ['Normal', 'Fraud'], values = df_cred['Class'].value_counts(), 
               textfont=dict(size=15), opacity = 0.8,
               marker=dict(colors=['blue','red'], 
                           line=dict(color='#00ff00', width=1.8)))


layout = dict(title =  'Distribution of target variable')
           
fig = dict(data = [trace], layout=layout)
py.iplot(fig)

## Feature distribution

In [None]:
# Def plot distribution
def plot_distribution(data_select) : 
    figsize =( 15, 8)
    sns.set_style("ticks")
    s = sns.FacetGrid(df_cred, hue = 'Class',aspect = 2.5, palette ={0 : 'red', 1 :'blue'})
    s.map(sns.kdeplot, data_select, shade = True, alpha = 0.6)
    s.set(xlim=(df_cred[data_select].min(), df_cred[data_select].max()))
    s.add_legend()
    s.set_axis_labels(data_select, 'proportion')
    s.fig.suptitle(data_select)
    plt.show()

In [None]:
plot_distribution('V4')
plot_distribution('V9')
plot_distribution('V11')
plot_distribution('V12')
plot_distribution('V13')

# Initial Preprocessing 

In [None]:
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix , roc_auc_score, roc_curve

In [None]:
### dropping off unncessary columns

In [None]:
min_max_scaler = preprocessing.MinMaxScaler()
df_cred=df_cred.drop("Time",axis=1)
df_cred_scaled = min_max_scaler.fit_transform(df_cred.iloc[:,:-1])
df_cred_normalized = pd.DataFrame(df_cred_scaled)

In [None]:
df_cred_normalized["Class"]=df_cred["Class"]

In [None]:
df_cred_normalized["Class"].value_counts()

# Spliting strategy

In [None]:
df_cred_normalized_train=df_cred_normalized[df_cred_normalized["Class"]==0]
df_cred_normalized_test=df_cred_normalized[df_cred_normalized["Class"]==1]

#### splitting dataset as per strategy I have dicussed 
#### we will train it on non-fraudulent transcation and test on both the classes 


In [None]:
df_cred_normalized_test_part_1=df_cred_normalized_train.sample(frac=0.05)
df_cred_normalized_train=df_cred_normalized_train.drop(df_cred_normalized_test_part_1.index)
df_cred_normalized_test_part_2=df_cred_normalized_train.sample(frac=0.05)
df_cred_normalized_train=df_cred_normalized_train.drop(df_cred_normalized_test_part_2.index)

In [None]:
df_cred_normalized_test_class_1=df_cred_normalized_test.sample(frac=0.5)
df_cred_normalized_validation_class_1=df_cred_normalized_test.drop(df_cred_normalized_test_class_1.index)

In [None]:
df_cred_normalized_test_class_1.shape

## Merging of test and validation sets 

In [None]:
df_cred_normalized_test_set=df_cred_normalized_test_part_1.append(df_cred_normalized_test_class_1)
df_cred_normalized_validation_set=df_cred_normalized_test_part_2.append(df_cred_normalized_validation_class_1)

###  re-checking size of train test and validate set 

In [None]:
print("train set dimensions :",df_cred_normalized_train.shape)
print("test set dimensions :",df_cred_normalized_test_set.shape)
print("validate set dimensions :",df_cred_normalized_validation_set.shape)

In [None]:
df_cred_normalized_validation_set["Class"].value_counts()

In [None]:
X_train, X_test = train_test_split(df_cred_normalized_train, test_size=0.2, random_state=2020)
X_train = X_train[X_train.Class == 0]
X_train = X_train.drop(['Class'], axis=1)
y_test = X_test['Class']
X_test = X_test.drop(['Class'], axis=1)
X_train = X_train.values
X_test = X_test.values
X_train.shape

# Autoencoder

In [None]:
from keras.models import Model, load_model
from keras.layers import Input, Dense
from keras.callbacks import ModelCheckpoint, TensorBoard
from keras.callbacks import EarlyStopping ,ReduceLROnPlateau
from keras.optimizers import Adam

#from keras import regularizers

In [None]:
input_dim = X_train.shape[1]
encoding_dim = 20
input_layer = Input(shape=(input_dim, ))
encoder = Dense(encoding_dim*2, activation="sigmoid")(input_layer)
encoder = Dense(encoding_dim, activation="sigmoid")(input_layer)
encoder = Dense(8,activation="sigmoid")(encoder)
decoder = Dense(20, activation='sigmoid')(encoder)
decoder = Dense(40, activation='sigmoid')(encoder)
decoder = Dense(input_dim, activation='sigmoid')(decoder)
autoencoder = Model(inputs=input_layer, outputs=decoder)

In [None]:
nb_epoch = 25
batch_size = 32
autoencoder.compile(optimizer='adam', 
                    loss='mean_squared_error', 
                    metrics=['accuracy'])
es = EarlyStopping(monitor='val_loss', mode='min', verbose=1,patience=15)

checkpointer = ModelCheckpoint(filepath="model.h5",
                               verbose=0,
                               save_best_only=True)
history = autoencoder.fit(X_train, X_train,
                    epochs=nb_epoch,
                    batch_size=batch_size,
                    shuffle=True,
                    validation_data=(X_test, X_test),callbacks=[es,checkpointer],
                    verbose=1)

In [None]:
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper right')
plt.show()
plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])
plt.title('model acc')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper right')
plt.show()

### reconstruction error on x_test set

In [None]:
predictions = autoencoder.predict(X_test)
mse = np.mean(np.power(X_test - predictions, 2), axis=1)
error_df = pd.DataFrame({'reconstruction_error': mse,
                        'true_class': y_test})
error_df.describe()

# Evaluation of mse on both classes on test set

In [None]:
y_test=df_cred_normalized_test_set["Class"]
df_cred_normalized_test_set=df_cred_normalized_test_set.drop("Class",axis=1)

In [None]:
predictions = autoencoder.predict(df_cred_normalized_test_set)
mse = np.mean(np.power(df_cred_normalized_test_set - predictions, 2), axis=1)
error_df_test = pd.DataFrame({'reconstruction_error': mse,
                        'true_class': y_test})
error_df_test.describe()

In [None]:
fig = plt.figure()
ax = fig.add_subplot(111)
normal_error_df = error_df_test[(error_df_test['true_class']== 0) & (error_df_test['reconstruction_error'] < 10)]
_ = ax.hist(normal_error_df.reconstruction_error.values, bins=200)

In [None]:
fig = plt.figure()
ax = fig.add_subplot(111)
fraud_error_df = error_df_test[error_df_test['true_class'] == 1]
_ = ax.hist(fraud_error_df.reconstruction_error.values, bins=100)


In [None]:
fraud_error_df.describe() ### frauds cases 

In [None]:
normal_error_df.describe() ### non fraud cases

In [None]:
error_df_test["predicted_class"]=[1 if x > 0.001 else 0 for x in error_df_test["reconstruction_error"]]

In [None]:
error_df_test

In [None]:
from sklearn.metrics import (confusion_matrix, precision_recall_curve, auc,
                             roc_curve, recall_score, classification_report, f1_score,
                             precision_recall_fscore_support)

In [None]:
error_df_test["predicted_class"]=[1 if x > 0.001 else 0 for x in error_df_test["reconstruction_error"]]

In [None]:
fpr, tpr, thresholds = roc_curve(error_df_test.true_class, error_df_test.reconstruction_error)
roc_auc = auc(fpr, tpr)

plt.title('Receiver Operating Characteristic')
plt.plot(fpr, tpr, label='AUC = %0.4f'% roc_auc)
plt.legend(loc='lower right')
plt.plot([0,1],[0,1],'r--')
plt.xlim([-0.001, 1])
plt.ylim([0, 1.001])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show();

In [None]:
print(classification_report(error_df_test["true_class"],error_df_test["predicted_class"]))

In [None]:
LABELS = ["Normal", "Fraud"]
y_pred = [1 if e > 0.004 else 0 for e in error_df_test.reconstruction_error.values]
conf_matrix = confusion_matrix(error_df_test.true_class,error_df_test.predicted_class)
plt.figure(figsize=(8, 8))
sns.heatmap(conf_matrix, xticklabels=LABELS, yticklabels=LABELS, annot=True, fmt="d");
plt.title("Confusion matrix")
plt.ylabel('True class')
plt.xlabel('Predicted class')
plt.show()

In [None]:
error_df_test["predicted_class"]=[1 if x > 0.0039888 else 0 for x in error_df_test["reconstruction_error"]]

In [None]:
print(classification_report(error_df_test["true_class"],error_df_test["predicted_class"]))

In [None]:
LABELS = ["Normal", "Fraud"]
y_pred = [1 if e >  0.0039888 else 0 for e in error_df_test.reconstruction_error.values]
conf_matrix = confusion_matrix(error_df_test.true_class,error_df_test.predicted_class)
plt.figure(figsize=(8, 8))
sns.heatmap(conf_matrix, xticklabels=LABELS, yticklabels=LABELS, annot=True, fmt="d");
plt.title("Confusion matrix")
plt.ylabel('True class')
plt.xlabel('Predicted class')
plt.show()

In [None]:
fpr, tpr, thresholds = roc_curve(error_df_test.true_class, error_df_test.reconstruction_error)
roc_auc = auc(fpr, tpr)

plt.title('Receiver Operating Characteristic')
plt.plot(fpr, tpr, label='AUC = %0.4f'% roc_auc)
plt.legend(loc='lower right')
plt.plot([0,1],[0,1],'r--')
plt.xlim([-0.001, 1])
plt.ylim([0, 1.001])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show();

### now same for final evaluation set that is our validation-set

In [None]:
y_test=df_cred_normalized_validation_set["Class"]
df_cred_normalized_validation_set=df_cred_normalized_validation_set.drop("Class",axis=1)
predictions = autoencoder.predict(df_cred_normalized_validation_set)
mse = np.mean(np.power(df_cred_normalized_validation_set - predictions, 2), axis=1)
error_df_test = pd.DataFrame({'reconstruction_error': mse,
                        'true_class': y_test})
error_df_test.describe()

In [None]:
error_df_test["predicted_class"]=[1 if x > 0.003 else 0 for x in error_df_test["reconstruction_error"]]

In [None]:
print(classification_report(error_df_test["true_class"],error_df_test["predicted_class"]))

In [None]:
fpr, tpr, thresholds = roc_curve(error_df_test.true_class, error_df_test.reconstruction_error)
roc_auc = auc(fpr, tpr)

plt.title('Receiver Operating Characteristic')
plt.plot(fpr, tpr, label='AUC = %0.4f'% roc_auc)
plt.legend(loc='lower right')
plt.plot([0,1],[0,1],'r--')
plt.xlim([-0.001, 1])
plt.ylim([0, 1.001])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show();

In [None]:
LABELS = ["Normal", "Fraud"]
y_pred = [1 if e >  0.00398888 else 0 for e in error_df_test.reconstruction_error.values]
conf_matrix = confusion_matrix(error_df_test.true_class,error_df_test.predicted_class)
plt.figure(figsize=(8, 8))
sns.heatmap(conf_matrix, xticklabels=LABELS, yticklabels=LABELS, annot=True, fmt="d");
plt.title("Confusion matrix")
plt.ylabel('True class')
plt.xlabel('Predicted class')
plt.show()