In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
from sklearn.model_selection import  train_test_split
from sklearn.metrics import classification_report,accuracy_score,roc_auc_score
from imblearn.over_sampling import SMOTE,ADASYN
from imblearn.under_sampling import  RandomUnderSampler
from sklearn.preprocessing import MinMaxScaler,RobustScaler,StandardScaler
from scipy.stats import norm
import plotly.graph_objects as go
import warnings
import pickle
import os
warnings.filterwarnings("ignore")

In [None]:
df = pd.read_csv('creditcard.csv')

In [None]:
df.head()

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
Class_imbalance = pd.concat([df.Class.value_counts().to_frame(),df.Class.value_counts(normalize=True).to_frame()],axis=1)

In [None]:
Class_imbalance.columns = ['no of Transactions','Percentage']

In [None]:
Class_imbalance.index = ["no fraud","fraud"]
Class_imbalance

In [None]:
fg = go.Figure(data=[go.Pie(labels=df['Class'],hole=0.5)])
fg.show()

In [None]:
df_copy = df.copy()

In [None]:
def Scaler_def(_Scaler,df_copy):

    df_amount = df_copy['Amount'].values.reshape(-1,1)
    df_Time = df_copy.Time.values.reshape(-1,1)
    y = df.Class

    df_copy['Scaled_Amount'] = _Scaler.fit_transform(df_amount)
    df_copy['Scaled_Time'] = _Scaler.fit_transform(df_Time)


    fig, ax = plt.subplots(1, 2, figsize=(18,4))

    amount_val = df_copy['Scaled_Amount'].values
    time_val = df_copy['Scaled_Time'].values

    sns.distplot(amount_val, ax=ax[0], color='r')
    ax[0].set_title('Distribution of Transaction Amount', fontsize=14)
    ax[0].set_xlim([min(amount_val), max(amount_val)])

    sns.distplot(time_val, ax=ax[1], color='b')
    ax[1].set_title('Distribution of Transaction Time', fontsize=14)
    ax[1].set_xlim([min(time_val), max(time_val)])



    plt.show()

In [None]:
robust_scaler = RobustScaler()
Scaler_def(robust_scaler,df)

In [None]:
MinMax_Scaler = MinMaxScaler()
Scaler_def(MinMax_Scaler,df_copy)

In [None]:
Standard_Scaler = StandardScaler()
Scaler_def(Standard_Scaler,df_copy)

In [None]:
df.describe()

In [None]:
#df.drop(['Time','Amount'],inplace=True,axis=1)

In [None]:
df.insert(0,'scaled_Time',df['Scaled_Time'])
df.insert(1,'scaled_Amount',df['Scaled_Amount'])
df.head()

In [None]:
df.drop(['Scaled_Amount','Scaled_Time'],inplace=True,axis=1)

In [None]:
df.head(2)

In [None]:
df.T.head()

In [None]:
fig, axs = plt.subplots(ncols=2, figsize=(16,4))

sns.distplot(df[(df['Class'] == 1)]['scaled_Time'], bins=100, color='red', ax=axs[0])
axs[0].set_title("Distribution of Fraud Transactions")

sns.distplot(df[(df['Class'] == 0)]['scaled_Time'], bins=100, color='green', ax=axs[1])
axs[1].set_title("Distribution of Genuine Transactions")

plt.show()

In [None]:
plt.figure(figsize=(8,6))
ax = sns.boxplot(x='Class', y='scaled_Time',data = df)

# Change the appearance of that box
ax.artists[0].set_facecolor('#90EE90')
ax.artists[1].set_facecolor('#FA8072')

plt.title('Time Distribution for Fraud and Genuine transactions')
plt.show()

In [None]:
plt.figure(figsize=(8 , 6))
ax = sns.boxplot(x='Class', y='scaled_Amount', data=df)

# Change the appearance of that box
ax.artists[0].set_facecolor('#90EE90')
ax.artists[1].set_facecolor('#FA8072')

plt.title('Time Distribution for Fraud and Genuine transactions')
plt.show()

In [None]:
df = df.sample(frac=1)

# amount of fraud classes 492 rows.
fraud_df = df.loc[df['Class'] == 1]
non_fraud_df = df.loc[df['Class'] == 0][:492]

normal_distributed_df = pd.concat([fraud_df, non_fraud_df])

In [None]:
fig = px.scatter(normal_distributed_df,x='scaled_Time',y='scaled_Amount',color='Class' ,marginal_x="histogram", marginal_y="violin",template="simple_white")
fig.show()

In [None]:
fig = px.scatter(normal_distributed_df,x='scaled_Time',y='scaled_Amount',facet_col='Class' ,marginal_x="histogram", marginal_y="rug",trendline='ols',template='ggplot2')
fig.show()

In [None]:
plt.title('Pearson Correlation Matrix')
sns.heatmap(df[['scaled_Time', 'scaled_Amount','Class']].corr(),linewidths=0.25,vmax=0.7,square=True,cmap="summer",
            linecolor='w',annot=True);

In [None]:
print('{} and {}'.format(np.max(df.V1),np.min(df.V1)))

In [None]:
import  math

In [None]:
# Converting time from second to hour
df['Time'] = df['Time'].apply(lambda sec : (sec/3600))
# Calculating hour of the day
df['hour'] = df['Time']%24   # 2 days of data
df['hour'] = df['hour'].apply(lambda x : math.floor(x))
# Calculating First and Second day
df['day'] = df['Time']/24   # 2 days of data
df['day'] = df['day'].apply(lambda x : 1 if(x==0) else math.ceil(x))
df[['Time','hour','day','Class','Amount']].head()

In [None]:
df.head()

In [None]:
df.day.nunique()

In [None]:
df = df.sample(frac=1)

# amount of fraud classes 492 rows.
fraud_df = df.loc[df['Class'] == 1]
non_fraud_df = df.loc[df['Class'] == 0][:492]

normal_distributed_df = pd.concat([fraud_df, non_fraud_df])

In [None]:
fig = px.scatter(normal_distributed_df,x='hour',y='scaled_Amount',color='Class' ,marginal_x="histogram", marginal_y="violin",template="simple_white")
fig.show()

In [None]:
fig = px.scatter(normal_distributed_df,x='hour',y='scaled_Amount',facet_col='Class' ,marginal_x="histogram", marginal_y="rug",trendline='ols',template='ggplot2')
fig.show()

In [None]:
fig = px.scatter(df,x='hour',y='scaled_Amount',facet_col='Class' ,marginal_x="box", marginal_y="rug",trendline='ols',template='ggplot2')
fig.show()

In [None]:
import pickle
import os

In [None]:
df.hist(figsize=(30,30))
plt.show()

In [None]:
df.shape

In [None]:
fig = px.histogram(df,x='Amount',y='scaled_Time',color='Class',nbins=20)
fig.show()

In [None]:
df.columns

In [None]:
X = df.drop(['hour','Amount','Class','day'],axis=1)
y = df.Class

print(X.columns)
# Split the data into training and testing set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, shuffle=True, random_state=101)

# Quick sanity check with the shapes of Training and testing datasets
print("X_train - ",X_train.shape)
print("y_train - ",y_train.shape)
print("X_test - ",X_test.shape)
print("y_test - ",y_test.shape)

In [None]:
from sklearn.linear_model import LogisticRegression # Importing Classifier Step

In [None]:
log_reg = LogisticRegression()
log_reg.fit(X_train,y_train)

In [None]:
y_pred = log_reg.predict(X_test)
y_pred

In [None]:
print(classification_report(y_pred,y_test))

In [None]:
roc_auc_score(y_test,y_pred)

In [None]:
pd.concat([pd.Series(y_pred).value_counts(normalize=True),pd.Series(y_pred).value_counts(normalize=False),pd.Series(y_pred).value_counts(normalize=True),pd.Series(y_pred).value_counts(normalize=False)],axis=1)

In [None]:
from sklearn.metrics import confusion_matrix

In [None]:
print(confusion_matrix(y_test,y_pred))

In [None]:
[[85296 ,   10]
 [   55  ,  82]]

In [None]:
import plotly.figure_factory as ff
plt.Figure(figsize=(5,5))

fig = ff.create_annotated_heatmap(confusion_matrix(y_test,y_pred),colorscale='Viridis',annotation_text=[[85296 ,   10],[55  ,82]])


In [None]:
import itertools


# Create a confusion matrix
def plot_confusion_matrix(cm, classes,
                          normalize=False,
                          title='Confusion matrix',
                          cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        print("Normalized confusion matrix")
    else:
        print('Confusion matrix, without normalization')

    print(cm)

    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title, fontsize=14)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)

    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], fmt),
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')

In [None]:
labels = ['No Fraud', 'Fraud']
conf_matrix = confusion_matrix(y_test,y_pred)

fig = plt.figure(figsize=(6,6))

plot_confusion_matrix(conf_matrix, labels, title="Random UnderSample \n Confusion Matrix", cmap=plt.cm.Reds)

In [None]:
# Heatmap for Confusion Matrix
def plot_conf_matrix(conf_matrix):
    p = sns.heatmap(pd.DataFrame(conf_matrix), annot=True, annot_kws={"size": 25}, cmap="winter" ,fmt='g')

    plt.title('Confusion matrix', y=1.1, fontsize = 22)
    plt.ylabel('Actual',fontsize = 18)
    plt.xlabel('Predicted',fontsize = 18)

    ax.xaxis.set_ticklabels(['Genuine', 'Fraud']);
    ax.yaxis.set_ticklabels(['Genuine', 'Fraud']);

    plt.show()

In [None]:
plot_conf_matrix(confusion_matrix(y_test,y_pred))

In [None]:
# plot ROC Curve
from sklearn import metrics
def plot_auc(y_test, y_pred):
    plt.figure(figsize=(8,6))

    fpr, tpr, thresholds = metrics.roc_curve(y_test, y_pred)

    auc = metrics.roc_auc_score(y_test, y_pred)
    print("AUC - ",auc,"\n")

    plt.plot(fpr,tpr,linewidth=2, label="data 1, auc="+str(auc))
    plt.legend(loc=4)

    plt.plot([0,1], [0,1], 'k--' )

    plt.rcParams['font.size'] = 12
    plt.title('ROC curve for Predicting a credit card fraud detection')
    plt.xlabel('False Positive Rate (1 - Specificity)')
    plt.ylabel('True Positive Rate (Sensitivity)')

    plt.show()

In [None]:
plot_auc(y_test,y_pred)

## Random Under Sampling Method with Logistic Regression

In [None]:
from collections import Counter # counter takes values returns value_counts dictionary
from sklearn.datasets import make_classification

In [None]:
print('Original dataset shape %s' % Counter(y_train))

# Undersampling only on train
rus = RandomUnderSampler(random_state=42)
X_train_rus, y_train_rus = rus.fit_resample(X_train, y_train)


print('Resampled dataset shape %s' % Counter(y_train_rus))

In [None]:
log_reg_rus = LogisticRegression()
log_reg_rus.fit(X_train_rus,y_train_rus)

y_pred_rus = log_reg_rus.predict(X_test)

In [None]:
def print_metrics(y_test,y_pred_rus):

    print('Accuracy :{0:0.5f}'.format(metrics.accuracy_score(y_pred_rus , y_test)))
    print('AUC : {0:0.5f}'.format(metrics.roc_auc_score(y_test , y_pred_rus)))
    print('Precision : {0:0.5f}'.format(metrics.precision_score(y_test , y_pred_rus)))
    print('Recall : {0:0.5f}'.format(metrics.recall_score(y_test , y_pred_rus)))
    print('F1 : {0:0.5f}'.format(metrics.f1_score(y_test , y_pred_rus)))

In [None]:
print_metrics(y_test,y_pred_rus)

In [None]:
plot_conf_matrix(confusion_matrix(y_test,y_pred_rus))

In [None]:
plot_auc(y_test,y_pred_rus)

## Random Oversampling Method with Logistic Regression

In [None]:
from imblearn.over_sampling import RandomOverSampler
print('Original dataset shape %s' % Counter(y_train))

ros = RandomOverSampler(random_state=42)
X_train_ros, y_train_ros = ros.fit_resample(X_train, y_train)

print('Resampled dataset shape %s' % Counter(y_train_ros))

In [None]:
log_reg_ros = LogisticRegression()
log_reg_ros.fit(X_train_ros,y_train_ros)

y_pred_ros = log_reg_ros.predict(X_test)

In [None]:
print_metrics(y_test,y_pred_ros)

In [None]:
plot_conf_matrix(confusion_matrix(y_test,y_pred_ros))

In [None]:
plot_auc(y_test,y_pred_ros)

## using SMOTE method with Logistic Regression

In [None]:
print('Original dataset shape %s' % Counter(y_train))

smote = SMOTE(random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

print('Resampled dataset shape %s' % Counter(y_train_smote))

In [None]:
log_reg_smote = LogisticRegression(max_iter=1000)
log_reg_smote.fit(X_train_smote,y_train_smote)

y_pred_smote = log_reg_smote.predict(X_test)

In [None]:
print_metrics(y_test,y_pred_smote)

In [None]:
plot_conf_matrix(confusion_matrix(y_test,y_pred_smote))

In [None]:
plot_auc(y_test,y_pred_smote)

## Using ADASYN with Logiestic Regression

In [None]:
print('Original dataset shape %s' % Counter(y_train))

adasyn = ADASYN(random_state=42)
X_train_adasyn, y_train_adasyn = smote.fit_resample(X_train, y_train)

print('Resampled dataset shape %s' % Counter(y_train_adasyn))

In [None]:
log_reg_adasyn = LogisticRegression(max_iter=1000)
log_reg_adasyn.fit(X_train_adasyn,y_train_adasyn)

y_pred_adasyn = log_reg_adasyn.predict(X_test)

In [None]:
print_metrics(y_test,y_pred_adasyn)

In [None]:
plot_conf_matrix(confusion_matrix(y_test,y_pred_adasyn))

In [None]:
plot_auc(y_test,y_pred_adasyn)