In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import scipy.stats as stats
import matplotlib.pyplot as plt
plt.style.use('ggplot')

np.random.seed(42) 

Exploratory Data Analysis

In [None]:
df = pd.read_csv('../input/creditcardfraud/creditcard.csv')

In [None]:
df.head(5)

In [None]:
print("This data frame has {} rows and {} columns".format(df.shape[0], df.shape[1]))

In [None]:
df.info()

Analyze known values

In [None]:
pd.set_option("precision", 5)
df.loc[:, ['Time', 'Amount']].describe()

In [None]:
plt.figure(figsize=(10,8))
plt.title("Distribution of Time Feature")
sns.histplot(df.Time)

In [None]:
plt.figure(figsize=(10,8))
plt.title('Distribution of Monetary Value Feature')
sns.distplot(df.Amount)

In [None]:
counts = df.Class.value_counts()
print(counts)
genuine = counts[0]
fraud = counts[1]
perc_fraud = fraud / (fraud + genuine) * 100
perc_genuine = 100 - perc_fraud
print("Fraudulent transactions: ({:.3f}%), Genuine transactions: ({:.3f}%)".format(perc_fraud, perc_genuine))

In [None]:
corr = df.corr()
plt.figure(figsize=(12,10))
heat = sns.heatmap(data=corr)
plt.title("Heatmap of correlation")

Scaling Amount and Time

In [None]:
from sklearn.preprocessing import StandardScaler, RobustScaler
from itertools import chain

# robust scaler is better for outliers
rob_scaler = RobustScaler() 

In [None]:
scaled_amount = rob_scaler.fit_transform(df['Amount'].values.reshape(-1,1))
scaled_time = rob_scaler.fit_transform(df['Time'].values.reshape(-1,1))
df.insert(0, 'scaled_time', scaled_time)
df.insert(1, 'scaled_amount', scaled_amount)
df.drop(['Time', 'Amount'], axis=1, inplace=True)
df.head()

Preparing training and testing data

In [None]:
from sklearn.model_selection import StratifiedKFold

print("Fraudulent transactions: ({:.3f}%), Genuine transactions: ({:.3f}%)".format(perc_fraud, perc_genuine))

X = df.drop('Class', axis=1)
y = df['Class']

skf = StratifiedKFold(n_splits=5, random_state=None, shuffle=False)

for train_index, test_index in skf.split(X,y):
    print("Train:", train_index, "Test:", test_index)
    original_Xtrain, original_Xtest = X.iloc[train_index], X.iloc[test_index]
    original_ytrain, original_ytest = y.iloc[train_index], y.iloc[test_index]
    
print('-' * 100)
print('Label Distributions: ')
# Turn into an array
original_Xtrain = original_Xtrain.values
original_Xtest = original_Xtest.values
original_ytrain = original_ytrain.values
original_ytest = original_ytest.values

# Double check after splitting
train_unique_label, train_counts_label = np.unique(original_ytrain, return_counts=True)
test_uniqe_label, test_counts_label = np.unique(original_ytest, return_counts=True)
print(train_counts_label)
print("Y Train", train_counts_label/ len(original_ytrain))
print("Y Test", test_counts_label/ len(original_ytest))

Undersampling - Prepare data to fit into models

In [None]:
#shuffle data
df = df.sample(frac=1)

# we only have 492 fradulent transactions so we will pick 492 genuine ones too
fraud_df = df.loc[df['Class'] == 1]
non_fraud_df = df.loc[df['Class'] == 0][:492]

new_df = pd.concat([fraud_df, non_fraud_df]).sample(frac=1, random_state=16)
new_df.head()



In [None]:
print("Distribution of the classes in undersampling")
print(new_df['Class'].value_counts() / len(new_df))

In [None]:
colors = ["#0101DF", "#DF0101"]
plt.title("Equally distributed classes")
sns.countplot('Class', data=new_df, palette=colors)

In [None]:
subsample_corr = new_df.corr()
plt.figure(figsize=(12,10))
subsample_heat = sns.heatmap(data=subsample_corr)
plt.title("Heatmap of sub sample correlation")

Negative correlation with Class: V3, V9, V10, V12, V14, V16, V17 -> the lower it is, the more likely it will be a fraud


Positive correlation with Class: V4, V11 -> the higher it is, the more likely it will be a fraud


In [None]:
corr = new_df.corr()
corr = corr[['Class']]
corr
corr[corr.Class < -0.5]


In [None]:
corr[corr.Class > 0.5]

Negative correlation

In [None]:
f, axes = plt.subplots(nrows=2, ncols=4, figsize=(26,16))

# Negative Correlations with our Class (The lower our feature value the more likely it will be a fraud transaction)
sns.boxplot(x="Class", y="V3", data=new_df, palette=colors, ax=axes[0, 0])
axes[0, 0].set_title('V3 vs Class Negative Correlation')

sns.boxplot(x="Class", y="V9", data=new_df, palette=colors, ax=axes[0, 1])
axes[0, 1].set_title('V9 vs Class Negative Correlation')

sns.boxplot(x="Class", y="V11", data=new_df, palette=colors, ax=axes[0, 2])
axes[0, 2].set_title('V11 vs Class Negative Correlation')
sns.boxplot(x="Class", y="V12", data=new_df, palette=colors, ax=axes[0, 3])
axes[0, 3].set_title('V12 vs Class Negative Correlation')


sns.boxplot(x="Class", y="V14", data=new_df, palette=colors, ax=axes[1, 0])
axes[1, 0].set_title('V14 vs Class Negative Correlation')

sns.boxplot(x="Class", y="V16", data=new_df, palette=colors, ax=axes[1, 1])
axes[1, 1].set_title('V16 vs Class Negative Correlation')

sns.boxplot(x="Class", y="V17", data=new_df, palette=colors, ax=axes[1, 2])
axes[1, 2].set_title('V17 vs Class Negative Correlation')

f.delaxes(axes[1, 3])
plt.show()


Possitive correlation

In [None]:
f, axes = plt.subplots(ncols=2, figsize=(14, 8))

sns.boxplot(x="Class", y="V11", data=new_df, palette=colors, ax=axes[0])
axes[0].set_title("V11 vs Class Positive Correlation")

sns.boxplot(x="Class", y="V4", data=new_df, palette=colors, ax=axes[1])
axes[1].set_title('V4 vs Class Positive Correlation')


plt.show()

Remove outliers

In [None]:
Q1 = new_df.quantile(0.25)
Q3 = new_df.quantile(0.75)
IQR = Q3 - Q1

print("Before:", len(new_df))

new_df = new_df[~((new_df < (Q1 - 2.5 * IQR)) | (new_df > (Q3 + 2.5 * IQR))).any(axis=1)]

print("After", len(new_df))
new_df.head()

Dimension reduction

In [None]:
from sklearn.manifold import TSNE

X = new_df.drop('Class', axis=1)
y = new_df['Class']

#t-SNE
X_reduced_tsne = TSNE(n_components=2, random_state=42).fit_transform(X.values)

print(X_reduced_tsne)

In [None]:
# t-SNE scatter plot
import matplotlib.patches as mpatches

f, ax = plt.subplots(figsize=(20,10))


blue_patch = mpatches.Patch(color='#0A0AFF', label='No Fraud')
red_patch = mpatches.Patch(color='#AF0000', label='Fraud')

ax.scatter(X_reduced_tsne[:,0], X_reduced_tsne[:,1], c=(y == 0), cmap='coolwarm', label='No Fraud', linewidths=2)
ax.scatter(X_reduced_tsne[:,0], X_reduced_tsne[:,1], c=(y == 1), cmap='coolwarm', label='Fraud', linewidths=2)
ax.set_title('t-SNE', fontsize=14)

ax.grid(True)

ax.legend(handles=[blue_patch, red_patch])

Run Classification Algos

In [None]:
"""
Running algos:
1. Decision trees with some pruning
2. Neural networks: many layers and any activation function you see fit
3. Boosting for the decision tree
4. SVM: use least 2 kernels
5. k-nearest neighbors -> use different k
"""
from sklearn.model_selection import train_test_split
# under sampling before cross validating

X = new_df.drop('Class', axis=1)
y = new_df['Class']

# from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

X_train = X_train.values
y_train = y_train.values
X_test = X_test.values
y_test = y_test.values

Create neural network from Keras

In [None]:
import keras
from keras.models import Sequential
from keras.layers.core import Dense
from keras.wrappers.scikit_learn import KerasClassifier

input_size = X_train.shape[1]
def create_network(optimizer="adam", activation="relu"):
    model = Sequential([
        Dense(input_size, input_shape=(input_size, ), activation=activation),
        Dense(32, activation=activation),
        Dense(2, activation=activation)
    ])
    model.compile(optimizer=optimizer, loss= 'sparse_categorical_crossentropy', metrics=['accuracy'])
    return model

nn_model = KerasClassifier(build_fn=create_network, verbose=0)

In [None]:
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import cross_val_score

classifiers = {
    "K Nearest Neighbor": KNeighborsClassifier(),
    "Decision Tree": DecisionTreeClassifier(),
    "Support Vector Machine": SVC(),
    "Gradient Boosting Classifier": GradientBoostingClassifier(),
    "Neural Network Classifier": nn_model
}

# calculate training score based on X_train, y_train to compare performance of hyperparams
# calculate cross_val_score on train data to compare performance of hyperparameter
for key, clf in classifiers.items():
    training_score = cross_val_score(clf, X_train, y_train, cv=5)
    print("Classifier: {}, training score: {}%".format(key, training_score.mean() * 100))

Hyparameter tunning

In [None]:
from sklearn.model_selection import GridSearchCV

# K nearest neighbor
knn_params = {"n_neighbors": list(range(2,5,1)), 'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute']}
knn_grid = GridSearchCV(KNeighborsClassifier(), knn_params)
knn_grid.fit(X_train, y_train)
knn_best_params = knn_grid.best_params_
print("K Nearest Neighbor:", knn_best_params)

# Decision Tree
dc_params = {"criterion": ["gini", "entropy"], "max_depth": list(range(2,5,1)), "min_samples_leaf": list(range(5,7,1))}
dc_grid = GridSearchCV(DecisionTreeClassifier(), dc_params)
dc_grid.fit(X_train, y_train)
dc_best_params = dc_grid.best_params_
print("Decision Tree:", dc_best_params)


# Support Vestor Machine
svc_params = {'C': [0.5, 0.7, 0.9, 1], 'kernel': ['rbf', 'poly', 'sigmoid', 'linear']}
svc_grid = GridSearchCV(SVC(), svc_params)
svc_grid.fit(X_train, y_train)
svc_best_params = svc_grid.best_params_

print("Support Vector Machine:",svc_best_params)

# Gradient Boosting Classifier
gb_params = {"n_estimators": [50, 150, 300], "max_depth": list(range(1,7,2)), "min_samples_leaf": [7,9,11,13]}
gb_grid = GridSearchCV(GradientBoostingClassifier(), gb_params)
gb_grid.fit(X_train, y_train)
gb_best_params = gb_grid.best_params_
print("Gradient Boosting:",gb_best_params)

# Keras Neural Net Classifier
nn_params = {"nb_epoch": [5, 10, 15], "batch_size": [5, 25, 50], "optimizer": ["adam", "sgd"], "activation": ["tanh", "relu", "softmax"]}
nn_grid = GridSearchCV(nn_model, nn_params)
nn_grid.fit(X_train, y_train)
nn_best_params = nn_grid.best_params_
print("Neural network", nn_best_params)

Analysis

Plot learning curve

In [None]:
from sklearn.model_selection import learning_curve

# reference https://scikit-learn.org/stable/auto_examples/model_selection/plot_learning_curve.html#sphx-glr-auto-examples-model-selection-plot-learning-curve-py
def plot_learning_curve(estimator, title, X, y, axes=None, ylim=None, cv=None, n_jobs=None, train_sizes=np.linspace(.1, 1.0, 5)):

    if axes is None:
        _, axes = plt.subplots(1, 3, figsize=(20, 5))

    axes[0].set_title(title)
    if ylim is not None:
        axes[0].set_ylim(*ylim)
    axes[0].set_xlabel("Training examples")
    axes[0].set_ylabel("Score")

    train_sizes, train_scores, test_scores, fit_times, _ = \
        learning_curve(estimator, X, y, cv=cv, n_jobs=n_jobs,
                       train_sizes=train_sizes,
                       return_times=True)
    train_scores_mean = np.mean(train_scores, axis=1)
    train_scores_std = np.std(train_scores, axis=1)
    test_scores_mean = np.mean(test_scores, axis=1)
    test_scores_std = np.std(test_scores, axis=1)
    fit_times_mean = np.mean(fit_times, axis=1)
    fit_times_std = np.std(fit_times, axis=1)

    # Plot learning curve
    axes[0].grid(True)
    axes[0].fill_between(train_sizes, train_scores_mean - train_scores_std,
                         train_scores_mean + train_scores_std, alpha=0.1,
                         color="#ff9124")
    axes[0].fill_between(train_sizes, test_scores_mean - test_scores_std,
                         test_scores_mean + test_scores_std, alpha=0.1,
                         color="#2492ff")
    axes[0].plot(train_sizes, train_scores_mean, 'o-', color="#ff9124",
                 label="Training score")
    axes[0].plot(train_sizes, test_scores_mean, 'o-', color="#2492ff",
                 label="Cross-validation score")
    axes[0].legend(loc="best")

    # Plot n_samples vs fit_times
    axes[1].grid(True)
    axes[1].plot(train_sizes, fit_times_mean, 'o-', color="#ff9124")
    axes[1].fill_between(train_sizes, fit_times_mean - fit_times_std,
                         fit_times_mean + fit_times_std, alpha=0.1, color="#ff9124")
    axes[1].set_xlabel("Training examples")
    axes[1].set_ylabel("fit_times")
    axes[1].set_title("Scalability of the model")

    # Plot fit_time vs score
    axes[2].grid(True)
    axes[2].plot(fit_times_mean, test_scores_mean, 'o-', color="#ff9124")
    axes[2].fill_between(fit_times_mean, test_scores_mean - test_scores_std,
                         test_scores_mean + test_scores_std, alpha=0.1, color="#ff9124")
    axes[2].set_xlabel("fit_times")
    axes[2].set_ylabel("Score")
    axes[2].set_title("Performance of the model")

    return plt

title = "Learning Curves (K Nearest Neighbor)"
plot_learning_curve(knn_grid.best_estimator_, title, X_train, y_train, axes=None, ylim=(0.7, 1.01), cv=None, n_jobs=None)

title = "Learning Curves (Decision Tree)"
plot_learning_curve(dc_grid.best_estimator_, title, X_train, y_train, axes=None, ylim=(0.7, 1.01), cv=None, n_jobs=None)

title = "Learning Curves (Support Vector Machine)"
plot_learning_curve(svc_grid.best_estimator_, title, X_train, y_train, axes=None, ylim=(0.7, 1.01), cv=None, n_jobs=None)

title = "Learning Curves (Gradient Boosting)"
plot_learning_curve(gb_grid.best_estimator_, title, X_train, y_train, axes=None, ylim=(0.7, 1.01), cv=None, n_jobs=None)

In [None]:
title = "Learning Curves (Neural Network)"
plot_learning_curve(nn_grid.best_estimator_, title, X_train, y_train, axes=None, ylim=(0.4, 1.01), cv=None, n_jobs=None)

Confusion matrix, recall, precision, fscore, support

In [None]:
from sklearn.metrics import plot_confusion_matrix
from sklearn.metrics import precision_score

# Now start training,the best estimator with orginal_Xtrain, original_ytrain
# Then calculate precision
best_estimators = {
    "K Nearest Neighbor": knn_grid.best_estimator_,
    "Decision Tree": dc_grid.best_estimator_,
    "Gradient Boosting Classifier": gb_grid.best_estimator_,
}


for name, estimator in best_estimators.items():
    estimator.fit(original_Xtrain, original_ytrain)
    y_predict = estimator.predict(original_Xtest)
    print("{} - Precision Score: {:.2f} %".format(name, precision_score(original_ytest, y_predict) * 100))
    


In [None]:
svc_best = svc_grid.best_estimator_
svc_best.fit(original_Xtrain, original_ytrain)
y_predict = svc_best.predict(original_Xtest)
print("Support Vector Machine - Precision Score: {:.2f} %".format(precision_score(original_ytest, y_predict) * 100))

In [None]:
nn_best = create_network(optimizer="adam", activation="tanh")
nn_best.fit(original_Xtrain, original_ytrain, batch_size=5, epochs=10)
y_predict = nn_best.predict_classes(original_Xtest, batch_size=5)
print(y_predict)
print("Neural Network - Precision Score: {:.2f} %".format(precision_score(original_ytest, y_predict) * 100))

**REFERENCE**

The EPA is being referenced from https://www.kaggle.com/janiobachmann/credit-fraud-dealing-with-imbalanced-datasets