<div style="color: #2590c2; text-align: center;">
<span style="font-size:18pt;"><b>ST: BIG DATA ANALYTICS</b></span><br/>
<span>CS 696-16 (Fall'18)</span><br/><br/>
<span><b>Project 3</b></span><br/><br/>
<span>Submitted By</span><br/>
<span>Ashok Kumar Shrestha</span>
</div>

In [None]:
#credit card fraud detection
#kaggle challenge
#https://www.kaggle.com/mlg-ulb/creditcardfraud/home

In [None]:
import pandas as pd
data = pd.read_csv("creditcard.csv")
data.head(5)

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA, TruncatedSVD
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
import time
import collections
import matplotlib.patches as mpatches
%matplotlib inline

In [None]:
colors = ["#0101DF", "#DF0101"]
sns.countplot('Class', data=data, palette=colors)
plt.title('Class Distributions \n (0: No Fraud | 1: Fraud)', fontsize=12)

In [None]:
data['Amount'] = StandardScaler().fit_transform(data['Amount'].reshape(-1, 1))
data['Time'] = StandardScaler().fit_transform(data['Time'].reshape(-1, 1))

In [None]:
# Correlation Heatmap for original datasets
correlation_matrix = data.corr()
fig = plt.figure(figsize=(12,9))
fig.suptitle('Correlation Heatmap for Credit Card Fraud Detection Dataset', 
              fontsize=14);
sns.heatmap(correlation_matrix,cmap='coolwarm', square = True)
fig.savefig('heatmapa.jpg')
plt.show()

In [None]:
#random under sampling
data = data.sample(frac=1)
fraud_data = data.loc[data['Class'] == 1]
non_fraud_data = data.loc[data['Class'] == 0][:492]
normal_distributed_data = pd.concat([fraud_data, non_fraud_data])
new_data = normal_distributed_data.sample(frac=1, random_state=42)`
new_data.head()

In [None]:
sns.countplot('Class', data=new_data, palette=colors)
plt.title('Equally Distributed Classes\n (0: No Fraud | 1: Fraud)', fontsize=12)
plt.show()

In [None]:
# Correlation Heatmap for resampled datasets
correlation_matrix = new_data.corr()
fig = plt.figure(figsize=(12,9))
fig.suptitle('Resampled Correlation Heatmap for Credit Card Fraud Detection Dataset', 
              fontsize=14);
sns.heatmap(correlation_matrix,cmap='coolwarm',square = True)
fig.savefig('heatmapb.jpg')
plt.show()

In [None]:
# Resampling
X = new_data.drop('Class', axis=1)
y = new_data['Class']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# T-SNE
X_reduced_tsne = TSNE(n_components=2, random_state=42).fit_transform(X.values)

# PCA
X_reduced_pca = PCA(n_components=2, random_state=42).fit_transform(X.values)

# TruncatedSVD
X_reduced_svd = TruncatedSVD(n_components=2, algorithm='randomized', random_state=42).fit_transform(X.values)

#plotting datasets
f, (ax1, ax2, ax3) = plt.subplots(1, 3, figsize=(24,6))
f.suptitle('Clusters using Dimensionality Reduction', fontsize=12)

blue_patch = mpatches.Patch(color='#0A0AFF', label='No Fraud')
red_patch = mpatches.Patch(color='#AF0000', label='Fraud')

# t-SNE scatter plot
ax1.scatter(X_reduced_tsne[:,0], X_reduced_tsne[:,1], c=(y == 0), cmap='coolwarm', label='No Fraud', linewidths=1)
ax1.scatter(X_reduced_tsne[:,0], X_reduced_tsne[:,1], c=(y == 1), cmap='coolwarm', label='Fraud', linewidths=1)
ax1.set_title('t-SNE', fontsize=12)
ax1.grid(True)
ax1.legend(handles=[blue_patch, red_patch])


# PCA scatter plot
ax2.scatter(X_reduced_pca[:,0], X_reduced_pca[:,1], c=(y == 0), cmap='coolwarm', label='No Fraud', linewidths=1)
ax2.scatter(X_reduced_pca[:,0], X_reduced_pca[:,1], c=(y == 1), cmap='coolwarm', label='Fraud', linewidths=1)
ax2.set_title('PCA', fontsize=12)
ax2.grid(True)
ax2.legend(handles=[blue_patch, red_patch])

# TruncatedSVD scatter plot
ax3.scatter(X_reduced_svd[:,0], X_reduced_svd[:,1], c=(y == 0), cmap='coolwarm', label='No Fraud', linewidths=1)
ax3.scatter(X_reduced_svd[:,0], X_reduced_svd[:,1], c=(y == 1), cmap='coolwarm', label='Fraud', linewidths=1)
ax3.set_title('Truncated SVD', fontsize=12)
ax3.grid(True)
ax3.legend(handles=[blue_patch, red_patch])
plt.show()

In [None]:
classifiers = {
    "LogisiticRegression": LogisticRegression(),
    "KNearest": KNeighborsClassifier(),
    "Support Vector Classifier": SVC(),
    "DecisionTreeClassifier": DecisionTreeClassifier()
}

for key, classifier in classifiers.items():
    classifier.fit(X_train, y_train)
    training_score = cross_val_score(classifier, X_train, y_train, cv=5)
    print("Classifier: ", classifier.__class__.__name__, "\nTraining score: ", round(training_score.mean(), 2) * 100, "%")
    y_pred = classifier.predict(X_test)
    print("Testing accuracy: ",round(accuracy_score(y_test,y_pred),2) * 100,"%")
    print(classification_report(y_test,y_pred))    
    
    cnf_matrix = confusion_matrix(y_test,y_pred)
    plt.figure()
    plot_confusion_matrix(cnf_matrix
                          , classes=classifier.__class__.__name__
                          , title='Confusion matrix')
    plt.show()
    
    fpr, tpr, thresholds = roc_curve(y_test,y_pred)
    roc_auc = auc(fpr,tpr)

    plt.title('Receiver Operating Characteristic')
    plt.plot(fpr, tpr,label=classifier.__class__.__name__)
    plt.legend(loc='lower right')
    plt.plot([0,1],[0,1],'r--')
    plt.xlim([-0.1,1.0])
    plt.ylim([-0.1,1.01])
    plt.ylabel('True Positive Rate')
    plt.xlabel('False Positive Rate')
    
plt.show()

In [None]:
# test data

def test(X_train, y_train):
    log_reg_score = cross_val_score(log_reg, X_train, y_train, cv=5)
    print('Logistic Regression Score: ', round(log_reg_score.mean() * 100, 2).astype(str) + '%')

    knears_score = cross_val_score(knears_neighbors, X_train, y_train, cv=5)
    print('Knears Neighbors Score', round(knears_score.mean() * 100, 2).astype(str) + '%')

    svc_score = cross_val_score(svc, X_train, y_train, cv=5)
    print('Support Vector Classifier Score', round(svc_score.mean() * 100, 2).astype(str) + '%')

    tree_score = cross_val_score(tree_clf, X_train, y_train, cv=5)
    print('DecisionTree Classifier Score', round(tree_score.mean() * 100, 2).astype(str) + '%')

In [None]:
test(X_train, y_train)

In [None]:
test(X_test, y_test)

In [None]:
X = data.drop('Class', axis=1)
y = data['Class']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=42)

In [None]:
test(X_test, y_test)