# Credit Card Fraud


[Source](https://www.kaggle.com/mlg-ulb/creditcardfraud)

[Great Examples](https://www.kaggle.com/janiobachmann/credit-fraud-dealing-with-imbalanced-datasets)

In [1]:
import pandas as pd, numpy as np
import datetime

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, LabelBinarizer
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import roc_curve, auc, confusion_matrix
from sklearn.model_selection import GridSearchCV

import sklearn
#from sklearn_pandas import DataFrameMapper

import matplotlib.pyplot as plt
from IPython.display import HTML

  from numpy.core.umath_tests import inner1d


Unbalanced data issue

In [2]:
df = pd.read_excel("creditcard.xlsx")

In [3]:
df.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V22,V23,V24,V25,V26,V27,V28,Amount,Class,fil
0,16,0.694885,-1.361819,1.029221,0.834159,-1.191209,1.309109,-0.878586,0.44529,-0.446196,...,-0.571955,-0.050881,-0.304215,0.072001,-0.422234,0.086553,0.063499,231.71,0,0.917284
1,25,1.114009,0.085546,0.493702,1.33576,-0.300189,-0.010754,-0.11876,0.188617,0.205687,...,-0.00476,-0.03147,0.198054,0.565007,-0.337718,0.029057,0.004453,4.45,0,0.98044
2,26,-0.535388,0.865268,1.351076,0.147575,0.43368,0.086983,0.693039,0.179742,-0.285642,...,0.206537,-0.187108,0.000753,0.098117,-0.553471,-0.078306,0.025427,1.77,0,0.958214
3,27,-1.452187,1.765124,0.611669,1.176825,-0.44598,0.246826,-0.257566,1.092472,-0.607524,...,0.325782,-0.069107,0.020962,-0.044668,-0.243441,0.14918,0.120557,1.8,0,0.968873
4,34,-0.29154,0.445575,1.249752,-1.735736,0.085756,-0.121924,0.407716,0.095309,0.815902,...,-0.120449,-0.156526,-0.800213,-0.00062,-0.835203,0.131001,0.062896,18.95,0,0.917209


In [4]:
pd.set_option('display.max_colwidth', -1)
display(HTML(df.head().to_html()))
pd.reset_option('display.max_colwidth')

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,V11,V12,V13,V14,V15,V16,V17,V18,V19,V20,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class,fil
0,16,0.694885,-1.361819,1.029221,0.834159,-1.191209,1.309109,-0.878586,0.44529,-0.446196,0.568521,1.019151,1.298329,0.42048,-0.372651,-0.80798,-2.044557,0.515663,0.625847,-1.300408,-0.138334,-0.295583,-0.571955,-0.050881,-0.304215,0.072001,-0.422234,0.086553,0.063499,231.71,0,0.917284
1,25,1.114009,0.085546,0.493702,1.33576,-0.300189,-0.010754,-0.11876,0.188617,0.205687,0.082262,1.133556,0.626699,-1.49278,0.520788,-0.674593,-0.529108,0.158256,-0.398751,-0.145709,-0.273832,-0.053234,-0.00476,-0.03147,0.198054,0.565007,-0.337718,0.029057,0.004453,4.45,0,0.98044
2,26,-0.535388,0.865268,1.351076,0.147575,0.43368,0.086983,0.693039,0.179742,-0.285642,-0.482474,0.8718,0.853447,-0.571822,0.102252,-1.519991,-0.285912,-0.309633,-0.403902,-0.823743,-0.283264,0.049526,0.206537,-0.187108,0.000753,0.098117,-0.553471,-0.078306,0.025427,1.77,0,0.958214
3,27,-1.452187,1.765124,0.611669,1.176825,-0.44598,0.246826,-0.257566,1.092472,-0.607524,0.047156,0.783727,1.096386,-0.268094,0.768648,-0.524367,-0.808816,0.710386,-0.118369,0.979288,0.008713,0.08228,0.325782,-0.069107,0.020962,-0.044668,-0.243441,0.14918,0.120557,1.8,0,0.968873
4,34,-0.29154,0.445575,1.249752,-1.735736,0.085756,-0.121924,0.407716,0.095309,0.815902,-1.491188,-0.846191,0.056533,-0.058954,0.151923,1.982595,-0.443295,-0.318251,0.064787,0.613505,-0.033522,-0.064906,-0.120449,-0.156526,-0.800213,-0.00062,-0.835203,0.131001,0.062896,18.95,0,0.917209


In [5]:
df.columns

Index(['Time', 'V1', 'V2', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9', 'V10',
       'V11', 'V12', 'V13', 'V14', 'V15', 'V16', 'V17', 'V18', 'V19', 'V20',
       'V21', 'V22', 'V23', 'V24', 'V25', 'V26', 'V27', 'V28', 'Amount',
       'Class', 'fil'],
      dtype='object')

In [6]:
features = ['V1', 'V2', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9', 'V10',
            'V11', 'V12', 'V13', 'V14', 'V15', 'V16', 'V17', 'V18', 'V19', 'V20',
            'V21', 'V22', 'V23', 'V24', 'V25', 'V26', 'V27', 'V28']

In [7]:
df['Class'].value_counts()

0    28442
1      492
Name: Class, dtype: int64

In [8]:
X = df[features]
y = df['Class']

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, 
                                                    stratify=y)

In [10]:
from sklearn.metrics import confusion_matrix, accuracy_score

# Model 1:

In [11]:
clf = LogisticRegression()
clf.fit(X_train, y_train)
print(confusion_matrix(clf.predict(X_train), y_train))
print(accuracy_score(clf.predict(X_train), y_train))
print(confusion_matrix(clf.predict(X_test), y_test))
print(accuracy_score(clf.predict(X_test), y_test))

[[19897    66]
 [   12   278]]
0.9961487187083395
[[8530   29]
 [   3  119]]
0.9963137887340168


# Model 2:

In [12]:
clf = GradientBoostingClassifier()
clf.fit(X_train, y_train)
print(confusion_matrix(clf.predict(X_train), y_train))
print(accuracy_score(clf.predict(X_train), y_train))
print(confusion_matrix(clf.predict(X_test), y_test))
print(accuracy_score(clf.predict(X_test), y_test))

[[19908    20]
 [    1   324]]
0.9989631165753222
[[8521   26]
 [  12  122]]
0.9956226241216449


# Model 3:

In [13]:
clf = DecisionTreeClassifier()
clf.fit(X_train, y_train)
print(confusion_matrix(clf.predict(X_train), y_train))
print(accuracy_score(clf.predict(X_train), y_train))
print(confusion_matrix(clf.predict(X_test), y_test))
print(accuracy_score(clf.predict(X_test), y_test))

[[19909     0]
 [    0   344]]
1.0
[[8508   22]
 [  25  126]]
0.9945858772030872


# Model 4:

In [14]:
clf = RandomForestClassifier()
clf.fit(X_train, y_train)
print(confusion_matrix(clf.predict(X_train), y_train))
print(accuracy_score(clf.predict(X_train), y_train))
print(confusion_matrix(clf.predict(X_test), y_test))
print(accuracy_score(clf.predict(X_test), y_test))

[[19909    10]
 [    0   334]]
0.9995062459882487
[[8530   25]
 [   3  123]]
0.9967745651422647


***