In [1]:
import pandas as pd
import numpy as np
import scipy
import matplotlib.pyplot as plt
from sklearn import tree
from IPython.display import Image
import pydotplus
import graphviz
%matplotlib inline

In [2]:
data = pd.read_csv('creditcard.csv').dropna()
df = pd.DataFrame(data)
df.describe()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
count,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,...,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0
mean,94813.859575,3.91956e-15,5.688174e-16,-8.769071e-15,2.782312e-15,-1.552563e-15,2.010663e-15,-1.694249e-15,-1.927028e-16,-3.137024e-15,...,1.537294e-16,7.959909e-16,5.36759e-16,4.458112e-15,1.453003e-15,1.699104e-15,-3.660161e-16,-1.206049e-16,88.349619,0.001727
std,47488.145955,1.958696,1.651309,1.516255,1.415869,1.380247,1.332271,1.237094,1.194353,1.098632,...,0.734524,0.7257016,0.6244603,0.6056471,0.5212781,0.482227,0.4036325,0.3300833,250.120109,0.041527
min,0.0,-56.40751,-72.71573,-48.32559,-5.683171,-113.7433,-26.16051,-43.55724,-73.21672,-13.43407,...,-34.83038,-10.93314,-44.80774,-2.836627,-10.2954,-2.604551,-22.56568,-15.43008,0.0,0.0
25%,54201.5,-0.9203734,-0.5985499,-0.8903648,-0.8486401,-0.6915971,-0.7682956,-0.5540759,-0.2086297,-0.6430976,...,-0.2283949,-0.5423504,-0.1618463,-0.3545861,-0.3171451,-0.3269839,-0.07083953,-0.05295979,5.6,0.0
50%,84692.0,0.0181088,0.06548556,0.1798463,-0.01984653,-0.05433583,-0.2741871,0.04010308,0.02235804,-0.05142873,...,-0.02945017,0.006781943,-0.01119293,0.04097606,0.0165935,-0.05213911,0.001342146,0.01124383,22.0,0.0
75%,139320.5,1.315642,0.8037239,1.027196,0.7433413,0.6119264,0.3985649,0.5704361,0.3273459,0.597139,...,0.1863772,0.5285536,0.1476421,0.4395266,0.3507156,0.2409522,0.09104512,0.07827995,77.165,0.0
max,172792.0,2.45493,22.05773,9.382558,16.87534,34.80167,73.30163,120.5895,20.00721,15.59499,...,27.20284,10.50309,22.52841,4.584549,7.519589,3.517346,31.6122,33.84781,25691.16,1.0


In [74]:
#Count the number of fradulent charges in this data set

df.Class.value_counts()

0    284315
1       492
Name: Class, dtype: int64

In [75]:
#We have 284,315 normal charges and 492 fradulent
#Fraudelent charges account for .173% of the data

#We will probably need to oversample fraud to get a good model here
#I will use RandomOverSampler as well as ADASYN to oversample our target here

In [3]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import recall_score
from imblearn.over_sampling import SMOTE

In [5]:
from imblearn.over_sampling import RandomOverSampler
from collections import Counter

X = df.drop(['Time', 'Class'], axis=1) #training features
y = df.Class #target 

In [39]:
#Before we oversample it's important to do the train test split first to 
#segragate our test samples to use later

from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(X, y,
                                                  test_size = .1)

In [40]:
#We will oversample from the training data set and teach our model with the training set also

ros = RandomOverSampler(random_state=0)
X_resampled, y_resampled = ros.fit_resample(x_train, y_train)


In [41]:
print(sorted(Counter(y_resampled).items()))


[(0, 255894), (1, 255894)]


In [42]:
#First model using our oversampled data
from sklearn.svm import LinearSVC
clf = LinearSVC()
clf.fit(X_resampled, y_resampled) # doctest : +ELLIPSIS



LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0)

In [43]:
clf.score(X_resampled, y_resampled)

0.934562748638108

In [11]:
#Our score is 93.45% seems pretty good
#Lets test with ADASYN now

In [44]:
#Let's train our model
from imblearn.over_sampling import SMOTE, ADASYN

X_resampled, y_resampled = ADASYN().fit_resample(x_train, y_train)
print(sorted(Counter(y_resampled).items()))

#Train our model again using ADASYN oversampling data
clf_adasyn = LinearSVC().fit(X_resampled, y_resampled)

[(0, 255894), (1, 255920)]




In [45]:
#Set our predictor 
y_pred = clf_adasyn.predict(X_resampled)

In [46]:
clf_adasyn.score(X_resampled, y_resampled)

0.9134255803866248

In [78]:
#The recall is intuitively the ability of the classifier to find all the positive samples.
print (recall_score(y_resampled, y_pred))

0.9003790246952172


In [80]:
#We correctly identify Fraudulent charges in 90% of all variance here*   *(I think)

In [86]:
clf_adasyn.score(x_test, y_test)

0.9271093009374671

In [87]:
y_pred1 = clf_adasyn.predict(x_test)

In [90]:
clf_adasyn.score

array([0, 0, 0, ..., 0, 1, 0])

In [92]:
clf_adasyn.score(y_test, y_pred1)

ValueError: Expected 2D array, got 1D array instead:
array=[0 0 0 ... 0 0 0].
Reshape your data either using array.reshape(-1, 1) if your data has a single feature or array.reshape(1, -1) if it contains a single sample.

In [127]:
ypred_shape = y_pred1.reshape(-1, 1)

In [128]:
ytest_shape =y_test.values.reshape(-1,1)

In [129]:
clf_adasyn.score(ytest_shape, ypred_shape)

ValueError: X has 1 features per sample; expecting 29