In [1]:
import pandas as pd
import numpy as np
import os
from sklearn.feature_selection import RFE
from sklearn import tree
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import random

In [2]:
PATH_DATA_BASE = 'data'
DATA_FILE_NAME = 'creditcard.csv'

database = pd.read_csv(os.path.join(PATH_DATA_BASE, DATA_FILE_NAME))
database.drop('Time', axis=1, inplace=True)

In [3]:
print database.shape

(284807, 30)


In [4]:
column_names = database.columns
print column_names

Index([u'V1', u'V2', u'V3', u'V4', u'V5', u'V6', u'V7', u'V8', u'V9', u'V10',
       u'V11', u'V12', u'V13', u'V14', u'V15', u'V16', u'V17', u'V18', u'V19',
       u'V20', u'V21', u'V22', u'V23', u'V24', u'V25', u'V26', u'V27', u'V28',
       u'Amount', u'Class'],
      dtype='object')


In [5]:
print database.head()

         V1        V2        V3        V4        V5        V6        V7  \
0 -1.359807 -0.072781  2.536347  1.378155 -0.338321  0.462388  0.239599   
1  1.191857  0.266151  0.166480  0.448154  0.060018 -0.082361 -0.078803   
2 -1.358354 -1.340163  1.773209  0.379780 -0.503198  1.800499  0.791461   
3 -0.966272 -0.185226  1.792993 -0.863291 -0.010309  1.247203  0.237609   
4 -1.158233  0.877737  1.548718  0.403034 -0.407193  0.095921  0.592941   

         V8        V9       V10  ...         V21       V22       V23  \
0  0.098698  0.363787  0.090794  ...   -0.018307  0.277838 -0.110474   
1  0.085102 -0.255425 -0.166974  ...   -0.225775 -0.638672  0.101288   
2  0.247676 -1.514654  0.207643  ...    0.247998  0.771679  0.909412   
3  0.377436 -1.387024 -0.054952  ...   -0.108300  0.005274 -0.190321   
4 -0.270533  0.817739  0.753074  ...   -0.009431  0.798278 -0.137458   

        V24       V25       V26       V27       V28  Amount  Class  
0  0.066928  0.128539 -0.189115  0.133558 -0.02

### Description of every column

In [6]:
print database.describe()

                 V1            V2            V3            V4            V5  \
count  2.848070e+05  2.848070e+05  2.848070e+05  2.848070e+05  2.848070e+05   
mean   3.919560e-15  5.688174e-16 -8.769071e-15  2.782312e-15 -1.552563e-15   
std    1.958696e+00  1.651309e+00  1.516255e+00  1.415869e+00  1.380247e+00   
min   -5.640751e+01 -7.271573e+01 -4.832559e+01 -5.683171e+00 -1.137433e+02   
25%   -9.203734e-01 -5.985499e-01 -8.903648e-01 -8.486401e-01 -6.915971e-01   
50%    1.810880e-02  6.548556e-02  1.798463e-01 -1.984653e-02 -5.433583e-02   
75%    1.315642e+00  8.037239e-01  1.027196e+00  7.433413e-01  6.119264e-01   
max    2.454930e+00  2.205773e+01  9.382558e+00  1.687534e+01  3.480167e+01   

                 V6            V7            V8            V9           V10  \
count  2.848070e+05  2.848070e+05  2.848070e+05  2.848070e+05  2.848070e+05   
mean   2.010663e-15 -1.694249e-15 -1.927028e-16 -3.137024e-15  1.768627e-15   
std    1.332271e+00  1.237094e+00  1.194353e+00  1.

### Normalizing all columns

In [7]:
for column_name in column_names[0:-1]:
    maxi = database[column_name].max()
    mini = database[column_name].min()
    lower = 0
    higher = 100
    database[column_name] = ((database[column_name] - mini) / (maxi - mini))*(higher-lower) + lower
    
print database.describe()

                  V1             V2             V3             V4  \
count  284807.000000  284807.000000  284807.000000  284807.000000   
mean       95.829378      76.725837      83.741363      25.193020   
std         3.327582       1.742375       2.627454       6.276426   
min         0.000000       0.000000       0.000000       0.000000   
25%        94.265777      76.094278      82.198487      21.431070   
50%        95.860142      76.794934      84.053011      25.105042   
75%        98.064490      77.573884      85.521346      28.488189   
max       100.000000     100.000000     100.000000     100.000000   

                  V5             V6             V7             V8  \
count  284807.000000  284807.000000  284807.000000  284807.000000   
mean       76.571630      26.301976      26.535552      78.538548   
std         0.929178       1.339476       0.753651       1.281166   
min         0.000000       0.000000       0.000000       0.000000   
25%        76.106049      25.5295

### Checking correlation between columns

In [8]:
correlation_matrix = database.corr()
correlation_matrix = np.array(correlation_matrix)
w, h = correlation_matrix.shape
print (w, h)

(30, 30)


In [9]:
def find_correlated_columns(threshold):
    counter = 0
    correlated_cols = []
    for i in range(h):
        for j in range(w):
            if abs(correlation_matrix[i][j]) > threshold and not abs(correlation_matrix[i][j]) == 1.:
                counter += 1
                correlated_cols.append((column_names[i], column_names[j]))
    return counter, correlated_cols

thresholds = [0.3, 0.5, 0.6, 0.7]
for threshold in thresholds:
    results = find_correlated_columns(threshold)
    print 'Number of highly correlated columns with threshold {} are: {}'.format(threshold, results[0])
    print results[-1]
    print

Number of highly correlated columns with threshold 0.3 are: 12
[('V2', 'Amount'), ('V5', 'Amount'), ('V7', 'Amount'), ('V14', 'Class'), ('V17', 'Class'), ('V20', 'Amount'), ('Amount', 'V2'), ('Amount', 'V5'), ('Amount', 'V7'), ('Amount', 'V20'), ('Class', 'V14'), ('Class', 'V17')]

Number of highly correlated columns with threshold 0.5 are: 2
[('V2', 'Amount'), ('Amount', 'V2')]

Number of highly correlated columns with threshold 0.6 are: 0
[]

Number of highly correlated columns with threshold 0.7 are: 0
[]



### This shows that there is very little correlation between the columns of the dataset

### Check out distribution of Class column

In [10]:
print database['Class'].value_counts()
# print database['Class'].unique()

0    284315
1       492
Name: Class, dtype: int64


## Prepping the data for Machine Learning

### Shuffling the data

In [11]:
database.sample(frac=1).reset_index(drop=True)

Unnamed: 0,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,93.971114,75.297261,85.755442,14.596763,75.656912,26.978034,26.523231,78.656872,39.115102,53.073335,...,56.313060,55.746366,67.458700,34.024326,53.287871,36.947491,41.923467,31.628913,0.801832,0
1,98.009532,75.820151,85.174740,21.690453,75.658299,26.092537,25.860160,78.660383,44.249554,52.465503,...,56.840862,55.954119,66.269499,38.848905,60.130683,41.739203,41.695564,31.349168,0.190727,0
2,91.683794,79.362529,83.908910,23.723756,76.209354,25.603195,26.631458,79.159726,49.659773,54.004667,...,55.498539,47.175230,66.784567,37.793207,58.403053,44.595863,42.971142,32.326745,0.034993,0
3,95.904298,77.688808,86.010268,36.787485,77.039439,27.709528,26.554791,79.109550,42.684171,52.722346,...,56.554403,54.133030,66.424568,40.492029,55.289767,41.525704,41.737841,31.335117,0.019267,0
4,95.273292,77.697453,83.022072,22.451212,77.297935,25.747840,27.053679,78.479793,45.408241,49.923071,...,55.733431,48.154703,66.967055,46.485546,55.327670,43.732163,41.602973,31.469263,0.010042,0
5,90.448636,78.925121,87.105027,46.520748,75.246816,28.082545,25.418627,80.590269,46.831540,54.829424,...,56.093498,54.218106,66.292543,38.510651,60.283296,52.936648,41.514311,30.785392,0.057024,0
6,95.994262,77.832943,83.363805,23.228981,77.065871,25.302308,27.153013,78.394049,45.823235,49.096425,...,55.665819,47.728901,66.792097,51.587364,55.081507,43.984497,42.068708,31.493982,0.034993,0
7,94.140974,77.149121,86.125828,17.849250,76.116819,26.076723,26.372825,79.236161,41.622564,50.637909,...,56.409987,53.107381,66.383025,41.419898,58.273452,36.176939,42.059057,31.480297,0.046709,0
8,97.872182,77.085776,85.107206,30.080985,76.254717,25.568161,26.535224,78.358425,46.371034,50.725898,...,55.863332,48.795344,66.685569,43.318139,59.662316,32.292935,41.730658,31.389330,0.038885,0
9,92.944099,76.819360,87.820825,39.818629,76.354643,27.212153,25.938623,79.670028,44.347404,51.757311,...,56.427470,52.722000,66.190781,38.577193,59.641309,50.111119,42.108373,31.185981,0.206141,0


### Getting X and Y

In [12]:
data = np.array(database)
X = data[:, 0:-1]
Y = data[:, -1]

print X.shape, Y.shape

(284807, 29) (284807,)


### Feature Selection using Recursive Feature Selection technique

In [13]:
from sklearn.linear_model import LogisticRegression

NUM_FEATURES_SELECTED = 9

model = LogisticRegression()
rfe = RFE(model, NUM_FEATURES_SELECTED)
fit = rfe.fit(X, Y)

In [14]:
print("Num Features: %d") % fit.n_features_
fit_support = fit.support_
print("Selected Features: %s") % fit_support
print("Feature Ranking: %s") % fit.ranking_

Num Features: 9
Selected Features: [False False False  True  True False False  True False  True False False
 False  True False False False False False False  True  True  True False
 False False  True False False]
Feature Ranking: [12  3 14  1  1  7 10  1 17  1 16  9  8  1 19 18  6 11 13  2  1  1  1 15 21
 20  1  5  4]


In [15]:
selected_feature_indeces = list(np.where(fit_support == True))[0]
print 'Number of features sleected: {}'.format(len(selected_feature_indeces))

SELECTED_FEATURES = column_names[selected_feature_indeces]

for i, SELECTED_FEATURE in enumerate(SELECTED_FEATURES):
    print 'Feature {}: {}'.format(i+1, SELECTED_FEATURE)

Number of features sleected: 9
Feature 1: V4
Feature 2: V5
Feature 3: V8
Feature 4: V10
Feature 5: V14
Feature 6: V21
Feature 7: V22
Feature 8: V23
Feature 9: V27


### This gives us the 12 selected features

In [16]:
X = np.array(database[SELECTED_FEATURES])
Y = np.array(database['Class'])

print X.shape, Y.shape

(284807, 9) (284807,)


### Splitting into training and testing

In [17]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.20, random_state=2)
print X_train.shape, Y_train.shape, X_test.shape, Y_test.shape

(227845, 9) (227845,) (56962, 9) (56962,)


In [18]:
print np.where(Y_train == 1)
print np.where(Y_test == 1)

(array([   186,    824,   1364,   2944,   3180,   3568,   4985,   5398,
         6557,   6688,   6743,   7616,   7994,   8028,   8210,   9191,
        10417,  11609,  11610,  11778,  12499,  13594,  13731,  14285,
        15866,  16193,  16648,  17930,  17953,  18086,  19058,  21180,
        21831,  22483,  22881,  23125,  23425,  24295,  24594,  24642,
        24692,  24987,  25553,  26569,  26574,  26715,  26919,  26946,
        27414,  27510,  28727,  29685,  30412,  30979,  31710,  32181,
        33348,  34200,  35090,  36005,  37178,  37322,  37374,  38771,
        39779,  40334,  40446,  40987,  41250,  41395,  41479,  41601,
        41949,  41965,  43032,  43123,  43653,  44098,  44525,  44949,
        45019,  45309,  45333,  45890,  45982,  46122,  46528,  46564,
        47657,  47856,  47920,  48570,  49059,  49177,  49267,  49997,
        50324,  50430,  50851,  51656,  52174,  52596,  52989,  53807,
        54137,  54307,  55794,  56157,  56437,  57336,  58034,  58280,
     

## Using SVM

In [19]:
from sklearn import svm

In [20]:
model_svm = svm.SVC(class_weight='balanced')
model_svm.fit(X_train, Y_train)

SVC(C=1.0, cache_size=200, class_weight='balanced', coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [21]:
def predict_accuracy(model):
    prediction = np.transpose(model.predict(X_test))


    print prediction.shape


    correct_counter = 0
    wrong_counter = 0

#     print len(np.where(Y_test == 1)[0])

    for i in range(prediction.shape[0]):
        if prediction[i] == Y_test[i]:
            correct_counter += 1
        else:
            wrong_counter += 1
            print 'Actual: {}, Predicted: {}'.format(Y_test[i], prediction[i])

    print '\n\n\n'

    print '# Correct predictions: {}'.format(correct_counter)
    print '# Wrong predictions: {}'.format(wrong_counter)
    
    return ((correct_counter * 1.) / (correct_counter + wrong_counter)) * 100.

In [22]:
print predict_accuracy(model_svm)

(56962,)
Actual: 0, Predicted: 1
Actual: 0, Predicted: 1
Actual: 0, Predicted: 1
Actual: 0, Predicted: 1
Actual: 0, Predicted: 1
Actual: 1, Predicted: 0
Actual: 0, Predicted: 1
Actual: 0, Predicted: 1
Actual: 0, Predicted: 1
Actual: 0, Predicted: 1
Actual: 0, Predicted: 1
Actual: 0, Predicted: 1
Actual: 0, Predicted: 1
Actual: 0, Predicted: 1
Actual: 0, Predicted: 1
Actual: 0, Predicted: 1
Actual: 0, Predicted: 1
Actual: 0, Predicted: 1
Actual: 0, Predicted: 1
Actual: 0, Predicted: 1
Actual: 0, Predicted: 1
Actual: 0, Predicted: 1
Actual: 0, Predicted: 1
Actual: 0, Predicted: 1
Actual: 1, Predicted: 0
Actual: 1, Predicted: 0
Actual: 1, Predicted: 0
Actual: 1, Predicted: 0
Actual: 0, Predicted: 1
Actual: 1, Predicted: 0
Actual: 0, Predicted: 1
Actual: 0, Predicted: 1
Actual: 0, Predicted: 1
Actual: 1, Predicted: 0
Actual: 1, Predicted: 0
Actual: 0, Predicted: 1
Actual: 0, Predicted: 1
Actual: 0, Predicted: 1
Actual: 0, Predicted: 1
Actual: 1, Predicted: 0
Actual: 0, Predicted: 1
Actual:

## Using Decision Tree

In [23]:
model_decision_tree = tree.DecisionTreeClassifier(class_weight='balanced')
model_decision_tree.fit(X_train, Y_train)

DecisionTreeClassifier(class_weight='balanced', criterion='gini',
            max_depth=None, max_features=None, max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=None, splitter='best')

In [24]:
print predict_accuracy(model_decision_tree)

(56962,)
Actual: 1, Predicted: 0
Actual: 0, Predicted: 1
Actual: 1, Predicted: 0
Actual: 1, Predicted: 0
Actual: 1, Predicted: 0
Actual: 0, Predicted: 1
Actual: 1, Predicted: 0
Actual: 1, Predicted: 0
Actual: 0, Predicted: 1
Actual: 0, Predicted: 1
Actual: 0, Predicted: 1
Actual: 0, Predicted: 1
Actual: 0, Predicted: 1
Actual: 1, Predicted: 0
Actual: 0, Predicted: 1
Actual: 1, Predicted: 0
Actual: 0, Predicted: 1
Actual: 0, Predicted: 1
Actual: 0, Predicted: 1
Actual: 0, Predicted: 1
Actual: 0, Predicted: 1
Actual: 0, Predicted: 1
Actual: 1, Predicted: 0
Actual: 0, Predicted: 1
Actual: 0, Predicted: 1
Actual: 1, Predicted: 0
Actual: 1, Predicted: 0
Actual: 1, Predicted: 0
Actual: 1, Predicted: 0
Actual: 0, Predicted: 1
Actual: 1, Predicted: 0
Actual: 1, Predicted: 0
Actual: 0, Predicted: 1
Actual: 0, Predicted: 1
Actual: 1, Predicted: 0
Actual: 1, Predicted: 0
Actual: 0, Predicted: 1
Actual: 0, Predicted: 1
Actual: 1, Predicted: 0
Actual: 1, Predicted: 0
Actual: 0, Predicted: 1
Actual:

## Multi Layer Perceptron

In [25]:
from sklearn.neural_network import MLPClassifier

model_mlp = MLPClassifier(hidden_layer_sizes=(32, 16, 8))
model_mlp.fit(X_train, Y_train)

MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(32, 16, 8), learning_rate='constant',
       learning_rate_init=0.001, max_iter=200, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=None,
       shuffle=True, solver='adam', tol=0.0001, validation_fraction=0.1,
       verbose=False, warm_start=False)

In [26]:
print predict_accuracy(model_mlp)

(56962,)
Actual: 0, Predicted: 1
Actual: 1, Predicted: 0
Actual: 1, Predicted: 0
Actual: 1, Predicted: 0
Actual: 0, Predicted: 1
Actual: 1, Predicted: 0
Actual: 1, Predicted: 0
Actual: 1, Predicted: 0
Actual: 0, Predicted: 1
Actual: 1, Predicted: 0
Actual: 1, Predicted: 0
Actual: 0, Predicted: 1
Actual: 0, Predicted: 1
Actual: 0, Predicted: 1
Actual: 0, Predicted: 1
Actual: 1, Predicted: 0
Actual: 1, Predicted: 0
Actual: 1, Predicted: 0
Actual: 0, Predicted: 1
Actual: 1, Predicted: 0
Actual: 0, Predicted: 1
Actual: 1, Predicted: 0
Actual: 0, Predicted: 1
Actual: 0, Predicted: 1
Actual: 1, Predicted: 0
Actual: 1, Predicted: 0
Actual: 0, Predicted: 1
Actual: 1, Predicted: 0
Actual: 1, Predicted: 0
Actual: 1, Predicted: 0
Actual: 0, Predicted: 1
Actual: 0, Predicted: 1




# Correct predictions: 56930
# Wrong predictions: 32
99.9438221973
