In [1]:
import pandas as pd
import math
import numpy as np
import matplotlib.pylab as plt
%matplotlib inline
import matplotlib.gridspec as gridspec
import seaborn as sns
from sklearn.preprocessing import StandardScaler
import sklearn.model_selection
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection  import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix, classification_report

In [2]:
cc = pd.read_csv('creditcard.csv')

# Preprocessing

We are going to undersample because the aata is imbalanced.

In [3]:
cc.groupby('Class').mean()

Unnamed: 0_level_0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V20,V21,V22,V23,V24,V25,V26,V27,V28,Amount
Class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,94838.202258,0.008258,-0.006271,0.012171,-0.00786,0.005453,0.002419,0.009637,-0.000987,0.004467,...,-0.000644,-0.001235,-2.4e-05,7e-05,0.000182,-7.2e-05,-8.9e-05,-0.000295,-0.000131,88.291022
1,80746.806911,-4.771948,3.623778,-7.033281,4.542029,-3.151225,-1.397737,-5.568731,0.570636,-2.581123,...,0.372319,0.713588,0.014049,-0.040308,-0.10513,0.041449,0.051648,0.170575,0.075667,122.211321


In [4]:
legit = cc[cc['Class'] == 0]
fraud = cc[cc['Class'] == 1]

In [5]:
# Sampled 492 samples from the legit df because we are using undersampling for this project
legit_sample = legit.sample(n=492)

In [6]:
# Concatenating the legit_sample with fraud to make a new df
new_df = pd.concat([legit_sample, fraud], axis=0)

In [7]:
new_df.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
207005,136480.0,-0.929568,-0.856994,1.783463,-0.636927,-0.378617,2.440052,-0.255535,0.789685,-0.838583,...,0.194293,0.788731,0.373844,-1.722975,-0.530015,-0.509529,0.224176,0.176436,209.66,0
189464,128414.0,2.079322,-0.152564,-1.698454,0.024904,0.440867,-0.345947,0.00851,-0.064161,0.533906,...,0.242343,0.739748,-0.02854,0.198799,0.289891,-0.125285,-0.036444,-0.069742,1.0,0
126490,78017.0,-0.531179,0.807309,0.401682,0.895299,-0.425644,-0.097162,0.772807,0.359484,-1.051684,...,0.293194,0.495197,0.225042,0.007022,-0.047884,-0.330938,-0.095537,-0.020394,153.34,0
106520,69996.0,1.085739,-1.931283,-0.804063,-1.289851,-1.01098,-0.164789,-0.398317,-0.245859,-1.616924,...,-0.141654,-0.56245,-0.494864,-1.004861,0.731834,-0.004425,-0.044468,0.042149,294.94,0
125756,77789.0,-1.087833,0.348714,2.980074,1.604071,-1.291307,1.754895,-0.613933,0.499412,1.102819,...,0.305699,1.443442,-0.124071,0.310584,-0.369497,-0.07095,-0.182117,0.134935,65.0,0


In [8]:
new_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 984 entries, 207005 to 281674
Data columns (total 31 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Time    984 non-null    float64
 1   V1      984 non-null    float64
 2   V2      984 non-null    float64
 3   V3      984 non-null    float64
 4   V4      984 non-null    float64
 5   V5      984 non-null    float64
 6   V6      984 non-null    float64
 7   V7      984 non-null    float64
 8   V8      984 non-null    float64
 9   V9      984 non-null    float64
 10  V10     984 non-null    float64
 11  V11     984 non-null    float64
 12  V12     984 non-null    float64
 13  V13     984 non-null    float64
 14  V14     984 non-null    float64
 15  V15     984 non-null    float64
 16  V16     984 non-null    float64
 17  V17     984 non-null    float64
 18  V18     984 non-null    float64
 19  V19     984 non-null    float64
 20  V20     984 non-null    float64
 21  V21     984 non-null    float64

In [9]:
new_df.describe()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
count,984.0,984.0,984.0,984.0,984.0,984.0,984.0,984.0,984.0,984.0,...,984.0,984.0,984.0,984.0,984.0,984.0,984.0,984.0,984.0,984.0
mean,87651.606707,-2.32116,1.808761,-3.517934,2.288009,-1.54712,-0.712739,-2.769947,0.269369,-1.287411,...,0.34355,0.016154,-0.013272,-0.076594,0.025399,0.032833,0.090504,0.043499,101.432348,0.5
std,48679.3662,5.524432,3.690792,6.207109,3.175841,4.206137,1.716846,5.864708,4.914437,2.317288,...,2.817246,1.175976,1.153497,0.574512,0.681534,0.485305,0.995067,0.432925,223.244255,0.500254
min,194.0,-30.55238,-10.492816,-31.103685,-4.657545,-22.105532,-6.406267,-43.557242,-41.044261,-13.434066,...,-22.797604,-8.887017,-19.254328,-2.028024,-4.781606,-1.16887,-7.263482,-1.86929,0.0,0.0
25%,45491.5,-2.836279,-0.164846,-5.074851,-0.04578,-1.742686,-1.614669,-3.066415,-0.205486,-2.324444,...,-0.161084,-0.519674,-0.233295,-0.405936,-0.303961,-0.291944,-0.062458,-0.053257,1.5175,0.0
50%,79429.0,-0.702449,0.978149,-1.38062,1.332365,-0.419116,-0.62791,-0.584404,0.139618,-0.70088,...,0.143915,0.049174,-0.032938,-0.010116,0.075038,-0.023393,0.040336,0.03206,18.01,0.5
75%,135327.75,1.099823,2.816333,0.309575,4.20283,0.481995,0.045544,0.320273,0.846899,0.186487,...,0.651412,0.564606,0.18952,0.365025,0.392531,0.355651,0.45539,0.214156,99.99,1.0
max,172782.0,2.341913,22.057729,3.924953,12.114672,11.095089,6.474115,6.702845,20.007208,4.869866,...,27.202839,8.361985,5.46623,1.155998,2.208209,2.745261,3.052358,4.107412,2125.87,1.0


In [10]:
Num_of_Legit = round(new_df['Class'].value_counts()[1]/len(new_df)*100,3)
Num_of_Fraud = round(new_df['Class'].value_counts()[0]/len(new_df)*100,3)


print("Number of Fraud Values :\t\t  \t   ",new_df['Class'].value_counts()[1])
print("Number of Legitimate Values :\t\t        ",new_df['Class'].value_counts()[0])
print("\n")
print("% of Fraud transactions :  \t\t ", Num_of_Legit)
print("% of Legitimate transactions :  ", Num_of_Fraud)

Number of Fraud Values :		  	    492
Number of Legitimate Values :		         492


% of Fraud transactions :  		  50.0
% of Legitimate transactions :   50.0


In [11]:
# Checking if there was any change to the data
new_df.groupby('Class').mean()

Unnamed: 0_level_0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V20,V21,V22,V23,V24,V25,V26,V27,V28,Amount
Class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,94556.406504,0.129629,-0.006257,-0.002587,0.033989,0.056986,-0.027741,0.028836,-0.031899,0.006301,...,0.011822,-0.026489,0.01826,0.013765,-0.048057,0.009348,0.014019,0.010434,0.011331,80.653374
1,80746.806911,-4.771948,3.623778,-7.033281,4.542029,-3.151225,-1.397737,-5.568731,0.570636,-2.581123,...,0.372319,0.713588,0.014049,-0.040308,-0.10513,0.041449,0.051648,0.170575,0.075667,122.211321


In [12]:
X = new_df.drop(['Class'], axis = 1).values
y = new_df['Class']

# Standardizing the data with StandardScaler()
X = StandardScaler().fit_transform(X)

# Performing train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 101)

In [13]:
print(X)

[[ 1.00357148  0.25202574 -0.72263882 ...  0.13440294  0.30722326
   0.48504132]
 [ 0.83779073  0.79695426 -0.53168042 ... -0.12764319 -0.26170457
  -0.45010529]
 [-0.19802037  0.32417644 -0.27147589 ... -0.18705851 -0.14765991
   0.2326333 ]
 ...
 [ 1.67917015  0.2979227  -0.18498509 ...  0.2962141   0.34864896
  -0.10550918]
 [ 1.69181027 -0.14355771 -0.33150559 ...  0.79871532 -0.68684124
   0.64342377]
 [ 1.69966153  0.78113536 -0.44736299 ... -0.0879955  -0.13590816
  -0.26398126]]


In [14]:
print(y)

207005    0
189464    0
126490    0
106520    0
125756    0
         ..
279863    1
280143    1
280149    1
281144    1
281674    1
Name: Class, Length: 984, dtype: int64


### Modeling

In [15]:
import warnings
warnings.filterwarnings("ignore")
# Logistic regression
# Creating the hyperparameter grid
c_space = [0.001, 0.01, 0.1, 1, 10, 100, 1000]
param_grid = {'C': c_space, 'penalty': ['l1', 'l2']}

# Instantiate the logistic regression classifier: logreg
logreg = LogisticRegression()

# Modeling
logreg_cv = GridSearchCV(logreg, param_grid, cv=5)
logreg_cv.fit(X_train, y_train)
y_pred = logreg_cv.predict(X_test)
test_lr = accuracy_score(y_pred, y_test)

# Printing out scores
print("Tuned Logistic Regression Parameters: {}".format(logreg_cv.best_params_)) 
print("Best score is {}".format(logreg_cv.best_score_))
print("Accuracy is {}".format(test_lr))
print('\n')
print(classification_report(y_pred,y_test))

Tuned Logistic Regression Parameters: {'C': 1000, 'penalty': 'l2'}
Best score is 0.9377731194065951
Accuracy is 0.9441624365482234


              precision    recall  f1-score   support

           0       0.94      0.95      0.95       100
           1       0.95      0.94      0.94        97

    accuracy                           0.94       197
   macro avg       0.94      0.94      0.94       197
weighted avg       0.94      0.94      0.94       197



In [16]:
# DecisionTree Classifier
# Creating the hyperparameter grid
tree_params = {"criterion": ["gini", "entropy"], "max_depth": list(range(2,4,1)), 
              "min_samples_leaf": list(range(5,7,1))}

# Modeling
grid_tree = GridSearchCV(DecisionTreeClassifier(), tree_params)
grid_tree.fit(X_train, y_train)
y_pred_gct = grid_tree.predict(X_test)
test_dtc = accuracy_score(y_pred_gct, y_test)

# Printing out the Accuracy and the confusion matrix
print("Tuned DecisionTree Parameters: {}".format(grid_tree.best_params_)) 
print("Best score is {}".format(grid_tree.best_score_))
print("Accuracy is {}".format(test_dtc))
print('\n')
print(classification_report(y_pred_gct,y_test))

Tuned DecisionTree Parameters: {'criterion': 'entropy', 'max_depth': 3, 'min_samples_leaf': 5}
Best score is 0.9186809642828347
Accuracy is 0.9187817258883249


              precision    recall  f1-score   support

           0       0.95      0.90      0.92       107
           1       0.89      0.94      0.91        90

    accuracy                           0.92       197
   macro avg       0.92      0.92      0.92       197
weighted avg       0.92      0.92      0.92       197



In [17]:
# Support Vector Classifier
# Creating the hyperparameter grid
svc_params = {'C': [0.5, 0.7, 0.9, 1], 'kernel': ['rbf', 'poly', 'sigmoid', 'linear']}

# Modeling
grid_svc = GridSearchCV(SVC(), svc_params)
grid_svc.fit(X_train, y_train)
y_pred_gsvc = grid_svc.predict(X_test)
test_svc = accuracy_score(y_pred_gsvc, y_test)

# Printing out the Accuracy and the confusion matrix
print("Tuned SVC Parameter: {}".format(grid_svc.best_params_))
print("Best score is {}".format(grid_svc.best_score_))
print("Accuracy is {}".format(test_svc))
print('\n')
print(classification_report(y_pred_gsvc,y_test))

Tuned SVC Parameter: {'C': 1, 'kernel': 'rbf'}
Best score is 0.9352092235749415
Accuracy is 0.9187817258883249


              precision    recall  f1-score   support

           0       0.91      0.93      0.92        99
           1       0.93      0.91      0.92        98

    accuracy                           0.92       197
   macro avg       0.92      0.92      0.92       197
weighted avg       0.92      0.92      0.92       197



In [18]:
# kNN
# Creating the hyperparameter grid
knears_params = {"n_neighbors": list(range(2,5,1)), 'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute']}

grid_knears = GridSearchCV(KNeighborsClassifier(), knears_params)
grid_knears.fit(X_train, y_train)
y_pred_gknn = grid_knears.predict(X_test)
test_gknn = accuracy_score(y_pred_gknn, y_test)


print("Tuned kNN Parameter: {}".format(grid_knears.best_params_))
print("Best score is {}".format(grid_knears.best_score_))
print("Accuracy is {}".format(test_gknn))
print('\n')
print(classification_report(y_pred_gknn,y_test))

Tuned kNN Parameter: {'algorithm': 'auto', 'n_neighbors': 4}
Best score is 0.913585422881561
Accuracy is 0.8934010152284264


              precision    recall  f1-score   support

           0       0.91      0.88      0.90       104
           1       0.88      0.90      0.89        93

    accuracy                           0.89       197
   macro avg       0.89      0.89      0.89       197
weighted avg       0.89      0.89      0.89       197



## Conclusion

We preformed undersampling on the data because the data was imbalanced. Undersampling is a technique that people use to balance uneven datasets by keeping all of the data in the minority class and decreasing the size of the majority class. We then starndarized that data used split the data into training set and testing set. We modeled various classifiers and tuned it using GridSearch and found that Logistic Regression with the best parameters is the best model because because it has best scores when you compare the precision, recall, accuracy.