Data Preparation - We are using creditcard data on Kaggle

In [1]:
import pandas as pd
import numpy as np

#Read the data
df=pd.read_csv("creditcard.csv")

df.columns

Index(['Time', 'V1', 'V2', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9', 'V10',
       'V11', 'V12', 'V13', 'V14', 'V15', 'V16', 'V17', 'V18', 'V19', 'V20',
       'V21', 'V22', 'V23', 'V24', 'V25', 'V26', 'V27', 'V28', 'Amount',
       'Class'],
      dtype='object')

The dataset has 31 columns. The first column “Time” is transaction timestamp, second last column “Amount” is transaction amount and the last column “Class” designates whether transaction as fraud or non-fraud (fraud = 1, non-fraud = 0). The rest of the columns, “V1” to “V28” are unknown variables which were transformed before making the data public.

In [2]:
# number of fraud and non-fraud observations in the dataset
pd.value_counts(df['Class'])

0    284315
1       492
Name: Class, dtype: int64

In [3]:
## scaling the "Amount" and "Time" columns similar to the others variables
from sklearn.preprocessing import RobustScaler
rob_scaler = RobustScaler()

In [4]:
df.Amount

0         149.62
1           2.69
2         378.66
3         123.50
4          69.99
5           3.67
6           4.99
7          40.80
8          93.20
9           3.68
10          7.80
11          9.99
12        121.50
13         27.50
14         58.80
15         15.99
16         12.99
17          0.89
18         46.80
19          5.00
20        231.71
21         34.09
22          2.28
23         22.75
24          0.89
25         26.43
26         41.88
27         16.00
28         33.00
29         12.99
           ...  
284777      1.00
284778     80.00
284779     25.00
284780     30.00
284781     13.00
284782     12.82
284783     11.46
284784     40.00
284785      1.79
284786      8.95
284787      9.99
284788      3.99
284789     60.50
284790      9.81
284791     20.32
284792      3.99
284793      4.99
284794      0.89
284795      9.87
284796     60.00
284797      5.49
284798     24.05
284799     79.99
284800      2.68
284801      2.69
284802      0.77
284803     24.79
284804     67.

In [5]:
df['scaled_Amount']=rob_scaler.fit_transform(df['Amount'].values.reshape(-1,1))
df['scaled_Amount']

0         1.783274
1        -0.269825
2         4.983721
3         1.418291
4         0.670579
5        -0.256131
6        -0.237686
7         0.262698
8         0.994900
9        -0.255991
10       -0.198421
11       -0.167819
12        1.390344
13        0.076853
14        0.514218
15       -0.083980
16       -0.125900
17       -0.294977
18        0.346538
19       -0.237546
20        2.930343
21        0.168937
22       -0.275554
23        0.010480
24       -0.294977
25        0.061902
26        0.277789
27       -0.083840
28        0.153706
29       -0.125900
            ...   
284777   -0.293440
284778    0.810452
284779    0.041920
284780    0.111786
284781   -0.125760
284782   -0.128275
284783   -0.147279
284784    0.251520
284785   -0.282401
284786   -0.182352
284787   -0.167819
284788   -0.251659
284789    0.537972
284790   -0.170335
284791   -0.023475
284792   -0.251659
284793   -0.237686
284794   -0.294977
284795   -0.169496
284796    0.530986
284797   -0.230699
284798    0.

In [6]:
df['scaled_Time']=rob_scaler.fit_transform(df['Time'].values.reshape(-1,1))
df['scaled_Time']

0        -0.994983
1        -0.994983
2        -0.994972
3        -0.994972
4        -0.994960
5        -0.994960
6        -0.994937
7        -0.994901
8        -0.994901
9        -0.994878
10       -0.994866
11       -0.994866
12       -0.994866
13       -0.994854
14       -0.994843
15       -0.994843
16       -0.994843
17       -0.994831
18       -0.994819
19       -0.994807
20       -0.994796
21       -0.994784
22       -0.994772
23       -0.994772
24       -0.994725
25       -0.994725
26       -0.994713
27       -0.994713
28       -0.994713
29       -0.994713
            ...   
284777    1.034693
284778    1.034693
284779    1.034716
284780    1.034716
284781    1.034716
284782    1.034728
284783    1.034740
284784    1.034740
284785    1.034740
284786    1.034740
284787    1.034751
284788    1.034763
284789    1.034763
284790    1.034775
284791    1.034810
284792    1.034810
284793    1.034822
284794    1.034845
284795    1.034857
284796    1.034881
284797    1.034904
284798    1.

In [7]:
# now drop the original columns
df.drop(['Time','Amount'], axis=1, inplace=True)
df.head()

Unnamed: 0,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,...,V22,V23,V24,V25,V26,V27,V28,Class,scaled_Amount,scaled_Time
0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,0.090794,...,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,0,1.783274,-0.994983
1,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,-0.166974,...,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,0,-0.269825,-0.994983
2,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,0.207643,...,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,0,4.983721,-0.994972
3,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,-0.054952,...,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,0,1.418291,-0.994972
4,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,0.753074,...,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,0,0.670579,-0.99496


In [8]:
# Lets define X and y variables
X = df.loc[:, df.columns != 'Class']
y = df.loc[:, df.columns == 'Class']

import matplotlib.pyplot as plt
# Define a function to create a scatter plot of  data and Class labels
def plot_data(X, y):
    plt.scatter(X[y == 0, 0], X[y == 0, 1], label="Class #0", alpha=0.5, linewidth=0.15)
    plt.scatter(X[y == 1, 0], X[y == 1, 1], label="Class #1", alpha=0.5, linewidth=0.15, c='r') 
    plt.legend()
    return plt.show()

#x=df.loc[df['Class']==0]
#y=df.loc[df['Class']==1]
plot_data(X,y)

Let us perform SMOTE (Synthetic Minority Oversampling Technique) sampling over the data set


In [13]:
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline

resampling = SMOTE(sampling_strategy=0.1)
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.3,random_state=0)

#Applying resampling to the traing data only
X_resampled,y_resampled=resampling.fit_sample(X_train,y_train)

# Define the pipeline, tell it to combine SMOTE with the Logistic Regression model
#pipeline = Pipeline([('SMOTE', resampling), ('Logistic Regression', model)])

In [15]:
from sklearn.linear_model import LogisticRegression
#We will fit logistic regression with sampled data
model=LogisticRegression()
model.fit(X_resampled,y_resampled)


  return f(**kwargs)


LogisticRegression()

In [20]:
from sklearn.metrics import classification_report
#Performance metrics
predicted=model.predict(X_test)
print(classification_report(y_test,predicted))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00     85296
           1       0.41      0.85      0.56       147

    accuracy                           1.00     85443
   macro avg       0.71      0.92      0.78     85443
weighted avg       1.00      1.00      1.00     85443



In [22]:
from sklearn.metrics import confusion_matrix

confusion_matrix_out=confusion_matrix(y_test,predicted)
print(confusion_matrix_out)

[[85118   178]
 [   22   125]]
