In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [2]:
data = pd.read_csv('Datasets/creditcard.csv')

In [3]:
data['Class'].value_counts() # as you can see dataset is quite imbalanced

0    284315
1       492
Name: Class, dtype: int64

In [4]:
data.groupby(['Class']).mean() # 0 means legit ,,,,, 1 means fraud

Unnamed: 0_level_0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V20,V21,V22,V23,V24,V25,V26,V27,V28,Amount
Class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,94838.202258,0.008258,-0.006271,0.012171,-0.00786,0.005453,0.002419,0.009637,-0.000987,0.004467,...,-0.000644,-0.001235,-2.4e-05,7e-05,0.000182,-7.2e-05,-8.9e-05,-0.000295,-0.000131,88.291022
1,80746.806911,-4.771948,3.623778,-7.033281,4.542029,-3.151225,-1.397737,-5.568731,0.570636,-2.581123,...,0.372319,0.713588,0.014049,-0.040308,-0.10513,0.041449,0.051648,0.170575,0.075667,122.211321


In [5]:
# we will use a method called undersampling

In [6]:
legit = data[data.Class == 0]
fraud = data[data.Class == 1]

In [7]:
legit_sample = legit.sample(n=492)


In [8]:
new = pd.concat([legit_sample,fraud],axis=0) #axis=0 is to make sure they are added column wise

In [9]:
new['Class'].value_counts() #now it is sampled perfectly but it is still low

0    492
1    492
Name: Class, dtype: int64

In [10]:
new.shape

(984, 31)

In [11]:
X = new.drop(columns='Class',axis=1)
Y = new['Class']

In [12]:
X_train,X_test,Y_train,Y_test = train_test_split(X,Y,test_size=0.2,random_state=2)

In [13]:
model = LogisticRegression()

In [14]:
model.fit(X_train,Y_train)
train_model = model.predict(X_train)
train_accuracy = accuracy_score(train_model,Y_train)
print("Train Accuracy: ",train_accuracy)

Train Accuracy:  0.9453621346886912


In [15]:
test_model = model.predict(X_test)
test_accuracy = accuracy_score(test_model,Y_test)
print("Test Accuracy: ",test_accuracy)

Test Accuracy:  0.9390862944162437


In [16]:
df_pred = pd.DataFrame(Y_test.values, columns=['Actual'])
df_pred['Predicted'] = test_model
df_pred.head()

Unnamed: 0,Actual,Predicted
0,0,0
1,1,1
2,1,1
3,0,0
4,1,1
