In [1]:
import pandas as pd
import numpy as np
import scipy
import random
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn import linear_model
import statsmodels.formula.api as smf
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import cross_val_score

from sklearn.utils import resample

I want to make a model that can predcit cases of credit card fraud

In [2]:
fraud = pd.read_csv('C:\Code\Data\creditcard.csv')

In [3]:
fraud.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0.0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0
2,1.0,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0


In [4]:
fraud.Class.value_counts()

0    284315
1       492
Name: Class, dtype: int64

The classes are extremely inbalanced.  There are more than 280,000 reputable cases and only 492 fradulent ones.

I'll run the model and see what happens

In [5]:
X = fraud.drop(['Class'], 1)
Y = fraud.Class

In [73]:
lr = linear_model.LogisticRegression(C=1e10)
lr.fit(X, Y)

LogisticRegression(C=10000000000.0, class_weight=None, dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='ovr', n_jobs=1, penalty='l2', random_state=None,
          solver='liblinear', tol=0.0001, verbose=0, warm_start=False)

In [90]:
print('R² for the Vanilla training data:')
print(lr.score(X, Y))
print("\nConfusion Matrix:")
confusion_matrix(Y, Y_pred)

R² for the Vanilla training data:
0.9654783765848451

Confusion Matrix:


array([[284240,     75],
       [   203,    289]], dtype=int64)

In [82]:
score = lr.score(X,Y)
Y_pred = lr.predict(X)
real_charges, fake_charges = confusion_matrix(Y, Y_pred)
type2 = fake_charges[0]/(fake_charges[0]+fake_charges[1])
print("Overall Logistic Model Score:")
print(score)
print("\nType II Error percentage:")
print(type2*100,"%")

Overall Lasso Model Score:
0.9990239003957065

Type II Error percentage:
41.260162601626014 %


In [91]:
cross_val_score(lr, X, Y)

array([0.99296368, 0.99915733, 0.99849371])

## Resampling Classes and Creating New Training Data

In [11]:
real = fraud[fraud.Class==0]
fake = fraud[fraud.Class==1]
 
# Downsample real purchases class
real_downsampled = resample(real, replace=True, n_samples=2000)
#Upsample fake purchases class
fake_upsampled = resample(fake, replace=True, n_samples=2000)

sampled_fraud = pd.concat([real_downsampled, fake_upsampled])
 
# Display new class counts
sampled_fraud.Class.value_counts()

1    2000
0    2000
Name: Class, dtype: int64

In [12]:
Xs = sampled_fraud.drop(['Class'], 1)
Ys = sampled_fraud.Class

In [93]:
lrs = linear_model.LogisticRegression(C=1e10)
lrs.fit(Xs, Ys)

LogisticRegression(C=10000000000.0, class_weight=None, dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='ovr', n_jobs=1, penalty='l2', random_state=None,
          solver='liblinear', tol=0.0001, verbose=0, warm_start=False)

In [101]:
score = lrs.score(Xs,Ys)
Y_preds = lr.predict(Xs)
real_charges, fake_charges = confusion_matrix(Ys, Y_preds)
type2 = fake_charges[0]/(fake_charges[0]+fake_charges[1])
print("Overall Logistic Model Score:")
print(score)
print("\nType II Error percentage:")
print(type2*100,"%")

Overall Logistic Model Score:
0.94775

Type II Error percentage:
7.75 %


In [115]:
cross_val_score(lrs, Xs, Ys)

array([0.95427286, 0.92953523, 0.93093093])

In [102]:
confusion_matrix(Ys, Y_preds)

array([[1946,   54],
       [ 155, 1845]], dtype=int64)

## Ridge Logistic Regression

In [64]:
grid = [.01,.1, 1,10,100,200,300,500,700,1000,10000] 
out = [] 
for c in grid: 
    lrr = linear_model.LogisticRegression(C=c) 
    fitRidge = lrr.fit(Xs, Ys) 
    scores = cross_val_score(lrr, Xs, Ys, cv=3) 
    out.append(scores.mean()) 
    bestc = grid[out.index(max(out))] 

lrr = linear_model.LogisticRegression(C=bestc) 
lrr.fit(Xs,Ys)
print("\nThe model was fit using ",bestc)


The model was fit using  0.1


In [109]:
score = lrr.score(Xs,Ys)
Y_preds = lrr.predict(Xs)
real_charges, fake_charges = confusion_matrix(Ys, Y_preds)
type2 = fake_charges[0]/(fake_charges[0]+fake_charges[1])
print("Overall Ridge Model Score:")
print(score)
print("\nType II Error percentage:")
print(type2*100,"%")

Overall Ridge Model Score:
0.94825

Type II Error percentage:
7.75 %


In [116]:
cross_val_score(lrr, Xs, Ys)

array([0.95427286, 0.92953523, 0.93093093])

In [110]:
confusion_matrix(Ys, Y_preds)

array([[1948,   52],
       [ 155, 1845]], dtype=int64)

## Lasso Logistic Regression

In [63]:
grid = [.01,.1, 1, 10,100,200,300,500,700,1000,10000] 
out = [] 
for c in grid: 
    lrl = linear_model.LogisticRegression(penalty='l1',C=c) 
    lrl.fit(Xs, Ys) 
    scores = cross_val_score(lrl, Xs, Ys, cv=3) 
    out.append(scores.mean()) 
    bestc = grid[out.index(max(out))] 

lrl = linear_model.LogisticRegression(penalty='l1',C=bestc) 
lrl.fit(Xs,Ys)
print("\nThe model was fit using ",bestc)


The model was fit using  200


In [107]:
score = lrl.score(Xs,Ys)
Y_preds = lrl.predict(Xs)
real_charges, fake_charges = confusion_matrix(Ys, Y_preds)
type2 = fake_charges[0]/(fake_charges[0]+fake_charges[1])
print("Overall Lasso Model Score:")
print(score)
print("\nType II Error percentage:")
print(type2*100,"%")

Overall Lasso Model Score:
0.94875

Type II Error percentage:
8.1 %


In [117]:
cross_val_score(lrl, Xs, Ys)

array([0.95727136, 0.94152924, 0.93843844])

## Using Testing and Training Data

In [108]:
confusion_matrix(Ys, Ys_pred)

array([[1946,   54],
       [ 155, 1845]], dtype=int64)

In [111]:
trainsize = int(len(real_downsampled)/2)
X_train = pd.concat([real_downsampled.iloc[trainsize:, :30], fake_upsampled.iloc[trainsize:, :30]])
Y_train = pd.concat([real_downsampled.iloc[trainsize:, 30], fake_upsampled.iloc[trainsize:, 30]])
X_test = pd.concat([real_downsampled.iloc[:trainsize, :30], fake_upsampled.iloc[:trainsize, :30]])
Y_test = pd.concat([real_downsampled.iloc[:trainsize, 30], fake_upsampled.iloc[:trainsize, 30]])

In [112]:
lrt = linear_model.LogisticRegression(C=1e10) 
lrt.fit(X_train,Y_train)

LogisticRegression(C=10000000000.0, class_weight=None, dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='ovr', n_jobs=1, penalty='l2', random_state=None,
          solver='liblinear', tol=0.0001, verbose=0, warm_start=False)

In [113]:
score = lrt.score(X_train,Y_train)
Y_preds = lrt.predict(X_train)
real_charges, fake_charges = confusion_matrix(Y_train, Y_preds)
type2 = fake_charges[0]/(fake_charges[0]+fake_charges[1])
print("Overall Lasso Model Score:")
print(score)
print("\nType II Error percentage:")
print(type2*100,"%")

Overall Lasso Model Score:
0.945

Type II Error percentage:
8.5 %


In [114]:
score = lrt.score(X_test,Y_test)
Y_preds = lrt.predict(X_test)
real_charges, fake_charges = confusion_matrix(Y_test, Y_preds)
type2 = fake_charges[0]/(fake_charges[0]+fake_charges[1])
print("Overall Lasso Model Score:")
print(score)
print("\nType II Error percentage:")
print(type2*100,"%")

Overall Lasso Model Score:
0.949

Type II Error percentage:
7.9 %


In [119]:
cross_val_score(lrt, X_train, Y_train)

array([0.94011976, 0.93843844, 0.94144144])

In [120]:
cross_val_score(lrt, X_test, Y_test)

array([0.92664671, 0.93543544, 0.93243243])