In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report
from sklearn.metrics import precision_recall_fscore_support

from scipy.stats import ttest_ind

%matplotlib inline


In [2]:
# Pull in the dataset
df_raw = pd.read_csv('..//Datasets//creditcard.csv')

# Show the shape and the head
print(df_raw.shape)
df_raw.head()

(284807, 31)


Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0.0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0
2,1.0,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0


In [3]:
# From the first few rows, it appears that the data is already normalized. Let's check.
df_raw.groupby(df_raw.columns, axis=1).describe().head(n=7)

Unnamed: 0,Unnamed: 1,count,mean,std,min,25%,50%,75%,max
Amount,Amount,284807.0,88.34962,250.120109,0.0,5.6,22.0,77.165,25691.16
Class,Class,284807.0,0.001727486,0.041527,0.0,0.0,0.0,0.0,1.0
Time,Time,284807.0,94813.86,47488.145955,0.0,54201.5,84692.0,139320.5,172792.0
V1,V1,284807.0,3.91956e-15,1.958696,-56.40751,-0.920373,0.018109,1.315642,2.45493
V10,V10,284807.0,1.768627e-15,1.08885,-24.588262,-0.535426,-0.092917,0.453923,23.745136
V11,V11,284807.0,9.170318e-16,1.020713,-4.797473,-0.762494,-0.032757,0.739593,12.018913
V12,V12,284807.0,-1.810658e-15,0.999201,-18.683715,-0.405571,0.140033,0.618238,7.848392


It appears that all of the PCA Columns average out to zero, and have small standard deviations. Since the Amount and Time data are much larger than the other columns, we will have to scale them as well. Even when the "V" columns have a large range, their IQR is still approximately -1 <-> 1, which means that there are outliers present. I will preserve the outliers from the Amount and Time columns with sklearn's standard scaler method. The Normalize method would shrink everything to be explicitly between -1 and 1, and we don't want that.

In [None]:
print('Check for missing values: \nThere are {0:} missing values'.format(df_raw.isnull().sum().max()))

# Split our training data
X_train, X_test, y_train, y_test = train_test_split(df_raw.drop('Class', 1).copy(),
                                                    df_raw.loc[:, 'Class'].copy(),
                                                    random_state=42)

# Create scaler object using the X_train variable
scaler = StandardScaler().fit(X_train.loc[:, ('Amount', 'Time')])

# Standardize X_train and X_test values
standardized_X = scaler.transform(X_train.loc[:, ('Amount', 'Time')])
standardized_X_test = scaler.transform(X_test.loc[:, ('Amount', 'Time')])

# Change the X_train and X_test to our new standardized values
X_train.loc[:, ('Amount', 'Time')], X_test.loc[:, ('Amount', 'Time')] = standardized_X, standardized_X_test
X_train.head()

Check for missing values: 
There are 0 missing values


Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V20,V21,V22,V23,V24,V25,V26,V27,V28,Amount
83225,-0.73827,-1.648591,1.22813,1.370169,-1.735542,-0.029455,-0.484129,0.918645,-0.43875,0.982144,...,0.384201,-0.218076,-0.203458,-0.213015,0.011372,-0.304481,0.632063,-0.262968,-0.099863,-0.196016
52800,-1.035079,-0.234775,-0.493269,1.236728,-2.338793,-1.176733,0.885733,-1.960981,-2.363412,-2.694774,...,0.364679,-1.495358,-0.083066,0.074612,-0.347329,0.5419,-0.433294,0.089293,0.212029,-0.107223
21293,-1.331382,1.134626,-0.77446,-0.16339,-0.533358,-0.604555,-0.244482,-0.212682,0.040782,-1.136627,...,-0.396476,-0.684454,-1.855269,0.171997,-0.387783,-0.062985,0.245118,-0.061178,0.01218,0.086696
133600,-0.302019,0.069514,1.017753,1.033117,1.384376,0.223233,-0.310845,0.597287,-0.127658,-0.701533,...,0.14876,0.097023,0.369957,-0.219266,-0.124941,-0.049749,-0.112946,0.11444,0.066101,-0.306794
38225,-1.16873,-0.199441,0.610092,-0.114437,0.256565,2.290752,4.008475,-0.12353,1.038374,-0.075846,...,0.292972,-0.019733,0.165463,-0.080978,1.020656,-0.30073,-0.269595,0.481769,0.254114,-0.26002


That's kind of nice, our data has already been cleaned of missing values. Is that a result of PCA? Not necessarily in this case, because whoever made the data may have imputed, but I was wondering if PCA would fill a missing value since the data is being transformed onto a different vector space. So many questions.

In [None]:
# First, let's try an SVC with l2 regularization
clf = LinearSVC(class_weight='balanced')
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
target_names = ['Non-Fraudulent', 'Fraudulent']
print(classification_report(y_test, y_pred, target_names=target_names))

An important thing to note in this application is that banks want to be absolutely sure that they do not let fraudsters go unnoticed. They will get fined by the government out the wazoo, so they would rather report a ton of false positives because then they can be sure that they don't get fined. It mostly serves to help narrow down the search for fraudsters, and not necessarily call each of them out individually. I learned this from my mentor.

What this means for us, is that we need to have a very high recall. A recall of 1 is ideal, so let's try to force that so our branch doesn't get fined.

For my memory/understanding, 'recall' corresponds to doctor's being 'really' careful to not miss cancer patients. An example of high 'precision' is a very 'precise' advertising campaign. You don't want to send ads to people that won't look at them (kinda like a false positive), so you only send ads to people that you think are more likely to respond to them. In this way you get most bang for your advertising buck.

In [None]:
# And let's try an SVC with L1 regularization. I doubt this will be any better, because PCA has reduced data
# to only the important eigan features, so zeroing out feature coefficients wouldn't help that much. 
clf = LinearSVC(class_weight='balanced', loss='hinge')
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
target_names = ['Non-Fraudulent', 'Fraudulent']
print(classification_report(y_test, y_pred, target_names=target_names))

Not the first time I've been wrong, and it won't be the last! There is a way to increase our recall in the initialization of our Linear SVC, so let's learn/do that real quick and see if it gets us to 100% recall. To converge on a class weight more quickly, I will use a subset of the data. I will show that the class imbalance is similar.

In [None]:
resolution = 1 # How small of increments will we need? I don't know. Let's use 100 for now, by 5
n_steps = 10
start = .05
stop = 1
n_subset_examples = 20000

# Let's use the subset and ensure our class imbalance is roughly the same using a t-test
print(ttest_ind(y_train, y_train.iloc[0:n_subset_examples]), '\n')
print('Original Imbalance: \n{:.3f}%\n'.format(100 * y_train.value_counts()[1] / y_train.value_counts()[0]))
print('Subset Imbalance: \n{:.3f}%\n'.format(100 * y_train.iloc[0:n_subset_examples].value_counts()[1] / 
                                           y_train.iloc[0:n_subset_examples].value_counts()[0]))

precision_scores = []
recall_scores = []
f1_scores = []
weights = []

for i in np.arange(start, stop, resolution/n_steps):
    # Initialize the LinearSVC with different class weights
    clf = LinearSVC(class_weight={0:i, 1:1-i}, loss='hinge')
    clf.fit(X_train.iloc[:n_subset_examples], y_train.iloc[:n_subset_examples])
    y_pred = clf.predict(X_test.iloc[:n_subset_examples])
    target_names = ['Non-Fraudulent', 'Fraudulent']
    precision, recall, f1_score, _ = precision_recall_fscore_support(y_test.iloc[:n_subset_examples], y_pred, average='binary')
    precision_scores.append(precision)
    recall_scores.append(recall)
    f1_scores.append(f1_score)
    weights.append(i)

In [None]:
plt.figure(figsize=(10, 10))

# Plot all three scores on the same graph
plt.plot(weights, precision_scores, label='Precision Score')
plt.plot(weights, recall_scores, label='Recall Score')
plt.plot(weights, f1_scores, label='F1 Score')

# Make it look nice
plt.xlabel('Non-Fraudulent Weight')
plt.ylabel('Score')
plt.xlim([0, 1])
plt.ylim([0, 1])
plt.legend()
plt.show()

We can see that when the non-fraudulent class is weighted lower (and the fraudulent class weighted higher) that all three metrics are improved and seem to level out as the weight approaches zero. Let's see if a different classifier can get better results.

In [None]:
from sklearn.neighbors import KNeighborsClassifier

precision_knn = []
recall_knn = []
f1_knn = []
n_neighbors_range = 9
for i in np.arange(1, n_neighbors_range):
    knn = KNeighborsClassifier(n_neighbors=i, n_jobs=3)
    knn.fit(X_train.iloc[:n_subset_examples, :], y_train.iloc[:n_subset_examples])
    y_pred_knn = knn.predict(X_test.iloc[:n_subset_examples, :])
    result = precision_recall_fscore_support(y_test.iloc[:n_subset_examples], y_pred_knn, average='binary')
    precision_knn.append(result[0])
    recall_knn.append(result[1])
    f1_knn.append(result[2])
    print('Result for k={}: \n'.format(i), result)
n_neighbors = np.arange(1, n_neighbors_range)

In [None]:
plt.figure(figsize=(10,10))
plt.plot(n_neighbors, recall_knn, label='Recall')
plt.plot(n_neighbors, precision_knn, label='Precision')
plt.plot(n_neighbors, f1_knn, label='F1')
plt.legend()
plt.xlim([0, 10])
plt.ylim([0, 1])
plt.show()

Again, I would still like to get a higher recall score. Let's try another SVC but this time we will use a non-linear kernel. Failing high recall with that, we will try an ensemble method like random forest.

In [None]:
from sklearn.svm import SVC

# We will use a class weight of 0.15, because that seemed to work well in our LinearSVC
rbf_clf = SVC(class_weight={0:0.15, 1:0.85}, kernel='rbf')
rbf_clf.fit(X_train, y_train)
y_pred_rbf = rbf_clf.predict(X_test)
result = precision_recall_fscore_support(y_test, y_pred_rbf, average='binary')
print(result)