# Predicting Credit Fraud & Credit Default
## Dealing with Imbalanced Data

In [None]:
# @title Imports

import pandas as pd
import numpy as np

from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
from sklearn.metrics import precision_recall_curve, auc
from sklearn.model_selection import train_test_split

Get the data at https://www.kaggle.com/mlg-ulb/creditcardfraud

In [None]:
data = pd.read_csv('./creditcard.csv')
data.shape

(284807, 31)

Target: 1 = fraud;
0 = ok

Each row = 1 transaction

Amount = amount of transaction

V1-28 = masked features for privacy... not ideal (see credit default dataset for named features)

Time = seconds since first transaction

In [None]:
data.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0.0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0
2,1.0,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0


In [None]:
# Proportion of positive samples
data.Class.mean()

0.001727485630620034

## Train and validation split

In [None]:
X = data.iloc[:,:-1]
y = data.Class.values

In [None]:
def train_test_split_scaled(x: pd.DataFrame,
                            y: np.array,
                            test_size: float = 0.2,
                            random_state: int = 8675309,
                            stratify: np.array = None,
                            sample: float = None) -> tuple:
  """Given an input feature matrix and target vector, split into train/test
  split then scale the features.

  Arugments:
    x (pd.DataFrame): The matrix of features.
    y (np.array): A vector of targets.

  Returns:
    A tuple containing x_train, x_val, y_train and y_val.
  """

  if sample is not None and sample < 1 and sample > 0:
    _, x, _, y = train_test_split(
        x, y, test_size=sample, random_state=random_state, stratify=stratify)
    if stratify is not None:
      stratify=y

  x_train, x_val, y_train, y_val = train_test_split(
      x, y, test_size=test_size, random_state=random_state, stratify=stratify)
  scaler = StandardScaler()
  x_train = scaler.fit_transform(x_train)
  x_val = scaler.transform(x_val)
  return x_train, x_val, y_train, y_val

## Logistic regression baseline

In [None]:
x_train, x_val, y_train, y_val = train_test_split_scaled(
    X, y, stratify=y, sample=0.1)

In [None]:
y_train.shape, y_val.shape

((22784,), (5697,))

In [None]:
lr = LogisticRegression()
lr.fit(x_train, y_train)

y_hat_val = lr.predict_proba(x_val)[:, 1]
precision, recall, _ = precision_recall_curve(y_val, y_hat_val)
aucpr = auc(recall, precision)
aucroc = roc_auc_score(y_val, y_hat_val)
print('No sampling: aucpr = %.3f; aucroc = %.3f' % (aucpr, aucroc))

No sampling: aucpr = 0.595; aucroc = 0.886


## Logistic regression with weights

In [None]:
for w in [2, 5, 10, 100, 500, 1000, 10000]:
  weights = {0:1.0, 1:w}
  lr = LogisticRegression(solver='lbfgs', max_iter = 1000, class_weight=weights)
  lr.fit(x_train, y_train)
  y_hat_val = lr.predict_proba(x_val)[:, 1]
  precision, recall, thresholds = precision_recall_curve(y_val, y_hat_val)
  auc_precision_recall = auc(recall, precision)
  auc_roc = roc_auc_score(y_val, y_hat_val)
  print('weight: %d auc precision recall %.3f auc roc %.3f' %
        (w, auc_precision_recall, auc_roc))

weight: 2 auc precision recall 0.619 auc roc 0.936
weight: 5 auc precision recall 0.638 auc roc 0.946
weight: 10 auc precision recall 0.599 auc roc 0.934
weight: 100 auc precision recall 0.487 auc roc 0.890
weight: 500 auc precision recall 0.528 auc roc 0.879
weight: 1000 auc precision recall 0.553 auc roc 0.876
weight: 10000 auc precision recall 0.599 auc roc 0.867


## Undersampling

`pip install imblearn # in case it's not preinstalled`

In [None]:
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import CondensedNearestNeighbour, TomekLinks
from collections import Counter

In [None]:
# @title split data again using a 10% stratified sample
x_train, x_val, y_train, y_val = train_test_split_scaled(
    X, y, stratify=y, sample=0.1)

In [None]:
# @title undersampling with Condensed Nearest Neighbor
print('Original dataset shape %s' % Counter(y_train))

cnn = CondensedNearestNeighbour(random_state=42, n_neighbors=1)
%time x_train_cnn, y_train_cnn = cnn.fit_resample(x_train, y_train)

print('CNN: Resampled dataset shape %s' % Counter(y_train_cnn))

Original dataset shape Counter({0: 22745, 1: 39})
CPU times: user 11min 49s, sys: 41.2 s, total: 12min 30s
Wall time: 9min 20s
CNN: Resampled dataset shape Counter({0: 101, 1: 39})


In [None]:
lr = LogisticRegression()
lr.fit(x_train_cnn, y_train_cnn)

y_hat_val = lr.predict_proba(x_val)[:, 1]
precision, recall, _ = precision_recall_curve(y_val, y_hat_val)
aucpr = auc(recall, precision)
aucroc = roc_auc_score(y_val, y_hat_val)
print('CNN: aucpr = %.3f; aucroc = %.3f' % (aucpr, aucroc))

CNN: aucpr = 0.130; aucroc = 0.707


In [None]:
# @title undersampling with TomekLinks
print('Original dataset shape %s' % Counter(y_train))

tl = TomekLinks()
%time x_train_tl, y_train_tl = tl.fit_resample(x_train, y_train)

print('TomekLinks: Resampled dataset shape %s' % Counter(y_train_tl))

Original dataset shape Counter({0: 22745, 1: 39})
CPU times: user 2min 52s, sys: 1.11 s, total: 2min 53s
Wall time: 26.7 s
TomekLinks: Resampled dataset shape Counter({0: 22744, 1: 39})


In [None]:
lr = LogisticRegression()
lr.fit(x_train_tl, y_train_tl)

y_hat_val = lr.predict_proba(x_val)[:, 1]
precision, recall, _ = precision_recall_curve(y_val, y_hat_val)
aucpr = auc(recall, precision)
aucroc = roc_auc_score(y_val, y_hat_val)
print('TomekLinks: aucpr = %.3f; aucroc = %.3f' % (aucpr, aucroc))

TomekLinks: aucpr = 0.697; aucroc = 0.972


## Oversampling

In [None]:
# @title oversampling with SMOTE
print('Original dataset shape %s' % Counter(y_train))

smote = SMOTE(sampling_strategy=0.5)
x_train_smote, y_train_smote = smote.fit_resample(x_train, y_train)

print('SMOTE: Resampled dataset shape %s' % Counter(y_train_smote))

Original dataset shape Counter({0: 22745, 1: 39})
Resampled dataset shape Counter({0: 22745, 1: 11372})


In [None]:
# Default solver l-bfgs-b failed to converge.
lr = LogisticRegression(solver='newton-cg')
lr.fit(x_train_smote, y_train_smote)

y_hat_val = lr.predict_proba(x_val)[:, 1]
precision, recall, _ = precision_recall_curve(y_val, y_hat_val)
aucpr = auc(recall, precision)
aucroc = roc_auc_score(y_val, y_hat_val)
print('SMOTE: aupr = %.3f; auroc = %.3f' % (aucpr, aucroc))

SMOTE: aupr = 0.534; auroc = 0.896


# Lab:
## 1. Look up "one-sided selection" method. Implement.

In [None]:
from imblearn.under_sampling import OneSidedSelection, RandomUnderSampler

In [None]:
x_train, x_val, y_train, y_val = train_test_split_scaled(X, y, stratify=y, sample=0.1)

In [None]:
x_train.shape

(22784, 30)

In [None]:
# @title undersample with RandomUnderSampler
print('Original dataset shape %s' % Counter(y_train))

rus = RandomUnderSampler(sampling_strategy=0.02)
x_train_rus, y_train_rus = rus.fit_resample(x_train, y_train)

print('RandomUnderSampler: Original dataset shape %s' % Counter(y_train_rus))

Original dataset shape Counter({0: 22745, 1: 39})
RandomUnderSampler: Original dataset shape Counter({0: 1950, 1: 39})


In [None]:
lr = LogisticRegression()
lr.fit(x_train_rus, y_train_rus)
y_hat_val = lr.predict_proba(x_val)[:, 1]
precision, recall, _ = precision_recall_curve(y_val, y_hat_val)
aucpr = auc(recall, precision)
aucroc = roc_auc_score(y_val, y_hat_val)
print('RandomUnderSampler: aucpr %.3f; aucroc %.3f' % (aucpr, aucroc))

RandomUnderSampler: aucpr 0.352; aucroc 0.970


In [None]:
# @title undersample with OneSidedSelection
print('Original dataset shape %s' % Counter(y_train))

oss = OneSidedSelection(n_neighbors=1, n_seeds_S=10, n_jobs=-1)
x_train_oss, y_train_oss = oss.fit_resample(x_train, y_train)

print('OneSidedSelection: Original dataset shape %s' % Counter(y_train_rus))

Original dataset shape Counter({0: 22745, 1: 39})
OneSidedSelection: Original dataset shape Counter({0: 1950, 1: 39})


In [None]:
lr = LogisticRegression()
lr.fit(x_train_oss, y_train_oss)
y_hat_val = lr.predict_proba(x_val)[:, 1]
precision, recall, _ = precision_recall_curve(y_val, y_hat_val)
aucpr = auc(recall, precision)
aucroc = roc_auc_score(y_val, y_hat_val)
print('OneSidedSelection: aucpr %.3f; aucroc %.3f' % (aucpr, aucroc))

OneSidedSelection: aucpr 0.753; aucroc 0.935


## 2. Try out other sampling methods... e.g. ENN, etc.



## 3. Write a pipeline with cross-validation to tune the hyperparameter sampling_strategy in SMOTE. Note that oversampling/underssampling needs to be done in the cross-validation loop.

In [None]:
def stratified_sample(x: pd.DataFrame,
                      y: np.array,
                      sample: float = 0.1,
                      random_state: int = 8675309) -> tuple:
  _, x, _, y = train_test_split(
      x, y, test_size=sample, random_state=random_state, stratify=y)
  return x, y

X = data.iloc[:,:-1]
y = data.Class.values

x_sample, y_sample = stratified_sample(X, y)

x_train, x_val, y_train, y_val = train_test_split(
    x_sample, y_sample, test_size=0.2, random_state=42, stratify=y_sample)

In [None]:
x_train.shape

(22784, 30)

In [None]:
def smote_cv(x, y, folds: int = 3, sampling_strategy: float = 0.5) -> tuple:
  predicted = []
  actual = []

  kf = StratifiedKFold(n_splits = folds)
  kf.get_n_splits(x, y)

  for train_index, val_index in kf.split(x, y):
    x_train, x_val = x[train_index], x[val_index]
    y_train, y_val = y[train_index], y[val_index]

    # Scale using training data.
    scaler = StandardScaler()
    x_train = scaler.fit_transform(x_train)
    x_val = scaler.transform(x_val)

    # Oversample with SMOTE.
    smote = SMOTE(sampling_strategy=sampling_strategy)
    x_train, y_train = smote.fit_resample(x_train, y_train)

    # Train LR.
    lr = LogisticRegression(max_iter = 1000).fit(x_train, y_train)

    # Predict on validation set.
    predicted.append(lr.predict_proba(x_val)[:, 1])
    actual.append(y_val)

  actual = np.concatenate(actual)
  predicted = np.concatenate(predicted)
  precision, recall, _ = precision_recall_curve(actual, predicted)

  aucpr = auc(recall, precision)
  aucroc = roc_auc_score(actual, predicted)

  return aucpr, aucroc

In [None]:
x_train.shape, y_train.shape

((22784, 30), (22784,))

In [None]:
from sklearn.model_selection import StratifiedKFold

for s in [0.01, 0.05, 0.1, 0.2, 0.5]:
  aucpr, aucroc = smote_cv(
      x_train.values, y_train, folds=3, sampling_strategy=s)
  print("sampling_strategy=%.3f; aucpr=%.3f; aucroc=%.3f" % (s, aucpr, aucroc))

sampling_strategy=0.010; aucpr=0.738; aucroc=0.947
sampling_strategy=0.050; aucpr=0.694; aucroc=0.942
sampling_strategy=0.100; aucpr=0.685; aucroc=0.936
sampling_strategy=0.200; aucpr=0.649; aucroc=0.934
sampling_strategy=0.500; aucpr=0.626; aucroc=0.933


## 4. Try to build your own model on a new dataset.

Test out different over/undersampling methods and find the one that returns the most improvement in ROCAUC, PRAUC, F score, etc. For the feature set, use one-hot encoding on all the categorical features.


In [None]:
from sklearn.datasets import fetch_openml
from collections import Counter

# https://www.openml.org/search?type=data&status=active&id=31
credit = fetch_openml('credit-g', parser='auto', version=1)
x = credit.data
y = credit.target

print('Original dataset shape %s' % Counter(y))

Original dataset shape Counter({'good': 700, 'bad': 300})


## References
1. https://www.kaggle.com/janiobachmann/credit-fraud-dealing-with-imbalanced-datasets
2. https://machinelearningmastery.com/undersampling-algorithms-for-imbalanced-classification/