# SMOTE for credit card fraud

In [7]:
import imblearn
import pandas as pd
from sklearn.datasets import make_classification
from collections import Counter
from matplotlib import pyplot
from numpy import where
import pandas_profiling

# Download the data from 
# https://www.kaggle.com/mlg-ulb/creditcardfraud

# read the data
df = pd.read_csv("creditcard.csv")

In [8]:
# extract features and classes from dataframe
features = df.drop("Class", axis=1).to_numpy()
labels = df["Class"].to_numpy()

# show class imbalance
counter = Counter(labels)
print(counter)

# decision tree evaluated on imbalanced dataset
from numpy import mean
from sklearn.datasets import make_classification
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.tree import DecisionTreeClassifier

# define model
model = DecisionTreeClassifier()

# evaluate model using cross-validation
cv = RepeatedStratifiedKFold(n_splits=2, n_repeats=1, random_state=1)

scores = cross_val_score(model, 
                         features, 
                         labels, 
                         scoring='roc_auc', 
                         cv=cv, 
                         n_jobs=2)

print('Mean ROC AUC: %.3f' % mean(scores))

Counter({0: 284315, 1: 492})
Mean ROC AUC: 0.866


In [3]:
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline

# define pipeline
over = SMOTE(sampling_strategy=0.05)
under = RandomUnderSampler(sampling_strategy=0.5)
steps = [('o', over), ('u', under)]
pipeline = Pipeline(steps=steps)

# transform the dataset
X, y = pipeline.fit_resample(X, y)

# summarize the new class distribution
counter = Counter(y)
print(counter)

# define model
model = DecisionTreeClassifier()

# evaluate pipeline
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
scores = cross_val_score(model, X, y, scoring='roc_auc', cv=cv, n_jobs=-1)
print('Mean ROC AUC: %.3f' % mean(scores))



Counter({0: 28430, 1: 14215})
Mean ROC AUC: 0.988
