# Fraud Detection on Card Transaction Data

Machine Learning Experimental Setup

### Loading Data

In [None]:
import pandas as pd

In [None]:
filename = "creditcard.csv"
df = pd.read_csv(filename)

In [None]:
df.head()

In [None]:
df.Class.value_counts()

In [None]:
X, y = df[df.columns[df.columns != "Class"]], df["Class"]

In [None]:
X.shape, y.shape

## Experimental Pipeline

In [None]:
# Reproducibility settings
import numpy as np
from sklearn.utils import check_random_state

SEED = 12345

# The NumPy Generator will be used throughout the whole experiment
# rng = np.random.default_rng(SEED)
np.random.seed(SEED)
rng = check_random_state(SEED)

In [None]:
import os
os.environ["PYTHONWARNINGS"] = "ignore"
import warnings
warnings.filterwarnings('ignore')
from pathlib import Path
from datetime import datetime

# Preprocessing
from sklearn.preprocessing import RobustScaler
from sklearn.compose import ColumnTransformer

# Imbalanced Learning
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import NearMiss

# Model Selection and Metrics
from sklearn.model_selection import RepeatedStratifiedKFold, train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import f1_score

# ML Models
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

# Pipeline 
# (imblearn Pipeline because we need to embed a sampler)
from imblearn.pipeline import Pipeline

# Model Persistence
from joblib import dump

**Data Splitting**

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, 
                                                    random_state=rng)

**PreProcessing**

In [None]:
# (Selected) Feature Scaling
preprocessing = ColumnTransformer([("scaler", RobustScaler(), ["Time", "Amount"]),],
                                  remainder="passthrough")

### Machine Learning Models

Setting up Machine Learning models and their corresponding param grid (for Hyper parameter tuning)

In [None]:
# Decision Tree
dt = DecisionTreeClassifier(random_state=rng)
tree_models_params = {
    "model__max_depth": [None, 2, 3, 6],
    "model__min_samples_leaf": [2, 5, 6],
    "model__criterion": ["gini", "entropy"]
}

dt_params = tree_models_params

In [None]:
# Random Forest
rf = RandomForestClassifier(random_state=rng, n_jobs=-1)
rf_params = {
    "model__n_estimators": [50,],
    "model__max_features": ["log2", "sqrt"],
    }
rf_params_full = tree_models_params | tree_models_params

(CV) Training and Evaluation Utility Functions

In [None]:
# Inspired from https://stackoverflow.com/questions/54868698/what-type-is-a-sklearn-model

from typing import Protocol, Any
from numpy.typing import ArrayLike

class Estimator(Protocol):
    def fit(self, X, y, sample_weight=None): ...
    def predict(self, X): ...
    def score(self, X, y, sample_weight=None): ...
    def set_params(self, **params): ...

ModelInfo = tuple[str, Estimator, dict[str, list[Any]]]
PipelineSteps = list[tuple[str, Estimator]]
Params = dict[str, Any]
Partition = tuple[ArrayLike, ArrayLike]

In [None]:
def train(model_info: ModelInfo, preprocessing_steps: PipelineSteps,
          X_train: ArrayLike, y_train: ArrayLike, *,
          preprocessing_params: Params = None, 
          rng: np.random.RandomState = None,
          cv_n_reps: int = 10, cv_n_splits: int = 5,
          verbose: bool = True) -> Estimator:
    """Train a given model (with Hyper Parameter Tuning) within a Repeated Stratified 10x5CV"""

    model_name, model, model_params = model_info
    pipeline = Pipeline(preprocessing_steps + [("model", model)])

    if preprocessing_params is not None:
        pipeline_params = preprocessing_params | model_params
    else:
        pipeline_params = model_params

    print(f"Training {model_name}")
    if verbose:
        print(f"Params: {pipeline_params}")
        print(f"Pipeline: {pipeline}")

    gs = GridSearchCV(estimator=pipeline, param_grid=pipeline_params, n_jobs=-1,
                      scoring="f1", cv=RepeatedStratifiedKFold(n_repeats=cv_n_reps,
                                                               n_splits=cv_n_splits, 
                                                               random_state=rng))
    gs.fit(X_train, y_train)
    if verbose:
        print("Best Params: ", gs.best_params_)
        print("Best CV Score (F1)", gs.best_score_)

    return gs


def evaluate(model_name: str, model: Estimator, X_test: ArrayLike , y_test: ArrayLike) -> float:
    print(f"Evaluate {model_name}")
    y_pred = model.predict(X_test)
    return f1_score(y_test, y_pred)


In [None]:
MODEL_FOLDER = Path(os.path.abspath(os.path.curdir)) / "models"
os.makedirs(MODEL_FOLDER, exist_ok=True)

def run_experiment(name: str, model_configs: list[ModelInfo],
                   data: Partition, labels: Partition,
                   preprocessing_steps: PipelineSteps,
                   *,
                   cv_n_reps: int = 10, cv_n_splits: int = 5,
                   preproc_hyper_params: Params = None,
                   rng: np.random.RandomState = None,
                   ):
    """
    Run the full experiment on selected models (and Params),
    calling train and evaluate, in turn.
    """
    X_train, X_test = data
    y_train, y_test = labels
    exp_label = name.lower().strip().replace(" ", "_")

    for model_info in model_configs:
        start = datetime.now()
        gs_model = train(model_info=model_info,
                        preprocessing_steps=preprocessing_steps,
                        preprocessing_params=preproc_hyper_params,
                        X_train=X_train, y_train=y_train,
                        cv_n_reps=cv_n_reps, cv_n_splits=cv_n_splits,
                        rng=rng, verbose=True)
        elapsed = datetime.now() - start
        print(f"Elapsed Time to run {cv_n_reps}x{cv_n_splits}CV: {elapsed}")
        best_model = gs_model.best_estimator_
        model_name, *_ = model_info
        print(evaluate(model_name=model_name, model=best_model,
                    X_test=X_test, y_test=y_test))
        model_filename = f"gs_{model_name.lower().replace(' ', '_')}_{exp_label}.joblib"
        model_filepath = MODEL_FOLDER / model_filename
        dump(gs_model, model_filepath)
        print("")  # Empty line, mostly for clean report

#### 1. Near-Miss (Under) Sampling Strategy

In [None]:
nm_run_config = [
        ("Decision Tree", dt, dt_params),
        ("Random Forest", rf, rf_params_full),
    ]

In [None]:
# Under Sampling Strategy
nm = NearMiss(sampling_strategy="majority", version=3)
# NearMiss Param Grid
nm_params = {"sampling__n_neighbors_ver3" : [4, 5]}

steps_under_sampling = [("preprocess", preprocessing), ("sampling", nm)]

In [None]:
run_experiment(name="Under Sampling Near Miss",
               model_configs=nm_run_config, data=(X_train, X_test),
               labels=(y_train, y_test),
               preprocessing_steps=steps_under_sampling,
               preproc_hyper_params=nm_params,
               rng=rng)

---

#### 2. SMOTE (Over) Sampling Strategy

In [None]:
smote_run_config = [
        ("Decision Tree", dt, dt_params),
        ("Random Forest", rf, rf_params),  # only RF specific params tuned
    ]

In [None]:
# Over Sampling Strategies
smote = SMOTE(sampling_strategy="minority", random_state=rng)

steps_over_sampling = [("preprocess", preprocessing), ("sampling", smote)]

⚠️ **Note**

Sampling data with SMOTE may result in longer training time -
~20 mins for each selected model - within a 10x5CV training
schema (default).

To reduce the training time, please consider reducing the number
of repetitions in CV by setting a value to the `cv_n_reps` parameter
in the `run_experiment` function.

In [None]:
run_experiment(name="Over Sampling SMOTE",
               model_configs=smote_run_config,
               data=(X_train, X_test),
               labels=(y_train, y_test),
               preprocessing_steps=steps_over_sampling,
            #    cv_n_reps=3, # see note
               rng=rng)

---