### install and import

In [None]:
# # download dataset from kaggle
# %mkdir dataset
# !pip install kaggle
# !kaggle datasets download -d mlg-ulb/creditcardfraud -p dataset
# !unzip dataset/creditcardfraud.zip -d dataset

# # install dependencies
# !pip install --quiet numpy pandas scikit-learn imblearn xgboost lightgbm
# !pip install --quiet ydata-profiling ipywidgets
# !pip install --quiet plotly "nbformat>=4.2.0" statsmodels

In [None]:
import numpy as np
import pandas as pd

import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
# from ydata_profiling import ProfileReport

from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import RobustScaler, StandardScaler
from sklearn.model_selection import (
    StratifiedKFold,
    train_test_split,
    cross_val_score,
    GridSearchCV, RandomizedSearchCV,
)
from sklearn.metrics import (
    make_scorer, classification_report,
    accuracy_score, balanced_accuracy_score,
    precision_score, recall_score, f1_score, 
    roc_auc_score, average_precision_score,
    log_loss, brier_score_loss, matthews_corrcoef,   
)

from imblearn.pipeline import make_pipeline as make_pipeline_imb
from imblearn.under_sampling import NearMiss, RandomUnderSampler
from imblearn.over_sampling import ADASYN, SMOTE, RandomOverSampler
from imblearn.metrics import classification_report_imbalanced

from imblearn.ensemble import (
    EasyEnsembleClassifier, 
    RUSBoostClassifier, 
    BalancedBaggingClassifier, 
    BalancedRandomForestClassifier
)

from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import (
    RandomForestClassifier, 
    GradientBoostingClassifier,
)

### load and eda

In [None]:
# read dataset
df = pd.read_csv("dataset/creditcard.csv")

# convert all column names to lower
df.columns = [col.lower() for col in df.columns.tolist()]

# put target class at the beginning
target = df["class"]
df.drop("class", axis=1, inplace=True)
df.insert(loc=0, column="class", value=target)

# printing stuff
print(f"shape: {df.shape}")
print(f"columns: {df.columns.tolist()}")
df.head()

In [None]:
# check count and dtypes
df.info()

In [None]:
# check null values
df.isna().sum()

In [None]:
# check statistics of numerical cols 
df.describe()

In [None]:
# # quick eda (takes ~ 3 to 4 mins)
# keyword = "train"
# profile = ProfileReport(df, title=f"{keyword} dataset")
# profile.to_notebook_iframe()

In [None]:
# quick visualize
for col in df.columns:
    fig = px.histogram(df, x=col)
    fig.show()

In [None]:
# checkout distribution of target class
temp = df["class"].value_counts(normalize=True).to_frame()
round(temp, 4)

In [None]:
# plot the distribution of target class
temp = df["class"].value_counts().reset_index(name="count")
fig = px.bar(temp, x="class", y="count",  text="count")

fig.update_layout(
    title="non-fraud(class=0) vs fraud(class=1)",
    width=400, height=400,
)
fig.update_traces(width=0.5)
fig.show()

In [None]:
# plot for time
fig = px.histogram(
    df, x="time",
    marginal="rug",
    barmode="stack", facet_col="class", color="class", 
    # barmode="overlay"
)
fig.show()

In [None]:
# plot for amount
fig = px.histogram(
    df, x="amount",
    marginal="rug", 
    barmode="stack", facet_col="class", color="class", 
    # barmode="overlay"
)
fig.show()

### preprocess

In [None]:
def preprocessing(data):
    # scale time and amount
    robust_scaler = RobustScaler()
    data[["scaled_time", "scaled_amount"]] = robust_scaler.fit_transform(data[["time", "amount"]])
    
    # drop time and amount
    data.drop(["time", "amount"], axis=1, inplace=True)
    
    return data

In [None]:
df = preprocessing(data=df)

# printing stuff
print(f"shape: {df.shape}")
print(f"columns: {df.columns.tolist()}")
df.head()

#### sample and viz

In [None]:
def get_X_y(data):
    FEATURES = ["scaled_time", "scaled_amount", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28"]
    TARGET = "class"
    
    # X = data.drop(TARGET, axis=1)
    X = data[FEATURES]
    y = data[TARGET]
    
    return X, y

In [None]:
# get original X and y
X, y = get_X_y(df)
print(f"Original:")
print(f"X.shape: {X.shape} | y.shape: {y.shape}")
print(y.value_counts().reset_index())


# sample
random_us = RandomUnderSampler(random_state=1729, sampling_strategy="majority")
X_rus, y_rus = random_us.fit_resample(X, y)
print(f"\nAfter Sampling:")
print(f"X_rus.shape: {X_rus.shape} | y_rus.shape: {y_rus.shape}")
print(y_rus.value_counts().reset_index())


# create dataframe
df_rus = pd.concat([X_rus, y_rus], axis=1)
print(f"shape: {df_rus.shape}")
print(f"columns: {df_rus.columns.tolist()}")
df_rus.head()

In [None]:
# plot the distribution of target class
temp = df_rus["class"].value_counts().reset_index(name="count")
fig = px.bar(temp, x="class", y="count",  text="count")

fig.update_layout(
    title="non-fraud(class=0) vs fraud(class=1)",
    width=400, height=400,
)
fig.update_traces(width=0.5)
fig.show()

In [None]:
# NOTE: data is already normalized - no need to repeat again!
# correlation
correlation = df.corr()

# plot
fig = px.imshow(
    correlation.round(2),
    text_auto=True, aspect="auto", 
    color_continuous_scale="RdBu",
    width=1400, height=1400,
)
fig.update_traces(textfont_size=8)
fig.show()

In [None]:
# NOTE: data is already normalized - no need to repeat again!
# correlation
correlation = df_rus.corr()

# plot
fig = px.imshow(
    correlation.round(2),
    text_auto=True, aspect="auto", 
    color_continuous_scale="RdBu",
    width=1400, height=1400,
)
fig.update_traces(textfont_size=8)
fig.show()

In [None]:
corr_df = df_rus.corr()["class"].reset_index(name="value").rename(columns={"index": "column"})
corr_df = corr_df.drop(corr_df[corr_df["column"] == "class"].index)
corr_df

In [None]:
class_df = corr_df[(corr_df.value > 0.2) | (corr_df.value < -0.5)]

class_df.sort_values(by="value")

In [None]:
features = class_df.column.to_list()

titles = [f"feature={feature}" for feature in features]

In [None]:
rows, cols = 2, 4

fig = make_subplots(
    rows=rows, cols=cols,
    subplot_titles=titles,
    vertical_spacing=0.15,
)

for r in range(1, rows+1):
    for c in range(1, cols+1):
        fig.add_trace(
            go.Histogram(name=features[r+c-2], x=df_rus.loc[df_rus["class"]==1, features[r+c-2]]),
            r, c
        )
        fig.update_xaxes(mirror=True, linewidth=2, linecolor="black", row=r, col=c)
        fig.update_yaxes(mirror=True, linewidth=2, linecolor="black", row=r, col=c)

fig.update_layout(title="Feature Distributions for Fraud Transactions", template="seaborn",)
fig.show()

In [None]:
rows, cols = 2, 4

fig = make_subplots(
    rows=rows, cols=cols,
    subplot_titles=titles,
    vertical_spacing=0.15,
)

for r in range(1, rows+1):
    for c in range(1, cols+1):
        fig.add_trace(
            go.Histogram(name=features[r+c-2], x=df_rus.loc[df_rus["class"]==0, features[r+c-2]]),
            r, c
        )
        fig.update_xaxes(mirror=True, linewidth=2, linecolor="black", row=r, col=c)
        fig.update_yaxes(mirror=True, linewidth=2, linecolor="black", row=r, col=c)

fig.update_layout(title="Feature Distributions for Non-Fraud Transactions", template="seaborn",)
fig.show()

In [None]:
pos_corr_cols = corr_df[corr_df.value > 0.2]["column"].tolist()
pos_titles = [f"feature={feature}" for feature in pos_corr_cols]

neg_corr_cols = corr_df[corr_df.value < -0.5]["column"].tolist()
neg_titles = [f"feature={feature}" for feature in neg_corr_cols]

print(f"columns with positive correlations: {pos_corr_cols}")
print(f"columns with negative correlations: {neg_corr_cols}")

In [None]:
# for positive correlation
rows, cols = 1,  4

features = pos_corr_cols

fig = make_subplots(
    rows=rows, cols=cols,
    subplot_titles=pos_titles,
    vertical_spacing=0.15,
)

for r in range(1, rows+1):
    for c in range(1, cols+1):
        fig.add_trace(
            go.Box(name=features[r+c-2], x=df_rus["class"], y=df_rus[features[r+c-2]]),
            r, c
        )
        fig.update_xaxes(mirror=True, linewidth=2, linecolor="black", row=r, col=c)
        fig.update_yaxes(mirror=True, linewidth=2, linecolor="black", row=r, col=c)

fig.update_layout(width=700, template="seaborn", title="Boxplots for Positive Correlations")
fig.show()

In [None]:
# for negative correlation
rows, cols = 1, 4

features = neg_corr_cols

fig = make_subplots(
    rows=rows, cols=cols, subplot_titles=neg_titles, vertical_spacing=0.15,
)

for r in range(1, rows+1):
    for c in range(1, cols+1):
        fig.add_trace(
            go.Box(name=features[r+c-2], x=df_rus["class"], y=df_rus[features[r+c-2]]),
            r, c
        )

fig.update_layout(width=700, template="seaborn", title="Boxplots for Negative Correlations")
fig.show()

#### outlier removal

In [None]:
def outlier_removal(df, feature):
    filtered_df = df[df["class"] == 1][feature]
    
    # calculate Q1 (25th percentile) and Q3 (75th percentile)
    q25, q75 = np.percentile(filtered_df, 25), np.percentile(filtered_df, 75)
    
    # calculate IQR and cutoff
    iqr = q75 - q25
    cutoff = iqr * 1.5
    lower_threshold, upper_threshold = q25 - cutoff, q75 + cutoff
    
    # count outliers
    outliers_count = np.sum((filtered_df < lower_threshold) | (filtered_df > upper_threshold))
    
    # remove outliers
    df = df[~((df[feature] < lower_threshold) | (df[feature] > upper_threshold))]
    
    print(f"feature: {feature} | outliers_count: {outliers_count} | count(records_after_outlier_removal): {len(df)}")
    return df

In [None]:
df_rus_rm_out = outlier_removal(df_rus, "v10")
df_rus_rm_out = outlier_removal(df_rus_rm_out, "v2")

In [None]:
rows, cols = 2, 2

features = ["v2", "v10"]
titles = [
    "v2 vs class <br> (before outlier removal)", 
    "v2 vs class <br> (after outlier removal)", 
    "v10 vs class <br> (before outlier removal)", 
    "v10 vs class <br> (after outlier removal)", 
]

fig = make_subplots(
    rows=rows, cols=cols, shared_yaxes=True, subplot_titles=titles
)

for r in range(1, rows+1):
    fig.add_trace(
        go.Box(name=f"{features[r-1]}(before)", x=df_rus["class"], y=df_rus[features[r-1]]),
        r,1
    )
    fig.add_trace(
        go.Box(name=f"{features[r-1]}(after)", x=df_rus_out["class"], y=df_rus_out[features[r-1]]),
        r,2
    )
fig.update_layout(width=700, template="seaborn", title="comparison of boxplots for v2 & v10")
# fig.show()

### train

In [None]:
classifiers = [
    {
        "name": "logisticregression",
        "clf": LogisticRegression(max_iter=1000),
        "params": {
            "penalty": ["l2"],
            "C": [0.001, 0.01, 0.1, 1, 10, 100, 1000],
        }
    },
    {
        "name": "xgbclassifier", # takes 15 mins
        "clf": XGBClassifier(objective="binary:logistic", random_state=1729),
        "params": {
            "n_estimators": [100, 500, 1000],
            "max_depth": [5, 10],
            "learning_rate": [0.075, 0.1],
        }
    },
    {
        "name": "randomforestclassifier",
        "clf": RandomForestClassifier(random_state=1729,),
        "params": {
            "n_estimators": [100, 500], 
            "max_depth": [5, 10],
            "max_leaf_nodes": [64, 128, 256],
        }
    },
    {
        "name": "gradientboostingclassifier",
        "clf": GradientBoostingClassifier(random_state=1729),
        "params": {
            "n_estimators": [100, 500, 1000],
            "max_depth": [2, 4, 8, 16],
            "min_samples_leaf": [64, 128, 256],
            "learning_rate": [0.15, 0.175, 0.2,],
        }
    },
]

samplers = [
    {
        "name": "nearmiss",
        "clf": NearMiss(n_jobs=-1, n_neighbors_ver3=3,),
        "params": {
            "n_neighbors": [3, 5],
            "sampling_strategy": ["majority"],
            'version': [2, 3],
        },
    },
    {
        "name": "randomundersampler",
        "clf": RandomUnderSampler(random_state=1729, replacement=False),
        "params": {
            "sampling_strategy": ["majority"],
        },
    },
    {
        "name": "adasyn",
        "clf": ADASYN(n_jobs=-1, random_state=1729, ),
        "params": {
           "sampling_strategy": ["auto"],
           "n_neighbors": [3, 5, 10],
        }
    },
    {
        "name": "smote",
        "clf": SMOTE(n_jobs=-1, random_state=1729,),
        "params": {
            "sampling_strategy": ["auto"],
            "k_neighbors": [3, 5, 10],
        }
    },
    {
        "name": "randomoversampler",
        "clf": RandomOverSampler(random_state=1729),
        "params": {
           "sampling_strategy": ["auto"],
           "shrinkage": [None],
        }
    },
]

scoring = {
    "accuracy": make_scorer(balanced_accuracy_score),
    "f1": make_scorer(f1_score),
    "average_precision": make_scorer(average_precision_score)
}

# make_pipeline_imb(LogisticRegression()).get_params()

In [None]:
def train_models(data, labels, pipelines):
    results = []

    # split data
    X_tr, X_val, y_tr, y_val = train_test_split(data, labels, test_size=0.1, random_state=1729)
    
    for item in pipelines:
        name, pipeline = item["name"], item["pipeline"]
        # fit model
        model = pipeline.fit(X_tr, y_tr)
        
        # predict
        y_hat = model.predict(X_val)
        
        # get reports
        target_names = ["non-fraud", "fraud"]
        report = classification_report(y_val, y_hat, target_names=target_names)
        report_imb = classification_report_imbalanced(y_val, y_hat, target_names=target_names)

        # calculate metrics
        accuracy = accuracy_score(y_val, y_hat)
        balanced_accuracy = balanced_accuracy_score(y_val, y_hat)
        precision = precision_score(y_val, y_hat)
        recall = recall_score(y_val, y_hat)
        f1 = f1_score(y_val, y_hat)
        roc_auc = roc_auc_score(y_val, y_hat)
        ap_score = average_precision_score(y_val, y_hat)
        
        results.append({
            "name": name,
            "model": model,
            "eval": {
                "report": report,
                "report_imb": report_imb,
                "accuracy": accuracy,
                "balanced_accuracy": balanced_accuracy,
                "precision": precision,
                "recall": recall,
                "f1": f1,
                "roc_auc": roc_auc,
                "avg_precision": ap_score,
            },
        })
    
    return results


def create_pipelines(classifiers, param_tune=False, cv_tuner=None, cv=None, scoring=None, refit=None):
    pipelines = []
    for item in classifiers:
        clf = item["clf"]
        pipeline = make_pipeline(clf)
        
        grid = { f"{item['name']}__{k}":v for k,v in item["params"].items()}
        # grid = item["params"]
        
        if param_tune:
            pipeline = cv_tuner(pipeline, grid, cv=cv, n_jobs=-1, verbose=2, scoring=scoring, refit=refit)
       
        pipelines.append({
            "name": item["name"],
            "pipeline": pipeline,
            "grid": grid,
        })
    return pipelines

def create_pipelines_with_sampler(samplers, classifiers, param_tune=False, cv_tuner=None, cv=None, scoring=None, refit=None):
    pipelines = []
    for item_c in classifiers:
        for item_s in samplers:
            pipeline = make_pipeline_imb(item_s["clf"], item_c["clf"])
            
            grid_s = { f"{item_s['name']}__{k}":v for k,v in item_s["params"].items()}
            grid_c = { f"{item_c['name']}__{k}":v for k,v in item_c["params"].items()}
            grid = {**grid_s, **grid_c}
            # grid = {**item_s["params"] , **item_c["params"]}
            
            if param_tune:
                pipeline = cv_tuner(pipeline, grid, cv=cv, n_jobs=-1, verbose=2, scoring=scoring, refit=refit)
        
            pipelines.append({
                "name": f"{item_c['name']}__{item_s['name']}",
                "pipeline": pipeline,
                "grid": grid,
            })
    return pipelines


def print_results(results, highlight=False):
    results_df = []
    for res in results:
        item = {
            "name": res["name"],
        }
        item.update(res["eval"])
        results_df.append(item)

    results_df = pd.DataFrame(results_df)
    results_df.drop(["report", "report_imb"], axis=1, inplace=True)
    if highlight:
        return results_df.style.highlight_max(subset=results_df.columns[1:])
    return results_df


def print_reports(results, idx):
    print(f"--- Classification Report ---")
    print(results[idx]["eval"]["report"])
    
    print(f"--- Imbalanced Classification Report ---")
    print(results[idx]["eval"]["report_imb"])

In [None]:
# train models - no sampling - without hyper-parameter tuning
pipelines = create_pipelines(
    classifiers, param_tune=False, cv_tuner=None
)

results_ns_nht = train_models(
    data=X, labels=y, pipelines=pipelines,
)

In [None]:
# vizualize
results_ns_nht_df = print_results(results_ns_nht, highlight=False)
results_ns_nht_df

In [None]:
print_reports(results_ns_nht, idx=0)

In [None]:
# # train models - no sampling - with hyper-parameter tuning

# param_tune=True
# cv_tuner=RandomizedSearchCV
# # cv_tuner=GridSearchCV
# cv=3
# refit="f1"

# pipelines = create_pipelines(
#     classifiers, 
#     param_tune=param_tune, 
#     cv_tuner=cv_tuner, 
#     cv=cv, 
#     scoring=scoring, 
#     refit=refit
# )

# [item['name'] for item in pipelines]

# results_ns_ht = train_models(
#     data=X, labels=y, pipelines=pipelines,
# )

In [None]:
# # vizualize
# results_ns_ht_df = print_results(results_ns_ht, highlight=False)
# results_ns_ht_df

In [None]:
# train models - with sampling - without hyper-parameter tuning
pipelines = create_pipelines_with_sampler(
    samplers, classifiers, param_tune=False, cv_tuner=None
)

print(f"pipelines: {[item['name'] for item in pipelines]}")

results_s_nht = train_models(
    data=X, labels=y, pipelines=pipelines,
)

In [None]:
# vizualize
results_s_nht_df = print_results(results_s_nht, highlight=False)
results_s_nht_df

In [None]:
# # train models - with sampling - with hyper-parameter tuning
# param_tune=True
# cv_tuner=RandomizedSearchCV
# # cv=3
# cv=StratifiedKFold(n_splits=5, random_state=None, shuffle=False)
# refit="f1"

# pipelines = create_pipelines_with_sampler(
#     samplers, 
#     classifiers, 
#     param_tune=param_tune, 
#     cv_tuner=cv_tuner, 
#     cv=cv, 
#     scoring=scoring, 
#     refit=refit
# )

# print(f"pipelines: {[item['name'] for item in pipelines]}")

# results_s_ht = train_models(
#     data=X, labels=y, pipelines=pipelines,
# )

In [None]:
# # vizualize
# results_s_ht_df = print_results(results_s_ht, highlight=False)
# results_s_ht_df

In [None]:
# print_reports(results_s_ht, idx=0)