In [1]:
%load_ext autoreload
%autoreload 2

## Requirements
* The dataset can be downloaded from [this Kaggle competition](https://www.kaggle.com/c/ieee-fraud-detection).
* In addition to the [Anaconda](https://www.anaconda.com) libraries, you need to install `altair`, `pyod` and `scikit-learn` version 0.24 or higher.
* You also need to set up an AWS account and install `awscli` and `sagemaker-python-sdk`.

In [2]:
import gc
import os
import warnings
import sagemaker
import boto3
import numpy as np
import pandas as pd
import altair as alt
from scipy.interpolate import interp1d
from sklearn.compose import make_column_transformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import OrdinalEncoder, StandardScaler
from sklearn.manifold import TSNE
from sklearn.metrics import (
    average_precision_score,
    precision_recall_curve,
    roc_auc_score,
    roc_curve,
)
from sklearn.model_selection import train_test_split
from category_encoders import HelmertEncoder
from pyod.models.copod import COPOD
from pyod.models.iforest import IForest
from sagemaker import RandomCutForest
from utils.common import (
    check_bucket_permission,
    dump_pickle,
    get_cpu_count,
    load_pickle,
    reduce_mem_usage,
    str_to_int,
)
from utils.eda_utils import plot_histogram

np.random.seed(42)
warnings.filterwarnings(action="ignore")

#### Data Loading from Local Directory
The Kaggle dataset was saved in the local directory `~/data/ieee-fraud-detection` in advance.

In [3]:
DATA_DIR = "../../data/ieee-fraud-detection"
MODEL_DIR = "models"

os.makedirs(MODEL_DIR, exist_ok=True)

In [4]:
train_identity = pd.read_csv(os.path.join(DATA_DIR, "train_identity.csv"))
train_transaction = pd.read_csv(os.path.join(DATA_DIR, "train_transaction.csv"))
test_identity = pd.read_csv(os.path.join(DATA_DIR, "test_identity.csv"))
test_transaction = pd.read_csv(os.path.join(DATA_DIR, "test_transaction.csv"))

df_train = pd.merge(train_transaction, train_identity, on="TransactionID", how="left")
df_test = pd.merge(test_transaction, test_identity, on="TransactionID", how="left")
df_test.columns = df_test.columns.str.replace("-", "_")

In [5]:
del train_identity, train_transaction, test_identity, test_transaction
_ = gc.collect()

In [6]:
print(f"Train dataset has {df_train.shape[0]} rows and {df_train.shape[1]} columns.")
print(f"Test dataset has {df_test.shape[0]} rows and {df_test.shape[1]} columns.")

Train dataset has 590540 rows and 434 columns.
Test dataset has 506691 rows and 433 columns.


In [7]:
print(f"The fraud rate is {df_train['isFraud'].mean():.2%}.")

The fraud rate is 3.50%.


In [8]:
df_train = reduce_mem_usage(df_train)
df_test = reduce_mem_usage(df_test)

Memory usage of dataframe is 1959.88 MB.
Memory usage after optimization is 650.48 MB.
Decreased by 66.81%.
Memory usage of dataframe is 1677.73 MB.
Memory usage after optimization is 565.37 MB.
Decreased by 66.30%.


# Exploratory Data Analysis

In [9]:
n_samples = 100

prop_of_missing_values = (
    df_train[df_train.columns.difference(["TransactionID", "isFraud"])].isnull().sum()
    / df_train.shape[0]
).reset_index()
prop_of_missing_values.columns = ["feature", "prop_of_missing_values"]
source = prop_of_missing_values.sample(n_samples, random_state=42)

highlight = alt.selection(
    type="single", on="mouseover", fields=["feature"], nearest=True
)
bars = (
    alt.Chart(source)
    .mark_bar()
    .encode(
        x=alt.X("feature:N", axis=alt.Axis(title="Feature"), sort="-y"),
        y=alt.Y(
            "prop_of_missing_values:Q", axis=alt.Axis(title="Percentage", format=".0%")
        ),
        color=alt.Color("prop_of_missing_values:Q", legend=None),
        opacity=alt.condition(~highlight, alt.value(1.0), alt.value(0.5)),
        tooltip=["feature:N", alt.Tooltip("prop_of_missing_values:Q", format=".2%")],
    )
    .add_selection(highlight)
)
bars.properties(
    title="Proportions of Missing Values", width=1200, height=200
).configure_axisX(labelAngle=-45, labelFontSize=8)

In [10]:
cat_features = pd.Index(
    [
        "ProductCD",
        "addr1",
        "addr2",
        "P_emaildomain",
        "R_emaildomain",
        "DeviceType",
        "DeviceInfo",
    ]
    + df_train.columns[df_train.columns.str.startswith("card")].tolist()
    + df_train.columns[df_train.columns.str.startswith("M")].tolist()
    + df_train.columns[df_train.columns.str.startswith("id")].tolist()
)
num_features = df_train.columns.difference(
    pd.Index(["TransactionID", "TransactionDT", "isFraud"]) | cat_features
)
all_features = cat_features | num_features

In [11]:
print(
    f"There are {len(cat_features)} categorical features and {len(num_features)} numeric features."
)

There are 60 categorical features and 371 numeric features.


In [12]:
int_cat_features = df_train[cat_features].select_dtypes("number").columns
df_train[int_cat_features] = df_train[int_cat_features].applymap(str_to_int)

int_cat_features = df_test[cat_features].select_dtypes("number").columns
df_test[int_cat_features] = df_test[int_cat_features].applymap(str_to_int)

In [13]:
source = df_train[cat_features].nunique().reset_index()
source.columns = ["feature", "cardinality"]

bars = alt.Chart(source).mark_bar().encode(
    x=alt.X("feature:N", axis=alt.Axis(title="Feature"), sort="-y"),
    y=alt.Y("cardinality:Q", axis=alt.Axis(title="Count")),
    tooltip=["feature:N", "cardinality:Q"]
)
bars.properties(title="Cardinalities of Categorical Features", width=1000, height=200).configure_axisX(labelAngle=-45)

In [14]:
max_null_ratio = 0.5
min_cardinality = 100
n_features= 20

null_ratios = df_train[num_features].isnull().sum() / df_train.shape[0]
value_counts = df_train[num_features].nunique()
selected_features = num_features[
    (null_ratios < max_null_ratio) & (value_counts >= min_cardinality)
]
selected_features = np.random.permutation(selected_features)[:n_features]

In [15]:
charts = []
for feature in selected_features:
    charts.append(
        plot_histogram(
            df_train[feature].dropna(),
            feature,
            bins=20,
            bar_size=12.5,
            height=100,
            n_digits=0,
        )
    )

rows = []
for i, chart in enumerate(charts):
    if (i % 3 == 2) or (i == len(charts) - 1):
        rows.append(alt.HConcatChart(hconcat=charts[i - (i % 3) : i + 1]))
alt.VConcatChart(vconcat=rows).configure_axisY(
    labelAlign="left", labelLimit=30, labelPadding=30
).configure_axisX(labelAngle=-45)

In [16]:
corr_matrix = df_train[selected_features].corr()
source = corr_matrix.stack().reset_index()
source.columns = ["feature_x", "feature_y", "correlation"]

base = alt.Chart(source).encode(
    x=alt.X("feature_x:N", axis=alt.Axis(ticks=False, title="Feature")),
    y=alt.Y("feature_y:N", axis=alt.Axis(ticks=False, title="Feature")),
)
text = base.mark_text(size=10).encode(
    text=alt.Text("correlation", format=".0%"),
    color=alt.condition(
        alt.datum.correlation > 0.5, alt.value("white"), alt.value("black")
    ),
)
heatmap = base.mark_rect().encode(
    color=alt.Color(
        "correlation:Q", legend=alt.Legend(title="Correlation", titleFontSize=9)
    )
)
(heatmap + text).properties(
    title="Correlation Matrix", width=500, height=500
).configure_axisX(labelAngle=-45)

# Data Splitting and Preprocessing

In [17]:
test_size = 0.2

df_train[cat_features] = df_train[cat_features].astype("str")
df_test[cat_features] = df_test[cat_features].astype("str")

df_X_train, df_X_valid, df_y_train, df_y_valid = train_test_split(
    df_train[all_features],
    df_train["isFraud"],
    test_size=test_size,
    random_state=42,
    stratify=df_train["isFraud"],
)

In [18]:
cat_pipeline = make_pipeline(
    OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=np.nan),
    SimpleImputer(strategy="constant", fill_value=-1),
    HelmertEncoder(drop_invariant=True),
    StandardScaler(),
)
num_pipeline = make_pipeline(StandardScaler(), SimpleImputer(strategy="median"))
transformer = make_column_transformer(
    (cat_pipeline, cat_features), (num_pipeline, num_features)
)

X_train = transformer.fit_transform(df_X_train)
X_valid = transformer.transform(df_X_valid)



### Data Visualization with t-SNE

In [19]:
%%time
n_samples = 5000
n_jobs = get_cpu_count(0.75)

tsne = TSNE(
    n_components=2,
    perplexity=50.0,
    early_exaggeration=12.0,
    learning_rate=200.0,
    random_state=42,
    n_jobs=n_jobs,
)
manifold = tsne.fit_transform(X_valid[:n_samples])

CPU times: user 1min 8s, sys: 1.2 s, total: 1min 9s
Wall time: 13.4 s


In [20]:
source = pd.DataFrame(
    np.c_[manifold, df_y_valid.iloc[:n_samples].values],
    columns=["feature_x", "feature_y", "is_fraud"],
)
source["is_fraud"] = source["is_fraud"].map(lambda x: str(int(x)))

brush = alt.selection(type="interval")
base = alt.Chart(source).add_selection(brush)
points = base.mark_point(size=5).encode(
    x=alt.X("feature_x", title=None),
    y=alt.Y("feature_y", title=None),
    color=alt.condition(
        brush,
        alt.Color("is_fraud:N", legend=alt.Legend(title="Fraudulent", titleFontSize=9)),
        alt.value("grey"),
    ),
)
tick_axis = alt.Axis(labels=False, domain=False, ticks=False)
x_ticks = base.mark_tick().encode(
    alt.X("feature_x", title="Feature", axis=tick_axis),
    alt.Y("is_fraud", title=None, axis=tick_axis),
    color=alt.condition(brush, "is_fraud", alt.value("lightgrey")),
)
y_ticks = base.mark_tick().encode(
    alt.X("is_fraud", title=None, axis=tick_axis),
    alt.Y("feature_y", title="Feature", axis=tick_axis),
    color=alt.condition(brush, "is_fraud", alt.value("lightgrey")),
)
(y_ticks | (points & x_ticks)).properties(
    title="Scatter Plot of Manifold with t-SNE"
).configure_title(anchor="middle")

# Model Training and Prediction
## Fitting and Prediction with PyOD Isolation Forest and COPOD 

In [21]:
%%time
n_estimators = 100

if_estimator = IForest(
    n_estimators=n_estimators, behaviour="new", n_jobs=n_jobs, random_state=42
)
_ = if_estimator.fit(X_train)
if_scores = if_estimator.predict_proba(X_valid)

dump_pickle(os.path.join(MODEL_DIR, "if_scores.pkl"), if_scores)

CPU times: user 9min 25s, sys: 2min 47s, total: 12min 13s
Wall time: 6min 16s


In [22]:
%%time
contamination = 0.1

cop_estimator = COPOD(contamination=contamination, n_jobs=n_jobs)
_ = cop_estimator.fit(X_train)
cop_scores = cop_estimator.predict_proba(X_valid)

dump_pickle(os.path.join(MODEL_DIR, "cop_scores.pkl"), cop_scores)

[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done   2 out of   4 | elapsed:   34.3s remaining:   34.3s
[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:   39.6s finished
[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done   2 out of   4 | elapsed:   49.9s remaining:   49.9s
[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:   56.8s finished


CPU times: user 38.6 s, sys: 1min 3s, total: 1min 42s
Wall time: 2min 58s


## Fitting with Amazon SageMaker Random Cut Forest

In [23]:
sagemaker_session = sagemaker.Session()
bucket = sagemaker_session.default_bucket()
prefix = "ieee-fraud-detection"
region = boto3.Session().region_name
role = sagemaker.get_execution_role()

if check_bucket_permission(bucket):
    print(f"Input/output will be stored in: s3://{'_'.join(bucket.split('-')[:-1])}_.../{prefix}")

Input/output will be stored in: s3://sagemaker_us_east_1_.../ieee-fraud-detection


In [24]:
%%time
num_samples_per_tree = 512
num_trees = 100

rcf_estimator = RandomCutForest(
    role=role,
    instance_count=1,
    instance_type="ml.m4.xlarge",
    data_location=f"s3://{bucket}/{prefix}/train/",
    output_path=f"s3://{bucket}/{prefix}/models",
    num_samples_per_tree=num_samples_per_tree,
    num_trees=num_trees,
)
_ = rcf_estimator.fit(rcf_estimator.record_set(X_train), logs=False)

Defaulting to the only supported framework/algorithm version: 1. Ignoring framework/algorithm version: 1.
Defaulting to the only supported framework/algorithm version: 1. Ignoring framework/algorithm version: 1.



2022-08-03 06:08:32 Starting - Starting the training job.....
2022-08-03 06:09:00 Starting - Preparing the instances for training................
2022-08-03 06:10:34 Downloading - Downloading input data.........
2022-08-03 06:11:25 Training - Downloading the training image..........
2022-08-03 06:12:25 Training - Training image download completed. Training in progress...........
2022-08-03 06:13:26 Uploading - Uploading generated training model...............
2022-08-03 06:14:46 Completed - Training job completed
CPU times: user 28.4 s, sys: 11.6 s, total: 40 s
Wall time: 18min 20s


#### Uploading Validation Set to S3 Bucket

In [25]:
np.savetxt(os.path.join(DATA_DIR, "X_valid.csv"), X_valid, delimiter=",", fmt="%i")
valid_data_uri = sagemaker_session.upload_data(
    os.path.join(DATA_DIR, "X_valid.csv"), bucket=bucket, key_prefix=f"{prefix}/valid"
)

## Defining Transformer and Prediction

In [26]:
%%time
rcf_transformer = rcf_estimator.transformer(
    instance_count=1,
    instance_type="ml.m4.xlarge",
    output_path=f"s3://{bucket}/{prefix}/pred",
)
_ = rcf_transformer.transform(
    data=valid_data_uri, content_type="text/csv", split_type="Line", logs=False
)

Defaulting to the only supported framework/algorithm version: 1. Ignoring framework/algorithm version: 1.


....................................................................................................................................................!
CPU times: user 582 ms, sys: 101 ms, total: 683 ms
Wall time: 13min 11s


#### Downloading Prediction Scores to Local Directory

In [27]:
boto3.resource("s3").meta.client.download_file(
    bucket, f"{prefix}/pred/X_valid.csv.out", os.path.join(DATA_DIR, "X_valid.csv.out")
)
rcf_scores = (
    pd.read_csv(os.path.join(DATA_DIR, "X_valid.csv.out"), header=None)[0]
    .map(lambda x: eval(x)["score"])
    .values
)

dump_pickle(os.path.join(MODEL_DIR, "rcf_scores.pkl"), rcf_scores)

# Model Evaluation

In [28]:
model_names = ["IF", "COPOD", "RCF"]

charts = []
for model, score in zip(model_names, [if_scores[:, 1], cop_scores[:, 1], rcf_scores],):
    charts.append(
        plot_histogram(score, model, bins=50, bar_size=15, height=150, n_digits=2)
    )

alt.VConcatChart(vconcat=charts).configure_axisX(labelAngle=-45)

In [29]:
roc_curves = [
    x
    for scores in [if_scores[:, 1], cop_scores[:, 1], rcf_scores]
    for x in roc_curve(df_y_valid, scores)
]
aurocs = [
    roc_auc_score(df_y_valid, scores)
    for scores in [if_scores[:, 1], cop_scores[:, 1], rcf_scores]
]

pr_curves = [
    x
    for scores in [if_scores[:, 1], cop_scores[:, 1], rcf_scores]
    for x in precision_recall_curve(df_y_valid, scores)
]
auprcs = [
    average_precision_score(df_y_valid, scores)
    for scores in [if_scores[:, 1], cop_scores[:, 1], rcf_scores]
]

In [30]:
x = np.linspace(0, 1, int(n_samples / 3))
source = np.c_[
    x,
    interp1d(roc_curves[0], roc_curves[1])(x),
    interp1d(roc_curves[3], roc_curves[4])(x),
    interp1d(roc_curves[6], roc_curves[7])(x),
]
columns = [
    f"{model_names[0]} (AUROC:{aurocs[0]:0.2%})",
    f"{model_names[1]} (AUROC:{aurocs[1]:0.2%})",
    f"{model_names[2]} (AUROC:{aurocs[2]:0.2%})",
]
source = pd.DataFrame(source, columns=["x"] + columns)
source = pd.melt(source, id_vars=["x"], value_vars=columns)

highlight = alt.selection(
    type="single", on="mouseover", fields=["variable"], nearest=True
)
base = alt.Chart(source).encode(
    x=alt.X("x:Q", title="False Positive Rate"),
    y=alt.Y("value:Q", title="True Positive Rate"),
    color=alt.Color(
        "variable:N", legend=alt.Legend(title="Estimator", orient="bottom-right")
    ),
)
points = (
    base.mark_circle()
    .encode(opacity=alt.value(0))
    .add_selection(highlight)
    .properties()
)
line = base.mark_line().encode(
    size=alt.condition(~highlight, alt.value(1.5), alt.value(3))
)
(points + line).properties(title="Receiver Operating Characteristic Curves")

In [31]:
source = np.c_[
    x,
    interp1d(pr_curves[1], pr_curves[0])(x),
    interp1d(pr_curves[4], pr_curves[3])(x),
    interp1d(pr_curves[7], pr_curves[6])(x),
]
columns = [
    f"{model_names[0]} (AUPRC:{auprcs[0]:0.2%})",
    f"{model_names[1]} (AUPRC:{auprcs[1]:0.2%})",
    f"{model_names[2]} (AUPRC:{auprcs[2]:0.2%})",
]
source = pd.DataFrame(source, columns=["x"] + columns)
source = pd.melt(source, id_vars=["x"], value_vars=columns)

highlight = alt.selection(
    type="single", on="mouseover", fields=["variable"], nearest=True
)
base = alt.Chart(source).encode(
    x=alt.X("x:Q", title="Recall"),
    y=alt.Y("value:Q", title="Precision", scale=alt.Scale(domain=[0.0, 1.0])),
    color=alt.Color(
        "variable:N", legend=alt.Legend(title="Estimator", orient="top-right")
    ),
)
points = (
    base.mark_circle()
    .encode(opacity=alt.value(0))
    .add_selection(highlight)
    .properties()
)
lines = base.mark_line().encode(
    size=alt.condition(~highlight, alt.value(1.5), alt.value(3))
)
(points + lines).properties(title="Precision - Recall Curves")

# Model Re-training
## Data Preprocessing, Fitting and Prediction

In [32]:
X_train = transformer.fit_transform(df_train[all_features])
X_test = transformer.transform(df_test[all_features])



In [33]:
%%time
n_jobs = 1

cop_estimator = COPOD(contamination=contamination, n_jobs=n_jobs)
_ = cop_estimator.fit(X_train)
cop_scores = cop_estimator.predict_proba(X_test)

CPU times: user 3min 30s, sys: 1min 22s, total: 4min 53s
Wall time: 5min 42s


In [34]:
submission = pd.DataFrame(
    {"TransactionID": df_test["TransactionID"].values, "isFraud": cop_scores[:, 1]}
)
submission.to_csv(os.path.join(MODEL_DIR, "submission.csv"), index=False)