In [1]:
# ! pip install pycaret

# Credit Card Fraud Detection with PyCaret

In this notebook, we will test out the autoML [PyCaret](https://pycaret.org/) with reference to its [Binary Classification tutorial](https://github.com/pycaret/pycaret/blob/master/tutorials/Binary%20Classification%20Tutorial%20Level%20Beginner%20-%20%20CLF101.ipynb). Using PyCaret, we will select and train a few models to train the data generated by the [synthetic credit card transactions generator](./01-Synthetic-data-generation.ipynb).  

Our ultimate goal is to select a couple of models and use the trained models to perform real-time credit card fraud detection.  
Combined with [atoti](https://atoti.io/), we will compute the impact on key business metrics and also compare the performance of the models.

In [2]:
import glob
import os
from datetime import date
from zipfile import ZipFile

import numpy as np
import pandas as pd
import wget
from haversine import Unit, haversine
from IPython.display import clear_output, display
from utils import data_prep

## 1 Data preparation

### 1.1 Gather customer and credit card transactions data  

You may use the [01-Synthetic-data-generation](./01-Synthetic-data-generation.ipynb) notebook to generate your own data. Alternatively, you can download a saved version of customers and transactions from Jun - Oct 2021.

In [3]:
data_path = "./output"
profile_path = "./profiles_data"

os.makedirs(data_path, exist_ok=False)

Skip the below step if data has already been downloaded.

In [4]:
def bar_custom(current, total, width=80):
    clear_output(wait=True)
    print("Downloading: %d%% [%d / %d] bytes" % (current / total * 100, current, total))


url = "https://data.atoti.io/notebooks/credit-card-fraud/profiles_data.zip"
filename = wget.download(url, bar=bar_custom)

# unzipping the file
with ZipFile("profiles_data.zip", "r") as zipObj:
    # Extract all the contents of zip file in current directory
    zipObj.extractall()

Downloading: 100% [153865413 / 153865413] bytes


#### 1.1.1 Process transactions files

We concatenate the transactions for all the profiles into a single DataFrame.

In [5]:
all_files = glob.glob(profile_path + "/*adults*.csv")

In [6]:
files = []

for filename in all_files:
    df = pd.read_csv(filename, index_col=None, header=0)
    files.append(df)

txns_df = pd.concat(files, axis=0, ignore_index=True)
print(
    "Total number of transactions: ",
    len(txns_df),
    " - number of columns: ",
    len(txns_df.columns),
)

txns_df.head()

Total number of transactions:  1276543  - number of columns:  14


Unnamed: 0,ssn,cc_num,acct_num,profile,trans_num,trans_date,trans_time,unix_time,category,amt,is_fraud,merchant,merch_lat,merch_long
0,028-33-9309,4391470058239421,76479233103,adults_2550_female_rural.json,a32d15100140b93f67430a0d51f7f470,2021-08-24,03:09:54,1629774594,gas_transport,8.78,1,fraud_Prohaska-Murray,43.469516,-85.860994
1,028-33-9309,4391470058239421,76479233103,adults_2550_female_rural.json,4302c8e427aebde7200b8cb312b092e6,2021-08-23,02:35:13,1629686113,grocery_pos,368.16,1,fraud_Cartwright-Harris,44.040299,-84.026526
2,028-33-9309,4391470058239421,76479233103,adults_2550_female_rural.json,3e2edc0fa18cb831ab946b3636b2beb4,2021-08-24,23:16:43,1629847003,home,276.14,1,fraud_Beier and Sons,43.735269,-85.12236
3,028-33-9309,4391470058239421,76479233103,adults_2550_female_rural.json,a33a6497109e098aa322c904f105d81b,2021-08-24,22:30:43,1629844243,food_dining,106.2,1,"fraud_Kutch, Steuber and Gerhold",43.43289,-85.804933
4,028-33-9309,4391470058239421,76479233103,adults_2550_female_rural.json,c902dfaa64a6756f7d0f6e5fbea02284,2021-08-23,01:03:58,1629680638,shopping_net,1094.5,1,fraud_Jast Ltd,43.245551,-85.220724


#### 1.1.2 Reducing number of merchants 

In the synthetic data generator, merchants' location are computed based on distance from customer. Hence, there may be many merchants of the same name close by. We are going to clean up by combining the same merchants that are close by.

In [7]:
txns_df = data_prep.merchant_cleanup(txns_df)

In [8]:
txns_df.sort_values(by=["trans_date", "trans_time"], inplace=True)
txns_df.head()

Unnamed: 0,ssn,cc_num,acct_num,profile,trans_num,trans_date,trans_time,unix_time,category,amt,is_fraud,merchant,merchant_id,merch_long,merch_lat
350538,651-85-6277,6011254895433784,650277300926,adults_2550_male_rural.json,78a301f6c1d629851f2134f3d428bd64,2021-06-01,00:03:16,1622505796,grocery_pos,94.07,0,fraud_Barton Inc,191095,-80.583026,34.505328
156578,829-65-8471,3561104445052090,359048776675,adults_2550_female_urban.json,b1f41c9941a8fef0c58b7da9c53b87d1,2021-06-01,00:05:54,1622505954,misc_net,2.52,0,fraud_Ruecker Group,156108,-74.303208,40.728483
1030583,765-51-3253,30177657960332,457496356031,adults_50up_male_urban.json,4c277b5b47525ffd52919af6c80f292a,2021-06-01,00:06:50,1622506010,food_dining,40.44,0,fraud_Haag-Blanda,289878,-87.214427,42.418391
473195,843-90-9424,4893518176925451756,261496133542,adults_2550_male_urban.json,a24a9cfd2eb04f1e477c8b2b28f57bd7,2021-06-01,00:09:07,1622506147,grocery_pos,82.0,0,fraud_Bauch-Raynor,418020,-97.492732,36.430343
337806,201-15-2262,3510018921883087,840477491610,adults_2550_male_rural.json,2ecedf1776c3de3f8e87cd4842f855f4,2021-06-01,00:10:19,1622506219,grocery_pos,82.95,0,fraud_Kiehn-Emmerich,337806,-76.54049,43.92555


In [9]:
txns_df.shape

(1276543, 15)

In [10]:
print(
    f"Non-fraudulent: {len(txns_df[txns_df['is_fraud']==0])}, Fraudulent: {len(txns_df[txns_df['is_fraud']==1])}"
)

Non-fraudulent: 1185276, Fraudulent: 91267


#### 1.1.3 Load customer file  

In [11]:
cust_df = pd.read_csv(f"{profile_path}/customers_list.csv")
cust_df.head()

Unnamed: 0,ssn,cc_num,first,last,gender,street,city,state,zip,lat,long,city_pop,job,dob,acct_num,profile
0,535-89-6775,346545733014127,Carla,Sheppard,F,013 Daniel Trafficway Suite 429,Pittsburgh,PA,15213,40.444,-79.9552,687276,"Psychologist, occupational",1938-02-10,438882350784,adults_50up_female_urban.json
1,354-81-2556,3518152160915706,Jaime,Boyle,F,88361 Schaefer Cove Suite 452,Lester,AL,35647,34.9596,-87.1006,1030,Theme park manager,1944-10-08,347776380945,adults_50up_female_rural.json
2,422-27-9393,5429874770003176,Seth,Mitchell,M,79740 Maria View,Santee,CA,92071,32.8486,-116.9862,53422,"Engineer, agricultural",1966-07-13,97717754818,adults_50up_male_urban.json
3,028-33-9309,4391470058239421,Danielle,Rodriguez,F,8929 Kenneth Courts Apt. 591,Muir,MI,48860,43.0439,-84.9391,1334,Publishing rights manager,1992-04-12,76479233103,adults_2550_female_rural.json
4,246-23-6185,5395351866032237,Nicole,Villarreal,F,2655 Rebecca Parkway,Welches,OR,97067,45.3399,-121.9598,2138,"Education officer, museum",1999-08-22,389654813660,young_adults_female_rural.json


In [12]:
len(cust_df)

9353

#### 1.1.4 Compute additional features

The `compute_features` function merge the customers and transactions to compute additional features:
- customer age
- distance between customer and merchant
- time of transaction
    - Weekday or weekend
    - Day or night
- cumulative number of transactions for customer in the past 1, 7 and 30 days
- cumulative average transaction amount for customer in the past 1, 7 and 30 days

In [13]:
cc_txn_df = data_prep.compute_features(cust_df, txns_df)

In [14]:
len(cc_txn_df)

1276543

In [15]:
cc_txn_df.tail()

Unnamed: 0,ssn,cc_num,acct_num,profile,trans_num,trans_date,trans_time,unix_time,category,amt,...,age,distance_from_cust,txn_during_night,trans_weekend,nb_txns_1_days,avg_txns_amt_1_days,nb_txns_7_days,avg_txns_amt_7_days,nb_txns_30_days,avg_txns_amt_30_days
1276538,402-78-5680,30306133731849,683914675882,adults_50up_male_urban.json,e0570d36e082a5913159e90459f95763,2021-10-31,23:58:08,1635724688,misc_net,780.88,...,74,87.147193,1,1,7.0,562.718571,12.0,553.205833,12.0,553.205833
1276539,590-91-7122,3525276120447046,301548051587,young_adults_male_urban.json,5087d30049ca44c3a8b06f5f16d4400a,2021-10-31,23:58:16,1635724696,shopping_net,900.05,...,25,69.105396,1,1,8.0,466.5425,17.0,546.149412,17.0,546.149412
1276540,403-93-5583,639082412268,847752784845,adults_2550_female_rural.json,a6f360bf3f1eee039d16a3dffc7caab0,2021-10-31,23:58:41,1635724721,travel,10.29,...,45,126.952035,1,1,6.0,412.798333,15.0,312.095333,15.0,312.095333
1276541,329-13-2201,30179130072305,205446777871,adults_50up_female_urban.json,e2597d6acf0b135cd9e261c257051bb1,2021-10-31,23:59:55,1635724795,misc_net,891.91,...,75,98.274275,1,1,7.0,400.428571,12.0,446.11,12.0,446.11
1276542,790-64-5257,3552783966051994,573043127801,adults_50up_female_rural.json,91ae1f2cd55e7d21d4059c2d3bc56bd1,2021-10-31,23:59:58,1635724798,entertainment,610.12,...,81,234.908522,1,1,4.0,510.6,7.0,297.687143,7.0,297.687143


In [16]:
cc_txn_df.columns

Index(['ssn', 'cc_num', 'acct_num', 'profile', 'trans_num', 'trans_date',
       'trans_time', 'unix_time', 'category', 'amt', 'is_fraud', 'merchant',
       'merchant_id', 'merch_long', 'merch_lat', 'first', 'last', 'gender',
       'street', 'city', 'state', 'zip', 'lat', 'long', 'city_pop', 'job',
       'dob', 'age', 'distance_from_cust', 'txn_during_night', 'trans_weekend',
       'nb_txns_1_days', 'avg_txns_amt_1_days', 'nb_txns_7_days',
       'avg_txns_amt_7_days', 'nb_txns_30_days', 'avg_txns_amt_30_days'],
      dtype='object')

### 1.2 Splitting data for atoti and AutoML

Below are the new columns for transactions and customers after computing the new features.  
We will output these features to CSV for data exploration in atoti.

In [17]:
txn_new_columns = txns_df.columns.to_list() + [
    "distance_from_cust",
    "txn_during_night",
    "trans_weekend",
    "nb_txns_1_days",
    "avg_txns_amt_1_days",
    "nb_txns_7_days",
    "avg_txns_amt_7_days",
    "nb_txns_30_days",
    "avg_txns_amt_30_days",
]

cust_new_columns = cust_df.columns.to_list() + ["age"]

We will take data from the first 3 months for exploratory data analysis and data from September for real-time simulation on atoti.

In [18]:
eda_txn_df = cc_txn_df.loc[cc_txn_df["trans_date"] < "2021-09-01"]
realtime_txn_df = cc_txn_df.loc[
    (cc_txn_df["trans_date"] >= "2021-09-01") & (cc_txn_df["trans_date"] < "2021-10-01")
]

#### 1.2.1 Export the processed data for EDA in atoti.

In [19]:
cc_txn_df[cust_new_columns].drop_duplicates().to_csv(
    f"{data_path}/post_process_customer_list.csv", index=False
)

# transaction data for ml modeling and to be loaded into atoti for eda
eda_txn_df[txn_new_columns].to_csv(
    f"{data_path}/post_process_transaction_list.csv", index=False
)

# transaction data to be exported for real-time simulation in atoti
realtime_txn_df[txn_new_columns].to_csv(f"{data_path}/realtime_txn.csv", index=False)

#### 1.2.2 Split data for machine learning and actual test on unseen data

In the example from [PyCaret tutorial](https://github.com/pycaret/pycaret/blob/master/tutorials/Binary%20Classification%20Tutorial%20Level%20Beginner%20-%20%20CLF101.ipynb), the data set is split into _data_ and _data_unseen_.  

In our case, we will use the June data for AutoML, which means that it will be further split into training and testing datasets.  
July data will be used to test the accuracy of the final model.  

In [20]:
june_data = cc_txn_df.loc[cc_txn_df["trans_date"] < "2021-07-01"]
july_aug_data = cc_txn_df.loc[
    (cc_txn_df["trans_date"] >= "2021-07-01") & (cc_txn_df["trans_date"] < "2021-09-01")
]

print(f"June: {len(june_data)} - July & August: {len(july_aug_data)}")

June: 55648 - July & August: 328520


In [21]:
july_aug_data.tail()

Unnamed: 0,ssn,cc_num,acct_num,profile,trans_num,trans_date,trans_time,unix_time,category,amt,...,age,distance_from_cust,txn_during_night,trans_weekend,nb_txns_1_days,avg_txns_amt_1_days,nb_txns_7_days,avg_txns_amt_7_days,nb_txns_30_days,avg_txns_amt_30_days
384163,717-56-3781,343089588484922,382840886816,adults_50up_male_rural.json,726c8636d9639403c9e343b35214dc7d,2021-08-31,23:59:12,1630454352,home,28.55,...,63,46.21627,1,0,3.0,32.563333,31.0,57.515161,134.0,61.500299
384164,282-33-3795,4486993221031082,988409340048,adults_2550_female_urban.json,ffaee26ba5112a5b022ee5551d47063f,2021-08-31,23:59:13,1630454353,entertainment,152.87,...,28,90.797907,1,0,8.0,64.37875,32.0,75.802188,32.0,75.802188
384165,282-33-3795,4486993221031082,988409340048,adults_2550_female_urban.json,b23ed48bdbf12f4556c6b76a1e00937e,2021-08-31,23:59:28,1630454368,personal_care,13.05,...,28,93.939469,1,0,9.0,58.675556,33.0,73.900606,33.0,73.900606
384166,890-25-8008,341511357666803,156690221471,young_adults_female_urban.json,1adeceacd294b7c75a33e5ba925711e6,2021-08-31,23:59:35,1630454375,personal_care,3.59,...,22,78.924718,1,0,4.0,19.9975,24.0,57.830417,96.0,46.29375
384167,488-44-6293,372091910730620,202299763857,adults_50up_male_rural.json,7c2f6c73a993dcab9d6230a8bc66c972,2021-08-31,23:59:42,1630454382,home,13.5,...,61,116.68854,1,0,3.0,212.28,19.0,82.588947,56.0,61.99875


In [22]:
len(july_aug_data)

328520

#### 1.2.3 Select features for machine learning  

We could have used the [`ignore_features`](https://pycaret.org/classification1/) from PyCaret to ignore some data columns. However, to make it easier to see what are the features that were finally used, we perform filtered selection instead.  

In [23]:
features_list = [
    "category",
    "amt",
    "is_fraud",
    "merchant_id",
    "gender",
    "street",
    "city",
    "state",
    "zip",
    "city_pop",
    "job",
    "age",
    "distance_from_cust",
    "txn_during_night",
    "trans_weekend",
    "nb_txns_1_days",
    "avg_txns_amt_1_days",
    "nb_txns_7_days",
    "avg_txns_amt_7_days",
    "nb_txns_30_days",
    "avg_txns_amt_30_days",
]

In [24]:
data = june_data[features_list]
data_unseen = july_aug_data[features_list]

In [25]:
print(f"data: {len(data)} - data_unseen: {len(data_unseen)}")

data: 55648 - data_unseen: 328520


Reset index for the data in preparation for machine learning.

In [26]:
data.reset_index(drop=True, inplace=True)
data_unseen.reset_index(drop=True, inplace=True)

## 2 Testing autoML with PyCaret

### 2.1 Setting up environment in PyCaret

PyCaret automatically infers the data type. We can [override the inferred data type](https://pycaret.org/data-types/) adding the column names under `numeric_features` or `categorical_features`.  

Despite a smaller dataset, we still encountered memory issue. As a workaround, we add the features of high cardinality to the parameter `high_cardinality_features`.  
Note: Uncomment the `ignore_features` below if you would like to exclude the cumulative features from the machine learning model.

In [27]:
from pycaret.classification import *

clf1 = setup(
    data=data,
    target="is_fraud",
    numeric_features=["txn_during_night", "trans_weekend"],
    categorical_features=["zip", "merchant_id"],
    high_cardinality_features=["street", "city", "state", "zip", "merchant_id"],
    # ignore_features=[
    #     "nb_txns_1_days",
    #     "avg_txns_amt_1_days",
    #     "nb_txns_7_days",
    #     "avg_txns_amt_7_days",
    #     "nb_txns_30_days",
    #     "avg_txns_amt_30_days",
    # ],
)

Unnamed: 0,Description,Value
0,session_id,7679
1,Target,is_fraud
2,Target Type,Binary
3,Label Encoded,
4,Original Data,"(55648, 21)"
5,Missing Values,False
6,Numeric Features,12
7,Categorical Features,8
8,Ordinal Features,False
9,High Cardinality Features,True


From the table, we can see that the number of features increased from 21 to 491 due to categorical encoding:
- The Original Data	(55648, 21)
- Transformed Train Set	(38953, 491)
- Transformed Test Set	(16695, 491)

### 2.2 Comparing all models

The goal is to be able to detect potential fraud and enable the bank to investigate or alert the consumer of suspicious transactions. Therefore, we will choose a few models with the highest F1 score, AUC and Recall for further comparison. 

In [28]:
best_model = compare_models()

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
lightgbm,Light Gradient Boosting Machine,0.9969,0.9976,0.888,0.9605,0.9223,0.9207,0.9217,0.368
gbc,Gradient Boosting Classifier,0.9947,0.9921,0.821,0.9145,0.8649,0.8622,0.8637,7.299
rf,Random Forest Classifier,0.9938,0.9961,0.7138,0.9798,0.8254,0.8223,0.8333,2.181
dt,Decision Tree Classifier,0.9934,0.9193,0.8421,0.8421,0.841,0.8377,0.8382,0.518
et,Extra Trees Classifier,0.9931,0.9941,0.6778,0.9857,0.802,0.7987,0.8138,4.41
ada,Ada Boost Classifier,0.9926,0.9909,0.7712,0.855,0.8103,0.8065,0.808,1.904
knn,K Neighbors Classifier,0.9882,0.9252,0.5758,0.7987,0.6686,0.6628,0.6723,1.983
lda,Linear Discriminant Analysis,0.9862,0.9748,0.7114,0.6501,0.6789,0.6719,0.6728,1.891
lr,Logistic Regression,0.9828,0.9095,0.3308,0.6684,0.4413,0.4336,0.4621,7.011
ridge,Ridge Classifier,0.9819,0.0,0.1928,0.7382,0.3051,0.2991,0.3709,0.155


In [29]:
best_model

LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
               importance_type='split', learning_rate=0.1, max_depth=-1,
               min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
               n_estimators=100, n_jobs=-1, num_leaves=31, objective=None,
               random_state=7679, reg_alpha=0.0, reg_lambda=0.0, silent='warn',
               subsample=1.0, subsample_for_bin=200000, subsample_freq=0)

### 2.3 Creating models

Let's create the model for a few of those with highest F1 score and AUC. In fraud detection, we aim to raise alert to the credit cardholder of suspicious transactions. Hence, we will take the model that scored well for both AUC and recall.  

#### 2.3.1 Decision Tree Classifier

In [30]:
dt = create_model("dt")

Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,0.9931,0.9421,0.8889,0.8,0.8421,0.8386,0.8398
1,0.9938,0.9244,0.8519,0.8519,0.8519,0.8487,0.8487
2,0.9931,0.8817,0.7654,0.8857,0.8212,0.8177,0.8199
3,0.9936,0.9478,0.9,0.809,0.8521,0.8488,0.8501
4,0.9926,0.9044,0.8125,0.8228,0.8176,0.8138,0.8138
5,0.9944,0.9237,0.85,0.8718,0.8608,0.8579,0.8579
6,0.9972,0.9618,0.925,0.9367,0.9308,0.9294,0.9294
7,0.9897,0.8785,0.7625,0.7439,0.7531,0.7478,0.7479
8,0.9941,0.9113,0.825,0.88,0.8516,0.8486,0.8491
9,0.9928,0.9178,0.8395,0.8193,0.8293,0.8256,0.8257


#### 2.3.2 Extra Trees Classifier

In [31]:
et = create_model("et")

Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,0.9938,0.9905,0.716,0.9831,0.8286,0.8255,0.8363
1,0.992,0.9986,0.6296,0.9808,0.7669,0.7631,0.7825
2,0.9936,0.9981,0.7037,0.9828,0.8201,0.817,0.8288
3,0.9933,0.9874,0.675,1.0,0.806,0.8027,0.8188
4,0.9944,0.9972,0.725,1.0,0.8406,0.8378,0.849
5,0.9938,0.9914,0.7125,0.9828,0.8261,0.823,0.8341
6,0.9944,0.9991,0.725,1.0,0.8406,0.8378,0.849
7,0.9915,0.9897,0.6375,0.9273,0.7556,0.7514,0.7651
8,0.9908,0.9904,0.55,1.0,0.7097,0.7054,0.7381
9,0.9938,0.9986,0.7037,1.0,0.8261,0.823,0.8362


#### 2.3.3 Light Gradient Boosting Machine

In [32]:
lgbm = create_model("lightgbm")

Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,0.9974,0.9986,0.9136,0.961,0.9367,0.9354,0.9357
1,0.9967,0.9995,0.9012,0.9359,0.9182,0.9165,0.9167
2,0.9967,0.9975,0.8889,0.9474,0.9172,0.9155,0.916
3,0.9969,0.9987,0.8875,0.9595,0.9221,0.9205,0.9212
4,0.9956,0.9975,0.8,0.9846,0.8828,0.8806,0.8855
5,0.9959,0.9927,0.8875,0.9103,0.8987,0.8966,0.8967
6,0.9979,0.9997,0.925,0.9737,0.9487,0.9477,0.948
7,0.9977,0.9932,0.9125,0.9733,0.9419,0.9408,0.9413
8,0.9979,0.9991,0.9,1.0,0.9474,0.9463,0.9477
9,0.9964,0.9994,0.8642,0.9589,0.9091,0.9073,0.9085


### 2.4 Tune models

`create_model()` trains the model using default hyperparameters. We will use `tune_model()` function to tune the hyperparameters.  

In [33]:
tuned_dt = tune_model(dt)

Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,0.9915,0.9488,0.7531,0.8243,0.7871,0.7828,0.7836
1,0.9882,0.9007,0.5556,0.8182,0.6618,0.656,0.6687
2,0.9884,0.9506,0.679,0.7432,0.7097,0.7038,0.7045
3,0.9866,0.9124,0.6375,0.6892,0.6623,0.6555,0.6561
4,0.9874,0.9281,0.7,0.6914,0.6957,0.6892,0.6892
5,0.9892,0.9314,0.6875,0.7639,0.7237,0.7182,0.7192
6,0.9892,0.9219,0.675,0.7714,0.72,0.7145,0.7162
7,0.9874,0.9367,0.6625,0.7067,0.6839,0.6775,0.6778
8,0.99,0.9473,0.6375,0.8361,0.7234,0.7184,0.7253
9,0.9884,0.933,0.6296,0.7727,0.6939,0.6881,0.6918


In [34]:
tuned_et = tune_model(et)

Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,0.9792,0.9479,0.0,0.0,0.0,0.0,0.0
1,0.9792,0.9486,0.0,0.0,0.0,0.0,0.0
2,0.9792,0.9501,0.0,0.0,0.0,0.0,0.0
3,0.9795,0.9227,0.0,0.0,0.0,0.0,0.0
4,0.9795,0.9144,0.0,0.0,0.0,0.0,0.0
5,0.9795,0.9476,0.0,0.0,0.0,0.0,0.0
6,0.9795,0.953,0.0,0.0,0.0,0.0,0.0
7,0.9795,0.9216,0.0,0.0,0.0,0.0,0.0
8,0.9795,0.8738,0.0,0.0,0.0,0.0,0.0
9,0.9792,0.8866,0.0,0.0,0.0,0.0,0.0


In [35]:
tuned_lgbm = tune_model(lgbm)

Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,0.9982,0.9963,0.963,0.9512,0.9571,0.9561,0.9562
1,0.9964,0.9987,0.8765,0.9467,0.9103,0.9084,0.9091
2,0.9949,0.9949,0.8148,0.9296,0.8684,0.8658,0.8678
3,0.9959,0.997,0.85,0.9444,0.8947,0.8926,0.8939
4,0.9949,0.9865,0.7875,0.9545,0.863,0.8604,0.8646
5,0.9951,0.9936,0.85,0.9067,0.8774,0.8749,0.8754
6,0.9979,0.9995,0.925,0.9737,0.9487,0.9477,0.948
7,0.9959,0.9913,0.8875,0.9103,0.8987,0.8966,0.8967
8,0.9972,0.9992,0.875,0.9859,0.9272,0.9257,0.9274
9,0.9969,0.9997,0.9012,0.9481,0.9241,0.9225,0.9228


### 2.5 Evaluating models

The `evaluate_model()` function displays a user interface for all the available plots for a given model. Let's see the performance for each of the models.

In [36]:
evaluate_model(tuned_dt)

interactive(children=(ToggleButtons(description='Plot Type:', icons=('',), options=(('Hyperparameters', 'param…

In [37]:
evaluate_model(tuned_et)

interactive(children=(ToggleButtons(description='Plot Type:', icons=('',), options=(('Hyperparameters', 'param…

In [38]:
evaluate_model(tuned_lgbm)

interactive(children=(ToggleButtons(description='Plot Type:', icons=('',), options=(('Hyperparameters', 'param…

### 2.6 Predict on test/hold-out sample  

We perform one final check by predicting the test/hold-out set before we finalize the model. We saw earlier in section 2.1, we have 16,695 samples for the test data:  
__Transformed Test Set: (16695, 491)__

In [39]:
predict_model(tuned_dt);

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Decision Tree Classifier,0.9887,0.9341,0.6532,0.7687,0.7062,0.7005,0.703


In [40]:
predict_model(tuned_et);

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Extra Trees Classifier,0.9793,0.9037,0.0,0.0,0.0,0.0,0.0


In [41]:
predict_model(tuned_lgbm);

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Light Gradient Boosting Machine,0.9956,0.9958,0.922,0.8716,0.8961,0.8938,0.8942


From the evaluation, we see that for the Extra Trees Classifier, the recall, precision and other metrics becomes 0. It also scores poorly for the AUC.  
We can probably drop the model from our evaluation.

### 2.7 Finalize model for deployment  

Lastly, we proceed to train the model on the complete dataset with the `finalize_model()`. This should lead to the best model for use in making prediction on new and unseen data.

In [42]:
final_dt = finalize_model(tuned_dt)

In [43]:
final_et = finalize_model(tuned_et)

In [44]:
final_lgbm = finalize_model(tuned_lgbm)

In [45]:
print(final_dt)

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='entropy',
                       max_depth=2, max_features=1.0, max_leaf_nodes=None,
                       min_impurity_decrease=0.0002, min_impurity_split=None,
                       min_samples_leaf=3, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=7679, splitter='best')


In [46]:
print(final_lgbm)

LGBMClassifier(bagging_fraction=0.4, bagging_freq=3, boosting_type='gbdt',
               class_weight=None, colsample_bytree=1.0, feature_fraction=0.8,
               importance_type='split', learning_rate=0.15, max_depth=-1,
               min_child_samples=51, min_child_weight=0.001, min_split_gain=0.5,
               n_estimators=250, n_jobs=-1, num_leaves=6, objective=None,
               random_state=7679, reg_alpha=0.1, reg_lambda=0.005,
               silent='warn', subsample=1.0, subsample_for_bin=200000,
               subsample_freq=0)


In [47]:
print(final_et)

ExtraTreesClassifier(bootstrap=True, ccp_alpha=0.0, class_weight={},
                     criterion='gini', max_depth=8, max_features='sqrt',
                     max_leaf_nodes=None, max_samples=None,
                     min_impurity_decrease=0.001, min_impurity_split=None,
                     min_samples_leaf=6, min_samples_split=2,
                     min_weight_fraction_leaf=0.0, n_estimators=250, n_jobs=-1,
                     oob_score=False, random_state=7679, verbose=0,
                     warm_start=False)


### 2.8 Predict on unseen data  

We have previously used June data for ML modelling and put aside the July and August data in section 1.2.2.  
Now, we will use the unseen July and Aug data to test out the finalized model.  

#### 2.8.1 Decision Tree Classifier

In [48]:
prediction_output_col = ["trans_num", "Label", "Score"]

`Label` and `Score` columns are added onto the data_unseen set:  
- Label is the prediction  
- score is the probability of the prediction.  

In [49]:
unseen_predictions = predict_model(final_dt, data=data_unseen)
unseen_predictions.head()

Unnamed: 0,category,amt,is_fraud,merchant_id,gender,street,city,state,zip,city_pop,...,txn_during_night,trans_weekend,nb_txns_1_days,avg_txns_amt_1_days,nb_txns_7_days,avg_txns_amt_7_days,nb_txns_30_days,avg_txns_amt_30_days,Label,Score
0,shopping_pos,14.11,0,81458,F,7820 Wilson Tunnel Apt. 795,Andrews,SC,29510,11060,...,1,0,1.0,14.11,22.0,62.525455,33.0,55.510909,0,0.9976
1,grocery_pos,75.52,0,490762,M,2137 Serrano Station,North Charleston,SC,29420,71702,...,1,0,1.0,75.52,27.0,50.487407,69.0,73.28087,0,0.9976
2,grocery_pos,136.23,0,635517,F,4656 Karen Parkways,Elmo,MT,59915,403,...,1,0,1.0,136.23,34.0,88.363529,127.0,69.842756,0,0.9976
3,gas_transport,66.94,0,19638,F,74769 Anderson Plain,Oaktown,IN,47561,1588,...,1,0,1.0,66.94,10.0,28.29,60.0,30.62,0,0.9976
4,entertainment,116.85,0,23622,M,318 Brenda Terrace Suite 001,Beverly,WV,26253,3490,...,1,0,1.0,116.85,19.0,66.212105,92.0,65.748152,0,0.9976


It's nice that the predicted results are concatenated to the original dataset while all the transformations are automatically performed in the background.  

However, in section 1.2.3, we have selected only the features that we needed for the ML because we have a lot of features that we decided not necessary.  
Hence, we will do another merge back to the original data frame as we needed the `trans_num` for our use in atoti later on.  

In [50]:
july_aug_prediction_dt = july_aug_data.merge(unseen_predictions)
july_aug_prediction_dt.head(3)

Unnamed: 0,ssn,cc_num,acct_num,profile,trans_num,trans_date,trans_time,unix_time,category,amt,...,txn_during_night,trans_weekend,nb_txns_1_days,avg_txns_amt_1_days,nb_txns_7_days,avg_txns_amt_7_days,nb_txns_30_days,avg_txns_amt_30_days,Label,Score
0,547-69-7793,6534665167391525,693931531856,adults_50up_female_urban.json,e14e1a05c8da5e3f3da457e6eb84d825,2021-07-01,00:00:18,1625097618,shopping_pos,14.11,...,1,0,1.0,14.11,22.0,62.525455,33.0,55.510909,0,0.9976
1,191-61-6759,4166219838226478,181679512065,adults_2550_male_urban.json,a3c0100cb4cfb7a5c4ffc90faaf7a6d8,2021-07-01,00:01:15,1625097675,grocery_pos,75.52,...,1,0,1.0,75.52,27.0,50.487407,69.0,73.28087,0,0.9976
2,027-46-7326,3513900304593051,920573624386,adults_50up_female_rural.json,9952d8bafe837216db5134d988d5144a,2021-07-01,00:02:12,1625097732,grocery_pos,136.23,...,1,0,1.0,136.23,34.0,88.363529,127.0,69.842756,0,0.9976


This step is purely to output the prediction for loading into atoti. We will use atoti to gather some insights on the impact of the prediction on the various business metrics.

In [51]:
july_aug_prediction_dt[prediction_output_col].to_csv(
    f"{data_path}/july_aug_prediction_full_dt.csv", index=False
)

#### 2.8.2 Extra Trees Classifier

For completeness, we will see the performance of this model despite its poor performance during the training.

In [52]:
unseen_predictions_et = predict_model(final_et, data=data_unseen)
unseen_predictions_et.head()

Unnamed: 0,category,amt,is_fraud,merchant_id,gender,street,city,state,zip,city_pop,...,txn_during_night,trans_weekend,nb_txns_1_days,avg_txns_amt_1_days,nb_txns_7_days,avg_txns_amt_7_days,nb_txns_30_days,avg_txns_amt_30_days,Label,Score
0,shopping_pos,14.11,0,81458,F,7820 Wilson Tunnel Apt. 795,Andrews,SC,29510,11060,...,1,0,1.0,14.11,22.0,62.525455,33.0,55.510909,0,0.9781
1,grocery_pos,75.52,0,490762,M,2137 Serrano Station,North Charleston,SC,29420,71702,...,1,0,1.0,75.52,27.0,50.487407,69.0,73.28087,0,0.9781
2,grocery_pos,136.23,0,635517,F,4656 Karen Parkways,Elmo,MT,59915,403,...,1,0,1.0,136.23,34.0,88.363529,127.0,69.842756,0,0.9781
3,gas_transport,66.94,0,19638,F,74769 Anderson Plain,Oaktown,IN,47561,1588,...,1,0,1.0,66.94,10.0,28.29,60.0,30.62,0,0.9781
4,entertainment,116.85,0,23622,M,318 Brenda Terrace Suite 001,Beverly,WV,26253,3490,...,1,0,1.0,116.85,19.0,66.212105,92.0,65.748152,0,0.9781


In [53]:
july_aug_prediction_et = july_aug_data.merge(unseen_predictions_et)
july_aug_prediction_et.head(3)

Unnamed: 0,ssn,cc_num,acct_num,profile,trans_num,trans_date,trans_time,unix_time,category,amt,...,txn_during_night,trans_weekend,nb_txns_1_days,avg_txns_amt_1_days,nb_txns_7_days,avg_txns_amt_7_days,nb_txns_30_days,avg_txns_amt_30_days,Label,Score
0,547-69-7793,6534665167391525,693931531856,adults_50up_female_urban.json,e14e1a05c8da5e3f3da457e6eb84d825,2021-07-01,00:00:18,1625097618,shopping_pos,14.11,...,1,0,1.0,14.11,22.0,62.525455,33.0,55.510909,0,0.9781
1,191-61-6759,4166219838226478,181679512065,adults_2550_male_urban.json,a3c0100cb4cfb7a5c4ffc90faaf7a6d8,2021-07-01,00:01:15,1625097675,grocery_pos,75.52,...,1,0,1.0,75.52,27.0,50.487407,69.0,73.28087,0,0.9781
2,027-46-7326,3513900304593051,920573624386,adults_50up_female_rural.json,9952d8bafe837216db5134d988d5144a,2021-07-01,00:02:12,1625097732,grocery_pos,136.23,...,1,0,1.0,136.23,34.0,88.363529,127.0,69.842756,0,0.9781


In [54]:
july_aug_prediction_et[prediction_output_col].to_csv(
    f"{data_path}/july_aug_prediction_full_et.csv", index=False
)

#### 2.8.3 Light Gradient Boosting Machine

During the model compare, LGBM has the highest AUC. Let's see if it do as well for the unseen data prediction.

In [55]:
unseen_predictions_lgbm = predict_model(final_lgbm, data=data_unseen)
unseen_predictions_lgbm.head()

Unnamed: 0,category,amt,is_fraud,merchant_id,gender,street,city,state,zip,city_pop,...,txn_during_night,trans_weekend,nb_txns_1_days,avg_txns_amt_1_days,nb_txns_7_days,avg_txns_amt_7_days,nb_txns_30_days,avg_txns_amt_30_days,Label,Score
0,shopping_pos,14.11,0,81458,F,7820 Wilson Tunnel Apt. 795,Andrews,SC,29510,11060,...,1,0,1.0,14.11,22.0,62.525455,33.0,55.510909,0,0.9999
1,grocery_pos,75.52,0,490762,M,2137 Serrano Station,North Charleston,SC,29420,71702,...,1,0,1.0,75.52,27.0,50.487407,69.0,73.28087,0,0.9998
2,grocery_pos,136.23,0,635517,F,4656 Karen Parkways,Elmo,MT,59915,403,...,1,0,1.0,136.23,34.0,88.363529,127.0,69.842756,0,1.0
3,gas_transport,66.94,0,19638,F,74769 Anderson Plain,Oaktown,IN,47561,1588,...,1,0,1.0,66.94,10.0,28.29,60.0,30.62,0,0.9999
4,entertainment,116.85,0,23622,M,318 Brenda Terrace Suite 001,Beverly,WV,26253,3490,...,1,0,1.0,116.85,19.0,66.212105,92.0,65.748152,0,0.9998


In [56]:
july_aug_prediction_lgbm = july_aug_data.merge(unseen_predictions_lgbm)
july_aug_prediction_lgbm.head(3)

Unnamed: 0,ssn,cc_num,acct_num,profile,trans_num,trans_date,trans_time,unix_time,category,amt,...,txn_during_night,trans_weekend,nb_txns_1_days,avg_txns_amt_1_days,nb_txns_7_days,avg_txns_amt_7_days,nb_txns_30_days,avg_txns_amt_30_days,Label,Score
0,547-69-7793,6534665167391525,693931531856,adults_50up_female_urban.json,e14e1a05c8da5e3f3da457e6eb84d825,2021-07-01,00:00:18,1625097618,shopping_pos,14.11,...,1,0,1.0,14.11,22.0,62.525455,33.0,55.510909,0,0.9999
1,191-61-6759,4166219838226478,181679512065,adults_2550_male_urban.json,a3c0100cb4cfb7a5c4ffc90faaf7a6d8,2021-07-01,00:01:15,1625097675,grocery_pos,75.52,...,1,0,1.0,75.52,27.0,50.487407,69.0,73.28087,0,0.9998
2,027-46-7326,3513900304593051,920573624386,adults_50up_female_rural.json,9952d8bafe837216db5134d988d5144a,2021-07-01,00:02:12,1625097732,grocery_pos,136.23,...,1,0,1.0,136.23,34.0,88.363529,127.0,69.842756,0,1.0


In [57]:
july_aug_prediction_lgbm[prediction_output_col].to_csv(
    f"{data_path}/july_aug_prediction_full_lgbm.csv", index=False
)

We will use the `check_metric` function from the `pycaret.utils` to get the AUC of the run. We can change and compare the metric to recall if preferred.  
It seems that LGBM still has the best AUC. But generally, the AUC scores are lower than what we have on the predict_mode stage in section 2.5.  

We shall see if the same holds true when we perform real-time streaming and ML evaluation in atoti.

In [58]:
from pycaret.utils import check_metric

check_metric(unseen_predictions["is_fraud"], unseen_predictions["Label"], metric="AUC")

0.8413

In [59]:
check_metric(
    unseen_predictions_et["is_fraud"], unseen_predictions_et["Label"], metric="AUC"
)

0.5

In [60]:
check_metric(
    unseen_predictions_lgbm["is_fraud"], unseen_predictions_lgbm["Label"], metric="AUC"
)

0.9306

### 2.9 Save trained model for real-time fraud detection with atoti

It takes a long time to train the models. Hence, it's great that we can save and use the model without having to repeat these all the time.  

In [61]:
save_model(final_dt, "Final DT Model 20211130")

Transformation Pipeline and Model Successfully Saved


(Pipeline(memory=None,
          steps=[('dtypes',
                  DataTypes_Auto_infer(categorical_features=['zip',
                                                             'merchant_id'],
                                       display_types=True, features_todrop=[],
                                       id_columns=[],
                                       ml_usecase='classification',
                                       numerical_features=['txn_during_night',
                                                           'trans_weekend'],
                                       target='is_fraud', time_features=[])),
                 ('imputer',
                  Simple_Imputer(categorical_strategy='not_available',
                                 fill_value_c...
                 ('dfs', 'passthrough'), ('pca', 'passthrough'),
                 ['trained_model',
                  DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None,
                                         crit

In [62]:
save_model(final_et, "Final ET Model 20211130")

Transformation Pipeline and Model Successfully Saved


(Pipeline(memory=None,
          steps=[('dtypes',
                  DataTypes_Auto_infer(categorical_features=['zip',
                                                             'merchant_id'],
                                       display_types=True, features_todrop=[],
                                       id_columns=[],
                                       ml_usecase='classification',
                                       numerical_features=['txn_during_night',
                                                           'trans_weekend'],
                                       target='is_fraud', time_features=[])),
                 ('imputer',
                  Simple_Imputer(categorical_strategy='not_available',
                                 fill_value_c...
                  ExtraTreesClassifier(bootstrap=True, ccp_alpha=0.0,
                                       class_weight={}, criterion='gini',
                                       max_depth=8, max_features='sqrt',
   

In [63]:
save_model(final_lgbm, "Final LGBM Model 20211130")

Transformation Pipeline and Model Successfully Saved


(Pipeline(memory=None,
          steps=[('dtypes',
                  DataTypes_Auto_infer(categorical_features=['zip',
                                                             'merchant_id'],
                                       display_types=True, features_todrop=[],
                                       id_columns=[],
                                       ml_usecase='classification',
                                       numerical_features=['txn_during_night',
                                                           'trans_weekend'],
                                       target='is_fraud', time_features=[])),
                 ('imputer',
                  Simple_Imputer(categorical_strategy='not_available',
                                 fill_value_c...
                                 colsample_bytree=1.0, feature_fraction=0.8,
                                 importance_type='split', learning_rate=0.15,
                                 max_depth=-1, min_child_samples=