# Import Libraries

In [77]:
import pandas as pd
 

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier
from sklearn.naive_bayes import CategoricalNB, GaussianNB

from sklearn.metrics import classification_report, ConfusionMatrixDisplay, roc_curve, roc_auc_score, RocCurveDisplay
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV

from datetime import datetime
from tqdm import tqdm
import yaml
import joblib
import json
import copy
import hashlib

# Load the Datasets

In [78]:
pkl_folder = "/root/ml_process_feb23/data/processed/"

df_train = joblib.load(pkl_folder + "df_train.pkl")
x_train = df_train.drop(['card'], axis = 1)
y_train = df_train['card']

df_train_rus = joblib.load(pkl_folder + "df_train_rus.pkl")
x_train_rus = df_train_rus.drop(['card'], axis = 1)
y_train_rus = df_train_rus['card']

df_train_ros = joblib.load(pkl_folder + "df_train_ros.pkl")
x_train_ros = df_train_ros.drop(['card'], axis = 1)
y_train_ros = df_train_ros['card']

df_train_smote = joblib.load(pkl_folder + "df_train_smote.pkl")
x_train_smote = df_train_smote.drop(['card'], axis = 1)
y_train_smote = df_train_smote['card']

df_valid = joblib.load(pkl_folder + "df_valid.pkl")
x_valid = df_valid.drop(['card'], axis = 1)
y_valid = df_valid['card']

df_test = joblib.load(pkl_folder + "df_test.pkl")
x_test = df_test.drop(['card'], axis = 1)
y_test = df_test['card']

## Selecting features to be trained

In [79]:
x_test.columns.values

array(['index', 'reports', 'age', 'share', 'owner', 'selfemp',
       'dependents', 'majorcards', 'active', 'income_log',
       'expenditure_log', 'months_log', 'age_bin', 'reports_bin',
       'dependents_bin', 'active_bin'], dtype=object)

In [80]:
# Create instances for some binned features and its original value
bins = ['age_bin', 'reports_bin', 'dependents_bin', 'active_bin']
ori_value = ['age', 'reports', 'dependents', 'active']

For the **baseline model**, I will not include the binned features.

In [81]:
# Create the independent variables for baseline model
x_train_base = x_train.drop(bins, axis = 1)
x_train_rus_base = x_train_rus.drop(bins, axis = 1)
x_train_ros_base = x_train_ros.drop(bins, axis = 1)
x_train_smote_base = x_train_smote.drop(bins, axis = 1)
x_valid_base = x_valid.drop(bins, axis = 1)
x_test_base = x_test.drop(bins, axis = 1)


For the **alternative model**, I use the binned features.

In [82]:
# Create the independent variables for alternative model
x_train_bin = x_train.drop(ori_value, axis = 1)
x_train_rus_bin = x_train_rus.drop(ori_value, axis = 1)
x_train_ros_bin = x_train_ros.drop(ori_value, axis = 1)
x_train_smote_bin = x_train_smote.drop(ori_value, axis = 1)
x_valid_bin = x_valid.drop(ori_value, axis = 1)
x_test_bin = x_test.drop(ori_value, axis = 1)

# Create Log Template

In [83]:
def time_stamp():
    return datetime.now()

In [84]:
def create_log_template():
    logger = {
        "model_name" : [],
        "model_uid" : [],
        "training_time" : [],
        "training_date" : [],
        "performance" : [],
        "f1_score_avg" : [],
        "data_configurations" : [],
    }

    return logger

In [85]:
def training_log_updater(current_log, log_path):
    current_log = current_log.copy()

    try:
        with open(log_path, "r") as file:
            last_log = json.load(file)
        file.close()
    except FileNotFoundError as ffe:
        with open(log_path, "w") as file:
            file.write("[]")
        file.close()
        with open(log_path, "r") as file:
            last_log = json.load(file)
        file.close()
    
    last_log.append(current_log)

    with open(log_path, "w") as file:
        json.dump(last_log, file)
        file.close()

    return last_log

# Training and Evaluation

## Create Model Object

Create instance for each algorithm function

In [86]:
lgr_baseline        = LogisticRegression()      # Logistic regression
svm_baseline        = SVC()                     # Support Vector Machine 
dct_baseline        = DecisionTreeClassifier()  # Decision tree Classifier
rfc_baseline        = RandomForestClassifier()  # Random FOrest Classifier
knn_baseline        = KNeighborsClassifier()    # k-Nearest Neighbors CLassifier
xgb_baseline        = XGBClassifier()           # XG Boost Classifier
nb_cat_baseline     = CategoricalNB()           # Categorical Naive Bayes Classifier
nb_gauss_baseline   = GaussianNB()              # Gaussian Naive Bayes Classifier

In [87]:
list_of_model = {
    "imbalanced" : [
        { "model_name": lgr_baseline.__class__.__name__, "model_object": lgr_baseline, "model_uid": ""},
        { "model_name": svm_baseline.__class__.__name__, "model_object": dct_baseline, "model_uid": ""},
        { "model_name": dct_baseline.__class__.__name__, "model_object": dct_baseline, "model_uid": ""},
        { "model_name": rfc_baseline.__class__.__name__, "model_object": rfc_baseline, "model_uid": ""},
        { "model_name": knn_baseline.__class__.__name__, "model_object": knn_baseline, "model_uid": ""},
        { "model_name": xgb_baseline.__class__.__name__, "model_object": xgb_baseline, "model_uid": ""},
        { "model_name": nb_cat_baseline.__class__.__name__, "model_object": knn_baseline, "model_uid": ""},
        { "model_name": nb_gauss_baseline.__class__.__name__, "model_object": xgb_baseline, "model_uid": ""}
        ],
    "undersampling" : [
        { "model_name": lgr_baseline.__class__.__name__, "model_object": lgr_baseline, "model_uid": ""},
        { "model_name": svm_baseline.__class__.__name__, "model_object": dct_baseline, "model_uid": ""},
        { "model_name": dct_baseline.__class__.__name__, "model_object": dct_baseline, "model_uid": ""},
        { "model_name": rfc_baseline.__class__.__name__, "model_object": rfc_baseline, "model_uid": ""},
        { "model_name": knn_baseline.__class__.__name__, "model_object": knn_baseline, "model_uid": ""},
        { "model_name": xgb_baseline.__class__.__name__, "model_object": xgb_baseline, "model_uid": ""},
        { "model_name": nb_cat_baseline.__class__.__name__, "model_object": knn_baseline, "model_uid": ""},
        { "model_name": nb_gauss_baseline.__class__.__name__, "model_object": xgb_baseline, "model_uid": ""}
        ],
    "oversampling" : [
        { "model_name": lgr_baseline.__class__.__name__, "model_object": lgr_baseline, "model_uid": ""},
        { "model_name": svm_baseline.__class__.__name__, "model_object": dct_baseline, "model_uid": ""},
        { "model_name": dct_baseline.__class__.__name__, "model_object": dct_baseline, "model_uid": ""},
        { "model_name": rfc_baseline.__class__.__name__, "model_object": rfc_baseline, "model_uid": ""},
        { "model_name": knn_baseline.__class__.__name__, "model_object": knn_baseline, "model_uid": ""},
        { "model_name": xgb_baseline.__class__.__name__, "model_object": xgb_baseline, "model_uid": ""},
        { "model_name": nb_cat_baseline.__class__.__name__, "model_object": knn_baseline, "model_uid": ""},
        { "model_name": nb_gauss_baseline.__class__.__name__, "model_object": xgb_baseline, "model_uid": ""}
        ],
    "smote" : [
        { "model_name": lgr_baseline.__class__.__name__, "model_object": lgr_baseline, "model_uid": ""},
        { "model_name": svm_baseline.__class__.__name__, "model_object": dct_baseline, "model_uid": ""},
        { "model_name": dct_baseline.__class__.__name__, "model_object": dct_baseline, "model_uid": ""},
        { "model_name": rfc_baseline.__class__.__name__, "model_object": rfc_baseline, "model_uid": ""},
        { "model_name": knn_baseline.__class__.__name__, "model_object": knn_baseline, "model_uid": ""},
        { "model_name": xgb_baseline.__class__.__name__, "model_object": xgb_baseline, "model_uid": ""},
        { "model_name": nb_cat_baseline.__class__.__name__, "model_object": knn_baseline, "model_uid": ""},
        { "model_name": nb_gauss_baseline.__class__.__name__, "model_object": xgb_baseline, "model_uid": ""}
        ],
    }

In [88]:
def train_eval_model(list_of_model, prefix_model_name, x_train, y_train, data_configuration_name, x_valid, y_valid, log_path):

    list_of_model = copy.deepcopy(list_of_model)
    logger = create_log_template()

    for model in tqdm(list_of_model):    
        model_name = prefix_model_name + "-" + model["model_name"]

        start_time = time_stamp()
        model["model_object"].fit(x_train, y_train)
        finished_time = time_stamp()

        elapsed_time = finished_time - start_time
        elapsed_time = elapsed_time.total_seconds()

        y_pred = model["model_object"].predict(x_valid)
        performance = classification_report(y_valid, y_pred, output_dict = True)

        plain_id = str(start_time) + str(finished_time)
        chiper_id = hashlib.md5(plain_id.encode()).hexdigest()

        model["model_uid"] = chiper_id

        logger["model_name"].append(model_name)
        logger["model_uid"].append(chiper_id)
        logger["training_time"].append(elapsed_time)
        logger["training_date"].append(str(start_time))
        logger["performance"].append(performance)
        logger["f1_score_avg"].append(performance["macro avg"]["f1-score"])
        logger["data_configurations"].append(data_configuration_name)

    training_log = training_log_updater(logger, log_path)

    return training_log, list_of_model

In [89]:
def get_best_model(training_log_df, list_of_model):
    model_object = None

    best_model_info = training_log_df.sort_values(["f1_score_avg", "training_time"], ascending = [False, True]).iloc[0]
    
    for configuration_data in list_of_model:
        for model_data in list_of_model[configuration_data]:
            if model_data["model_uid"] == best_model_info["model_uid"]:
                model_object = model_data["model_object"]
                break
    
    if model_object == None:
        raise RuntimeError("The best model not found in your list of model.")
    
    return model_object
    

## Train the Baseline Models

### Original (imbalanced) data

In [90]:
training_log, list_of_model_imbal = train_eval_model(
    list_of_model["imbalanced"],
    "baseline_model",
    x_train_base,
    y_train,
    "imbalanced",
    x_valid_base,
    y_valid,
    "/root/ml_process_feb23/logs/training_log_baseline.json"
)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
100%|██████████| 8/8 [00:06<00:00,  1.16it/s]


In [91]:
list_of_model["imbalanced"] = copy.deepcopy(list_of_model_imbal)

### Balanced with undersampling

In [92]:
training_log, list_of_model_rus = train_eval_model(
    list_of_model["undersampling"],
    "baseline_model",
    x_train_rus_base,
    y_train_rus,
    "undersampling",
    x_valid_base,
    y_valid,
    "/root/ml_process_feb23/logs/training_log_baseline.json"
)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
100%|██████████| 8/8 [00:00<00:00,  8.08it/s]


In [93]:
list_of_model["undersampling"] = copy.deepcopy(list_of_model_rus)

### Balanced with oversampling

In [94]:
training_log, list_of_model_ros = train_eval_model(
    list_of_model["oversampling"],
    "baseline_model",
    x_train_ros_base,
    y_train_ros,
    "oversampling",
    x_valid_base,
    y_valid,
    "/root/ml_process_feb23/logs/training_log_baseline.json"
)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
100%|██████████| 8/8 [00:01<00:00,  7.32it/s]


In [95]:
list_of_model["oversampling"] = copy.deepcopy(list_of_model_ros)

### Balanced with SMOTE

In [96]:
training_log, list_of_model_smote = train_eval_model(
    list_of_model["smote"],
    "baseline_model",
    x_train_smote_base,
    y_train_smote,
    "smote",
    x_valid_base,
    y_valid,
    "/root/ml_process_feb23/logs/training_log_baseline.json"
)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
100%|██████████| 8/8 [00:04<00:00,  1.65it/s]


In [97]:
list_of_model["smote"] = copy.deepcopy(list_of_model_smote)

### Table of models performance

Now let's compare the performance from each algorithm and each dataset.

In [98]:
def training_log_to_df(training_log):
    training_res = pd.DataFrame()

    for log in tqdm(training_log):
        training_res = pd.concat([training_res, pd.DataFrame(log)])
    
    training_res.sort_values(["f1_score_avg", "training_time"], ascending = [False, True], inplace = True)
    training_res.reset_index(inplace = True, drop = True)
    
    return training_res

In [99]:
training_res_baseline = training_log_to_df(training_log)

100%|██████████| 4/4 [00:00<00:00, 90.56it/s]


In [100]:
training_res_baseline

Unnamed: 0,model_name,model_uid,training_time,training_date,performance,f1_score_avg,data_configurations
0,baseline_model-RandomForestClassifier,913f0952f08dffcbc9b598550abc6414,0.453041,2023-03-24 00:26:54.679508,"{'0': {'precision': 0.9130434782608695, 'recal...",0.97023,oversampling
1,baseline_model-RandomForestClassifier,e9b63742ce5bbf1eb579101f368060b4,0.61806,2023-03-24 00:26:50.619651,"{'0': {'precision': 0.9130434782608695, 'recal...",0.97023,imbalanced
2,baseline_model-GaussianNB,ba7545b378b78086f45e19006a7e93b4,0.620266,2023-03-24 00:27:00.352289,"{'0': {'precision': 0.8936170212765957, 'recal...",0.963076,smote
3,baseline_model-XGBClassifier,9ef9bcdb93ba00e35f639a6634e0f27d,3.322321,2023-03-24 00:26:56.966611,"{'0': {'precision': 0.8936170212765957, 'recal...",0.963076,smote
4,baseline_model-SVC,0a5beb2e12ef996954fa410d990c2519,0.020832,2023-03-24 00:26:56.328165,"{'0': {'precision': 0.9111111111111111, 'recal...",0.962492,smote
5,baseline_model-GaussianNB,73511a4f851f446955b0be26f6f9281a,0.150952,2023-03-24 00:26:51.975952,"{'0': {'precision': 0.9111111111111111, 'recal...",0.962492,imbalanced
6,baseline_model-XGBClassifier,69438bdba8e4bc3f4f97d29321c8340e,0.535993,2023-03-24 00:26:51.379244,"{'0': {'precision': 0.9111111111111111, 'recal...",0.962492,imbalanced
7,baseline_model-SVC,cbf63e66143deedc578d688474191b6c,0.009698,2023-03-24 00:26:52.887050,"{'0': {'precision': 0.875, 'recall': 1.0, 'f1-...",0.956028,undersampling
8,baseline_model-DecisionTreeClassifier,5a33281124e428d58c99d131495335d6,0.011169,2023-03-24 00:26:52.932434,"{'0': {'precision': 0.875, 'recall': 1.0, 'f1-...",0.956028,undersampling
9,baseline_model-GaussianNB,1e92af064d1c58224d67ae75d3f7c8ee,0.06494,2023-03-24 00:26:53.678280,"{'0': {'precision': 0.875, 'recall': 1.0, 'f1-...",0.956028,undersampling


In [102]:
model_baseline = get_best_model(training_res_baseline, list_of_model)
model_baseline

## Train the Alternative Models

The binned value features are used in the datasets for this model training process. 

### Original (imbalanced) data


In [103]:
training_log, list_of_model_imbal = train_eval_model(
    list_of_model["imbalanced"],
    "alternative_model",
    x_train_bin,
    y_train,
    "imbalanced",
    x_valid_bin,
    y_valid,
    "/root/ml_process_feb23/logs/training_log_alternative.json"
)
list_of_model["imbalanced"] = copy.deepcopy(list_of_model_imbal)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
100%|██████████| 8/8 [00:01<00:00,  7.77it/s]


In [104]:
list_of_model["imbalanced"] = copy.deepcopy(list_of_model_imbal)


### Balanced with undersampling


In [105]:
training_log, list_of_model_rus = train_eval_model(
    list_of_model["undersampling"],
    "alternative_model",
    x_train_rus_bin,
    y_train_rus,
    "undersampling",
    x_valid_bin,
    y_valid,
    "/root/ml_process_feb23/logs/training_log_alternative.json"
)
list_of_model["undersampling"] = copy.deepcopy(list_of_model_rus)


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
100%|██████████| 8/8 [00:00<00:00,  8.12it/s]


### Balanced with oversampling


In [106]:
training_log, list_of_model_ros = train_eval_model(
    list_of_model["oversampling"],
    "alternative_model",
    x_train_ros_bin,
    y_train_ros,
    "oversampling",
    x_valid_bin,
    y_valid,
    "/root/ml_process_feb23/logs/training_log_alternative.json"
)
list_of_model["oversampling"] = copy.deepcopy(list_of_model_ros)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
100%|██████████| 8/8 [00:07<00:00,  1.10it/s]


In [107]:
list_of_model["oversampling"] = copy.deepcopy(list_of_model_ros)


### Balanced with SMOTE


In [108]:
training_log, list_of_model_smote = train_eval_model(
    list_of_model["smote"],
    "alternative_model",
    x_train_smote_bin,
    y_train_smote,
    "smote",
    x_valid_bin,
    y_valid,
    "/root/ml_process_feb23/logs/training_log_alternative.json"
)
list_of_model["smote"] = copy.deepcopy(list_of_model_smote)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
100%|██████████| 8/8 [00:02<00:00,  3.57it/s]


In [109]:
list_of_model["smote"] = copy.deepcopy(list_of_model_smote)

### Table of models performance

Now let's compare the performance from each algorithm and each dataset.


In [110]:
def training_log_to_df(training_log):
    training_res = pd.DataFrame()

    for log in tqdm(training_log):
        training_res = pd.concat([training_res, pd.DataFrame(log)])
    
    training_res.sort_values(["f1_score_avg", "training_time"], ascending = [False, True], inplace = True)
    training_res.reset_index(inplace = True, drop = True)
    
    return training_res


In [111]:
training_res_alternative = training_log_to_df(training_log)


100%|██████████| 4/4 [00:00<00:00, 265.01it/s]


In [112]:
training_res_alternative

Unnamed: 0,model_name,model_uid,training_time,training_date,performance,f1_score_avg,data_configurations
0,alternative_model-RandomForestClassifier,4e4aa5af81ba18d8f83b9e4f25d0f9ca,0.455028,2023-03-24 00:27:03.655299,"{'0': {'precision': 0.8936170212765957, 'recal...",0.963076,imbalanced
1,alternative_model-DecisionTreeClassifier,efa556c2ae0e5710d469a57e46029aef,0.004896,2023-03-24 00:27:05.482673,"{'0': {'precision': 0.875, 'recall': 1.0, 'f1-...",0.956028,undersampling
2,alternative_model-SVC,3129a3e879f91c5bccf3608452cc053a,0.024354,2023-03-24 00:27:05.433911,"{'0': {'precision': 0.875, 'recall': 1.0, 'f1-...",0.956028,undersampling
3,alternative_model-GaussianNB,c762e800447e75df2b343529db23cf0a,0.073288,2023-03-24 00:27:06.198597,"{'0': {'precision': 0.875, 'recall': 1.0, 'f1-...",0.956028,undersampling
4,alternative_model-XGBClassifier,1b100ed4470cd6f6fa84eaf89463d81d,0.079543,2023-03-24 00:27:06.070306,"{'0': {'precision': 0.875, 'recall': 1.0, 'f1-...",0.956028,undersampling
5,alternative_model-LogisticRegression,b07f4e383908aad814ce5334cc3c6989,0.083703,2023-03-24 00:27:14.347752,"{'0': {'precision': 0.875, 'recall': 1.0, 'f1-...",0.956028,smote
6,alternative_model-LogisticRegression,e761e6c21e30f80e9d9f3112e2ea9d1a,0.09195,2023-03-24 00:27:06.584063,"{'0': {'precision': 0.875, 'recall': 1.0, 'f1-...",0.956028,oversampling
7,alternative_model-LogisticRegression,1025d8e6facf7183e7b5f8b050ade582,0.111484,2023-03-24 00:27:03.458959,"{'0': {'precision': 0.875, 'recall': 1.0, 'f1-...",0.956028,imbalanced
8,alternative_model-RandomForestClassifier,9c222805dfb1cb11f1a47c1942e2501f,0.478671,2023-03-24 00:27:05.525484,"{'0': {'precision': 0.875, 'recall': 1.0, 'f1-...",0.956028,undersampling
9,alternative_model-RandomForestClassifier,b254740758278d6889336702d16249b8,0.923518,2023-03-24 00:27:14.614175,"{'0': {'precision': 0.875, 'recall': 1.0, 'f1-...",0.956028,smote


In [113]:
model_alternative = get_best_model(training_res_alternative, list_of_model)
model_alternative

In [114]:
training_res_baseline.head(1).T

Unnamed: 0,0
model_name,baseline_model-RandomForestClassifier
model_uid,913f0952f08dffcbc9b598550abc6414
training_time,0.453041
training_date,2023-03-24 00:26:54.679508
performance,"{'0': {'precision': 0.9130434782608695, 'recal..."
f1_score_avg,0.97023
data_configurations,oversampling


In [115]:
training_res_alternative.head(1).T


Unnamed: 0,0
model_name,alternative_model-RandomForestClassifier
model_uid,4e4aa5af81ba18d8f83b9e4f25d0f9ca
training_time,0.455028
training_date,2023-03-24 00:27:03.655299
performance,"{'0': {'precision': 0.8936170212765957, 'recal..."
f1_score_avg,0.963076
data_configurations,imbalanced
