# Import Libraries

In [31]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier
from sklearn.naive_bayes import CategoricalNB, GaussianNB

from sklearn.metrics import classification_report, ConfusionMatrixDisplay, roc_curve, roc_auc_score, RocCurveDisplay
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV

from datetime import datetime
from tqdm import tqdm
import yaml
import joblib
import json
import copy
import hashlib

# Load the Datasets

In [32]:
pkl_folder = "/root/ml_process_feb23/data/processed/"

df_train = joblib.load(pkl_folder + "df_train.pkl")
x_train = df_train.drop(['card'], axis = 1)
y_train = df_train['card']

df_train_rus = joblib.load(pkl_folder + "df_train_rus.pkl")
x_train_rus = df_train_rus.drop(['card'], axis = 1)
y_train_rus = df_train_rus['card']

df_train_ros = joblib.load(pkl_folder + "df_train_ros.pkl")
x_train_ros = df_train_ros.drop(['card'], axis = 1)
y_train_ros = df_train_ros['card']

df_train_smote = joblib.load(pkl_folder + "df_train_smote.pkl")
x_train_smote = df_train_smote.drop(['card'], axis = 1)
y_train_smote = df_train_smote['card']

df_valid = joblib.load(pkl_folder + "df_valid.pkl")
x_valid = df_valid.drop(['card'], axis = 1)
y_valid = df_valid['card']

df_test = joblib.load(pkl_folder + "df_test.pkl")
x_test = df_test.drop(['card'], axis = 1)
y_test = df_test['card']

## Selecting features to be trained

In [33]:
x_test.columns.values

array(['index', 'reports', 'age', 'share', 'owner', 'selfemp',
       'dependents', 'majorcards', 'active', 'income_log',
       'expenditure_log', 'months_log', 'age_bin', 'reports_bin',
       'dependents_bin', 'active_bin'], dtype=object)

In [34]:
# Create instances for some binned features and its original value
bins = ['age_bin', 'reports_bin', 'dependents_bin', 'active_bin']
ori_value = ['age', 'reports', 'dependents', 'active']

For the **baseline model**, I will not include the binned feature.

In [35]:
# Create the independent variables for baseline model
x_train_base = x_train.drop(bins, axis = 1)
x_train_rus_base = x_train_rus.drop(bins, axis = 1)
x_train_ros_base = x_train_ros.drop(bins, axis = 1)
x_train_smote_base = x_train_smote.drop(bins, axis = 1)
x_valid_base = x_valid.drop(bins, axis = 1)
x_test_base = x_test.drop(bins, axis = 1)


In [63]:
# Create the independent variables for alternative model
x_train_bin = x_train.drop(ori_value, axis = 1)
x_train_rus_bin = x_train_rus.drop(ori_value, axis = 1)
x_train_ros_bin = x_train_ros.drop(ori_value, axis = 1)
x_train_smote_bin = x_train_smote.drop(ori_value, axis = 1)
x_valid_bin = x_valid.drop(ori_value, axis = 1)
x_test_bin = x_test.drop(ori_value, axis = 1)

# Create Log Template

In [37]:
def time_stamp():
    return datetime.now()

In [38]:
def create_log_template():
    logger = {
        "model_name" : [],
        "model_uid" : [],
        "training_time" : [],
        "training_date" : [],
        "performance" : [],
        "f1_score_avg" : [],
        "data_configurations" : [],
    }

    return logger

In [39]:
def training_log_updater(current_log, log_path):
    current_log = current_log.copy()

    try:
        with open(log_path, "r") as file:
            last_log = json.load(file)
        file.close()
    except FileNotFoundError as ffe:
        with open(log_path, "w") as file:
            file.write("[]")
        file.close()
        with open(log_path, "r") as file:
            last_log = json.load(file)
        file.close()
    
    last_log.append(current_log)

    with open(log_path, "w") as file:
        json.dump(last_log, file)
        file.close()

    return last_log

# Training and Evaluation

## Create Model Object

Create instance for each algorithm function

In [40]:
lgr_baseline        = LogisticRegression()      # Logistic regression
svm_baseline        = SVC()                     # Support Vector Machine 
dct_baseline        = DecisionTreeClassifier()  # Decision tree Classifier
rfc_baseline        = RandomForestClassifier()  # Random FOrest Classifier
knn_baseline        = KNeighborsClassifier()    # k-Nearest Neighbors CLassifier
xgb_baseline        = XGBClassifier()           # XG Boost Classifier
nb_cat_baseline     = CategoricalNB()           # Categorical Naive Bayes Classifier
nb_gauss_baseline   = GaussianNB()              # Gaussian Naive Bayes Classifier

In [41]:
list_of_model = {
    "imbalanced" : [
        { "model_name": lgr_baseline.__class__.__name__, "model_object": lgr_baseline, "model_uid": ""},
        { "model_name": svm_baseline.__class__.__name__, "model_object": dct_baseline, "model_uid": ""},
        { "model_name": dct_baseline.__class__.__name__, "model_object": dct_baseline, "model_uid": ""},
        { "model_name": rfc_baseline.__class__.__name__, "model_object": rfc_baseline, "model_uid": ""},
        { "model_name": knn_baseline.__class__.__name__, "model_object": knn_baseline, "model_uid": ""},
        { "model_name": xgb_baseline.__class__.__name__, "model_object": xgb_baseline, "model_uid": ""},
        { "model_name": nb_cat_baseline.__class__.__name__, "model_object": knn_baseline, "model_uid": ""},
        { "model_name": nb_gauss_baseline.__class__.__name__, "model_object": xgb_baseline, "model_uid": ""}
        ],
    "undersampling" : [
        { "model_name": lgr_baseline.__class__.__name__, "model_object": lgr_baseline, "model_uid": ""},
        { "model_name": svm_baseline.__class__.__name__, "model_object": dct_baseline, "model_uid": ""},
        { "model_name": dct_baseline.__class__.__name__, "model_object": dct_baseline, "model_uid": ""},
        { "model_name": rfc_baseline.__class__.__name__, "model_object": rfc_baseline, "model_uid": ""},
        { "model_name": knn_baseline.__class__.__name__, "model_object": knn_baseline, "model_uid": ""},
        { "model_name": xgb_baseline.__class__.__name__, "model_object": xgb_baseline, "model_uid": ""},
        { "model_name": nb_cat_baseline.__class__.__name__, "model_object": knn_baseline, "model_uid": ""},
        { "model_name": nb_gauss_baseline.__class__.__name__, "model_object": xgb_baseline, "model_uid": ""}
        ],
    "oversampling" : [
        { "model_name": lgr_baseline.__class__.__name__, "model_object": lgr_baseline, "model_uid": ""},
        { "model_name": svm_baseline.__class__.__name__, "model_object": dct_baseline, "model_uid": ""},
        { "model_name": dct_baseline.__class__.__name__, "model_object": dct_baseline, "model_uid": ""},
        { "model_name": rfc_baseline.__class__.__name__, "model_object": rfc_baseline, "model_uid": ""},
        { "model_name": knn_baseline.__class__.__name__, "model_object": knn_baseline, "model_uid": ""},
        { "model_name": xgb_baseline.__class__.__name__, "model_object": xgb_baseline, "model_uid": ""},
        { "model_name": nb_cat_baseline.__class__.__name__, "model_object": knn_baseline, "model_uid": ""},
        { "model_name": nb_gauss_baseline.__class__.__name__, "model_object": xgb_baseline, "model_uid": ""}
        ],
    "smote" : [
        { "model_name": lgr_baseline.__class__.__name__, "model_object": lgr_baseline, "model_uid": ""},
        { "model_name": svm_baseline.__class__.__name__, "model_object": dct_baseline, "model_uid": ""},
        { "model_name": dct_baseline.__class__.__name__, "model_object": dct_baseline, "model_uid": ""},
        { "model_name": rfc_baseline.__class__.__name__, "model_object": rfc_baseline, "model_uid": ""},
        { "model_name": knn_baseline.__class__.__name__, "model_object": knn_baseline, "model_uid": ""},
        { "model_name": xgb_baseline.__class__.__name__, "model_object": xgb_baseline, "model_uid": ""},
        { "model_name": nb_cat_baseline.__class__.__name__, "model_object": knn_baseline, "model_uid": ""},
        { "model_name": nb_gauss_baseline.__class__.__name__, "model_object": xgb_baseline, "model_uid": ""}
        ],
    }

In [42]:
def train_eval_model(list_of_model, prefix_model_name, x_train, y_train, data_configuration_name, x_valid, y_valid, log_path):

    list_of_model = copy.deepcopy(list_of_model)
    logger = create_log_template()

    for model in tqdm(list_of_model):    
        model_name = prefix_model_name + "-" + model["model_name"]

        start_time = time_stamp()
        model["model_object"].fit(x_train, y_train)
        finished_time = time_stamp()

        elapsed_time = finished_time - start_time
        elapsed_time = elapsed_time.total_seconds()

        y_pred = model["model_object"].predict(x_valid)
        performance = classification_report(y_valid, y_pred, output_dict = True)

        plain_id = str(start_time) + str(finished_time)
        chiper_id = hashlib.md5(plain_id.encode()).hexdigest()

        model["model_uid"] = chiper_id

        logger["model_name"].append(model_name)
        logger["model_uid"].append(chiper_id)
        logger["training_time"].append(elapsed_time)
        logger["training_date"].append(str(start_time))
        logger["performance"].append(performance)
        logger["f1_score_avg"].append(performance["macro avg"]["f1-score"])
        logger["data_configurations"].append(data_configuration_name)

    training_log = training_log_updater(logger, log_path)

    return training_log, list_of_model

## Train the Baseline Models

### Original (imbalanced) data

In [43]:
training_log, list_of_model_imbal = train_eval_model(
    list_of_model["imbalanced"],
    "baseline_model",
    x_train_base,
    y_train,
    "imbalanced",
    x_valid_base,
    y_valid,
    "/root/ml_process_feb23/docs/training_log_baseline.json"
)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
100%|██████████| 8/8 [00:03<00:00,  2.47it/s]


In [44]:
list_of_model["imbalanced"] = copy.deepcopy(list_of_model_imbal)

### Balanced with undersampling

In [45]:
training_log, list_of_model_rus = train_eval_model(
    list_of_model["undersampling"],
    "baseline_model",
    x_train_rus_base,
    y_train_rus,
    "undersampling",
    x_valid_base,
    y_valid,
    "/root/ml_process_feb23/docs/training_log_baseline.json"
)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
100%|██████████| 8/8 [00:08<00:00,  1.02s/it]


In [46]:
list_of_model["undersampling"] = copy.deepcopy(list_of_model_rus)

### Balanced with oversampling

In [47]:
training_log, list_of_model_ros = train_eval_model(
    list_of_model["oversampling"],
    "baseline_model",
    x_train_ros_base,
    y_train_ros,
    "oversampling",
    x_valid_base,
    y_valid,
    "/root/ml_process_feb23/docs/training_log_baseline.json"
)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
100%|██████████| 8/8 [00:03<00:00,  2.44it/s]


In [48]:
list_of_model["oversampling"] = copy.deepcopy(list_of_model_ros)

### Balanced with SMOTE

In [49]:
training_log, list_of_model_smote = train_eval_model(
    list_of_model["smote"],
    "baseline_model",
    x_train_smote_base,
    y_train_smote,
    "smote",
    x_valid_base,
    y_valid,
    "/root/ml_process_feb23/docs/training_log_baseline.json"
)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
100%|██████████| 8/8 [00:07<00:00,  1.06it/s]


In [50]:
list_of_model["smote"] = copy.deepcopy(list_of_model_smote)

### Table of models performance

Now let's compare the performance from each algorithm and each dataset.

In [51]:
def training_log_to_df(training_log):
    training_res = pd.DataFrame()

    for log in tqdm(training_log):
        training_res = pd.concat([training_res, pd.DataFrame(log)])
    
    training_res.sort_values(["f1_score_avg", "training_time"], ascending = [False, True], inplace = True)
    training_res.reset_index(inplace = True, drop = True)
    
    return training_res

In [55]:
training_res_baseline = training_log_to_df(training_log)

100%|██████████| 8/8 [00:04<00:00,  1.78it/s]


In [56]:
training_res_baseline

Unnamed: 0,model_name,model_uid,training_time,training_date,performance,f1_score_avg,data_configurations
0,baseline_model-RandomForestClassifier,d2024bbbeffaa419c81f9cce722e6aa7,0.658245,2023-03-17 15:21:15.874700,"{'0': {'precision': 0.9130434782608695, 'recal...",0.970230,imbalanced
1,baseline_model-RandomForestClassifier,5372f2a6b2001d82bf17afe009456103,1.436958,2023-03-17 15:24:04.385977,"{'0': {'precision': 0.9130434782608695, 'recal...",0.970230,oversampling
2,baseline_model-RandomForestClassifier,cdd88d4634a95305eec9cc865bc174bd,0.829285,2023-03-17 15:25:28.425261,"{'0': {'precision': 0.8936170212765957, 'recal...",0.963076,oversampling
3,baseline_model-GaussianNB,7de58a7cbec49af09fadb50758818fa3,0.972729,2023-03-17 15:25:40.694870,"{'0': {'precision': 0.8936170212765957, 'recal...",0.963076,smote
4,baseline_model-XGBClassifier,bbfe306e6e4a721e0fc616c1c14cff96,1.932120,2023-03-17 15:25:38.627883,"{'0': {'precision': 0.8936170212765957, 'recal...",0.963076,smote
...,...,...,...,...,...,...,...
59,baseline_model-CategoricalNB,f3773044516f5f642b942dd6a8cbd8f4,0.005277,2023-03-17 15:25:17.701648,"{'0': {'precision': 0.16666666666666666, 'reca...",0.458421,imbalanced
60,baseline_model-KNeighborsClassifier,4876a41354a278c3f7b704fe9aa085a4,0.006233,2023-03-17 15:25:16.933651,"{'0': {'precision': 0.16666666666666666, 'reca...",0.458421,imbalanced
61,baseline_model-CategoricalNB,48a85f1961b7399ce532498b5f9ea90d,0.009596,2023-03-17 15:21:16.866303,"{'0': {'precision': 0.16666666666666666, 'reca...",0.458421,imbalanced
62,baseline_model-KNeighborsClassifier,99265de149bb849c206b1b5a97a7f9b7,0.011212,2023-03-17 15:21:16.604639,"{'0': {'precision': 0.16666666666666666, 'reca...",0.458421,imbalanced


## Train the Alternative Models

The binned value features are used in the datasets for this model training process. 

### Original (imbalanced) data


In [69]:
training_log, list_of_model_imbal = train_eval_model(
    list_of_model["imbalanced"],
    "alternative_model",
    x_train_bin,
    y_train,
    "imbalanced",
    x_valid_bin,
    y_valid,
    "/root/ml_process_feb23/docs/training_log_alternative.json"
)
list_of_model["imbalanced"] = copy.deepcopy(list_of_model_imbal)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
100%|██████████| 8/8 [00:08<00:00,  1.01s/it]


In [70]:
list_of_model["imbalanced"] = copy.deepcopy(list_of_model_imbal)


### Balanced with undersampling


In [71]:
training_log, list_of_model_rus = train_eval_model(
    list_of_model["undersampling"],
    "alternative_model",
    x_train_rus_bin,
    y_train_rus,
    "undersampling",
    x_valid_bin,
    y_valid,
    "/root/ml_process_feb23/docs/training_log_alternative.json"
)
list_of_model["undersampling"] = copy.deepcopy(list_of_model_rus)


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
100%|██████████| 8/8 [00:02<00:00,  3.84it/s]


### Balanced with oversampling


In [72]:
training_log, list_of_model_ros = train_eval_model(
    list_of_model["oversampling"],
    "alternative_model",
    x_train_ros_bin,
    y_train_ros,
    "oversampling",
    x_valid_bin,
    y_valid,
    "/root/ml_process_feb23/docs/training_log_alternative.json"
)
list_of_model["oversampling"] = copy.deepcopy(list_of_model_ros)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
100%|██████████| 8/8 [00:01<00:00,  6.74it/s]


In [73]:
list_of_model["oversampling"] = copy.deepcopy(list_of_model_ros)


### Balanced with SMOTE


In [74]:
training_log, list_of_model_smote = train_eval_model(
    list_of_model["smote"],
    "alternative_model",
    x_train_smote_bin,
    y_train_smote,
    "smote",
    x_valid_bin,
    y_valid,
    "/root/ml_process_feb23/docs/training_log_alternative.json"
)
list_of_model["smote"] = copy.deepcopy(list_of_model_smote)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
100%|██████████| 8/8 [00:01<00:00,  5.41it/s]


In [75]:
list_of_model["smote"] = copy.deepcopy(list_of_model_smote)

### Table of models performance

Now let's compare the performance from each algorithm and each dataset.


In [76]:
def training_log_to_df(training_log):
    training_res = pd.DataFrame()

    for log in tqdm(training_log):
        training_res = pd.concat([training_res, pd.DataFrame(log)])
    
    training_res.sort_values(["f1_score_avg", "training_time"], ascending = [False, True], inplace = True)
    training_res.reset_index(inplace = True, drop = True)
    
    return training_res


In [77]:
training_res_alternative = training_log_to_df(training_log)


100%|██████████| 8/8 [00:00<00:00, 282.21it/s]


In [78]:
training_res_alternative

Unnamed: 0,model_name,model_uid,training_time,training_date,performance,f1_score_avg,data_configurations
0,alternative_model-RandomForestClassifier,f630c3a65d06800df553e0ab4c66be13,0.422317,2023-03-17 17:15:33.920571,"{'0': {'precision': 0.8936170212765957, 'recal...",0.963076,oversampling
1,alternative_model-RandomForestClassifier,3f74843f23a8143bc3de556ceddf06d7,0.780525,2023-03-17 17:15:20.401430,"{'0': {'precision': 0.8936170212765957, 'recal...",0.963076,imbalanced
2,baseline_model-SVC,4f375bd57d0d63469f9e6cb60a01341e,0.005673,2023-03-17 17:11:44.744757,"{'0': {'precision': 0.875, 'recall': 1.0, 'f1-...",0.956028,undersampling
3,baseline_model-DecisionTreeClassifier,8c8ff8cad3f681e6fb97a436f68956c1,0.007013,2023-03-17 17:11:44.781818,"{'0': {'precision': 0.875, 'recall': 1.0, 'f1-...",0.956028,undersampling
4,alternative_model-SVC,81cca94def5b0f6631f5eecc0d05a36d,0.008862,2023-03-17 17:15:31.216044,"{'0': {'precision': 0.875, 'recall': 1.0, 'f1-...",0.956028,undersampling
...,...,...,...,...,...,...,...
59,baseline_model-KNeighborsClassifier,3cd139d9c945cbdbb0975807e7583feb,0.444625,2023-03-17 17:11:33.112479,"{'0': {'precision': 1.0, 'recall': 0.142857142...",0.569444,imbalanced
60,baseline_model-CategoricalNB,01a704b8ce0f3fc85c3fb4a0eef51bd4,0.004445,2023-03-17 17:11:46.500107,"{'0': {'precision': 0.25287356321839083, 'reca...",0.495646,undersampling
61,alternative_model-CategoricalNB,d24d9dceec958d22af95fab2c4e1e6cd,0.004584,2023-03-17 17:15:32.351095,"{'0': {'precision': 0.25287356321839083, 'reca...",0.495646,undersampling
62,alternative_model-KNeighborsClassifier,2f6e2ea33f32923a76682b09c8e4f260,0.005389,2023-03-17 17:15:31.960264,"{'0': {'precision': 0.25287356321839083, 'reca...",0.495646,undersampling
