# Import Libraries

In [51]:
import pandas as pd
 

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier
from sklearn.naive_bayes import CategoricalNB, GaussianNB

from sklearn.metrics import classification_report, ConfusionMatrixDisplay, roc_curve, roc_auc_score, RocCurveDisplay
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV

from datetime import datetime
from tqdm import tqdm
import yaml
import joblib
import json
import copy
import hashlib

# Load the Datasets

In [52]:
pkl_folder = "/root/ml_process_feb23/data/processed/"

df_train = joblib.load(pkl_folder + "df_train.pkl")
x_train = df_train.drop(['card'], axis = 1)
y_train = df_train['card']

df_train_rus = joblib.load(pkl_folder + "df_train_rus.pkl")
x_train_rus = df_train_rus.drop(['card'], axis = 1)
y_train_rus = df_train_rus['card']

df_train_ros = joblib.load(pkl_folder + "df_train_ros.pkl")
x_train_ros = df_train_ros.drop(['card'], axis = 1)
y_train_ros = df_train_ros['card']

df_train_smote = joblib.load(pkl_folder + "df_train_smote.pkl")
x_train_smote = df_train_smote.drop(['card'], axis = 1)
y_train_smote = df_train_smote['card']

df_valid = joblib.load(pkl_folder + "df_valid.pkl")
x_valid = df_valid.drop(['card'], axis = 1)
y_valid = df_valid['card']

df_test = joblib.load(pkl_folder + "df_test.pkl")
x_test = df_test.drop(['card'], axis = 1)
y_test = df_test['card']

## Selecting features to be trained

In [53]:
x_test.columns.values

array(['index', 'reports', 'age', 'share', 'owner', 'selfemp',
       'dependents', 'majorcards', 'active', 'income_log',
       'expenditure_log', 'months_log', 'age_bin', 'reports_bin',
       'dependents_bin', 'active_bin'], dtype=object)

In [54]:
# Create instances for some binned features and its original value
bins = ['age_bin', 'reports_bin', 'dependents_bin', 'active_bin']
ori_value = ['age', 'reports', 'dependents', 'active']

For the **baseline model**, I will not include the binned features.

In [55]:
# Create the independent variables for baseline model
x_train_base = x_train.drop(bins, axis = 1)
x_train_rus_base = x_train_rus.drop(bins, axis = 1)
x_train_ros_base = x_train_ros.drop(bins, axis = 1)
x_train_smote_base = x_train_smote.drop(bins, axis = 1)
x_valid_base = x_valid.drop(bins, axis = 1)
x_test_base = x_test.drop(bins, axis = 1)


For the **alternative model**, I use the binned features.

In [56]:
# Create the independent variables for alternative model
x_train_bin = x_train.drop(ori_value, axis = 1)
x_train_rus_bin = x_train_rus.drop(ori_value, axis = 1)
x_train_ros_bin = x_train_ros.drop(ori_value, axis = 1)
x_train_smote_bin = x_train_smote.drop(ori_value, axis = 1)
x_valid_bin = x_valid.drop(ori_value, axis = 1)
x_test_bin = x_test.drop(ori_value, axis = 1)

# Create Log Template

In [57]:
def time_stamp():
    return datetime.now()

In [58]:
def create_log_template():
    logger = {
        "model_name" : [],
        "model_uid" : [],
        "training_time" : [],
        "training_date" : [],
        "performance" : [],
        "f1_score_avg" : [],
        "data_configurations" : [],
    }

    return logger

In [59]:
def training_log_updater(current_log, log_path):
    current_log = current_log.copy()

    try:
        with open(log_path, "r") as file:
            last_log = json.load(file)
        file.close()
    except FileNotFoundError as ffe:
        with open(log_path, "w") as file:
            file.write("[]")
        file.close()
        with open(log_path, "r") as file:
            last_log = json.load(file)
        file.close()
    
    last_log.append(current_log)

    with open(log_path, "w") as file:
        json.dump(last_log, file)
        file.close()

    return last_log

# Training and Evaluation

## Create Model Object

Create instance for each algorithm function

In [60]:
lgr_baseline        = LogisticRegression()      # Logistic regression
svm_baseline        = SVC()                     # Support Vector Machine 
dct_baseline        = DecisionTreeClassifier()  # Decision tree Classifier
rfc_baseline        = RandomForestClassifier()  # Random Forest Classifier
knn_baseline        = KNeighborsClassifier()    # k-Nearest Neighbors CLassifier
xgb_baseline        = XGBClassifier()           # XG Boost Classifier
nb_cat_baseline     = CategoricalNB()           # Categorical Naive Bayes Classifier
nb_gauss_baseline   = GaussianNB()              # Gaussian Naive Bayes Classifier

In [61]:
list_of_model = {
    "imbalanced" : [
        { "model_name": lgr_baseline.__class__.__name__, "model_object": lgr_baseline, "model_uid": ""},
        { "model_name": svm_baseline.__class__.__name__, "model_object": dct_baseline, "model_uid": ""},
        { "model_name": dct_baseline.__class__.__name__, "model_object": dct_baseline, "model_uid": ""},
        { "model_name": rfc_baseline.__class__.__name__, "model_object": rfc_baseline, "model_uid": ""},
        { "model_name": knn_baseline.__class__.__name__, "model_object": knn_baseline, "model_uid": ""},
        { "model_name": xgb_baseline.__class__.__name__, "model_object": xgb_baseline, "model_uid": ""},
        { "model_name": nb_cat_baseline.__class__.__name__, "model_object": knn_baseline, "model_uid": ""},
        { "model_name": nb_gauss_baseline.__class__.__name__, "model_object": xgb_baseline, "model_uid": ""}
        ],
    "undersampling" : [
        { "model_name": lgr_baseline.__class__.__name__, "model_object": lgr_baseline, "model_uid": ""},
        { "model_name": svm_baseline.__class__.__name__, "model_object": dct_baseline, "model_uid": ""},
        { "model_name": dct_baseline.__class__.__name__, "model_object": dct_baseline, "model_uid": ""},
        { "model_name": rfc_baseline.__class__.__name__, "model_object": rfc_baseline, "model_uid": ""},
        { "model_name": knn_baseline.__class__.__name__, "model_object": knn_baseline, "model_uid": ""},
        { "model_name": xgb_baseline.__class__.__name__, "model_object": xgb_baseline, "model_uid": ""},
        { "model_name": nb_cat_baseline.__class__.__name__, "model_object": knn_baseline, "model_uid": ""},
        { "model_name": nb_gauss_baseline.__class__.__name__, "model_object": xgb_baseline, "model_uid": ""}
        ],
    "oversampling" : [
        { "model_name": lgr_baseline.__class__.__name__, "model_object": lgr_baseline, "model_uid": ""},
        { "model_name": svm_baseline.__class__.__name__, "model_object": dct_baseline, "model_uid": ""},
        { "model_name": dct_baseline.__class__.__name__, "model_object": dct_baseline, "model_uid": ""},
        { "model_name": rfc_baseline.__class__.__name__, "model_object": rfc_baseline, "model_uid": ""},
        { "model_name": knn_baseline.__class__.__name__, "model_object": knn_baseline, "model_uid": ""},
        { "model_name": xgb_baseline.__class__.__name__, "model_object": xgb_baseline, "model_uid": ""},
        { "model_name": nb_cat_baseline.__class__.__name__, "model_object": knn_baseline, "model_uid": ""},
        { "model_name": nb_gauss_baseline.__class__.__name__, "model_object": xgb_baseline, "model_uid": ""}
        ],
    "smote" : [
        { "model_name": lgr_baseline.__class__.__name__, "model_object": lgr_baseline, "model_uid": ""},
        { "model_name": svm_baseline.__class__.__name__, "model_object": dct_baseline, "model_uid": ""},
        { "model_name": dct_baseline.__class__.__name__, "model_object": dct_baseline, "model_uid": ""},
        { "model_name": rfc_baseline.__class__.__name__, "model_object": rfc_baseline, "model_uid": ""},
        { "model_name": knn_baseline.__class__.__name__, "model_object": knn_baseline, "model_uid": ""},
        { "model_name": xgb_baseline.__class__.__name__, "model_object": xgb_baseline, "model_uid": ""},
        { "model_name": nb_cat_baseline.__class__.__name__, "model_object": knn_baseline, "model_uid": ""},
        { "model_name": nb_gauss_baseline.__class__.__name__, "model_object": xgb_baseline, "model_uid": ""}
        ],
    }

In [62]:
def train_eval_model(list_of_model, prefix_model_name, x_train, y_train, data_configuration_name, x_valid, y_valid, log_path):

    list_of_model = copy.deepcopy(list_of_model)
    logger = create_log_template()

    for model in tqdm(list_of_model):    
        model_name = prefix_model_name + "-" + model["model_name"]

        start_time = time_stamp()
        model["model_object"].fit(x_train, y_train)
        finished_time = time_stamp()

        elapsed_time = finished_time - start_time
        elapsed_time = elapsed_time.total_seconds()

        y_pred = model["model_object"].predict(x_valid)
        performance = classification_report(y_valid, y_pred, output_dict = True)

        plain_id = str(start_time) + str(finished_time)
        chiper_id = hashlib.md5(plain_id.encode()).hexdigest()

        model["model_uid"] = chiper_id

        logger["model_name"].append(model_name)
        logger["model_uid"].append(chiper_id)
        logger["training_time"].append(elapsed_time)
        logger["training_date"].append(str(start_time))
        logger["performance"].append(performance)
        logger["f1_score_avg"].append(performance["macro avg"]["f1-score"])
        logger["data_configurations"].append(data_configuration_name)

    training_log = training_log_updater(logger, log_path)

    return training_log, list_of_model

In [63]:
def get_best_model(training_log_df, list_of_model):
    model_object = None

    best_model_info = training_log_df.sort_values(["f1_score_avg", "training_time"], ascending = [False, True]).iloc[0]
    
    for configuration_data in list_of_model:
        for model_data in list_of_model[configuration_data]:
            if model_data["model_uid"] == best_model_info["model_uid"]:
                model_object = model_data["model_object"]
                break
    
    if model_object == None:
        raise RuntimeError("The best model not found in your list of model.")
    
    return model_object
    

## Train and Evaluate the Baseline Models

### Original (imbalanced) data

In [64]:
training_log, list_of_model_imbal = train_eval_model(
    list_of_model["imbalanced"],
    "baseline_model",
    x_train_base,
    y_train,
    "imbalanced",
    x_valid_base,
    y_valid,
    "/root/ml_process_feb23/logs/training_log_baseline.json"
)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
100%|██████████| 8/8 [00:09<00:00,  1.19s/it]


In [65]:
list_of_model["imbalanced"] = copy.deepcopy(list_of_model_imbal)

### Balanced with undersampling

In [66]:
training_log, list_of_model_rus = train_eval_model(
    list_of_model["undersampling"],
    "baseline_model",
    x_train_rus_base,
    y_train_rus,
    "undersampling",
    x_valid_base,
    y_valid,
    "/root/ml_process_feb23/logs/training_log_baseline.json"
)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
100%|██████████| 8/8 [00:00<00:00,  8.06it/s]


In [67]:
list_of_model["undersampling"] = copy.deepcopy(list_of_model_rus)

### Balanced with oversampling

In [68]:
training_log, list_of_model_ros = train_eval_model(
    list_of_model["oversampling"],
    "baseline_model",
    x_train_ros_base,
    y_train_ros,
    "oversampling",
    x_valid_base,
    y_valid,
    "/root/ml_process_feb23/logs/training_log_baseline.json"
)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
100%|██████████| 8/8 [00:03<00:00,  2.05it/s]


In [69]:
list_of_model["oversampling"] = copy.deepcopy(list_of_model_ros)

### Balanced with SMOTE

In [70]:
training_log, list_of_model_smote = train_eval_model(
    list_of_model["smote"],
    "baseline_model",
    x_train_smote_base,
    y_train_smote,
    "smote",
    x_valid_base,
    y_valid,
    "/root/ml_process_feb23/logs/training_log_baseline.json"
)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
100%|██████████| 8/8 [00:01<00:00,  6.38it/s]


In [71]:
list_of_model["smote"] = copy.deepcopy(list_of_model_smote)

### Table of models performance

Now let's compare the performance from each algorithm and each dataset.

In [72]:
def training_log_to_df(training_log):
    training_res = pd.DataFrame()

    for log in tqdm(training_log):
        training_res = pd.concat([training_res, pd.DataFrame(log)])
    
    training_res.sort_values(["f1_score_avg", "training_time"], ascending = [False, True], inplace = True)
    training_res.reset_index(inplace = True, drop = True)
    
    return training_res

In [73]:
training_res_baseline = training_log_to_df(training_log)

100%|██████████| 4/4 [00:00<00:00, 139.40it/s]


In [74]:
training_res_baseline

Unnamed: 0,model_name,model_uid,training_time,training_date,performance,f1_score_avg,data_configurations
0,baseline_model-GaussianNB,61ea9e18f6e009109d4f106418b9ff70,0.155203,2023-03-24 16:11:19.422386,"{'0': {'precision': 0.8936170212765957, 'recal...",0.963076,smote
1,baseline_model-XGBClassifier,57edd349709f6e23a2d1464db1bf7a58,0.205197,2023-03-24 16:11:19.164365,"{'0': {'precision': 0.8936170212765957, 'recal...",0.963076,smote
2,baseline_model-RandomForestClassifier,624e638b2bd5a4e25116c49037eb3f4f,0.90754,2023-03-24 16:11:14.703445,"{'0': {'precision': 0.8936170212765957, 'recal...",0.963076,oversampling
3,baseline_model-GaussianNB,5fc91d3f9d09f1ea8dd296e936f20cc9,0.190117,2023-03-24 16:11:08.999202,"{'0': {'precision': 0.9111111111111111, 'recal...",0.962492,imbalanced
4,baseline_model-XGBClassifier,94d4995a36478a6762edd3e042be021c,5.228302,2023-03-24 16:11:03.613657,"{'0': {'precision': 0.9111111111111111, 'recal...",0.962492,imbalanced
5,baseline_model-SVC,66e97eecae898cb40cdbfd1b9cf2ed83,0.004111,2023-03-24 16:11:09.778243,"{'0': {'precision': 0.875, 'recall': 1.0, 'f1-...",0.956028,undersampling
6,baseline_model-DecisionTreeClassifier,97bbd67e4b05ec12253957e17b619521,0.008382,2023-03-24 16:11:09.799235,"{'0': {'precision': 0.875, 'recall': 1.0, 'f1-...",0.956028,undersampling
7,baseline_model-GaussianNB,6ea3649bb0c8632cf5b8b4a873ab29b8,0.071247,2023-03-24 16:11:10.560312,"{'0': {'precision': 0.875, 'recall': 1.0, 'f1-...",0.956028,undersampling
8,baseline_model-XGBClassifier,6ae358d065e8aa7b3d90af99987e3aef,0.095539,2023-03-24 16:11:10.415975,"{'0': {'precision': 0.875, 'recall': 1.0, 'f1-...",0.956028,undersampling
9,baseline_model-LogisticRegression,ad35a3a174bd3021475ce1b1eb0420f3,0.118446,2023-03-24 16:11:18.349442,"{'0': {'precision': 0.875, 'recall': 1.0, 'f1-...",0.956028,smote


### Interpretation



From the baseline models table, I found two best models with similar F1-score, with the top one having the faster training time.

I keep these models to be treated by hyperparameter tuning to improve their performance.

The hyperparameter tuning will be done later after the alternative models are trained.

## Train and Evaluate the Alternative Models

The binned value features are used in the datasets for this model training process. 

### Original (imbalanced) data


In [75]:
training_log, list_of_model_imbal = train_eval_model(
    list_of_model["imbalanced"],
    "alternative_model",
    x_train_bin,
    y_train,
    "imbalanced",
    x_valid_bin,
    y_valid,
    "/root/ml_process_feb23/logs/training_log_alternative.json"
)
list_of_model["imbalanced"] = copy.deepcopy(list_of_model_imbal)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
100%|██████████| 8/8 [00:02<00:00,  3.53it/s]


In [76]:
list_of_model["imbalanced"] = copy.deepcopy(list_of_model_imbal)


### Balanced with undersampling


In [77]:
training_log, list_of_model_rus = train_eval_model(
    list_of_model["undersampling"],
    "alternative_model",
    x_train_rus_bin,
    y_train_rus,
    "undersampling",
    x_valid_bin,
    y_valid,
    "/root/ml_process_feb23/logs/training_log_alternative.json"
)
list_of_model["undersampling"] = copy.deepcopy(list_of_model_rus)


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
100%|██████████| 8/8 [00:01<00:00,  5.23it/s]


### Balanced with oversampling


In [78]:
training_log, list_of_model_ros = train_eval_model(
    list_of_model["oversampling"],
    "alternative_model",
    x_train_ros_bin,
    y_train_ros,
    "oversampling",
    x_valid_bin,
    y_valid,
    "/root/ml_process_feb23/logs/training_log_alternative.json"
)
list_of_model["oversampling"] = copy.deepcopy(list_of_model_ros)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
100%|██████████| 8/8 [00:01<00:00,  7.68it/s]


In [79]:
list_of_model["oversampling"] = copy.deepcopy(list_of_model_ros)


### Balanced with SMOTE


In [80]:
training_log, list_of_model_smote = train_eval_model(
    list_of_model["smote"],
    "alternative_model",
    x_train_smote_bin,
    y_train_smote,
    "smote",
    x_valid_bin,
    y_valid,
    "/root/ml_process_feb23/logs/training_log_alternative.json"
)
list_of_model["smote"] = copy.deepcopy(list_of_model_smote)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
100%|██████████| 8/8 [00:01<00:00,  5.75it/s]


In [81]:
list_of_model["smote"] = copy.deepcopy(list_of_model_smote)

### Table of models performance

Now let's compare the performance from each algorithm and each dataset.


In [82]:
def training_log_to_df(training_log):
    training_res = pd.DataFrame()

    for log in tqdm(training_log):
        training_res = pd.concat([training_res, pd.DataFrame(log)])
    
    training_res.sort_values(["f1_score_avg", "training_time"], ascending = [False, True], inplace = True)
    training_res.reset_index(inplace = True, drop = True)
    
    return training_res


In [83]:
training_res_alternative = training_log_to_df(training_log)


100%|██████████| 4/4 [00:00<00:00, 385.51it/s]


In [84]:
training_res_alternative

Unnamed: 0,model_name,model_uid,training_time,training_date,performance,f1_score_avg,data_configurations
0,alternative_model-RandomForestClassifier,2fe6445eb555fa043d91372dae0d615f,0.41338,2023-03-24 16:11:23.120754,"{'0': {'precision': 0.8936170212765957, 'recal...",0.963076,imbalanced
1,alternative_model-RandomForestClassifier,7c8c2f6df483e52a9f3c333e15cc7a6d,0.470431,2023-03-24 16:11:28.103588,"{'0': {'precision': 0.8936170212765957, 'recal...",0.963076,oversampling
2,alternative_model-DecisionTreeClassifier,e9420f0fab48c4c293b3c9df4da58204,0.006104,2023-03-24 16:11:25.564478,"{'0': {'precision': 0.875, 'recall': 1.0, 'f1-...",0.956028,undersampling
3,alternative_model-SVC,471442ecfd809b9ec5e7d25160b6fce9,0.006329,2023-03-24 16:11:25.542265,"{'0': {'precision': 0.875, 'recall': 1.0, 'f1-...",0.956028,undersampling
4,alternative_model-LogisticRegression,a707f714388573f5adeaeba6006d5331,0.06189,2023-03-24 16:11:27.989205,"{'0': {'precision': 0.875, 'recall': 1.0, 'f1-...",0.956028,oversampling
5,alternative_model-LogisticRegression,b3d4de265cd4b53be79d970fc8ce56d1,0.127674,2023-03-24 16:11:29.549156,"{'0': {'precision': 0.875, 'recall': 1.0, 'f1-...",0.956028,smote
6,alternative_model-XGBClassifier,1f04bec0d8d6386d022179ba4953e074,0.156847,2023-03-24 16:11:26.282997,"{'0': {'precision': 0.875, 'recall': 1.0, 'f1-...",0.956028,undersampling
7,alternative_model-LogisticRegression,7009ae5dfe2bd59944b3a1465524d641,0.362039,2023-03-24 16:11:22.687991,"{'0': {'precision': 0.875, 'recall': 1.0, 'f1-...",0.956028,imbalanced
8,alternative_model-GaussianNB,a3e7a00e386bce4eca9d4de741e1cdcb,0.450746,2023-03-24 16:11:26.503052,"{'0': {'precision': 0.875, 'recall': 1.0, 'f1-...",0.956028,undersampling
9,alternative_model-RandomForestClassifier,7d4f0257cc73716d256aa9d97ac61f80,0.532396,2023-03-24 16:11:25.602427,"{'0': {'precision': 0.875, 'recall': 1.0, 'f1-...",0.956028,undersampling


### Interpretation

From the alternative models table, the best model is resulted from the same algorithm with the best models in the baseline models table.

So the tuning will be using the same parameters, and will compare them together to get the best of the best model.

## Hyperparameter Tuning

### Create the Paramaters Instances
As mentioned before, I am going to fine-tune the Random Forest Classifier algorithm, parameters need to be tuned as follows:

In [85]:
params = {
    'n_estimators': [10, 50, 100, 200],
    'criterion' : ['gini', 'entropy', 'log_loss'],
    'min_samples_leaf' : [1, 2, 5, 10],
    'max_features': ['sqrt', 'log2', None],
    'n_jobs': [1, -1]
}

In [86]:
# Search the best parameters with two Cross-Validations methods
grid_search = GridSearchCV(RandomForestClassifier(), params)
random_search = RandomizedSearchCV(RandomForestClassifier(), params, random_state = 85)

### Train the Tuned Models

Now I try to train another models, with just one algorithm, Random Forest Classifier.

The list in these models will contain both of the default parameters of the algorithm and the tuned ones.

In [87]:
# Create new list of models

list_of_model = {
    "imbalanced" : [
        { "model_name": rfc_baseline.__class__.__name__, "model_object": rfc_baseline, "model_uid": ""},
        { "model_name": grid_search.__class__.__name__ + "-" + grid_search.estimator.__class__.__name__, "model_object": copy.deepcopy(grid_search), "model_uid": ""},
        { "model_name": random_search.__class__.__name__ + "-" + random_search.estimator.__class__.__name__, "model_object": copy.deepcopy(random_search), "model_uid": ""}
        ],
    "undersampling" : [
        { "model_name": lgr_baseline.__class__.__name__, "model_object": lgr_baseline, "model_uid": ""},
        { "model_name": grid_search.__class__.__name__ + "-" + grid_search.estimator.__class__.__name__, "model_object": copy.deepcopy(grid_search), "model_uid": ""},
        { "model_name": random_search.__class__.__name__ + "-" + random_search.estimator.__class__.__name__, "model_object": copy.deepcopy(random_search), "model_uid": ""}
        ],
    "oversampling" : [
        { "model_name": rfc_baseline.__class__.__name__, "model_object": lgr_baseline, "model_uid": ""},
        { "model_name": grid_search.__class__.__name__ + "-" + grid_search.estimator.__class__.__name__, "model_object": copy.deepcopy(grid_search), "model_uid": ""},
        { "model_name": random_search.__class__.__name__ + "-" + random_search.estimator.__class__.__name__, "model_object": copy.deepcopy(random_search), "model_uid": ""}
        ],
    "smote" : [
        { "model_name": rfc_baseline.__class__.__name__, "model_object": rfc_baseline, "model_uid": ""},
        { "model_name": grid_search.__class__.__name__ + "-" + grid_search.estimator.__class__.__name__, "model_object": copy.deepcopy(grid_search), "model_uid": ""},
        { "model_name": random_search.__class__.__name__ + "-" + random_search.estimator.__class__.__name__, "model_object": copy.deepcopy(random_search), "model_uid": ""}
        ],
    }

In [88]:
training_log, list_of_model_imbal = train_eval_model(
    list_of_model["imbalanced"],
    "baseline_model",
    x_train_base,
    y_train,
    "imbalanced",
    x_valid_base,
    y_valid,
    "/root/ml_process_feb23/logs/training_log_tuned.json"
)
list_of_model["imbalanced"] = copy.deepcopy(list_of_model_imbal)

100%|██████████| 3/3 [18:25<00:00, 368.39s/it]


In [89]:
training_log, list_of_model_ros = train_eval_model(
    list_of_model["oversampling"],
    "baseline_model",
    x_train_base,
    y_train,
    "imbalanced",
    x_valid_base,
    y_valid,
    "/root/ml_process_feb23/logs/training_log_tuned.json"
)
list_of_model["oversampling"] = copy.deepcopy(list_of_model_ros)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
100%|██████████| 3/3 [20:57<00:00, 419.05s/it]


In [90]:
training_log, list_of_model_imbal = train_eval_model(
    list_of_model["imbalanced"],
    "alternative_model",
    x_train_bin,
    y_train,
    "imbalanced",
    x_valid_bin,
    y_valid,
    "/root/ml_process_feb23/logs/training_log_tuned.json"
)
list_of_model["imbalanced"] = copy.deepcopy(list_of_model_imbal)

 33%|███▎      | 1/3 [00:00<00:00,  2.41it/s]

### Table of models performance

In [None]:
def training_log_to_df(training_log):
    training_res = pd.DataFrame()

    for log in tqdm(training_log):
        training_res = pd.concat([training_res, pd.DataFrame(log)])
    
    training_res.sort_values(["f1_score_avg", "training_time"], ascending = [False, True], inplace = True)
    training_res.reset_index(inplace = True, drop = True)
    
    return training_res

In [None]:
training_res_tuned = training_log_to_df(training_log)


100%|██████████| 3/3 [00:01<00:00,  2.43it/s]


In [None]:
training_res_tuned

Unnamed: 0,model_name,model_uid,training_time,training_date,performance,f1_score_avg,data_configurations
0,baseline_model-RandomizedSearchCV-RandomForest...,453459573a5810e2bae2294b8659af9f,23.23977,2023-03-24 15:47:41.183112,"{'0': {'precision': 0.9130434782608695, 'recal...",0.97023,imbalanced
1,alternative_model-GridSearchCV-RandomForestCla...,0a2cd0ce16279eadf7f5a06355b96c9a,930.408775,2023-03-24 15:48:06.315145,"{'0': {'precision': 0.9130434782608695, 'recal...",0.97023,imbalanced
2,baseline_model-GridSearchCV-RandomForestClassi...,5b634e7a7c55dffca9234a124cbb5b0d,1065.80323,2023-03-24 15:29:55.327296,"{'0': {'precision': 0.8936170212765957, 'recal...",0.963076,imbalanced
3,baseline_model-GridSearchCV-RandomForestClassi...,366662e86603cf7c06aa399c0118b2ee,1219.561539,2023-03-24 15:09:07.429515,"{'0': {'precision': 0.8936170212765957, 'recal...",0.963076,imbalanced
4,alternative_model-RandomForestClassifier,a2e14fe3dac5becec919c07fdfdd12e1,0.430113,2023-03-24 15:48:05.854328,"{'0': {'precision': 0.875, 'recall': 1.0, 'f1-...",0.956028,imbalanced
5,baseline_model-LogisticRegression,5b89cc74d54f772b1b17d7b80f7e17e4,0.857661,2023-03-24 15:29:54.432137,"{'0': {'precision': 0.875, 'recall': 1.0, 'f1-...",0.956028,imbalanced
6,baseline_model-RandomForestClassifier,8d762e6942749a5f20b10ce8a464519a,7.250303,2023-03-24 15:08:59.545757,"{'0': {'precision': 0.875, 'recall': 1.0, 'f1-...",0.956028,imbalanced
7,baseline_model-RandomizedSearchCV-RandomForest...,e492a20e594cbf89f5ff7f05b5966c80,26.818856,2023-03-24 15:29:27.031753,"{'0': {'precision': 0.875, 'recall': 1.0, 'f1-...",0.956028,imbalanced
8,alternative_model-RandomizedSearchCV-RandomFor...,0b557da9c0c1abaca73ac1c1c1f5eef7,20.364996,2023-03-24 16:03:36.736792,"{'0': {'precision': 0.9090909090909091, 'recal...",0.954627,imbalanced
