In [1]:
import pickle
# load an insights log file
load_file_name = "out/logs/insights-24-01-2023-14-40-53" + ".pck"
with open(load_file_name, 'rb') as f:
    insights = pickle.load(f)

In [2]:
# visualize plots by different model parameters
plt_models_by_clf = {}

# reformat insights data to easier fit our plot requirements
for model_name, model_class, model_params, model_accuracy, model_f1, model_fbeta, model_log_loss, model_precision, model_time in insights:
    # check if key exists and if not add new dict key with model type (name)
    if not model_name in plt_models_by_clf:
        plt_models_by_clf.update({str(model_name): []})
    
    plt_models_by_clf[model_name].append({
        "model_class": model_class,
        "model_params": model_params,
        "model_accuracy": model_accuracy,
        "model_f1": model_f1,
        "model_fbeta": model_fbeta,
        "model_log_loss": model_log_loss,
        "model_precision": model_precision,
        "model_time": model_time
    })

In [3]:
# pick best accuracy models and compare them to the rest
plt_model_best = []
_count = 0
for model_name, model_class, model_params, model_accuracy, model_f1, model_fbeta, model_log_loss, model_precision, model_time in insights:
    mdl_found = False
    for mdl in plt_model_best:
        if mdl["name"] == model_name:
            mdl_found = True
    
    if not mdl_found:
        plt_model_best.append({"name": model_name, "parameters": model_params, "f1_score": model_f1, "model_time": model_time})
    else:
        for best_name, best_parameters, best_f1, best_time in plt_model_best:
            _count = _count + 1
            if best_name == model_name:
                print(best_name + " == " + model_name)
                if best_f1 < model_f1:
                    plt_model_best[_count]["parameters"] = model_params
                    plt_model_best[_count]["f1_score"] = model_f1
                    plt_model_best[_count]["model_time"] = model_time
                if best_f1 == model_f1 and best_time == model_time:
                    plt_model_best[_count]["parameters"] = best_parameters + model_params
                if best_f1 == model_f1 and best_time > model_time:
                    plt_model_best[_count]["parameters"] = model_params
                    plt_model_best[_count]["model_time"] = model_time
                


In [4]:
# print all models
for model in plt_model_best:
    print("Model " + model["name"] + " with best f1 score " + str(model["f1_score"]) + " and it took " + str(round(model["model_time"], 2)) + "s to train")


Model AdaBoostClassifier with best f1 score 0.837037037037037 and it took 0.24s to train
Model BaggingClassifier with best f1 score 0.7111111111111111 and it took 0.7s to train
Model BernoulliNB with best f1 score 0.8185185185185185 and it took 0.02s to train
Model CalibratedClassifierCV with best f1 score 0.8185185185185185 and it took 0.27s to train
Model CategoricalNB with best f1 score 0 and it took 0.64s to train
Model ComplementNB with best f1 score 0.8222222222222222 and it took 0.01s to train
Model DecisionTreeClassifier with best f1 score 0 and it took 0.01s to train
Model DummyClassifier with best f1 score 0.8185185185185185 and it took 0.0s to train
Model ExtraTreesClassifier with best f1 score 0.8407407407407409 and it took 0.25s to train
Model GaussianNB with best f1 score 0.8407407407407409 and it took 0.0s to train
Model GaussianProcessClassifier with best f1 score 0.7518518518518519 and it took 0.76s to train
Model GradientBoostingClassifier with best f1 score 0.7296296

In [7]:
# print some models with best f1 score
best_f1 = 0
best_models = []

for model in plt_model_best:
    if model["f1_score"] >= best_f1:
        best_models.append(model)
        best_f1 = model["f1_score"]

for model in best_models:
    print("Model " + model["name"] + " with best f1 score " + str(model["f1_score"]) + " and it took " + str(round(model["model_time"], 2)) + "s to train")



Model AdaBoostClassifier with best f1 score 0.837037037037037 and it took 0.24s to train
Model ExtraTreesClassifier with best f1 score 0.8407407407407409 and it took 0.25s to train
Model GaussianNB with best f1 score 0.8407407407407409 and it took 0.0s to train
Model HistGradientBoostingClassifier with best f1 score 0.8481481481481481 and it took 0.94s to train
Model LinearDiscriminantAnalysis with best f1 score 0.8740740740740742 and it took 0.03s to train


In [9]:
# print model with best f1 and train time
best_time = 1000
best_model = None

for model in best_models:
    if model["model_time"] < best_time:
        best_model = model
        best_time = model["model_time"]

print(str(best_model))

{'name': 'GaussianNB', 'parameters': {'var_smoothing': 1e-09, 'priors': None}, 'f1_score': 0.8407407407407409, 'model_time': 0.0}
