In [2]:
import yaml
import numpy as np
import pandas as pd
from tabulate import tabulate

In [11]:
results = \
     pd.read_csv("../results/baseline.csv", header=None)

results.columns = ["timestamp", "config", "precision", "recall", "f1_score"]
results.head()

Unnamed: 0,timestamp,config,precision,recall,f1_score
0,2023-11-05 09:52:11.105919,config/baseline/balance_catboost_instance-hard...,0.87981,0.896,0.88484
1,2023-11-05 09:55:11.231233,config/baseline/balance_catboost_baseline.yml,0.87981,0.896,0.88484
2,2023-11-05 09:55:13.963468,config/baseline/balance_catboost_instance-hard...,0.9176,0.552,0.64547
3,2023-11-05 09:55:18.413736,config/baseline/balance_catboost_smotenn.yml,0.91159,0.784,0.82674
4,2023-11-05 09:55:20.813847,config/baseline/balance_catboost_svm-smote.yml,0.87568,0.888,0.87925


In [12]:
def parse_config_clf(config):
    with open(f"../{config}", "r") as f:
        config = yaml.safe_load(f)
    return config["baseline"]["classifier"]

def parse_config_method(config):
    with open(f"../{config}", "r") as f:
        config = yaml.safe_load(f)
    return config["baseline"]["sampler_name"]

def parse_config_dataset(config):
    with open(f"../{config}", "r") as f:
        config = yaml.safe_load(f)
    return config["baseline"]["dataset"]

results["dataset"] = results.config.apply(parse_config_dataset)
results["classifier"] = results.config.apply(parse_config_clf)
results["method"] = results.config.apply(parse_config_method)

In [29]:
for ds in results.dataset.unique():
    rs = pd.pivot_table(
        results.query(f"dataset=='{ds}'"),
        index=["method"],
        columns = "classifier",
        values="f1_score",
    )
    print(f"#### {ds}")
    print("\n")
    print(tabulate(rs, headers=rs.columns, tablefmt="github"))
    print("\n\n\n")


#### balance


|                             |   catboost |   decision_tree |     gbc |     knn |      lr |     mlp |     svm |
|-----------------------------|------------|-----------------|---------|---------|---------|---------|---------|
| baseline                    |   0.88484  |         0.7924  | 0.84133 | 0.8448  | 0.83759 | 0.9158  | 0.87434 |
| instance-hardness-threshold |   0.732467 |         0.64487 | 0.65359 | 0.67266 | 0.70658 | 0.8015  | 0.63642 |
| smotenn                     |   0.82956  |         0.77699 | 0.82366 | 0.75771 | 0.73804 | 0.85375 | 0.83594 |
| svm-smote                   |   0.873075 |         0.78417 | 0.84997 | 0.7716  | 0.93397 | 0.94528 | 0.8798  |




#### breast-cancer


|                             |   catboost |   decision_tree |     gbc |     knn |      lr |     mlp |     svm |
|-----------------------------|------------|-----------------|---------|---------|---------|---------|---------|
| adasyn                      |    0.96491 |         0.9

In [30]:
for ds in results.dataset.unique():
    rs = pd.pivot_table(
        results.query(f"dataset=='{ds}'"),
        index=["method"],
        columns = "classifier",
        values="precision",
    )
    print(f"#### {ds}")
    print("\n")
    print(tabulate(rs, headers=rs.columns, tablefmt="github"))
    print("\n\n\n")


#### balance


|                             |   catboost |   decision_tree |     gbc |     knn |      lr |     mlp |     svm |
|-----------------------------|------------|-----------------|---------|---------|---------|---------|---------|
| baseline                    |    0.87981 |         0.82353 | 0.82761 | 0.84172 | 0.81589 | 0.94037 | 0.84057 |
| instance-hardness-threshold |    0.90532 |         0.89072 | 0.91737 | 0.91946 | 0.93404 | 0.91748 | 0.93194 |
| smotenn                     |    0.9122  |         0.80691 | 0.85163 | 0.91101 | 0.93538 | 0.94424 | 0.93171 |
| svm-smote                   |    0.86778 |         0.81864 | 0.83    | 0.82185 | 0.94694 | 0.94771 | 0.90353 |




#### breast-cancer


|                             |   catboost |   decision_tree |     gbc |     knn |      lr |     mlp |     svm |
|-----------------------------|------------|-----------------|---------|---------|---------|---------|---------|
| adasyn                      |    0.96491 |         0.9

In [31]:
for ds in results.dataset.unique():
    rs = pd.pivot_table(
        results.query(f"dataset=='{ds}'"),
        index=["method"],
        columns = "classifier",
        values="recall",
    )
    print(f"#### {ds}")
    print("\n")
    print(tabulate(rs, headers=rs.columns, tablefmt="github"))
    print("\n\n\n")


#### balance


|                             |   catboost |   decision_tree |   gbc |   knn |    lr |   mlp |   svm |
|-----------------------------|------------|-----------------|-------|-------|-------|-------|-------|
| baseline                    |   0.896    |           0.768 | 0.856 | 0.848 | 0.872 | 0.936 | 0.912 |
| instance-hardness-threshold |   0.674667 |           0.552 | 0.56  | 0.584 | 0.624 | 0.752 | 0.544 |
| smotenn                     |   0.788    |           0.752 | 0.8   | 0.688 | 0.664 | 0.816 | 0.792 |
| svm-smote                   |   0.884    |           0.76  | 0.872 | 0.728 | 0.928 | 0.944 | 0.864 |




#### breast-cancer


|                             |   catboost |   decision_tree |     gbc |     knn |      lr |     mlp |     svm |
|-----------------------------|------------|-----------------|---------|---------|---------|---------|---------|
| adasyn                      |    0.96491 |         0.94737 | 0.97368 | 0.92982 | 0.96491 | 0.97368 | 0.96491 |
| b