In [1]:
import yaml
import numpy as np
import pandas as pd
from tabulate import tabulate

In [2]:
results = pd.concat([
    pd.read_csv("../results/baseline.csv", header=None),
    pd.read_csv("../results/mcmc.csv", header=None)])


results.columns = ["timestamp", "config", "precision", "recall", "f1_score"]
results.head()

Unnamed: 0,timestamp,config,precision,recall,f1_score
0,2023-11-11 18:03:44.827951,config/baseline/balance_catboost_baseline.yml,0.72953,0.70409,0.71001
1,2023-11-11 18:03:58.811437,config/baseline/balance_catboost_smotenn.yml,0.72914,0.78395,0.72273
2,2023-11-11 19:58:24.241044,config/baseline/balance_catboost_baseline.yml,0.72953,0.70409,0.71001
3,2023-11-11 19:58:31.947361,config/baseline/balance_catboost_baseline.yml,0.72953,0.70409,0.71001
4,2023-11-11 19:58:39.512772,config/baseline/balance_catboost_baseline.yml,0.72953,0.70409,0.71001


In [3]:
def parse_config_clf(config):
    with open(f"../{config}", "r") as f:
        config = yaml.safe_load(f)
    
    if "baseline" in config.keys():
        return config["baseline"]["classifier"]
    else:
        return config["mcmc"]["classifier"]
    # return config["baseline"]["classifier"]

def parse_config_method(config):
    with open(f"../{config}", "r") as f:
        config = yaml.safe_load(f)
        
    if "baseline" in config.keys():
        return config["baseline"]["sampler_name"]
    else:
        return "mcmc"
        # return config["mcmc"]["classifier"]

def parse_config_dataset(config):
    with open(f"../{config}", "r") as f:
        config = yaml.safe_load(f)
        
    if "baseline" in config.keys():
        return config["baseline"]["dataset"]
    else:
        return config["mcmc"]["dataset"]

results["dataset"] = results.config.apply(parse_config_dataset)
results["classifier"] = results.config.apply(parse_config_clf)
results["method"] = results.config.apply(parse_config_method)

In [4]:
for ds in results.dataset.unique():
    rs = pd.pivot_table(
        results.query(f"dataset=='{ds}'"),
        index=["method"],
        columns = "classifier",
        values="f1_score",
        aggfunc=lambda x: f"{np.mean(x):.4f}"
        # aggfunc= lambda x: f"{np.mean(x):.4f}±{np.std(x):.4f}"
    )
    print(f"#### {ds}")
    print("\n")
    print(tabulate(rs, headers=rs.columns, tablefmt="github"))
    print("\n\n\n")


#### balance


|                             |   catboost |   decision_tree |    gbc |    knn |     lr |    mlp |    svm |
|-----------------------------|------------|-----------------|--------|--------|--------|--------|--------|
| baseline                    |     0.71   |          0.5938 | 0.6097 | 0.6411 | 0.6071 | 0.7555 | 0.6337 |
| instance-hardness-threshold |     0.5424 |          0.5138 | 0.5281 | 0.5653 | 0.5851 | 0.667  | 0.5402 |
| mcmc                        |     0.6466 |          0.574  | 0.6354 | 0.5827 | 0.7019 | 0.7138 | 0.662  |
| smotenn                     |     0.7101 |          0.6063 | 0.6555 | 0.6183 | 0.614  | 0.7452 | 0.7241 |
| svm-smote                   |     0.663  |          0.5896 | 0.609  | 0.564  | 0.8633 | 0.8478 | 0.7438 |




#### breast-cancer


|                             |   catboost |   decision_tree |    gbc |    knn |     lr |    mlp |    svm |
|-----------------------------|------------|-----------------|--------|--------|--------|-------

In [5]:
for ds in results.dataset.unique():
    rs = pd.pivot_table(
        results.query(f"dataset=='{ds}'"),
        index=["method"],
        columns = "classifier",
        values="precision",
    )
    print(f"#### {ds}")
    print("\n")
    print(tabulate(rs, headers=rs.columns, tablefmt="github"))
    print("\n\n\n")


#### balance


|                             |   catboost |   decision_tree |      gbc |      knn |       lr |      mlp |      svm |
|-----------------------------|------------|-----------------|----------|----------|----------|----------|----------|
| baseline                    |   0.72953  |        0.61221  | 0.59979  | 0.64049  | 0.59176  | 0.95688  | 0.60931  |
| instance-hardness-threshold |   0.712785 |        0.663007 | 0.690083 | 0.717733 | 0.723949 | 0.729977 | 0.717533 |
| mcmc                        |   0.646267 |        0.593786 | 0.634303 | 0.632607 | 0.727687 | 0.725164 | 0.673707 |
| smotenn                     |   0.733337 |        0.632902 | 0.668743 | 0.705451 | 0.728955 | 0.764173 | 0.753707 |
| svm-smote                   |   0.664648 |        0.608201 | 0.597971 | 0.598577 | 0.840949 | 0.844627 | 0.741913 |




#### breast-cancer


|                             |   catboost |   decision_tree |      gbc |      knn |       lr |      mlp |      svm |
|---------------

In [6]:
for ds in results.dataset.unique():
    rs = pd.pivot_table(
        results.query(f"dataset=='{ds}'"),
        index=["method"],
        columns = "classifier",
        values="recall",
    )
    print(f"#### {ds}")
    print("\n")
    print(tabulate(rs, headers=rs.columns, tablefmt="github"))
    print("\n\n\n")


#### balance


|                             |   catboost |   decision_tree |      gbc |      knn |       lr |      mlp |      svm |
|-----------------------------|------------|-----------------|----------|----------|----------|----------|----------|
| baseline                    |   0.70409  |        0.58375  | 0.62019  | 0.64202  | 0.63158  | 0.73333  | 0.66082  |
| instance-hardness-threshold |   0.670605 |        0.583411 | 0.641389 | 0.694699 | 0.719136 | 0.786578 | 0.677315 |
| mcmc                        |   0.651349 |        0.559448 | 0.648572 | 0.581665 | 0.776872 | 0.762122 | 0.670991 |
| smotenn                     |   0.796437 |        0.616775 | 0.669323 | 0.715433 | 0.747217 | 0.85568  | 0.834816 |
| svm-smote                   |   0.664152 |        0.579901 | 0.621396 | 0.536659 | 0.910275 | 0.853981 | 0.780561 |




#### breast-cancer


|                             |   catboost |   decision_tree |      gbc |      knn |       lr |      mlp |      svm |
|---------------

In [8]:
results.head()

Unnamed: 0,timestamp,config,precision,recall,f1_score,dataset,classifier,method
0,2023-11-11 18:03:44.827951,config/baseline/balance_catboost_baseline.yml,0.72953,0.70409,0.71001,balance,catboost,baseline
1,2023-11-11 18:03:58.811437,config/baseline/balance_catboost_smotenn.yml,0.72914,0.78395,0.72273,balance,catboost,smotenn
2,2023-11-11 19:58:24.241044,config/baseline/balance_catboost_baseline.yml,0.72953,0.70409,0.71001,balance,catboost,baseline
3,2023-11-11 19:58:31.947361,config/baseline/balance_catboost_baseline.yml,0.72953,0.70409,0.71001,balance,catboost,baseline
4,2023-11-11 19:58:39.512772,config/baseline/balance_catboost_baseline.yml,0.72953,0.70409,0.71001,balance,catboost,baseline


In [41]:
# plt.rcParams["figure.figsize"] = [3, 9]
rs = pd.pivot_table(
    results,
    index=["dataset", "classifier"],
    columns="method",
    values="f1_score"
).reset_index().fillna(0)
print(tabulate(rs, headers=rs.columns, tablefmt="github"))

|    | dataset       | classifier    |   adasyn |   baseline |   instance-hardness-threshold |   kmean-smote |     mcmc |   smotenn |   svm-smote |
|----|---------------|---------------|----------|------------|-------------------------------|---------------|----------|-----------|-------------|
|  0 | balance       | catboost      | 0        |    0.71001 |                      0.542369 |      0        | 0.646565 |  0.710061 |    0.663022 |
|  1 | balance       | decision_tree | 0        |    0.59377 |                      0.513811 |      0        | 0.573965 |  0.606283 |    0.589603 |
|  2 | balance       | gbc           | 0        |    0.60965 |                      0.528087 |      0        | 0.635381 |  0.655518 |    0.609042 |
|  3 | balance       | knn           | 0        |    0.64113 |                      0.565335 |      0        | 0.582711 |  0.618299 |    0.564004 |
|  4 | balance       | lr            | 0        |    0.60709 |                      0.585141 |      0        | 0