In [1]:
import yaml
import numpy as np
import pandas as pd
from tabulate import tabulate

In [5]:
results = pd.concat([
    pd.read_csv("../results/baseline.csv", header=None),
    pd.read_csv("../results/mcmc.csv", header=None)])


results.columns = ["timestamp", "config", "precision", "recall", "f1_score"]
results.head()

Unnamed: 0,timestamp,config,precision,recall,f1_score
0,2023-11-05 12:40:57.258265,config/baseline/balance_catboost_baseline.yml,0.87981,0.896,0.88484
1,2023-11-05 12:40:59.676977,config/baseline/balance_catboost_baseline.yml,0.87981,0.896,0.88484
2,2023-11-05 12:41:02.054493,config/baseline/balance_catboost_baseline.yml,0.87981,0.896,0.88484
3,2023-11-05 12:41:04.406294,config/baseline/balance_catboost_baseline.yml,0.87981,0.896,0.88484
4,2023-11-05 12:41:06.718576,config/baseline/balance_catboost_baseline.yml,0.87981,0.896,0.88484


In [8]:
def parse_config_clf(config):
    with open(f"../{config}", "r") as f:
        config = yaml.safe_load(f)
    
    if "baseline" in config.keys():
        return config["baseline"]["classifier"]
    else:
        return config["mcmc"]["classifier"]
    # return config["baseline"]["classifier"]

def parse_config_method(config):
    with open(f"../{config}", "r") as f:
        config = yaml.safe_load(f)
        
    if "baseline" in config.keys():
        return config["baseline"]["sampler_name"]
    else:
        return "mcmc"
        # return config["mcmc"]["classifier"]

def parse_config_dataset(config):
    with open(f"../{config}", "r") as f:
        config = yaml.safe_load(f)
        
    if "baseline" in config.keys():
        return config["baseline"]["dataset"]
    else:
        return config["mcmc"]["dataset"]

results["dataset"] = results.config.apply(parse_config_dataset)
results["classifier"] = results.config.apply(parse_config_clf)
results["method"] = results.config.apply(parse_config_method)

In [13]:
for ds in results.dataset.unique():
    rs = pd.pivot_table(
        results.query(f"dataset=='{ds}'"),
        index=["method"],
        columns = "classifier",
        values="f1_score",
        # aggfunc= lambda x: f"{np.mean(x):.4f}±{np.std(x):.4f}"
    )
    print(f"#### {ds}")
    print("\n")
    print(tabulate(rs, headers=rs.columns, tablefmt="github"))
    print("\n\n\n")


#### balance


|                             |   catboost |   decision_tree |      gbc |      knn |       lr |      mlp |      svm |
|-----------------------------|------------|-----------------|----------|----------|----------|----------|----------|
| baseline                    |   0.88484  |        0.7924   | 0.84133  | 0.8448   | 0.83759  | 0.9158   | 0.87434  |
| instance-hardness-threshold |   0.649083 |        0.63392  | 0.625469 | 0.694536 | 0.690433 | 0.781246 | 0.643765 |
| mcmc                        |   0.860509 |        0.783372 | 0.853808 | 0.751991 | 0.813427 | 0.851543 | 0.822859 |
| smotenn                     |   0.830028 |        0.76745  | 0.823676 | 0.72628  | 0.727295 | 0.856183 | 0.842741 |
| svm-smote                   |   0.866276 |        0.782571 | 0.838979 | 0.766065 | 0.937529 | 0.936136 | 0.879181 |




#### breast-cancer


|                             |   catboost |   decision_tree |      gbc |      knn |       lr |      mlp |      svm |
|---------------

In [5]:
for ds in results.dataset.unique():
    rs = pd.pivot_table(
        results.query(f"dataset=='{ds}'"),
        index=["method"],
        columns = "classifier",
        values="precision",
    )
    print(f"#### {ds}")
    print("\n")
    print(tabulate(rs, headers=rs.columns, tablefmt="github"))
    print("\n\n\n")


#### balance


|                             |   catboost |   decision_tree |      gbc |      knn |       lr |      mlp |      svm |
|-----------------------------|------------|-----------------|----------|----------|----------|----------|----------|
| baseline                    |   0.87981  |        0.82353  | 0.82761  | 0.84172  | 0.81589  | 0.94037  | 0.84057  |
| instance-hardness-threshold |   0.92739  |        0.884911 | 0.897475 | 0.927969 | 0.933549 | 0.919281 | 0.932175 |
| smotenn                     |   0.905317 |        0.826243 | 0.857091 | 0.905015 | 0.934906 | 0.942046 | 0.939482 |
| svm-smote                   |   0.86515  |        0.818193 | 0.82404  | 0.813427 | 0.951801 | 0.938981 | 0.912315 |




#### breast-cancer


|                             |   catboost |   decision_tree |      gbc |      knn |       lr |      mlp |      svm |
|-----------------------------|------------|-----------------|----------|----------|----------|----------|----------|
| adasyn        

In [6]:
for ds in results.dataset.unique():
    rs = pd.pivot_table(
        results.query(f"dataset=='{ds}'"),
        index=["method"],
        columns = "classifier",
        values="recall",
    )
    print(f"#### {ds}")
    print("\n")
    print(tabulate(rs, headers=rs.columns, tablefmt="github"))
    print("\n\n\n")


#### balance


|                             |   catboost |   decision_tree |      gbc |    knn |       lr |      mlp |      svm |
|-----------------------------|------------|-----------------|----------|--------|----------|----------|----------|
| baseline                    |   0.896    |        0.768    | 0.856    | 0.848  | 0.872    | 0.936    | 0.912    |
| instance-hardness-threshold |   0.557867 |        0.541867 | 0.533333 | 0.6112 | 0.605867 | 0.723733 | 0.554133 |
| smotenn                     |   0.791467 |        0.728    | 0.799467 | 0.6496 | 0.650667 | 0.820267 | 0.800533 |
| svm-smote                   |   0.8704   |        0.755733 | 0.856    | 0.7248 | 0.931733 | 0.934933 | 0.859733 |




#### breast-cancer


|                             |   catboost |   decision_tree |      gbc |      knn |       lr |      mlp |      svm |
|-----------------------------|------------|-----------------|----------|----------|----------|----------|----------|
| adasyn                    