In [7]:
import json
import pathlib

In [8]:
report_dir = "/home/mdafifal.mamun/notebooks/triagerX/training/reports"

In [9]:
def compute_metrics(reports: list):
    accuracies = []
    precisions = []
    recalls = []
    f1s = []
    
    print(f"Number of reports: {len(reports)}")
    
    for report_path in sorted(reports):
        with open(report_path.as_posix(), "r") as json_file:
            json_data = json.load(json_file)
            avg_type = "macro avg"

            accuracies.append(json_data["test_accuracy"])
            precisions.append(json_data[avg_type]["precision"])
            recalls.append(json_data[avg_type]["recall"])
            f1s.append(json_data[avg_type]["f1-score"])

    avg_accuracy = sum(accuracies) / len(accuracies)
    avg_precision = sum(precisions) / len(precisions)
    avg_recall = sum(recalls) / len(recalls)
    avg_f1_score = sum(f1s) / len(f1s)

    return {
        "average_test_accuracy": avg_accuracy,
        "average_test_precision": avg_precision,
        "average_test_recall": avg_recall,
        "average_test_f1_score": avg_f1_score
    }

## Evaluation when no prerecossing is used

In [10]:
compute_metrics(list(pathlib.Path(report_dir).glob("classification_report_comp_raw*")))

Number of reports: 5


{'average_test_accuracy': 0.768284789644013,
 'average_test_precision': 0.6449829831826082,
 'average_test_recall': 0.5849716128827772,
 'average_test_f1_score': 0.6016229300984196}

## Evaluation when using custom special tokens like \[TIMESTAMP\], \[HEX\]

In [11]:
compute_metrics(list(pathlib.Path(report_dir).glob("classification_report_comp_sp*")))

Number of reports: 5


{'average_test_accuracy': 0.7883495145631068,
 'average_test_precision': 0.6771929504409231,
 'average_test_recall': 0.5405658635002272,
 'average_test_f1_score': 0.5747433645623304}

## Evaluation when using LlaMA generated summaries + special tokens

In [12]:
compute_metrics(list(pathlib.Path(report_dir).glob("classification_report_comp_summary*")))

Number of reports: 5


{'average_test_accuracy': 0.7993527508090615,
 'average_test_precision': 0.7342569601563167,
 'average_test_recall': 0.5760193795129018,
 'average_test_f1_score': 0.6116298829475797}