## Model testing

In [4]:
import data_inference_tools as tools
import json
import random

import asyncio

# URL and token for API
endpoint_url = "https://avynledq36ikyog8.eu-west-1.aws.endpoints.huggingface.cloud"
hf_token = "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"

In [5]:
# Version
version = "2"

params = {
    'temperature': 0.3,
    'top_p': 0.95,
    'repetition_penalty': 0.95,
    'n_examples': 12,
    'max_tries': 10,
    "max_new_tokens": 488,
    "stop_sequences": ["\nUser:", "\nUSER:", "<|endoftext|>", "</s>"]
}
# Generation arguments in tools
gen_kwargs = {param_name: params[param_name] for param_name in params if param_name not in ["n_examples", "max_tries"]}

# Dataset, domains, tasks and task-metrics to test
configs = {
    "SemEval2015Task12": {
        "restaurants": {
            "TASD": ["TASD", "ASD", "TSD", "AD", "TD", "TAD"]
        },
        # "laptops": {
        #     "ASD": ["ASD", "AD"]
        # },
    },
    "SemEval2016Task5": {
        "restaurants": {
            "TASD": ["TASD", "ASD", "TSD", "AD", "TD", "TAD"]
        },
        # "laptops": {
        #     "ASD": ["ASD", "AD"]
        # },
    }
}

# Number of runs for each setting
n_runs = 10

In [6]:

# Initialize a list to store results
all_metric_results = []
all_inference_results = []
completed_experiments = []

# Run model testing
for dataset_name, domains in configs.items():
    print("Working on dataset {}".format(dataset_name))
    for domain, tasks in domains.items():
        print("\tWorking on domain {}".format(domain))
        for task, task_metrics in tasks.items():
            print("\t\tWorking on task {}".format(task))
            # Handler for the experiment
            handler = tools.get_handler(dataset_name, endpoint_url, hf_token, gen_kwargs)
            # Analyzer for the experiment
            analyzer = tools.DatasetAnalyzer(handler, split="test", max_concurrent_tasks=100)
            # Initialize runs list
            runs = []
            for i in range(n_runs):
                # Run metadata
                metadata = {
                    "dataset": dataset_name,
                    "domain": domain,
                    "task": task,
                    "run": i
                } | params
                # Run the experiment
                inference_results = await analyzer.inference(domain, task, max_tries=params["max_tries"], n_examples=params["n_examples"], use_tqdm=True)
                print("\t\t\tRun {} completed".format(i))
                # Compute task metrics
                for task_metric in task_metrics:
                    metric_result = analyzer.compute_metrics(inference_results, domain, task_metric)
                    print("\t\t\t\tMetrics for task {} computed:".format(task_metric), "F1-Score: {f1:.2%}".format(f1=metric_result["f1"]))
                    metric_result = metadata | {"task_metric": task_metric} | metric_result
                    all_metric_results.append(metric_result)
                all_inference_results.append(metadata | {"results": inference_results})


# Write metric results to json fil
with open("model_testing/results/v{}_metric_results.json".format(version), "w") as outfile:
    json.dump(all_metric_results, outfile)

# Write all inference results to json file
with open("model_testing/results/v{}_inference_results.json".format(version), "w") as outfile:
    json.dump(all_inference_results, outfile)

Working on dataset SemEval2015Task12
	Working on domain restaurants
		Working on task TASD


Processing reviews: 100%|██████████| 685/685 [04:52<00:00,  2.34review/s]


			Run 0 completed
				Metrics for task TASD computed: F1-Score: 36.50%
				Metrics for task ASD computed: F1-Score: 61.79%
				Metrics for task TSD computed: F1-Score: 44.78%
				Metrics for task AD computed: F1-Score: 66.12%
				Metrics for task TD computed: F1-Score: 48.12%
				Metrics for task TAD computed: F1-Score: 37.96%


Processing reviews: 100%|██████████| 685/685 [02:45<00:00,  4.14review/s]


			Run 1 completed
				Metrics for task TASD computed: F1-Score: 26.67%
				Metrics for task ASD computed: F1-Score: 53.33%
				Metrics for task TSD computed: F1-Score: 40.00%
				Metrics for task AD computed: F1-Score: 53.33%
				Metrics for task TD computed: F1-Score: 40.00%
				Metrics for task TAD computed: F1-Score: 26.67%


Processing reviews: 100%|██████████| 685/685 [04:50<00:00,  2.36review/s]


			Run 2 completed
				Metrics for task TASD computed: F1-Score: 47.37%
				Metrics for task ASD computed: F1-Score: 74.45%
				Metrics for task TSD computed: F1-Score: 57.53%
				Metrics for task AD computed: F1-Score: 76.47%
				Metrics for task TD computed: F1-Score: 60.27%
				Metrics for task TAD computed: F1-Score: 47.37%


Processing reviews: 100%|██████████| 685/685 [05:15<00:00,  2.17review/s]  


			Run 3 completed
				Metrics for task TASD computed: F1-Score: 38.10%
				Metrics for task ASD computed: F1-Score: 65.00%
				Metrics for task TSD computed: F1-Score: 47.62%
				Metrics for task AD computed: F1-Score: 71.79%
				Metrics for task TD computed: F1-Score: 50.00%
				Metrics for task TAD computed: F1-Score: 38.10%


Processing reviews: 100%|██████████| 685/685 [05:58<00:00,  1.91review/s]


			Run 4 completed
				Metrics for task TASD computed: F1-Score: 0.00%
				Metrics for task ASD computed: F1-Score: 0.00%
				Metrics for task TSD computed: F1-Score: 0.00%
				Metrics for task AD computed: F1-Score: 0.00%
				Metrics for task TD computed: F1-Score: 0.00%
				Metrics for task TAD computed: F1-Score: 0.00%


Processing reviews: 100%|██████████| 685/685 [05:11<00:00,  2.20review/s]


			Run 5 completed
				Metrics for task TASD computed: F1-Score: 46.67%
				Metrics for task ASD computed: F1-Score: 68.29%
				Metrics for task TSD computed: F1-Score: 53.49%
				Metrics for task AD computed: F1-Score: 71.60%
				Metrics for task TD computed: F1-Score: 53.49%
				Metrics for task TAD computed: F1-Score: 46.67%


Processing reviews: 100%|██████████| 685/685 [04:11<00:00,  2.72review/s]  


			Run 6 completed
				Metrics for task TASD computed: F1-Score: 45.00%
				Metrics for task ASD computed: F1-Score: 60.00%
				Metrics for task TSD computed: F1-Score: 51.43%
				Metrics for task AD computed: F1-Score: 65.00%
				Metrics for task TD computed: F1-Score: 52.94%
				Metrics for task TAD computed: F1-Score: 45.00%


Processing reviews: 100%|██████████| 685/685 [02:41<00:00,  4.24review/s]


			Run 7 completed
				Metrics for task TASD computed: F1-Score: 57.14%
				Metrics for task ASD computed: F1-Score: 57.14%
				Metrics for task TSD computed: F1-Score: 57.14%
				Metrics for task AD computed: F1-Score: 57.14%
				Metrics for task TD computed: F1-Score: 57.14%
				Metrics for task TAD computed: F1-Score: 57.14%


Processing reviews: 100%|██████████| 685/685 [04:04<00:00,  2.80review/s]


			Run 8 completed
				Metrics for task TASD computed: F1-Score: 66.67%
				Metrics for task ASD computed: F1-Score: 76.92%
				Metrics for task TSD computed: F1-Score: 66.67%
				Metrics for task AD computed: F1-Score: 76.92%
				Metrics for task TD computed: F1-Score: 66.67%
				Metrics for task TAD computed: F1-Score: 66.67%


Processing reviews: 100%|██████████| 685/685 [02:38<00:00,  4.32review/s]


			Run 9 completed
				Metrics for task TASD computed: F1-Score: 0.00%
				Metrics for task ASD computed: F1-Score: 0.00%
				Metrics for task TSD computed: F1-Score: 0.00%
				Metrics for task AD computed: F1-Score: 0.00%
				Metrics for task TD computed: F1-Score: 0.00%
				Metrics for task TAD computed: F1-Score: 0.00%
Working on dataset SemEval2016Task5
	Working on domain restaurants
		Working on task TASD


Processing reviews: 100%|██████████| 676/676 [04:21<00:00,  2.59review/s]  


			Run 0 completed
				Metrics for task TASD computed: F1-Score: 26.42%
				Metrics for task ASD computed: F1-Score: 54.17%
				Metrics for task TSD computed: F1-Score: 33.96%
				Metrics for task AD computed: F1-Score: 66.67%
				Metrics for task TD computed: F1-Score: 46.15%
				Metrics for task TAD computed: F1-Score: 37.74%


Processing reviews: 100%|██████████| 676/676 [04:20<00:00,  2.60review/s]


			Run 1 completed
				Metrics for task TASD computed: F1-Score: 47.06%
				Metrics for task ASD computed: F1-Score: 73.91%
				Metrics for task TSD computed: F1-Score: 50.98%
				Metrics for task AD computed: F1-Score: 77.27%
				Metrics for task TD computed: F1-Score: 56.00%
				Metrics for task TAD computed: F1-Score: 50.98%


Processing reviews: 100%|██████████| 676/676 [04:22<00:00,  2.58review/s]  


			Run 2 completed
				Metrics for task TASD computed: F1-Score: 38.30%
				Metrics for task ASD computed: F1-Score: 59.57%
				Metrics for task TSD computed: F1-Score: 51.06%
				Metrics for task AD computed: F1-Score: 60.87%
				Metrics for task TD computed: F1-Score: 55.32%
				Metrics for task TAD computed: F1-Score: 38.30%


Processing reviews: 100%|██████████| 676/676 [04:18<00:00,  2.61review/s]  


			Run 3 completed
				Metrics for task TASD computed: F1-Score: 25.53%
				Metrics for task ASD computed: F1-Score: 53.33%
				Metrics for task TSD computed: F1-Score: 40.00%
				Metrics for task AD computed: F1-Score: 53.33%
				Metrics for task TD computed: F1-Score: 40.00%
				Metrics for task TAD computed: F1-Score: 25.53%


Processing reviews: 100%|██████████| 676/676 [04:17<00:00,  2.62review/s]  


			Run 4 completed
				Metrics for task TASD computed: F1-Score: 22.22%
				Metrics for task ASD computed: F1-Score: 43.90%
				Metrics for task TSD computed: F1-Score: 35.56%
				Metrics for task AD computed: F1-Score: 45.00%
				Metrics for task TD computed: F1-Score: 41.86%
				Metrics for task TAD computed: F1-Score: 22.22%


Processing reviews: 100%|██████████| 676/676 [04:09<00:00,  2.71review/s]  


			Run 5 completed
				Metrics for task TASD computed: F1-Score: 48.00%
				Metrics for task ASD computed: F1-Score: 66.67%
				Metrics for task TSD computed: F1-Score: 50.00%
				Metrics for task AD computed: F1-Score: 66.67%
				Metrics for task TD computed: F1-Score: 50.00%
				Metrics for task TAD computed: F1-Score: 48.00%


Processing reviews: 100%|██████████| 676/676 [04:07<00:00,  2.73review/s]


			Run 6 completed
				Metrics for task TASD computed: F1-Score: 29.63%
				Metrics for task ASD computed: F1-Score: 59.26%
				Metrics for task TSD computed: F1-Score: 29.63%
				Metrics for task AD computed: F1-Score: 66.67%
				Metrics for task TD computed: F1-Score: 38.46%
				Metrics for task TAD computed: F1-Score: 29.63%


Processing reviews: 100%|██████████| 676/676 [04:17<00:00,  2.63review/s]  


			Run 7 completed
				Metrics for task TASD computed: F1-Score: 52.17%
				Metrics for task ASD computed: F1-Score: 76.19%
				Metrics for task TSD computed: F1-Score: 60.87%
				Metrics for task AD computed: F1-Score: 85.71%
				Metrics for task TD computed: F1-Score: 69.57%
				Metrics for task TAD computed: F1-Score: 60.87%


Processing reviews: 100%|██████████| 676/676 [02:37<00:00,  4.29review/s]


			Run 8 completed
				Metrics for task TASD computed: F1-Score: 0.00%
				Metrics for task ASD computed: F1-Score: 0.00%
				Metrics for task TSD computed: F1-Score: 0.00%
				Metrics for task AD computed: F1-Score: 0.00%
				Metrics for task TD computed: F1-Score: 0.00%
				Metrics for task TAD computed: F1-Score: 0.00%


Processing reviews: 100%|██████████| 676/676 [04:13<00:00,  2.67review/s]

			Run 9 completed
				Metrics for task TASD computed: F1-Score: 44.00%
				Metrics for task ASD computed: F1-Score: 69.39%
				Metrics for task TSD computed: F1-Score: 57.14%
				Metrics for task AD computed: F1-Score: 68.09%
				Metrics for task TD computed: F1-Score: 57.14%
				Metrics for task TAD computed: F1-Score: 44.00%





In [25]:
import pandas as pd

# Read the data
df = pd.DataFrame(all_metric_results)

# Group by param & param_value and aggregate precision, recall, and f1
# Filter out failed runs
df_grouped = df[df.f1 != 0].groupby(by=["dataset", "domain", "task", "task_metric"]).agg({"precision": ["mean", "std"], "recall": ["mean", "std"], "f1": ["mean", "std"]})

In [30]:
df_grouped.sort_values(by=["dataset", "domain", "task", "task_metric"]).f1["mean"].loc["SemEval2015Task12"]

domain       task  task_metric
restaurants  TASD  AD             0.672982
                   ASD            0.646166
                   TAD            0.456953
                   TASD           0.455129
                   TD             0.535792
                   TSD            0.523320
Name: mean, dtype: float64

In [29]:
df_grouped.sort_values(by=["dataset", "domain", "task", "task_metric"]).f1["mean"].loc["SemEval2016Task5"]

domain       task  task_metric
restaurants  TASD  AD             0.655861
                   ASD            0.618216
                   TAD            0.396964
                   TASD           0.370366
                   TD             0.505003
                   TSD            0.454671
Name: mean, dtype: float64