In [1]:
import os
import pandas as pd

from datetime import datetime

## Util

In [2]:
metrics = {
    "cola": ["mcc"],
    "sst2": ["accuracy"],
    "mrpc": ["f1", "accuracy"],
    "stsb": ["pearson"],
    "qqp": ["f1", "accuracy"],
    "mnli_matched": ["accuracy"],
    "mnli_mismatched": ["accuracy"],
    "qnli": ["accuracy"],
    "rte": ["accuracy"],
    "wnli": ["accuracy"],
}
metrics = {
    "cola": ["mcc"],
    "sst2": ["accuracy"],
    "mrpc": ["accuracy"],
    "stsb": ["pearson"],
    "qqp": ["accuracy"],
    "mnli_matched": ["accuracy"],
    "mnli_mismatched": ["accuracy"],
    "qnli": ["accuracy"],
    "rte": ["accuracy"],
    "wnli": ["accuracy"],
}

In [3]:
def parse_results_name(filename):
    _, date, time, model = filename.split(".")[0].split("_")

    date_obj = datetime.strptime(date + time, '%Y%m%d%H%M%S')

    formatted_date = date_obj.strftime('%Y-%m-%d')
    formatted_time = date_obj.strftime('%H:%M:%S')

    return formatted_date, formatted_time, model

## printing all best results

In [9]:
results_path = "results"

for root, dirs, files in os.walk(results_path):
    for file in files:
        if not file.endswith("csv"):
            continue

        df = pd.read_csv(os.path.join(root, file))

        # skipping empty results files
        if df.shape[0] == 0:
            continue

        _, embedding_type, task = root.split("/")
        date, time, model = parse_results_name(file)

        if date < "2024-01-10":
            continue

        print(f"--- Results from {date} {time}: {model}, {embedding_type}, {task} ---")

        for m in metrics[task]:
            best = df[m].max()
            best_row = df[df[m] == best]
            print(f"Best {m}: {best:.5f}")
            print(best_row['device'])


--- Results from 2024-01-10 15:43:42: DistilRoBERTaBase, mean_pooling, mrpc ---
Best accuracy: 0.74510
116    cuda:0
Name: device, dtype: object
--- Results from 2024-01-10 17:09:26: DistilRoBERTaBase, mean_pooling, mrpc ---
Best accuracy: 0.77696
21    cuda:2
Name: device, dtype: object
--- Results from 2024-01-10 17:33:40: DistilRoBERTaBase, mean_pooling, rte ---
Best accuracy: 0.60650
59    cuda:2
Name: device, dtype: object
--- Results from 2024-01-10 17:48:52: DistilRoBERTaBase, mean_pooling, wnli ---
Best accuracy: 0.56338
0      cuda:2
1      cuda:2
2      cuda:2
3      cuda:2
4      cuda:2
        ...  
155    cuda:2
156    cuda:2
157    cuda:2
158    cuda:2
159    cuda:2
Name: device, Length: 153, dtype: object
--- Results from 2024-01-10 17:51:13: DistilRoBERTaBase, mean_pooling, qnli ---
Best accuracy: 0.72781
74    cuda:0
Name: device, dtype: object
--- Results from 2024-01-10 16:28:11: DistilRoBERTaBase, mean_pooling, qqp ---
Best accuracy: 0.73660
75    cuda:0
Name: devic

## writing results csv

In [50]:
results_path = "results"
filter_model = "DistilRoBERTaBase"
filter_embedding = "mean_pooling"

def get_filtered_col(param, val):
    rows = []
    for root, _, files in os.walk(results_path):
        for file in files:
            if not file.endswith("csv"):
                continue

            df = pd.read_csv(os.path.join(root, file))

            # Skipping empty results files
            if df.shape[0] == 0:
                continue

            _, embedding_type, task = root.split("/")
            date, time, model = parse_results_name(file)

            if date < "2023-12-26":
                continue

            if model != filter_model:
                continue
            if embedding_type != filter_embedding:
                continue

            # df = df[df[param] == val]
            df = df[df["layer_size"] / df["input_size"] == val]
            metric_best = [f"{df[m].max():.5f}" for m in metrics[task]]

            rows.append({
                "Model": model,
                "Embedding": embedding_type,
                "Task": task,
                f"{param} {val}": ",".join(metric_best)
            })

    return rows

stats = [pd.DataFrame(get_filtered_col("layer_size", v)) for v in [0.25, 0.5, 1, 2, 4]]

df = stats[0][stats[0].columns[:3]]
for s in stats:
    df = pd.concat([df, s[s.columns[-1]]], axis=1)
df.to_csv("test.csv", index=False)
df

Unnamed: 0,Model,Embedding,Task,layer_size 0.25,layer_size 0.5,layer_size 1,layer_size 2,layer_size 4
0,DistilRoBERTaBase,mean_pooling,cola,0.49617,0.49809,0.50269,0.50837,0.5156
1,DistilRoBERTaBase,mean_pooling,mrpc,0.80147,0.78922,0.79412,0.79167,0.78676
2,DistilRoBERTaBase,mean_pooling,mrpc,0.71569,0.72549,0.71569,0.72304,0.71324
3,DistilRoBERTaBase,mean_pooling,mrpc,0.70343,0.71324,0.72059,0.72549,0.71569
4,DistilRoBERTaBase,mean_pooling,rte,0.57401,0.55957,0.59567,0.56679,0.58845
5,DistilRoBERTaBase,mean_pooling,rte,0.55596,0.56679,0.54874,0.55957,0.55957
6,DistilRoBERTaBase,mean_pooling,rte,0.55957,0.5704,0.55957,0.55957,0.56318
7,DistilRoBERTaBase,mean_pooling,qnli,0.71902,0.72744,0.72799,0.72909,0.73165
8,DistilRoBERTaBase,mean_pooling,qnli,0.67143,0.66813,0.67875,0.68003,0.67198
9,DistilRoBERTaBase,mean_pooling,qnli,0.70346,0.69943,0.70639,0.70236,0.70895
