In [23]:
import pandas as pd
import os
import sys
import re
import plotly.express as px
import random

In [24]:
parent_dir = os.path.abspath(os.path.join(os.getcwd(), "..", ".."))
sys.path.append(parent_dir)

In [25]:
from evaluation.utils.metrics import hard_accuracy, soft_accuracy, hard_accuracy_for_label

### Auxiliary methods

In [26]:
def get_random_color_scheme() -> px.colors.qualitative:
    color_schemes = [
        # px.colors.qualitative.Plotly,
        # px.colors.qualitative.D3,
        px.colors.qualitative.T10,
        # px.colors.qualitative.Dark24,
        px.colors.qualitative.Set1,
        px.colors.qualitative.Set2,
        px.colors.qualitative.Set3,
        px.colors.qualitative.Pastel,
        # px.colors.qualitative.Pastel1,
        # px.colors.qualitative.Pastel2,
        # px.colors.qualitative.Bold,
        px.colors.qualitative.Safe,
        # px.colors.qualitative.Prism,
        # px.colors.qualitative.Antique,
    ]

    # Randomly select a color scheme
    return random.choice(color_schemes)

In [27]:
def extract_model_name(filename) -> tuple[str, bool]:
    match = re.match(r"(en_)?([a-zA-Z0-9.-]+)", filename)
    if match:
        prefix, model_name = match.groups()
        return model_name, prefix is not None
    return None

In [28]:
def plot_model_scores(
    df: pd.DataFrame,
    score_col: str,
    model_col: str = "model_name",
    color_col: str = "Language",
    title: str | None = None,
) -> None:
    """Plot a Plotly bar chart grouped by model names with different colors for English/Ukrainian models."""
    
    # Randomly choose a color scheme
    colors = get_random_color_scheme()
    unique_categories = df[color_col].dropna().unique()
    color_map = {category: colors[i % len(colors)] for i, category in enumerate(unique_categories)}

    fig = px.bar(
        df,
        x=model_col,
        y=score_col,
        color=color_col,
        barmode="group",
        title=title if title else score_col.replace("_", " ").title(),
        labels={score_col: "Score", model_col: "Model"},
        color_discrete_map=color_map,
        hover_data={score_col: True, color_col: True},
    )

    fig.update_layout(
        xaxis=dict(title="Model Name", tickfont=dict(size=12)),
        yaxis=dict(title="Score", gridcolor="rgba(0, 0, 0, 0.1)", range=[0, 1.1]),
        legend=dict(title=color_col.title()),
        plot_bgcolor="white",
        paper_bgcolor="white",
        font=dict(color="black"),
        title=dict(font=dict(size=18), x=0.5),
        margin=dict(l=40, r=40, t=50, b=80),
        width=800,
        height=500,
    )

    fig.show()

In [42]:
def process_csv_files(folder_paths: list[str], output_csv: str):
    """Processes all CSV files in a folder, computes metrics, saves results, and plots bar charts."""
    results = []

    for folder_path in folder_paths:
        is_deepl = folder_path.endswith("_deepl")

        for file in os.listdir(folder_path):
            if file.endswith(".csv"):
                file_path = os.path.join(folder_path, file)
                df = pd.read_csv(file_path)

                hard_acc = hard_accuracy(df)
                bad_acc = hard_accuracy_for_label(df)
                soft_acc = soft_accuracy(df)

                model_name, is_english = extract_model_name(file)
                results.append(
                    {
                        "model_name": model_name,
                        "hard_accuracy": hard_acc,
                        "bad_hard_accuracy": bad_acc,
                        "soft_accuracy": soft_acc,
                        "Translation": ("DeepL" if is_deepl else "Dragoman") if not is_english else None,
                        "Language": "English" if is_english else "Ukrainian",
                    }
                )

    results_df = pd.DataFrame(results)
    results_df.sort_values(by=["model_name", "Language", "Translation"], inplace=True)
    results_df.to_csv(output_csv, index=False)

    return results_df

### Results extraction and plotting

In [43]:
folder_path = [
    "/Users/akravche/Projects/UCU/alignment/evaluation/results/sc_101_care_harm",
    "/Users/akravche/Projects/UCU/alignment/evaluation/results/sc_101_care_harm_deepl",
]
output_csv = "/Users/akravche/Projects/UCU/alignment/evaluation/results/sc_101_care_harm_results.csv"
results_df = process_csv_files(folder_path, output_csv)

### By language

In [37]:
df_subset = results_df[results_df['Translation'] != "Dragoman"]

In [38]:
plot_model_scores(df_subset, "hard_accuracy")

In [39]:
plot_model_scores(df_subset, "soft_accuracy")

In [41]:
plot_model_scores(df_subset, "bad_hard_accuracy", title="Accuracy for Wrong Social Norms detection")

### By translation

In [35]:
df_subset = results_df[results_df.Translation.notna()]
plot_model_scores(df_subset, "hard_accuracy", color_col="Translation")

In [36]:
df_subset = results_df[results_df.Translation.notna()]
plot_model_scores(df_subset, "soft_accuracy",color_col="Translation")