# Social Chemistry 101 results analysis

In [1]:
import pandas as pd
import os
import sys
import re
import plotly.express as px

In [2]:
parent_dir = os.path.abspath(os.path.join(os.getcwd(), "..", ".."))
sys.path.append(parent_dir)

In [None]:
from utils.metrics import (
    accuracy,
    soft_accuracy,
    accuracy_for_label,
    f1,
    bad_precision_score,
    bad_recall_score,
    get_int_predictions,
    bad_f1
)

### Auxiliary methods

In [5]:
def extract_model_name(filename) -> tuple[str, bool]:
    match = re.match(r"(en_)?([a-zA-Z0-9.-]+)", filename)
    if match:
        prefix, model_name = match.groups()
        return model_name, prefix is not None
    return None

In [19]:
models_name_map={"Llama-3.2-3B-Instruct": "Llama 3.2 3b",
                 "Qwen2.5-7B-Instruct": "Qwen 2.5 7b",
                 "aya-101": "Aya 101 13b",
                 "aya-expanse-8b": "Aya Expanse 8b",
                 "gemma-2-9b-it": "Gemma 2 9b",
                 "gemma-3-4b-it": "Gemma 3 4b",
                 "gpt-4o": "GPT 4o",}

In [None]:
def plot_model_scores(
    df: pd.DataFrame,
    score_col: str,
    model_col: str = "model_name",
    color_col: str = "Language",
    title: str | None = None,
) -> None:
    """Plot a Plotly bar chart grouped by model names with different colors for English/Ukrainian models."""
    
    # Randomly choose a color scheme
    colors = px.colors.qualitative.T10
    unique_categories = df[color_col].dropna().unique()
    color_map = {category: colors[i % len(colors)] for i, category in enumerate(unique_categories)}
    df["display_model_name"]=df[model_col].apply(lambda x: models_name_map.get(x, x))

    fig = px.bar(
        df,
        x="display_model_name",
        y=score_col,
        color=color_col,
        barmode="group",
        title=title if title else score_col.replace("_", " ").title(),
        labels={score_col: "Score", model_col: "Model"},
        color_discrete_map=color_map,
        hover_data={score_col: True, color_col: True},
    )

    fig.update_layout(
        xaxis=dict(title="Model Name", tickfont=dict(size=12)),
        yaxis=dict(title="Score", gridcolor="rgba(0, 0, 0, 0.1)", range=[0, 1.1]),
        legend=dict(title=color_col.title()),
        plot_bgcolor="white",
        paper_bgcolor="white",
        font=dict(color="black"),
        title=dict(font=dict(size=18), x=0.5),
        margin=dict(l=40, r=40, t=50, b=80),
        width=800,
        height=500,
    )

    fig.show()

In [None]:
def process_csv_files(folder_paths: list[str], output_csv: str):
    """Processes all CSV files in a folder, computes metrics, saves results, and plots bar charts."""

    translation_mapping = {
        "_deepl": "DeepL",
        "_claude": "Claude 3.7",
        "default": "Dragoman",
    }

    def get_translation(folder_path, is_english):
        if is_english:
            return None
        for suffix, name in translation_mapping.items():
            if folder_path.endswith(suffix):
                return name
        return translation_mapping["default"]

    results = []

    for folder_path in folder_paths:
        for file in os.listdir(folder_path):
            if file.endswith(".csv"):
                file_path = os.path.join(folder_path, file)
                df = pd.read_csv(file_path)

                df["parsed_prediction"] = get_int_predictions(df, fill_na=0 if "llama" in file.lower() else -1)

                hard_acc = accuracy(df)
                bad_acc = accuracy_for_label(df)
                soft_acc = soft_accuracy(df)
                bad_prec = bad_precision_score(df)
                bad_rec = bad_recall_score(df)
                bad_f1_score = bad_f1(df)
                f1_score = f1(df)

                model_name, is_english = extract_model_name(file)
                translation = get_translation(folder_path, is_english)

                results.append(
                    {
                        "model_name": model_name,
                        "hard_accuracy": hard_acc,
                        "bad_hard_accuracy": bad_acc,
                        "bad_precision": bad_prec,
                        "bad_recall": bad_rec,
                        "bad_f1_score": bad_f1_score,
                        "f1_score": f1_score,
                        "soft_accuracy": soft_acc,
                        "Translation": translation,
                        "Language": "English" if is_english else "Ukrainian",
                    }
                )

    results_df = pd.DataFrame(results)
    results_df.sort_values(by=["model_name", "Language", "Translation"], inplace=True)
    results_df.to_csv(output_csv, index=False)

    return results_df

### Results extraction

In [None]:
folder_path = [
    "../results/sc_101_care_harm",
    "../results/sc_101_care_harm_deepl",
    "../results/sc_101_care_harm_claude",
]
output_csv = "../results/sc_101_care_harm_results.csv"
results_df = process_csv_files(folder_path, output_csv)

## Comparison by Language

In [None]:
df_subset = results_df[~results_df['Translation'].isin(["Dragoman", "DeepL"])]

In [None]:
plot_model_scores(df_subset, "hard_accuracy")

In [11]:
plot_model_scores(df_subset, "soft_accuracy")

In [12]:
plot_model_scores(df_subset, "bad_hard_accuracy", title="Accuracy for Wrong Social Norms detection")

In [13]:
plot_model_scores(df_subset, "f1_score")

## Comparison by translation

In [21]:
models_list=["Llama-3.2-3B-Instruct", "Qwen2.5-7B-Instruct", "aya-expanse-8b", "gemma-2-9b-it"]

In [24]:
df_subset = results_df[results_df.Translation.notna()]
df_subset = df_subset[df_subset.model_name.isin(models_list)]
plot_model_scores(df_subset, "f1_score", color_col="Translation", title="F1 Score By Model and Translation")

In [15]:
df_subset = results_df[results_df.Translation.notna()]
plot_model_scores(df_subset, "soft_accuracy",color_col="Translation")

## Label prediction analysis

In [16]:
pd.set_option('display.max_colwidth', 255)

In [None]:
for file in os.listdir('../results/sc_101_care_harm'):
    if file.endswith(".csv"):
        file_path = os.path.join('../results/sc_101_care_harm', file)
        df = pd.read_csv(file_path)
        df["parsed_prediction"] = get_int_predictions(df,fill_na=-1)

        print(f"\n\nFile: {file}")
        print(df["parsed_prediction"].value_counts())



File: aya-expanse-8b_sc_101_care_harm_2025-03-06T22:51:35.csv
parsed_prediction
2    2259
0    1396
1      27
Name: count, dtype: int64


File: en_gemma-2-9b-it_2025-03-20T18:49:22.csv
parsed_prediction
2    2034
0    1267
1     381
Name: count, dtype: int64


File: en_Qwen2.5-7B-Instruct_2025-03-20T19:15:41.csv
parsed_prediction
 2    1913
 0    1493
 1     275
-1       1
Name: count, dtype: int64


File: Llama-3.2-3B-Instruct_sc_101_care_harm_2025-03-06T23:03:15.csv
parsed_prediction
 2    2386
 1    1212
 0      59
-1      25
Name: count, dtype: int64


File: en_aya-expanse-8b_2025-03-20T18:26:01.csv
parsed_prediction
2    2259
0    1388
1      35
Name: count, dtype: int64


File: gemma-2-9b-it_sc_101_care_harm_2025-03-06T22:23:29.csv
parsed_prediction
2    1813
0    1267
1     602
Name: count, dtype: int64


File: Qwen2.5-7B-Instruct_sc_101_care_harm_2025-03-06T23:39:19.csv
parsed_prediction
 2    1476
 0    1418
 1     542
-1     240
 3       1
Name: count, dtype: int64


File: 