In [9]:
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns; sns.set()

from sklearn.metrics import f1_score, precision_recall_fscore_support

In [6]:
test_df = pd.read_csv("../data/mtob_domain_en2fr_nllb_test.csv")
test_df = test_df.rename({"label": "true_label"}, axis=1)
test_df.head(1)

Unnamed: 0,id,text_en,text_fr,true_label,label_text,text_fr2en
0,3139393736393839,Show me dates for music festivals in 2018,Montre-moi les dates des festivals de musique ...,2,event,Show me the dates of the music festivals in 2018


In [32]:
preds_paths = {
    "base1_mostfreq": "../outputs/baselines/base1_mostfreq/test_preds.csv",
    "base2_stratified": "../outputs/baselines/base2_stratified/test_preds.csv",
    "base3_keywords": "../outputs/baselines/base3_keywords/test_preds.csv",
    "mlp_en_tt": "../outputs/en_only/mlp/translate_test_preds.csv",
    "roberta_en_tt": "../outputs/en_only/roberta-1/translate_test_preds.csv", 
    "xlm-roberta_en_tt": "../outputs/en_only/xlm-en-1/translate_test_preds.csv",
    "xlm-roberta_en": "../outputs/en_only/xlm-en-1/test_preds.csv",
    "camembert_fr": "../outputs/en_and_fr/camembert-1/test_preds.csv",
    "xlm-roberta_fr": "../outputs/en_and_fr/xlmr-fr-1/test_preds.csv",
    "gpt3.5_prompt": "../outputs/zero_shot_prompting/gpt-3.5/test_preds.csv"
}

In [33]:
preds_dfs = {}
for model, preds_path in preds_paths.items():
    preds_df = pd.read_csv(preds_path).rename({"label": "pred_label"}, axis=1)
    preds_df = test_df.merge(preds_df, left_on="id", right_on="id")
    preds_dfs[model] = preds_df.copy()

In [34]:
scores = []
for model, preds_df in preds_dfs.items():
    true_labels = preds_df["true_label"].values
    pred_labels = preds_df["pred_label"].values
    precision, recall, micro_f1, _ = precision_recall_fscore_support(
        true_labels, pred_labels, average="micro")
    macro_f1 = f1_score(true_labels, pred_labels, average="macro")
    weighted_f1 = f1_score(true_labels, pred_labels, average="weighted")
    scores.append([model, micro_f1, precision, recall, macro_f1, weighted_f1])
    
scores_df = pd.DataFrame(scores, columns=["Model", "F1", "Precision", "Recall", "Macro-F1", "Weighted-F1"])

In [35]:
scores_df

Unnamed: 0,Model,F1,Precision,Recall,Macro-F1,Weighted-F1
0,base1_mostfreq,0.172565,0.172565,0.172565,0.026758,0.050792
1,base2_stratified,0.111181,0.111181,0.111181,0.096764,0.111979
2,base3_keywords,0.56342,0.56342,0.56342,0.560178,0.668834
3,mlp_en_tt,0.928281,0.928281,0.928281,0.93005,0.928249
4,roberta_en_tt,0.947698,0.947698,0.947698,0.95295,0.947005
5,xlm-roberta_en_tt,0.94989,0.94989,0.94989,0.954502,0.949396
6,xlm-roberta_en,0.918572,0.918572,0.918572,0.914246,0.917564
7,camembert_fr,0.981835,0.981835,0.981835,0.980011,0.981803
8,xlm-roberta_fr,0.982148,0.982148,0.982148,0.980287,0.982144
9,gpt3.5_prompt,0.780144,0.780144,0.780144,0.751534,0.827613


In [40]:
pdf = preds_dfs["gpt3.5_prompt"]
pdf[pdf.pred_label < 0].label_text.value_counts()

label_text
people       124
weather       45
calling       37
recipes       34
reminder      31
music         27
messaging     22
news          15
alarm         11
timer          8
event          7
Name: count, dtype: int64