Evaluating models trained on v2.1 on v2.1 and v2.4

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
# imports
import re
from copy import deepcopy
from pathlib import Path
from typing import *

import pandas as pd
import srsly
import swifter
import wandb
from datasets import load_from_disk
from hydra.utils import instantiate
from omegaconf import OmegaConf
from tqdm.auto import tqdm

pd.set_option("display.max_colwidth", None)
pd.set_option("display.max_columns", None)
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib import ticker
from polyfuzz import PolyFuzz
from polyfuzz.models import TFIDF, EditDistance
from rapidfuzz import fuzz
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

from src.data.utilities import (
    check_dict_equal,
    clean_slot_values,
    clean_time,
    complement_labels,
    diff_train,
    extract_domains,
    extract_slots,
    extract_values,
    remove_empty_slots,
)
from src.evaluation import compute_prf, diff, jga, prepare_states_eval, slot_metrics
from src.model import DSTTask

In [3]:
list_dfs = []
path = Path("../preds")

for exp_path in list(path.rglob("*experiment_2")):

    for p in tqdm(list(exp_path.rglob("*preds.parquet")), desc=exp_path.name):

        # load
        df = pd.read_parquet(p).assign(
            split=p.parents[0].name,
            epoch=int(p.parents[1].name.split("=")[1]),
            version=p.parents[2].name,
            model=p.parents[4].name,
            size=p.parents[5].name,
        )

        # extract slots from experiments with normal states
        df = df.assign(
            states=lambda df_: df_["states"].map(remove_empty_slots),
            previous_states=lambda df_: df_["previous_states"].map(remove_empty_slots),
        )

        list_dfs.append(df)

experiment_2:   0%|          | 0/1385 [00:00<?, ?it/s]

In [4]:
df = pd.concat(list_dfs).reset_index(drop=True)

In [5]:
somdst = df.loc[df["model"].str.contains("(_mwoz21_ops)|(somdst)")]
df = df.loc[~df["model"].str.contains("(_mwoz21_ops)|(somdst)")]

  somdst = df.loc[df["model"].str.contains("(_mwoz21_ops)|(somdst)")]
  df = df.loc[~df["model"].str.contains("(_mwoz21_ops)|(somdst)")]


In [6]:
budget = df.loc[
    df["model"].isin(["mwoz21_ops_nohist+prev_2022-12-23T15-49-06", "mwoz21_ops_nohist+prev_2022-12-23T15-50-58"])
]
df = df.loc[
    ~df["model"].isin(["mwoz21_ops_nohist+prev_2022-12-23T15-49-06", "mwoz21_ops_nohist+prev_2022-12-23T15-50-58"])
]

In [7]:
df.groupby(["model", "split", "version"])[["epoch", "dialogue_id"]].nunique()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,epoch,dialogue_id
model,split,version,Unnamed: 3_level_1,Unnamed: 4_level_1
mwoz21_cum_fullhist+nostate_2022-12-15T18-44-50,test,v0,20,1000
mwoz21_cum_fullhist+nostate_2022-12-15T18-44-50,validation,v0,20,1000
mwoz21_cum_fullhist+nostate_2022-12-19T15-26-55,test,v0,20,1000
mwoz21_cum_fullhist+nostate_2022-12-19T15-26-55,validation,v0,20,1000
mwoz21_cum_fullhist+nostate_2022-12-19T22-28-29,test,v0,20,1000
mwoz21_cum_fullhist+nostate_2022-12-19T22-28-29,validation,v0,20,1000
mwoz21_cum_fullhist+nostate_2022-12-20T05-34-00,test,v0,20,1000
mwoz21_cum_fullhist+nostate_2022-12-20T05-34-00,validation,v0,19,1000
mwoz21_cum_fullhist+nostate_2022-12-20T12-43-11,test,v0,19,1000
mwoz21_cum_fullhist+nostate_2022-12-20T12-43-11,validation,v0,20,1000


In [8]:
df["states"] = df["states"].swifter.apply(prepare_states_eval)

Pandas Apply:   0%|          | 0/10030854 [00:00<?, ?it/s]

In [None]:
# load gold annotations
list_dfs = []
for v in ("21", "22", "23", "24"):
    dataset_dict = load_from_disk(f"../data/processed/multiwoz_{v}")

    true_df = (
        pd.concat([dataset_dict[split].to_pandas() for split in ("test", "validation")])
        .sort_values(["dialogue_id", "turn_id"])
        .reset_index(drop=True)
        .assign(
            states=lambda df_: df_[f"states"].map(lambda ex: prepare_states_eval(remove_empty_slots(ex))), version=v
        )
    )

    list_dfs.append(true_df)

true_df = pd.concat(list_dfs).reset_index(drop=True)
print(len(true_df))
true_df = true_df.loc[true_df["usr_utt"] != "none", ["dialogue_id", "turn_id", "states", "version"]]
print(len(true_df))
true_df = true_df.set_index(["dialogue_id", "turn_id", "version"]).unstack(-1)
true_df.columns = [f"{i}_{j}" for i, j in true_df.columns]
true_df = true_df.reset_index()

In [None]:
regexs = []
for v in ["guest house", "swimming pool", "night club", "concert hall"]:
    regexs += [
        (re.compile(v, flags=re.IGNORECASE), v.replace(" ", "")),
        (re.compile(v.replace(" ", ""), flags=re.IGNORECASE), v),
    ]

for v in [("theater", "theatre"), ("center", "centre")]:
    regexs += [
        (re.compile(v[0], flags=re.IGNORECASE), v[1]),
        (re.compile(v[1], flags=re.IGNORECASE), v[0]),
    ]

regexs.append((re.compile("star", flags=re.IGNORECASE), ""))


def add_variations(state: Union[Dict, None], regexs) -> Union[Dict, None]:
    if state is None:
        return None

    new_state = deepcopy(state)
    for k, v_list in new_state.items():
        new_v_list = []
        for v in v_list:
            for pat, sub in regexs:
                new_v_list.append(pat.sub(sub, v).strip())

                if v.startswith("the"):
                    new_v_list.append(v.lstrip("the").strip())

        new_state[k] = list(set(new_v_list))

    return new_state

In [None]:
for v in ("21", "22", "23", "24"):
    true_df[f"new_states_{v}"] = true_df[f"states_{v}"].map(lambda ex: add_variations(ex, regexs))

In [None]:
# add gold annotations
pred_df = pd.merge(
    df,
    true_df,
    on=["dialogue_id", "turn_id"],
    how="inner",
)
# assert len(df) == len(pred_df)

In [None]:
# compute correct predictions
for v in ("21", "22", "23", "24"):
    pred_df[f"jga_turn_{v}"] = pred_df.swifter.apply(
        lambda row: jga(row["states"], row[f"states_{v}"]),
        axis=1,
    )

    pred_df[f"new_jga_turn_{v}"] = pred_df.swifter.apply(
        lambda row: jga(row["states"], row[f"new_states_{v}"]),
        axis=1,
    )

In [None]:
eval_cols = []
for v in ("21", "22", "23", "24"):
    eval_cols += [f"jga_turn_{v}", f"new_jga_turn_{v}"]
index_cols = ["model", "version", "split", "size", "epoch"]

In [None]:
results = pred_df.groupby(index_cols)[eval_cols].mean()

In [None]:
validation_results = []
for col in eval_cols:
    val_best_ids = results.groupby(index_cols[:-1])[col].idxmax().reset_index()
    val_best_ids = val_best_ids.loc[val_best_ids["split"] == "validation"].assign(
        best_ckpt=lambda df_: df_[col].map(lambda ex: (ex[0], ex[1], ex[3], ex[4]))
    )
    dd = results.loc[results.index.droplevel("split").isin(val_best_ids["best_ckpt"]), [col]].assign(
        metric_name=col, metric=lambda df_: df_[col] * 100
    )[["metric_name", "metric"]]

    validation_results.append(dd)


validation_results = pd.concat(validation_results).reset_index()
validation_results["exp"] = validation_results["model"].str.split("_").map(lambda ex: f"{ex[1]}_{ex[2]}")
validation_results = (
    validation_results.groupby(index_cols[1:-1] + ["exp", "metric_name"])["metric"].describe().reset_index()
)

In [None]:
validation_results_gold = validation_results.loc[(validation_results["version"] == "v4")].copy()

print(
    validation_results_gold.loc[(validation_results_gold["split"] == "test")]
    .drop(columns=["version", "size", "split", "exp", "count"])
    .round(2)
    .to_latex(index=False)
)

In [None]:
all_results = (
    validation_results.loc[(validation_results["split"] == "test") & (validation_results["size"] == "base")]
    .drop(columns=["version", "split", "count"])
    .copy()
    .assign(
        state_repr=lambda df_: df_["exp"]
        .str.split("_", expand=True)[0]
        .map({"cum": "Cumulative", "ops": "Operations"}),
        context=lambda df_: df_["exp"]
        .str.split("_", expand=True)[1]
        .map(
            {
                "fullhist+nostate": "Full-history",
                "fullhist+prev": "Full-history + State",
                "nohist+prev": "State",
                "partialhist+prev": "4 Turns + State",
            }
        ),
        metric_name=lambda df_: df_["metric_name"].map(
            {
                "jga_turn_21": "2.1",
                "new_jga_turn_21": "2.1 (fix labels)",
                "jga_turn_22": "2.2",
                "new_jga_turn_22": "2.2 (fix labels)",
                "jga_turn_23": "2.3",
                "new_jga_turn_23": "2.3 (fix labels)",
                "jga_turn_24": "2.4",
                "new_jga_turn_24": "2.4 (fix labels)",
            }
        ),
    )[["state_repr", "context", "metric_name", "mean", "std", "min", "25%", "50%", "75%", "max"]]
    .rename(
        columns={
            "state_repr": "State representation",
            "context": "Context",
            "size": "Model size",
            "metric_name": "Dataset version",
        }
    )
    .set_index(["State representation", "Context", "Dataset version"])
    .round(2)
)

In [None]:
# full table
print(all_results.reset_index().iloc[:, 1:].to_latex(index=False))

In [None]:
validation_results.loc[
    (validation_results["split"] == "test") & (validation_results["metric_name"] == "new_jga_turn_24")
]

In [None]:
validation_results.loc[validation_results.version == "v3"]

In [None]:
exp1_table = all_results.reset_index()
exp1_table = exp1_table.loc[exp1_table["Dataset version"].str.contains("fix")].assign(
    **{
        "Dataset version": lambda df_: df_["Dataset version"].str.rstrip("(fix labels)").str.strip(),
    }
)
exp1_table

---
### Run-time

In [None]:
tmp = pred_df["model"].str.split("_", expand=True)
pred_df["state_repr"] = tmp[1].str.strip()
pred_df["context"] = tmp[2].str.strip()
del tmp

In [None]:
pred_df["size"] = "base"
large = [p.name for p in Path("../preds/experiment_1/large/").iterdir()]
pred_df.loc[pred_df["model"].isin(large), "size"] = "large"

In [None]:
pred_df["runtime_instance"] = pred_df["runtime"] / pred_df["batch_size"]

In [None]:
t = pred_df.loc[pred_df["size"] == "base"]
rt = t.groupby(["state_repr", "context"])["runtime_instance"].agg(["median", "std"])

In [None]:
rt["relative"] = rt["median"] / rt["median"].min()

In [None]:
print(
    rt[["median", "relative"]]
    .reset_index()
    .assign(
        state_repr=lambda df_: df_["state_repr"].map({"cum": "Cumulative", "ops": "State operations"}),
        median=lambda df_: df_["median"] * 100,
    )
    .round(2)
    .to_latex(index=False)
)

---

In [None]:
def f(ex):
    if ex is None:
        return None

    new_state = {k: v for k, v in ex.items() if v != "none"}

    if len(new_state) < 1:
        return None

    return new_state

In [None]:
somdst["states"] = somdst["states"].map(f)
somdst["previous_states"] = somdst["previous_states"].map(f)

In [None]:
# add gold annotations
pred_df_somdst = pd.merge(
    somdst,
    true_df,
    on=["dialogue_id", "turn_id"],
    how="inner",
)
# assert len(df) == len(pred_df)

In [None]:
# compute correct predictions
for v in ("21", "22", "23", "24"):
    pred_df_somdst[f"jga_turn_{v}"] = pred_df_somdst.swifter.apply(
        lambda row: jga(row["states"], row[f"states_{v}"]),
        axis=1,
    )

    pred_df_somdst[f"new_jga_turn_{v}"] = pred_df_somdst.swifter.apply(
        lambda row: jga(row["states"], row[f"new_states_{v}"]),
        axis=1,
    )

In [None]:
s = pred_df_somdst.loc[pred_df_somdst["model"].str.contains("(somdst)")]

In [None]:
results_somdst = s.groupby(["model", "epoch", "split"])[eval_cols].mean()

In [None]:
val_best_ids

In [None]:
validation_results_somdst = []
for col in eval_cols:
    val_best_ids = results_somdst.groupby(["model", "split"])[col].idxmax().reset_index()
    val_best_ids = val_best_ids.loc[val_best_ids["split"] == "validation"].assign(
        best_ckpt=lambda df_: df_[col].map(lambda ex: (ex[0], ex[1]))
    )
    dd = results_somdst.loc[results_somdst.index.droplevel("split").isin(val_best_ids["best_ckpt"]), [col]].assign(
        metric_name=col, metric=lambda df_: df_[col] * 100
    )[["metric_name", "metric"]]

    validation_results_somdst.append(dd)


validation_results_somdst = pd.concat(validation_results_somdst).reset_index()
# validation_results_somdst["exp"] = validation_results_somdst["model"].str.split("_").map(lambda ex: f"{ex[1]}_{ex[2]}")
# validation_results_somdst = validation_results_somdst.groupby(["split"] + ["exp", "metric_name"])["metric"].describe().reset_index()

In [None]:
validation_results_somdst.loc[validation_results_somdst["split"] == "test"].sort_values("model")

In [None]:
results_somdst.loc[
    (results_somdst.index.get_level_values("model") == "mwoz21_ops_somdst_2022-12-21T23-48-52")
    & (results_somdst.index.get_level_values("split") == "test")
]

In [None]:
a = s.groupby(["model", "split"])[eval_cols].describe().stack(0)

In [None]:
a = a.loc[a.index.get_level_values("split") == "test", ["mean", "std", "min", "25%", "50%", "75%", "max"]] * 100

In [None]:
a.round(2)

---

In [None]:
results

In [None]:
val = []
for col in eval_cols:
    val_best_ids = results.groupby(index_cols[:-1])[col].idxmax().reset_index()
    val_best_ids = val_best_ids.loc[val_best_ids["split"] == "validation"].assign(
        best_ckpt=lambda df_: df_[col].map(lambda ex: (ex[0], ex[1], ex[3], ex[4]))
    )
    dd = results.loc[results.index.droplevel("split").isin(val_best_ids["best_ckpt"]), [col]].assign(
        metric_name=col, metric=lambda df_: df_[col] * 100
    )[["metric_name", "metric"]]

    val.append(dd)


val = pd.concat(val).reset_index()

In [None]:
val = val.loc[val["metric_name"] == "new_jga_turn_24"]

In [None]:
ids = (
    val.loc[(val["model"].str.contains("nohist")) & (val["split"] == "test"), ["model", "epoch", "version", "split"]]
    .drop_duplicates()
    .apply(lambda row: f"{row['model']}_{row['epoch']}_{row['version']}_{row['split']}", axis=1)
    .tolist()
)

In [None]:
check = pred_df.loc[pred_df["model"].str.contains("nohist")]

In [None]:
check = check.loc[
    check.apply(lambda row: f"{row['model']}_{row['epoch']}_{row['version']}_{row['split']}", axis=1).isin(ids)
]

In [None]:
dd = check.loc[
    check["split"] == "test",
    ["version", "model", "dialogue_id", "turn_id", "states", "predictions", "epoch", "new_jga_turn_24", "input_text"],
]
dd = dd.set_index(["model", "dialogue_id", "turn_id", "version"]).unstack(-1)
dd.columns = [f"{i}_{j}" for i, j in dd.columns]

In [None]:
n, m = "new_jga_turn_24_v1", "new_jga_turn_24_v3"

In [None]:
# dd.loc[(dd[n] != dd[m])]

In [None]:
val = val.loc[(val["model"].str.contains("nohist")) & (val["version"] == "v1") & (val["split"] == "test")]

In [None]:
ids = (
    val.loc[:, ["model", "epoch", "version", "split"]]
    .drop_duplicates()
    .apply(lambda row: f"{row['model']}_{row['epoch']}_{row['split']}", axis=1)
    .tolist()
)

In [None]:
check_2 = pred_df.loc[(pred_df["model"].str.contains("nohist")) & (pred_df["split"] == "test")]

In [None]:
check_2 = check_2.loc[check_2.apply(lambda row: f"{row['model']}_{row['epoch']}_{row['split']}", axis=1).isin(ids)]

In [None]:
dd = check_2.loc[
    check_2["split"] == "test",
    [
        "version",
        "model",
        "dialogue_id",
        "turn_id",
        "states",
        "states_24",
        "predictions",
        "epoch",
        "new_jga_turn_24",
        "previous_states",
    ],
]
dd = dd.set_index(["model", "dialogue_id", "turn_id", "version"]).unstack(-1)
dd.columns = [f"{i}_{j}" for i, j in dd.columns]
del dd["states_24_v1"]
dd = dd.rename(columns={"states_24_v3": "states_true"})

In [None]:
dd.loc[(dd[n] == True) & (dd[m] == False)].head(1)

In [None]:
dd.loc[(dd[n] == True) & (dd[m] == False)].iloc[0, -1]

In [None]:
true_df.loc[(true_df["dialogue_id"] == "MUL0014.json") & (true_df["turn_id"] == 5)].iloc[-1, -8]

In [None]:
dd[n].where(dd[n] == False, dd[m]).mean()