In [None]:
import json
from pathlib import Path

import numpy as np
import pandas as pd
import seaborn as sns
import pingouin
import matplotlib.pyplot as plt
from scipy.stats import rankdata, levene, shapiro
from scipy.cluster.hierarchy import linkage, dendrogram

%matplotlib inline

In [None]:
df_logo = []
for csv in Path("./results/logo").glob("*/*.csv"):
    df = pd.read_csv(csv).rename(columns={"fold": "tgt"})
    if "__" in csv.stem:
        exp, tgt = csv.stem.split("__")
        df["exp"] = exp
        df["tgt"] = tgt
    else:
        df["exp"] = csv.stem
    df["features"] = csv.parts[-2]
    df_logo.append(df)
df_logo = pd.concat(df_logo).set_index(["exp", "features", "tgt", "rep"]).sort_index()
df_logo

In [None]:
df_pairwise = []
for csv in Path("./results/pairwise").glob("*/*/*.csv"):
    df = pd.read_csv(csv)
    src, tgt = csv.stem.split("__")
    df["src"] = src
    df["tgt"] = tgt
    df["exp"] = csv.parts[-2]
    df["features"] = csv.parts[-3]
    df_pairwise.append(df)
df_pairwise = pd.concat(df_pairwise).set_index(["exp", "features", "src", "tgt", "rep", "fold"]).sort_index()
df_pairwise

In [None]:
df_within = []
for csv in Path("./results/within").glob("*/*/*.csv"):
    df = pd.read_csv(csv)
    src = csv.stem
    df["src"] = src
    df["tgt"] = src
    df["exp"] = csv.parts[-2]
    df["features"] = csv.parts[-3]
    df_within.append(df)
df_within = pd.concat(df_within).set_index(["exp", "features", "src", "tgt", "rep", "fold"]).sort_index()
df_within

In [None]:
print((df_within.groupby(["exp", "features", "tgt"])["uar"].mean().unstack("features") * 100).to_string(float_format="{:.1f}".format))

In [None]:
print((df_logo.groupby(["exp", "features", "tgt"])["uar"].mean() * 100).unstack("features").to_string(float_format="{:.1f}".format))

In [None]:
diff = (
    df_within.rename(
        index={
            "ESD_en_train": "ESD_en",
            "ESD_zh_train": "ESD_zh",
            "MSP-PODCAST_train": "MSP-PODCAST",
            "CMU-MOSEI_train": "CMU-MOSEI",
        }
    )
    .groupby(["exp", "features", "tgt"])[["uar", "war", "macrof1"]]
    .mean(numeric_only=True)
    - df_logo.groupby(["exp", "features", "tgt"])[["uar", "war", "macrof1"]].mean()
)
diff = diff.drop(
    index=[
        "esd_cross_lang_train",
        "esd_cross_lang_test",
        "emofilm_cross_lang_5class",
        "venec_cross_country_18class",
    ],
    level="exp",
).groupby(["exp", "features"]).mean().unstack("features")
diff = diff * 100

print(diff.to_string(float_format="{:.1f}".format))


In [None]:
for exp in df_logo.drop(
    index=[
        "cross_group_all_3class",
        "cross_lang_explicit_3class",
        "esd_cross_lang_train",
        "esd_cross_lang_test",
        "emofilm_cross_lang_5class",
        "venec_cross_country_18class",
    ],
    level="exp",
).index.unique("exp"):
    sub_df = df_logo.loc[exp, "wav2vec2_audeering_ft_c_mean"].dropna(axis=1)
    classes = ["uar", "war", "macrof1"] + [
        x for x in sub_df.columns if x.endswith("_f1")
    ]
    mean_within = df_within.loc[exp, "wav2vec2_audeering_ft_c_mean"].groupby("tgt")[classes].mean()
    mean_logo = sub_df.groupby("tgt")[classes].mean()

    if exp in df_pairwise.index.unique("exp"):
        mean_pair = df_pairwise.loc[exp, "wav2vec2_audeering_ft_c_mean"].groupby("tgt")[classes].mean()
        df = pd.concat([mean_within, mean_pair, mean_logo], keys=["Within", "Pairwise", "LOGO"])
    else:
        df = pd.concat([mean_within, mean_logo], keys=["Within", "LOGO"])

    fig, ax = plt.subplots(figsize=(len(classes) / 1.8, len(df) / 4))
    sns.heatmap(df * 100, cmap="Blues", vmin=0, annot=True, fmt=".1f", vmax=100, ax=ax)
    ax.set(ylabel=None)
    plt.show()


## All groups, pairwise

In [None]:
mean_df = df_pairwise.loc["cross_group_all_3class", "wav2vec_c_mean"].groupby(["src", "tgt"])["uar"].mean().unstack("tgt")
fig, ax = plt.subplots(figsize=(len(mean_df) / 3, len(mean_df) / 4))
sns.heatmap(mean_df * 100, cmap="Blues", vmin=33, annot=True, fmt=".0f", vmax=100, cbar_kws={"aspect": 50, "pad": 0.02}, ax=ax)
ax.set(xlabel="Test/Target", ylabel="Train/Source")
ax.set_xticklabels(mean_df.columns, rotation=45, rotation_mode="anchor", ha="right")
plt.show()

mean_df = df_pairwise.loc["cross_group_all_3class", "wav2vec2_audeering_ft_c_mean"].groupby(["src", "tgt"])["uar"].mean().unstack("tgt")
fig, ax = plt.subplots(figsize=(len(mean_df) / 3, len(mean_df) / 4))
sns.heatmap(mean_df * 100, cmap="Blues", vmin=33, annot=True, fmt=".0f", vmax=100, cbar_kws={"aspect": 50, "pad": 0.02}, ax=ax)
ax.set(xlabel="Test/Target", ylabel="Train/Source")
ax.set_xticklabels(mean_df.columns, rotation=45, rotation_mode="anchor", ha="right")
plt.show()

In [None]:
(df_pairwise * 100).loc["cross_group_all_3class"][["anger_f1", "happiness_f1", "sadness_f1"]].groupby("features").agg(["mean", "std"])

In [None]:
logo_mean = df_logo.loc["cross_group_all_3class", "wav2vec2_audeering_ft_c_mean"].groupby("tgt")["uar"].mean()
within_mean = df_within.loc["cross_group_all_3class", "wav2vec2_audeering_ft_c_mean"].rename(
        index={"ESD_en_train": "ESD_en", "ESD_zh_train": "ESD_zh", "MSP-PODCAST_train": "MSP-PODCAST", "CMU-MOSEI_train": "CMU-MOSEI"}
    ).groupby("tgt")["uar"].mean()
src_mean = df_pairwise.loc["cross_group_all_3class", "wav2vec2_audeering_ft_c_mean"].groupby("src")["uar"].mean()
tgt_mean = df_pairwise.loc["cross_group_all_3class", "wav2vec2_audeering_ft_c_mean"].groupby("tgt")["uar"].mean()
print((pd.concat([within_mean, logo_mean, src_mean, tgt_mean], axis=1, keys=["within", "logo", "src", "tgt"]) * 100).style.format(precision=1).to_latex())

In [None]:
acted = ["AESDD", "ASED", "CaFE", "CREMA-D", "EMO-DB", "EmoFilm_en", "EmoFilm_es", "EmoFilm_it", "EMOVO", "eNTERFACE", "ESD_en", "ESD_zh", "JL", "MESD", "MESS", "Portuguese", "RAVDESS", "SAVEE", "SUBESCO", "TESS", "URDU", "VENEC_au", "VENEC_in", "VENEC_ke", "VENEC_sg", "VENEC_us"]
induced = ["DEMoS", "EESC", "IEMOCAP", "MELD", "MSP-IMPROV", "ShEMO"]
natural = ["CMU-MOSEI", "MSP-PODCAST", "SmartKom"]

print(src_mean.loc[acted].agg(["mean", "sem"]))
print(src_mean.loc[induced].agg(["mean", "sem"]))
print(src_mean.loc[natural].agg(["mean", "sem"]))

print(pingouin.ttest(src_mean.loc[acted], src_mean.loc[induced], alternative="greater").to_string())
print(pingouin.ttest(src_mean.loc[induced], src_mean.loc[natural], alternative="greater").to_string())
print(pingouin.ttest(src_mean.loc[acted], src_mean.loc[natural], alternative="greater").to_string())

### Pairwise - within differences

In [None]:
# all_within = df_within.loc["cross_group_all_3class", "wav2vec2_audeering_ft_c_mean"].rename(
#         index={"ESD_en_train": "ESD_en", "ESD_zh_train": "ESD_zh", "MSP-PODCAST_train": "MSP-PODCAST", "CMU-MOSEI_train": "CMU-MOSEI"}
#     ).groupby("tgt")[["uar", "war", "macrof1", "anger_f1", "happiness_f1", "sadness_f1"]].mean()
# all_pair = df_pairwise.loc["cross_group_all_3class", "wav2vec2_audeering_ft_c_mean"].groupby("tgt")[["uar", "war", "macrof1", "anger_f1", "happiness_f1", "sadness_f1"]].mean()
# all_logo = df_logo.loc["cross_group_all_3class", "wav2vec2_audeering_ft_c_mean"].groupby("tgt")[["uar", "war", "macrof1", "anger_f1", "happiness_f1", "sadness_f1"]].mean()

diff = tgt_mean - within_mean
# print((diff * 100).to_string(float_format="{:.1f}".format))
# print((diff * 100).mean().to_string(float_format="{:.1f}".format))

print(pingouin.correlation.corr(tgt_mean, within_mean))

print(diff.loc[acted].agg(["mean", "sem"]))
print(diff.loc[induced].agg(["mean", "sem"]))
print(diff.loc[natural].agg(["mean", "sem"]))

print(pingouin.ttest(diff.loc[acted], diff.loc[induced], alternative="two-sided").to_string())
print(pingouin.ttest(diff.loc[induced], diff.loc[natural], alternative="two-sided").to_string())
print(pingouin.ttest(diff.loc[acted], diff.loc[natural], alternative="two-sided").to_string())

### UAR correlation and simiarity

In [None]:
# mean_df = df_pairwise.loc["cross_group_all_3class", "wav2vec_c_mean"].groupby(["src", "tgt"])["uar"].mean().unstack("tgt")
pair_table = df_pairwise.loc["cross_group_all_3class", "wav2vec2_audeering_ft_c_mean"].groupby(["src", "tgt"])["uar"].mean().unstack("tgt")
mean_train_test = (pair_table + pair_table.T) / 2
avg_corr = (pair_table.corr(method="pearson") + pair_table.T.corr(method="pearson")) / 2

# fig, ax = plt.subplots(figsize=(5, 4))
# sns.scatterplot(x=pair_table["EMO-DB"], y=pair_table["MESS"], ax=ax)
# plt.show()

sim = pair_table.copy()
# sim = avg_corr
# sim = mean_train_test.corr(method="pearson")
sim.iloc[:, :] = np.interp((pair_table.T + pair_table) / 2, [0.33, 1], [0, 1])

print(pingouin.corr(sim.to_numpy()[np.triu_indices_from(sim, 1)], pair_table.to_numpy()[np.triu_indices_from(sim, 1)]))

# print(sim.apply(lambda x: list(x.sort_values().items())[-2]).T.to_string())

dist = 1 - sim
# dist = np.arccos(sim) / np.pi

plt.figure(figsize=(10, 5))
Z = linkage(dist.to_numpy()[np.triu_indices_from(dist, 1)], "average")
res = dendrogram(Z, labels=list(sim.columns), leaf_rotation=90, color_threshold=0, count_sort=True)

fig, ax = plt.subplots(figsize=(len(pair_table) / 2.5, len(pair_table) / 4))
sns.heatmap(dist.loc[res["ivl"], res["ivl"]], cmap="Greens", vmin=0, annot=True, fmt=".1f", vmax=2, cbar_kws={"aspect": 50, "pad": 0.02}, ax=ax)
ax.set(xlabel=None, ylabel=None)

# fig, ax = plt.subplots(figsize=(len(mean_df) / 2.5, len(mean_df) / 4))
# sns.heatmap(sim, cmap="coolwarm", vmin=-1, annot=True, fmt=".1f", vmax=1, cbar_kws={"aspect": 50, "pad": 0.02}, ax=ax)
# ax.set(xlabel=None, ylabel=None)

# fig, ax = plt.subplots(figsize=(len(mean_df) / 2.5, len(mean_df) / 4))
# sns.heatmap(mean_df.T.corr(method="spearman"), cmap="coolwarm", vmin=-1, annot=True, fmt=".1f", vmax=1, ax=ax)
# ax.set(xlabel=None, ylabel=None, title="Correlation between UAR when used as source dataset")

# fig, ax = plt.subplots(figsize=(len(mean_df) / 2.5, len(mean_df) / 4))
# sns.heatmap(mean_df.corr(method="spearman"), cmap="coolwarm", vmin=-1, annot=True, fmt=".1f", vmax=1, ax=ax)
# ax.set(xlabel=None, ylabel=None, title="Correlation between UAR when used as target dataset")

plt.show()

### Difference between pairwise source and target

In [None]:
mean_df = df_pairwise.loc["cross_group_all_3class", "wav2vec2_audeering_ft_c_mean"].groupby(["src", "tgt"])["uar"].mean().unstack("tgt")

fig, ax = plt.subplots(figsize=(len(mean_df) / 2.5, len(mean_df) / 4))
sns.heatmap(1 - (mean_df - mean_df.T).abs(), cmap="Blues", vmin=0.6, vmax=1, annot=True, fmt=".1f", ax=ax)
ax.set(xlabel=None, ylabel=None)

plt.show()