# EVC for data augmentation - results

In [1]:
from pathlib import Path
from itertools import product

import pandas as pd
from scipy.stats import binomtest, wilcoxon, ttest_1samp, shapiro
import pingouin as pg
import seaborn as sns
import matplotlib.pyplot as plt

from ertk.stats import holm_bonferroni

sns.set_color_codes()
%matplotlib inline

In [2]:
def stderr_str(x):
    return f"{x.mean():.1f} ({x.sem():.1f})"

def mean_str(x):
    return f"{x.mean():.1f}"

def latex(df: pd.DataFrame):
    for col in df.columns:
        df[col] = df[col].str.replace("+-", "$\\pm$", regex=False).str.replace("_", "\\_", regex=False)
    return df.to_latex(index=True, escape=False, sparsify=True)

RENAMES = {
    # Features
    "wav2vec_c_mean": "Wav2vec",
    "wav2vec2_audeering_ft_c_mean": "Wav2vec2 FT",
    # Experiment
    "esd_test": "ESD",
    "cross_corpus_explicit_4class_test": "CC explicit",
    "cross_corpus_induced_4class_test": "CC induced",
    # Direction
    "en_zh": "en to zh",
    "zh_en": "zh to en",
    "CREMA-D_EmoV-DB": "CREMA-D to EmoV-DB",
    "EmoV-DB_CREMA-D": "EmoV-DB to CREMA-D",
    "IEMOCAP_MSP-IMPROV": "IEMOCAP to MSP-IMPROV",
    "MSP-IMPROV_IEMOCAP": "MSP-IMPROV to IEMOCAP",
    # Augmented data
    "ESD_en_evc": "ESD en (10lang)",
    "ESD_zh_evc": "ESD zh (10lang)",
    "CREMA-D_evc": "CREMA-D (10lang)",
    "EmoV-DB_evc": "EmoV-DB (10lang)",
    "IEMOCAP_evc": "IEMOCAP (10lang)",
    "MSP-IMPROV_evc": "MSP-IMPROV (10lang)",
    #
    "aug_method": "Aug. method",
    "aug_data": "Aug. data",
}

### Load data

In [None]:
# Get added train size experiments
df_added = []
for csv in Path("./results/lr_rep20_max_train/").glob("*/*/*/*.csv"):
    if csv.parent.name.endswith("_const") or csv.parent.name.endswith("_neutral"):
        continue
    src, tgt, _, aug, *_ = csv.stem.split("_")
    if src == tgt:
        continue
    df = pd.read_csv(csv)
    df["dir"] = f"{src}_{tgt}"
    df["aug_method"] = aug
    df["aug_data"] = csv.parts[-2]
    df["common_phones"] = "ipa" in csv.parts[-2]
    df["exp"] = csv.parts[-3]
    df["features"] = csv.parts[-4]
    df_added.append(df)
df_added = pd.concat(df_added).set_index(["exp", "aug_data", "common_phones", "features", "dir", "aug_method", "p_real", "p_fake", "max_train", "rep"]).sort_index()
df_added

In [None]:
# Get fixed train size experiments
df_fixed = []
for csv in Path("./results/lr_rep20_max_train/").glob("*/*/*_const/*.csv"):
    if csv.parent.name.startswith("noaug"):
        continue
    src, tgt, _, aug, *_ = csv.stem.split("_")
    if src == tgt:
        continue
    df = pd.read_csv(csv)
    df["dir"] = f"{src}_{tgt}"
    df["aug_method"] = aug
    df["aug_data"] = csv.parts[-2].rsplit("_", maxsplit=1)[0]
    df["common_phones"] = "ipa" in csv.parts[-2]
    df["exp"] = csv.parts[-3]
    df["features"] = csv.parts[-4]
    df_fixed.append(df)
df_fixed = pd.concat(df_fixed).set_index(["exp", "aug_data", "common_phones", "features", "dir", "aug_method", "p_real", "p_fake", "max_train", "rep"]).sort_index()
df_fixed

In [None]:
# Get neutral-only fixed train size experiments
df_neutral = []
for csv in Path("./results/lr_rep20_max_train/").glob("*/*/*_const_neutral/*.csv"):
    if csv.parent.name.startswith("noaug"):
        continue
    src, tgt, which, aug, *_ = csv.stem.split("_")
    if src == tgt:
        continue
    df = pd.read_csv(csv)
    df["dir"] = f"{src}_{tgt}"
    df["aug_method"] = f"{which[3:]}_{aug}"
    aug_data = csv.parts[-2].rsplit("_", maxsplit=2)[0]
    df["aug_data"] = aug_data
    df["common_phones"] = "ipa" in aug_data
    df["exp"] = csv.parts[-3]
    df["features"] = csv.parts[-4]
    df_neutral.append(df)
df_neutral = pd.concat(df_neutral).set_index(["exp", "aug_data", "common_phones", "features", "dir", "aug_method", "p_real", "p_fake", "max_train", "rep"]).sort_index()
df_neutral

In [None]:
# Get real data fixed train size experiments
df_real = []
for csv in Path("./results/lr_rep20_max_train/").glob("*/*/noaug_const/*.csv"):
    src, tgt, _, aug, *_ = csv.stem.split("_")
    if src == tgt:
        continue
    df = pd.read_csv(csv)
    df["dir"] = f"{src}_{tgt}"
    df["aug_method"] = aug
    df["aug_data"] = csv.parts[-2].rsplit("_", maxsplit=2)[0]
    df["exp"] = csv.parts[-3]
    df["features"] = csv.parts[-4]
    df_real.append(df)
df_real = pd.concat(df_real).set_index(["exp", "aug_data", "features", "dir", "aug_method", "p_real", "p_fake", "max_train", "rep"]).sort_index()
df_real

In [None]:
# Get real neutral-only data fixed train size experiments
df_real_neutral = []
for csv in Path("./results/lr_rep20_max_train/").glob("*/*/noaug_const_neutral/*.csv"):
    src, tgt, which, aug, *_ = csv.stem.split("_")
    if src == tgt:
        continue
    df = pd.read_csv(csv)
    df["dir"] = f"{src}_{tgt}"
    df["aug_method"] = f"{which[4:]}_{aug}"
    df["aug_data"] = csv.parts[-2].rsplit("_", maxsplit=2)[0]
    df["exp"] = csv.parts[-3]
    df["features"] = csv.parts[-4]
    df_real_neutral.append(df)
df_real_neutral = pd.concat(df_real_neutral).set_index(["exp", "aug_data", "features", "dir", "aug_method", "p_real", "p_fake", "max_train", "rep"]).sort_index()
df_real_neutral

In [None]:
# Get within-group fixed train size experiments
df_within = []
for csv in Path("./results/lr_rep20_max_train/").glob("*/*/*_const/*.csv"):
    if csv.parent.name.startswith("noaug"):
        continue
    src, tgt, _, aug, *_ = csv.stem.split("_")
    if src != tgt:
        continue
    df = pd.read_csv(csv)
    df["dir"] = src
    df["aug_method"] = "within"
    aug_data = csv.parts[-2].rsplit("_", maxsplit=1)[0]
    df["aug_data"] = aug_data
    df["common_phones"] = "ipa" in aug_data
    df["exp"] = csv.parts[-3]
    df["features"] = csv.parts[-4]
    df_within.append(df)
df_within = pd.concat(df_within).set_index(["exp", "aug_data", "common_phones", "features", "dir", "aug_method", "p_real", "p_fake", "max_train", "rep"]).sort_index()
df_within

In [None]:
# Get cross-validated emotional expressiveness for real and augmented data
# df_cv_aug = []
# for csv in Path("./results/within/").glob("*/*/*/*.csv"):
#     df = pd.read_csv(csv).drop(columns=["rep"])
#     df["data"] = csv.stem
#     aug_data = csv.parts[-2]
#     df["aug_data"] = aug_data
#     aug_data_source = aug_data.split("_")[0]
#     if aug_data_source == "ESD":
#         aug_data_source = aug_data.split("_")[1]
#     if aug_data_source == "noaug":
#         aug_data_source = csv.stem
#     df["same_aug"] = aug_data_source == csv.stem
#     df["common_phones"] = "ipa" in csv.parts[-2]
#     df["exp"] = csv.parts[-4]
#     df["features"] = csv.parts[-3]
#     df_cv_aug.append(df)
# df_cv_aug = pd.concat(df_cv_aug).set_index(["data", "aug_data", "common_phones", "same_aug", "features", "data", "exp", "fold"]).sort_index()
# df_cv_aug

### Plots with neutral and real target data

In [None]:
features = "wav2vec2_audeering_ft_c_mean"

def plot_effects(exp, features, aug, dir, ax):
    sns.pointplot(ax=ax, data=df_real_neutral.loc[exp, "noaug", features, dir, "all_target"].reset_index() * 100, x="p_real", y="uar", color="r", markers='.', linestyles="--", scale=0.5, errwidth=2, capsize=0.1)
    sns.pointplot(ax=ax, data=df_real_neutral.loc[exp, "noaug", features, dir, "neutral_target"].reset_index() * 100, x="p_real", y="uar", color="orange", linestyles="-.", markers='.', scale=0.5, errwidth=2, capsize=0.1)
    sns.pointplot(ax=ax, data=df_neutral.loc[exp, aug, features, dir, "neutral_target"].reset_index() * 100, x="p_real", y="uar", color="g", markers='.', linestyles=":", scale=0.5, errwidth=2, capsize=0.1)
    sns.pointplot(ax=ax, data=df_neutral.loc[exp, aug, features, dir, "all_target"].reset_index() * 100, x="p_real", y="uar", color="b", markers='.', linestyles="-", scale=0.5, errwidth=2, capsize=0.1)
    # sns.pointplot(ax=ax, data=df_neutral.loc[exp, aug, features, dir, "neutral_source"].reset_index() * 100, x="p_real", y="uar", color="gray", markers='.', scale=0.5)
    # sns.pointplot(ax=ax, data=df_neutral.loc[exp, aug, features, dir, "all_source"].reset_index() * 100, x="p_real", y="uar", color="pink", markers='.', scale=0.5)

fig, ax = plt.subplots(1, 2, figsize=(8, 3), sharex=True, sharey=True, gridspec_kw={"hspace": 0.25, "wspace": 0.05})
plot_effects("cross_corpus_explicit_4class_test", "wav2vec2_audeering_ft_c_mean", "CREMA-D_ipa_spk_emo", "CREMA-D_EmoV-DB", ax=ax[0])
ax[0].set_title("CR to EV, Wav2vec FT")
ax[0].set_ylim(35, 80)
ax[0].set_ylabel("UAR")
plot_effects("esd_test", "wav2vec2_audeering_ft_c_mean", "ESD_en_ipa_spk_emo", "en_zh", ax=ax[1])
ax[1].set_title("en to zh, Wav2vec2 FT")
ax[1].set_ylabel(None)
plt.show()

### Cross-validation with real and augmented data

In [None]:
df_cv_aug_mean = (
    (df_cv_aug * 100)
    .rename(index=RENAMES)
    .reindex([True, False], level="same_aug")
    .groupby(["data", "aug_data", "features"])["uar"]
    .mean()
    .apply(lambda x: f"{x:.1f}")
    .unstack("features")
)
print(df_cv_aug_mean.to_string())

### Upper bound on transfer (using real target data)

In [None]:
sub_df = df_real.droplevel(["max_train", "p_fake", "aug_data", "aug_method"])
diff = sub_df.loc[:, :, :, 0.9]["uar"].sub(sub_df.loc[:, :, :, 1.0]["uar"]) * 100
print("UAR by experiment and feature using 10% augmented data")
print(diff.groupby(["dir", "features"]).agg(stderr_str).rename(index=RENAMES).unstack().to_string())

sub_df = df_real.droplevel(["max_train", "p_fake", "aug_data", "aug_method"])
diff = sub_df.loc[:, :, :, 0.75]["uar"].sub(sub_df.loc[:, :, :, 1.0]["uar"]) * 100
print("UAR by experiment and feature using 25% augmented data")
print(diff.groupby(["dir", "features"]).agg(stderr_str).rename(index=RENAMES).unstack().to_string())

sub_df = df_real.droplevel(["max_train", "p_fake", "aug_data", "aug_method"])
diff = sub_df.loc[:, :, :, 0.5]["uar"].sub(sub_df.loc[:, :, :, 1.0]["uar"]) * 100
print("UAR by experiment and feature using 50% augmented data")
print(diff.groupby(["dir", "features"]).agg(stderr_str).rename(index=RENAMES).unstack().to_string())

### Mean diff between 10% aug and none

In [None]:
sub_df = (
    df_fixed.drop(
        index=[
            "esd_train",
            "eng_cross_country_explicit_4class",
            "cross_corpus_induced_4class_test",
            "cross_corpus_explicit_4class_nojosh",
        ],
        level="exp",
    )
    .drop(
        index=[
            ("cross_corpus_induced_4class", "ESD_en_spk_emo"),
            ("cross_corpus_explicit_4class", "ESD_en_spk_emo"),
        ]
    )
    .droplevel("p_fake")
    .xs(1.0, level="max_train")
)

diff = sub_df.loc[:, :, :, :, :, 0.9]["uar"].sub(sub_df.loc[:, :, :, :, :, 1.0]["uar"]) * 100
diff = diff.rename(index=RENAMES)
print("UAR by experiment and augment method")
print(diff.groupby(["exp", "dir", "aug_method"]).agg(stderr_str).unstack().to_string())
print("UAR by experiment and feature")
print(diff.groupby(["exp", "dir", "features"]).agg(stderr_str).unstack().to_string())

df_aug = (
    diff[diff.index.get_level_values("aug_method").isin(["source", "target"])]
    .groupby(["exp", "dir", "aug_data", "features", "aug_method"])
    .agg(stderr_str)
    .unstack(["features", "aug_method"])
)
print("UAR by experiment and feature")
print(df_aug.to_string())

### Mean diff between best replacement aug and none

In [None]:
def best_p_real(x):
    best = x.unstack("rep").mean(1).unstack("p_real").idxmax(1).item()
    return x.xs(best, level="p_real").droplevel(x.index.names[:-2])


def arg_best_p_real(x):
    return x.unstack("rep").mean(1).unstack("p_real").idxmax(1).item()


sub_df = (
    df_fixed.drop(
        index=[
            "esd_train",
            "eng_cross_country_explicit_4class",
            "cross_corpus_induced_4class",
            "cross_corpus_explicit_4class",
            "cross_corpus_explicit_4class_nojosh",
        ],
        level="exp",
    )
    .drop(
        index=[
            ("cross_corpus_induced_4class", "ESD_en_spk_emo"),
            ("cross_corpus_explicit_4class", "ESD_en_spk_emo"),
        ],
        errors="ignore",
    )
    .drop(index=False, level="common_phones")
    .droplevel("p_fake")
    .xs(1.0, level="max_train")
)

fixed_argbest = (
    sub_df.drop(1.0, level="p_real")
    .groupby(level=["dir", "aug_data", "features", "aug_method"])["uar"]
    .apply(arg_best_p_real)
)

best_df = (
    sub_df.drop(1.0, level="p_real")
    .groupby(level=["aug_data", "features", "dir", "aug_method"])["uar"]
    .apply(best_p_real)
)
base_df = sub_df.xs(1.0, level="p_real")["uar"]

diff = (best_df - base_df) * 100

sig_idx = []
for idx, group in diff.drop(index="both", level="aug_method").groupby(
    ["dir", "aug_data", "features", "aug_method"]
):
    print(shapiro(group))
    res = pg.ttest(group, 0, alternative="two-sided")
    if res["p-val"].item() < 0.05 / 48:
        print(idx)
        sig_idx.append(idx)

print(
    (df_within * 100).drop(index=False, level="common_phones")
    .droplevel("p_fake")
    .xs(1.0, level="max_train")
    .xs(1.0, level="p_real")["uar"]
    .groupby(["dir", "aug_data", "features"])
    .agg(stderr_str)
    .rename(index=RENAMES)
    .sort_index()
    .unstack("features")
    .to_string()
)

print("Baseline")
print(
    (base_df * 100)
    .drop("both", level="aug_method")
    .groupby(["dir", "aug_data", "features"])
    .agg(stderr_str)
    .rename(index=RENAMES)
    .sort_index()
    .unstack("features")
    .to_string()
)
print()

df_diff_best_fixed_aug = (
    diff.rename(index=RENAMES)
    .drop(index="both", level="aug_method")
    .groupby(["dir", "aug_data", "features", "aug_method"])
    .agg(stderr_str)
    .unstack(["features", "aug_method"])
)
print("UAR by experiment and feature")
print(df_diff_best_fixed_aug.to_string())
print()

print("Max p_real")
print(
    fixed_argbest.drop("both", level="aug_method")
    .rename(index=RENAMES)
    .sort_index()
    .unstack(["features", "aug_method"])
    .to_string()
)
print(fixed_argbest.loc[sig_idx].value_counts())
print()

print("UAR by experiment and augment method")
# target_source = diff.xs("target", level="aug_method") - diff.xs(
#     "source", level="aug_method"
# )
# target_source = target_source.groupby(["dir"]).mean()
# print(shapiro(target_source))
# print(pg.ttest(target_source, 0, alternative="greater").to_string())
# print(pg.compute_bootci(target_source, func="mean", confidence=0.975))
# both_target = diff.xs("both", level="aug_method") - diff.xs(
#     "target", level="aug_method"
# )
# both_target = both_target.groupby("dir").mean()
# print(shapiro(both_target))
# print(pg.ttest(both_target, 0, alternative="greater").to_string())
# print(pg.compute_bootci(both_target, func="mean", confidence=0.975))

diff_by_aug_method = (
    diff.rename(index=RENAMES)
    .reindex(index=["source", "target", "both"], level="aug_method")
    .groupby(["dir", "aug_method"])
    .mean()
    .unstack()
)
print(diff_by_aug_method.applymap("{:.1f}".format).to_string())
print(diff_by_aug_method.agg(stderr_str))
print()


print("UAR by experiment and feature")
# feature_diff = diff.xs("wav2vec_c_mean", level="features") - diff.xs(
#     "wav2vec2_audeering_ft_c_mean", level="features"
# )
# feature_diff = feature_diff.groupby(["dir"]).mean()
# print(shapiro(feature_diff))
# print(pg.ttest(feature_diff, 0, alternative="greater").to_string())
# print(pg.compute_bootci(feature_diff, func="mean"))

diff_by_features = (
    diff.rename(index=RENAMES).groupby(["dir", "features"]).mean().unstack()
)
print(diff_by_features.applymap("{:.1f}".format).to_string())
print(diff_by_features.agg(stderr_str))
print()


### Mean diff between additional best aug and none

In [None]:
def best_p_fake(x):
    best = x.unstack("rep").mean(1).unstack("p_fake").idxmax(1).item()
    return x.xs(best, level="p_fake").droplevel(
        ["exp", "aug_data", "features", "dir", "aug_method"]
    )


def arg_best_p_fake(x):
    return x.unstack("rep").mean(1).unstack("p_fake").idxmax(1).item()


sub_df = (
    df_added.drop(
        index=[
            "esd_train",
            "eng_cross_country_explicit_4class",
            "cross_corpus_induced_4class",
            "cross_corpus_explicit_4class",
            "cross_corpus_explicit_4class_nojosh",
        ],
        level="exp",
    )
    .droplevel(["p_real", "max_train"])
    .drop(
        index=[
            ("cross_corpus_induced_4class", "ESD_en_spk_emo"),
            ("cross_corpus_explicit_4class", "ESD_en_spk_emo"),
        ],
        errors="ignore",
    )
    .drop(index=False, level="common_phones")
)

arg_best = (
    sub_df.drop(0.0, level="p_fake")
    .groupby(level=["dir", "aug_data", "features", "aug_method"])["uar"]
    .apply(arg_best_p_fake)
)
best_df = (
    sub_df.drop(0.0, level="p_fake")
    .groupby(level=["aug_data", "features", "dir", "aug_method"])[
        "uar"
    ]
    .apply(best_p_fake)
)
base_df = sub_df.xs(0.0, level="p_fake")["uar"]

diff = (best_df - base_df) * 100

for name, group in diff.rename(index=RENAMES).drop(index="both", level="aug_method").groupby(["dir", "aug_data", "features", "aug_method"]):
    res = pg.ttest(group.to_numpy(), 0, alternative="two-sided")
    if res["p-val"].item() < 0.05 / 24:
        print(name)

df_diff_best_add_aug = (
    diff.rename(index=RENAMES).drop(index="both", level="aug_method")
    .groupby(["dir", "aug_data", "features", "aug_method"])
    .agg(stderr_str)
    .unstack(["features", "aug_method"])
)
print("UAR by experiment and feature")
print(df_diff_best_add_aug.to_string())


# target_source = diff.xs("target", level="aug_method") - diff.xs("source", level="aug_method")
# target_source = target_source.groupby(["dir"]).mean()
# print(shapiro(target_source))
# print(pg.ttest(target_source, 0, alternative="greater").to_string())
# print(pg.compute_bootci(target_source, func="mean"))
# both_target = diff.xs("both", level="aug_method") - diff.xs("target", level="aug_method")
# both_target = both_target.groupby("dir").mean()
# print(shapiro(both_target))
# print(pg.ttest(both_target, 0, alternative="greater").to_string())
# print(pg.compute_bootci(both_target, func="mean"))

print("UAR by experiment and augment method")
print(diff.rename(index=RENAMES).groupby(["dir", "aug_method"]).agg(stderr_str).unstack().to_string())

# feature_diff = diff.xs("wav2vec_c_mean", level="features") - diff.xs("wav2vec2_audeering_ft_c_mean", level="features")
# feature_diff = feature_diff.groupby(["dir"]).mean()
# print(shapiro(feature_diff))
# print(pg.ttest(feature_diff, 0, alternative="greater").to_string())
# print(pg.compute_bootci(feature_diff, func="mean"))

print("UAR by experiment and feature")
print(diff.rename(index=RENAMES).groupby(["dir", "features"]).agg(stderr_str).unstack().to_string())

### Mean diff between within-group best aug and none

In [None]:
def best_p_real(x):
    best = x.unstack("rep").mean(1).unstack("p_real").idxmax(1).item()
    return x.xs(best, level="p_real").droplevel(x.index.names[:-2])


def arg_best_p_real(x):
    return x.unstack("rep").mean(1).unstack("p_real").idxmax(1).item()
    

sub_df = (
    df_within.drop(
        index=[
            ("IEMOCAP_test", "ESD_en_spk_emo"),
            ("MSP-IMPROV_test", "ESD_en_spk_emo"),
        ]
    )
    .droplevel(["p_fake", "aug_method"])
    .drop(index=False, level="common_phones")
    .xs(1.0, level="max_train")
)

best_df = (
    sub_df.drop(1.0, level="p_real")
    .groupby(level=["aug_data", "features", "dir"])["uar"]
    .apply(best_p_real)
)
base_df = sub_df.xs(1.0, level="p_real")["uar"]

diff = (best_df - base_df) * 100
diff = diff.rename(index=RENAMES)

for idx, group in diff.rename(index=RENAMES).groupby(["aug_data", "features"]):
    res = pg.ttest(group.to_numpy(), 0, alternative="two-sided")
    if res["p-val"].item() < 0.05 / 12:
        print(idx)

print("UAR using augmented target data, by experiment, augmented data and features")
within_best = diff.groupby(["aug_data", "features"]).agg(stderr_str).unstack()
print(within_best.to_string())


### Effect of neutral vs. emotional augmentation

In [None]:
def best_p_real(x):
    best = x.unstack("rep").mean(1).unstack("p_real").idxmax(1).item()
    return x.xs(best, level="p_real").droplevel(x.index.names[:-2])


def arg_best_p_real(x):
    return x.unstack("rep").mean(1).unstack("p_real").idxmax(1).item()


sub_df = (
    df_neutral.drop(
        index=[
            "esd_train",
            "cross_corpus_explicit_4class_nojosh",
            "cross_corpus_induced_4class",
            "cross_corpus_explicit_4class",
        ],
        level="exp",
    )
    .drop(index=False, level="common_phones")
    .drop(
        index=["all_both", "neutral_both", "all_source", "neutral_source"],
        level="aug_method",
    )
    .droplevel("p_fake")
    .drop(1.0, level="p_real")["uar"]
)

arg_best = (
    sub_df.xs("all_target", level="aug_method")
    .groupby(level=["dir", "aug_data", "features"])
    .apply(arg_best_p_real)
)
diffs = []
for (dir, aug_data, features), p_real in arg_best.items():
    _df = (
        sub_df.xs(dir, level="dir", drop_level=False)
        .xs(aug_data, level="aug_data", drop_level=False)
        .xs(features, level="features", drop_level=False)
        .xs(p_real, level="p_real")
    )
    diffs.append(_df.xs("all_target", level="aug_method") - _df.xs(
        "neutral_target", level="aug_method"
    ))
diff = pd.concat(diffs)
diff = diff * 100

for idx, group in diff.rename(index=RENAMES).groupby(["dir", "aug_data", "features"]):
    res = pg.ttest(group.to_numpy(), 0, alternative="two-sided")
    if res["p-val"].item() < 0.05 / 12:
        print(idx)

df_emotional_neutral = diff.rename(index=RENAMES).groupby(level=["dir", "aug_data", "features"]).agg(stderr_str).unstack("features")
print(df_emotional_neutral.to_string())
