# Analysis of the results

Here we analyse the EER for the three ASV systems and both attack scenarios

In [2]:
import os
from string import Template
import numpy as np

In [3]:
# define the dirs that will be included in this analysis (x-vectors)
privacy_dirs = ["../../logs/stargan/asv_xvect_lda200", "../../logs/stargan/asv_xvect_lda200_2", "../../logs/stargan/asv_xvect_lda200_3"]
utility_dirs = ["../../logs/stargan/utility"]

privacy_components = {
    "ignorant": Template("eval/asv-plda/ignorant/results/eer_$trait.txt"),
    "lazy-informed": Template("eval/asv-plda/lazy-informed/results/eer_$trait.txt"),
}

# map the utility components to the indices of the files that hold the relevant value, and the number of metrics
# the first item of each line is assumed to be the dataset; the second, the characteristic; the third, the number of samples
utility_components = {
    "whisper-small": [5, 3],
    "whisper-large": [5, 3],
    "ser-audeering-w2v": [3, 7],
    "nisqa": [3, 1],
}

In [4]:

def get_results(trait, privacy_dirs, privacy_components, utility_dirs, utility_components):
    """
    Gather the result across the experiment folders for each instance (called here "char_key")
    of the given trait. For example, the trait "accent" can take the values "US", "UK", "IN", etc.
    Their respective values are named "char_value" in the code below.
    """

    results, n_samples = dict(), dict()

    for log_dir in privacy_dirs:

        for component, eval_file_template in privacy_components.items():

            eval_file = eval_file_template.substitute(trait=trait)
            path = os.path.join(log_dir, eval_file)

            with open(path, "r") as f:
                next(f)  # skip header

                for line in f:
                    elements = line.split(" ")
                    dataset = elements[0]
                    char_value = elements[-1]
                    char_key = " ".join(elements[1: len(elements) - 3])

                    if dataset.endswith(".txt"):
                        dataset = dataset[:-4]

                    if dataset not in results:
                        results[dataset] = dict()
                    if char_key not in results[dataset]:
                        results[dataset][char_key] = dict()
                    if component not in results[dataset][char_key]:
                        results[dataset][char_key][component] = list()
                    
                    results[dataset][char_key][component].append(
                        float(char_value)
                    )

    for log_dir in utility_dirs:
        for component, numbers in utility_components.items():
            char_value_idx, n_metrics = numbers
            path = os.path.join(log_dir, "eval", component, f"{trait}.txt")
            with open(path, "r") as f:
                next(f)  # skip header

                for line in f:
                    elements = line.split(" ")
                    dataset = elements[0]
                    char_key = " ".join(elements[1: len(elements) - n_metrics - 1])
                    char_value = elements[char_value_idx + len(char_key.split()) - 1]

                    if dataset.endswith(".txt"):
                        dataset = dataset[:-4]

                    if dataset not in n_samples:
                        n_samples[dataset] = dict()
                    if char_key not in n_samples[dataset]:
                        n_samples[dataset][char_key] = int(elements[len(char_key.split(" ")) + 1])
                
                    if dataset not in results:
                        results[dataset] = dict()
                    if char_key not in results[dataset]:
                        results[dataset][char_key] = dict()
                    if component not in results[dataset][char_key]:
                        results[dataset][char_key][component] = list()

                    results[dataset][char_key][component].append(
                        float(char_value)
                    )
    
    return results, n_samples

In [5]:
def print_results_per_dataset(results, n_samples, components, min_samples):
    # print the results in a markdown table for each dataset
    
    filtered_char_values = dict()
    for dataset in n_samples:
        filtered_char_values[dataset] = list()
        for char in n_samples[dataset]:
            if n_samples[dataset][char] >= min_samples[dataset]:
                filtered_char_values[dataset].append(char)

    for dataset in results:
        print(f"#### {dataset}\n")

        print("| |", end=" ")  # empty cell in the top left corner
        # print the dataset names as column headers
        for key in filtered_char_values[dataset]:
            print(key, end=" | ")
        print()  # new line

        # add hyphens to separate the header from the table
        print("|", end=" ")
        for _ in range(len(filtered_char_values[dataset]) + 1):
            print("---", end=" | ")
        print()  # new line

        # get the averages for each key
        key_results = {component: list() for component in components}
        for key in filtered_char_values[dataset]:
            for component in components:
                if component not in results[dataset][key]:
                    key_results[component].append(" - ")
                else:
                    values = results[dataset][key][component]
                    key_results[component].append(np.round(np.mean(values), 2))

        for component in key_results:
            row = [component] + [str(x) for x in key_results[component]]
            print("| " + " | ".join(row) + " |")
        print()  # new line

In [6]:
trait = "accent"
results, n_samples = get_results(trait, privacy_dirs, privacy_components, utility_dirs, utility_components)
components = list(privacy_components) + list(utility_components)
print_results_per_dataset(results, n_samples, components, min_samples={"cv-test_3utts": 50, "edacc-test": 200})

#### cv-test_3utts

| | nan | United States English | 
| --- | --- | --- | 
| ignorant | 0.28 | 0.29 |
| lazy-informed | 0.22 | 0.25 |
| whisper-small | 0.71 | 0.58 |
| whisper-large | 0.53 | 0.43 |
| ser-audeering-w2v | 0.99 | 0.99 |
| nisqa | 3.41 | 3.59 |

#### edacc-test

| | Afrian | Nigerian | African accent | Kenyan | Indian  | 
| --- | --- | --- | --- | --- | --- | 
| ignorant |  -  | 0.17 |  -  |  -  |  -  |
| lazy-informed |  -  | 0.5 |  -  |  -  |  -  |
| whisper-small | 0.83 | 0.71 | 1.26 | 0.94 | 17.13 |
| whisper-large | 0.64 | 0.6 | 0.96 | 0.71 | 17.13 |
| ser-audeering-w2v | 0.99 | 0.99 | 0.99 | 0.99 | 264.0 |
| nisqa | 2.56 | 2.82 | 3.23 | 3.06 | 264.0 |



#### cv-test_3utts

| | nan | 
| --- | --- | 
| ignorant | 0.28 |
| lazy-informed | 0.22 |
| whisper-small | 0.71 |
| whisper-large | 0.53 |
| ser-audeering-w2v | 0.99 |
| nisqa | 3.41 |

#### edacc-test

| | American | Standard Indian English | Spanish | Afrian | Nigerian | African accent | Kenyan | Indian  | Spanish accent | Lithuanian | Lithuanian (eastern European) | Irish | Fluent | Israeli | Vietnamese | 
| --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | 
| ignorant | 0.44 |  -  | 0.17 |  -  | 0.17 |  -  |  -  |  -  |  -  |  -  |  -  | 0.5 | 0.17 |  -  | 0.28 |
| lazy-informed | 0.44 |  -  | 0.5 |  -  | 0.5 |  -  |  -  |  -  |  -  |  -  |  -  | 0.56 | 0.17 |  -  | 0.17 |
| whisper-small | 0.7 | 0.86 | 0.59 | 0.83 | 0.71 | 1.26 | 0.94 | 17.13 | 0.65 | 0.58 | 0.72 | 0.51 | 1.09 | 0.71 | 1.18 |
| whisper-large | 0.47 | 0.71 | 0.44 | 0.64 | 0.6 | 0.96 | 0.71 | 17.13 | 0.51 | 0.48 | 0.58 | 0.39 | 0.87 | 0.46 | 1.01 |
| ser-audeering-w2v | 0.99 | 0.99 | 0.99 | 0.99 | 0.99 | 0.99 | 0.99 | 264.0 | 0.99 | 0.99 | 0.99 | 0.99 | 0.99 | 0.99 | 0.99 |
| nisqa | 2.58 | 2.82 | 3.17 | 2.56 | 2.82 | 3.23 | 3.06 | 264.0 | 3.31 | 2.8 | 2.94 | 3.16 | 3.02 | 2.87 | 3.03 |

In [7]:
trait = "gender"
results, n_samples = get_results(trait, privacy_dirs, privacy_components, utility_dirs, utility_components)
components = list(privacy_components) + list(utility_components)
print_results_per_dataset(results, n_samples, components, min_samples={"cv-test_3utts": 20, "edacc-test": 20, "ls-test-clean": 20, "ravdess": 20})

#### cv-test_3utts

| | male | nan | female | 
| --- | --- | --- | --- | 
| ignorant | 0.27 | 0.27 | 0.29 |
| lazy-informed | 0.2 | 0.22 | 0.21 |
| whisper-small | 0.75 | 0.69 | 0.52 |
| whisper-large | 0.59 | 0.51 | 0.35 |
| ser-audeering-w2v | 0.99 | 0.99 | 0.99 |
| nisqa | 3.4 | 3.41 | 3.41 |

#### ls-test-clean

| | M | F | 
| --- | --- | --- | 
| ignorant | 0.32 | 0.34 |
| lazy-informed | 0.32 | 0.24 |
| whisper-small | 0.13 | 0.16 |
| whisper-large | 0.09 | 0.1 |
| ser-audeering-w2v | 1.0 | 1.0 |
| nisqa | 3.02 | 3.11 |

#### edacc-test

| | Male | Female | 
| --- | --- | --- | 
| ignorant | 0.36 | 0.36 |
| lazy-informed | 0.27 | 0.3 |
| whisper-small | 0.74 | 0.74 |
| whisper-large | 0.6 | 0.6 |
| ser-audeering-w2v | 0.99 | 0.99 |
| nisqa | 2.95 | 2.91 |

#### ravdess

| | F | M | 
| --- | --- | --- | 
| ignorant | 0.4 | 0.39 |
| lazy-informed | 0.42 | 0.31 |
| whisper-small | 0.76 | 0.42 |
| whisper-large | 0.46 | 0.28 |
| ser-audeering-w2v | 0.98 | 0.98 |
| nisqa | 3.26 | 3.34

#### cv-test_3utts

| | male | nan | female | 
| --- | --- | --- | --- | 
| ignorant | 0.27 | 0.27 | 0.29 |
| lazy-informed | 0.2 | 0.22 | 0.21 |
| whisper-small | 0.75 | 0.69 | 0.52 |
| whisper-large | 0.59 | 0.51 | 0.35 |
| ser-audeering-w2v | 0.99 | 0.99 | 0.99 |
| nisqa | 3.4 | 3.41 | 3.41 |

#### ls-test-clean

| | M | F | 
| --- | --- | --- | 
| ignorant | 0.32 | 0.34 |
| lazy-informed | 0.32 | 0.24 |
| whisper-small | 0.13 | 0.16 |
| whisper-large | 0.09 | 0.1 |
| ser-audeering-w2v | 1.0 | 1.0 |
| nisqa | 3.02 | 3.11 |

#### edacc-test

| | Male | Female | 
| --- | --- | --- | 
| ignorant | 0.36 | 0.36 |
| lazy-informed | 0.27 | 0.3 |
| whisper-small | 0.74 | 0.74 |
| whisper-large | 0.6 | 0.6 |
| ser-audeering-w2v | 0.99 | 0.99 |
| nisqa | 2.95 | 2.91 |

#### ravdess

| | F | M | 
| --- | --- | --- | 
| ignorant | 0.4 | 0.39 |
| lazy-informed | 0.42 | 0.31 |
| whisper-small | 0.76 | 0.42 |
| whisper-large | 0.46 | 0.28 |
| ser-audeering-w2v | 0.98 | 0.98 |
| nisqa | 3.26 | 3.34 |

In [22]:
trait = "age"
results, n_samples = get_results(trait, privacy_dirs, privacy_components, utility_dirs, utility_components)

results["both"] = dict()
n_samples["both"] = dict()
for age_group, components in results["cv-test_3utts"].items():
    results["both"][age_group] = dict()
    for component, values in components.items():
        results["both"][age_group][component] = [sum(values) / len(values)] * n_samples["cv-test_3utts"][age_group]
        n_samples["both"][age_group] = n_samples["cv-test_3utts"][age_group]

for age, components in results["edacc-test"].items():
    if age.startswith("1"):
        age_group = "teens"
    elif age.startswith("2"):
        age_group = "twenties"
    elif age.startswith("3"):
        age_group = "thirties"
    else:
        age_group = "fourties"
    for component, values in components.items():
        results["both"][age_group][component] += [sum(values) / len(values)] * n_samples["edacc-test"][age]
        n_samples["both"][age_group] = n_samples["edacc-test"][age]

for age_group, components in results["both"].items():
    for component, values in components.items():
        results["both"][age_group][component] = sum(values) / len(values)


components = list(privacy_components) + list(utility_components)
print_results_per_dataset(results, n_samples, components, min_samples={"cv-test_3utts": 20, "edacc-test": 20, "both": 20})

#### cv-test_3utts

| | teens | nan | thirties | twenties | fourties | 
| --- | --- | --- | --- | --- | --- | 
| ignorant | 0.2 | 0.28 | 0.23 | 0.32 | 0.29 |
| lazy-informed | 0.14 | 0.22 | 0.25 | 0.22 | 0.07 |
| whisper-small | 0.95 | 0.69 | 0.65 | 0.69 | 0.6 |
| whisper-large | 0.78 | 0.51 | 0.41 | 0.56 | 0.47 |
| ser-audeering-w2v | 0.99 | 0.99 | 0.99 | 0.99 | 0.99 |
| nisqa | 3.31 | 3.41 | 3.45 | 3.33 | 3.65 |

#### edacc-test

| | 25 | 24 | 34 | 21 | 40 | 67 | 38 | 33 | 58 | 31 | 30 | 28 | 32 | 36 | 22 | 26 | 39 | 45 | 41 | 27 | 35 | 48 | 49 | 23 | 19 | 53 | 46 | 
| --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | 
| ignorant | 0.44 | 0.25 | 0.33 | 0.31 | 0.5 |  -  |  -  | 0.42 | 0.17 | 0.36 |  -  | 0.29 |  -  |  -  | 0.33 | 0.17 | 0.44 |  -  |  -  | 0.22 | 0.5 |  -  |  -  |  -  |  -  |  -  |  -  |
| lazy-informed | 0.27 | 0.14 | 0.33 | 0.44 | 0.17 |  -  |  -  | 0

#### cv-test_3utts

| | teens | nan | thirties | twenties | fourties | 
| --- | --- | --- | --- | --- | --- | 
| ignorant | 0.2 | 0.28 | 0.23 | 0.32 | 0.29 |
| lazy-informed | 0.14 | 0.22 | 0.25 | 0.22 | 0.07 |
| whisper-small | 0.95 | 0.69 | 0.65 | 0.69 | 0.6 |
| whisper-large | 0.78 | 0.51 | 0.41 | 0.56 | 0.47 |
| ser-audeering-w2v | 0.99 | 0.99 | 0.99 | 0.99 | 0.99 |
| nisqa | 3.31 | 3.41 | 3.45 | 3.33 | 3.65 |

#### edacc-test

| | 25 | 24 | 34 | 21 | 40 | 67 | 38 | 33 | 58 | 31 | 30 | 28 | 32 | 36 | 22 | 26 | 39 | 45 | 41 | 27 | 35 | 48 | 49 | 23 | 19 | 53 | 46 | 
| --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | 
| ignorant | 0.44 | 0.25 | 0.33 | 0.31 | 0.5 |  -  |  -  | 0.42 | 0.17 | 0.36 |  -  | 0.29 |  -  |  -  | 0.33 | 0.17 | 0.44 |  -  |  -  | 0.22 | 0.5 |  -  |  -  |  -  |  -  |  -  |  -  |
| lazy-informed | 0.27 | 0.14 | 0.33 | 0.44 | 0.17 |  -  |  -  | 0.28 | 0.33 | 0.28 |  -  | 0.36 |  -  |  -  | 0.17 | 0.17 | 0.11 |  -  |  -  | 0.39 | 0.33 |  -  |  -  |  -  |  -  |  -  |  -  |
| whisper-small | 1.03 | 0.65 | 0.82 | 0.62 | 1.13 | 0.73 | 0.58 | 0.59 | 0.45 | 0.84 | 0.51 | 0.73 | 0.59 | 0.61 | 0.76 | 0.94 | 0.6 | 0.58 | 0.65 | 1.0 | 0.74 | 0.51 | 0.71 | 0.61 | 0.7 | 0.64 | 0.77 |
| whisper-large | 0.78 | 0.63 | 0.71 | 0.49 | 0.71 | 1.11 | 0.71 | 0.52 | 0.34 | 0.7 | 0.36 | 0.58 | 0.47 | 0.47 | 0.59 | 0.74 | 0.5 | 0.43 | 0.51 | 0.89 | 0.53 | 0.51 | 0.46 | 0.38 | 0.53 | 0.41 | 0.54 |
| ser-audeering-w2v | 0.99 | 0.99 | 0.99 | 0.99 | 1.0 | 0.99 | 0.99 | 0.99 | 0.99 | 0.99 | 0.99 | 0.99 | 0.99 | 0.99 | 0.99 | 0.99 | 0.99 | 0.99 | 0.99 | 0.99 | 0.99 | 0.99 | 0.99 | 1.0 | 0.99 | 0.99 | 0.99 |
| nisqa | 3.13 | 2.81 | 2.77 | 2.93 | 3.11 | 3.06 | 3.19 | 3.14 | 3.12 | 2.97 | 2.57 | 2.75 | 3.06 | 2.51 | 2.76 | 3.07 | 2.95 | 3.16 | 3.31 | 2.7 | 3.13 | 2.87 | 2.87 | 2.66 | 3.11 | 2.86 | 2.92 |

#### both

| | teens | nan | thirties | twenties | fourties | 
| --- | --- | --- | --- | --- | --- | 
| ignorant | 0.2 | 0.28 | 0.4 | 0.29 | 0.31 |
| lazy-informed | 0.14 | 0.22 | 0.23 | 0.29 | 0.23 |
| whisper-small | 0.8 | 0.69 | 0.69 | 0.8 | 0.67 |
| whisper-large | 0.63 | 0.51 | 0.57 | 0.64 | 0.5 |
| ser-audeering-w2v | 0.99 | 0.99 | 0.99 | 0.99 | 0.99 |
| nisqa | 3.19 | 3.41 | 2.96 | 2.92 | 3.05 |

In [9]:
trait = "ethnicity"
results, n_samples = get_results(trait, privacy_dirs, privacy_components, utility_dirs, utility_components)
components = list(privacy_components) + list(utility_components)
print_results_per_dataset(results, n_samples, components, min_samples={"cv-test_3utts": 20, "edacc-test": 20, "ls-test-clean": 20, "ravdess": 20})

#### edacc-test

| | White | South Asian | Asian | Black | Mixed | Latin American | 
| --- | --- | --- | --- | --- | --- | --- | 
| ignorant | 0.39 | 0.38 | 0.38 | 0.39 |  -  |  -  |
| lazy-informed | 0.3 | 0.31 | 0.2 | 0.29 |  -  |  -  |
| whisper-small | 0.61 | 0.78 | 0.73 | 0.88 | 0.7 | 0.51 |
| whisper-large | 0.51 | 0.63 | 0.58 | 0.7 | 0.44 | 0.36 |
| ser-audeering-w2v | 0.99 | 0.99 | 0.99 | 0.99 | 0.99 | 0.99 |
| nisqa | 3.02 | 2.84 | 2.92 | 2.89 | 2.73 | 2.57 |



#### edacc-test

| | White | South Asian | Asian | Black | Mixed | Latin American | 
| --- | --- | --- | --- | --- | --- | --- | 
| ignorant | 0.39 | 0.38 | 0.38 | 0.39 |  -  |  -  |
| lazy-informed | 0.3 | 0.31 | 0.2 | 0.29 |  -  |  -  |
| whisper-small | 0.61 | 0.78 | 0.73 | 0.88 | 0.7 | 0.51 |
| whisper-large | 0.51 | 0.63 | 0.58 | 0.7 | 0.44 | 0.36 |
| ser-audeering-w2v | 0.99 | 0.99 | 0.99 | 0.99 | 0.99 | 0.99 |
| nisqa | 3.02 | 2.84 | 2.92 | 2.89 | 2.73 | 2.57 |

In [10]:
trait = "utt_emotion"
results, n_samples = get_results(trait, privacy_dirs, privacy_components, utility_dirs, utility_components)
components = list(privacy_components) + list(utility_components)
print_results_per_dataset(results, n_samples, components, min_samples={"cv-test_3utts": 20, "edacc-test": 20, "ls-test-clean": 20, "ravdess": 20})

#### ravdess

| | angry | fearful | disgust | sad | surprised | happy | calm | neutral | 
| --- | --- | --- | --- | --- | --- | --- | --- | --- | 
| ignorant | 0.36 | 0.53 | 0.33 | 0.67 | 0.33 | 0.22 | 0.31 | 0.5 |
| lazy-informed | 0.36 | 0.25 | 0.0 | 0.33 | 0.17 | 0.42 | 0.42 | 0.33 |
| whisper-small | 0.58 | 1.14 | 0.48 | 0.58 | 0.59 | 0.65 | 0.26 | 0.26 |
| whisper-large | 0.43 | 0.55 | 0.28 | 0.34 | 0.46 | 0.32 | 0.16 | 0.51 |
| ser-audeering-w2v | 0.98 | 0.98 | 0.98 | 0.98 | 0.98 | 0.98 | 0.99 | 0.99 |
| nisqa | 3.08 | 3.19 | 3.4 | 3.35 | 3.2 | 3.2 | 3.65 | 3.42 |



#### ravdess

| | angry | fearful | disgust | sad | surprised | happy | calm | neutral | 
| --- | --- | --- | --- | --- | --- | --- | --- | --- | 
| ignorant | 0.36 | 0.53 | 0.33 | 0.67 | 0.33 | 0.22 | 0.31 | 0.5 |
| lazy-informed | 0.36 | 0.25 | 0.0 | 0.33 | 0.17 | 0.42 | 0.42 | 0.33 |
| whisper-small | 0.58 | 1.14 | 0.48 | 0.58 | 0.59 | 0.65 | 0.26 | 0.26 |
| whisper-large | 0.43 | 0.55 | 0.28 | 0.34 | 0.46 | 0.32 | 0.16 | 0.51 |
| ser-audeering-w2v | 0.98 | 0.98 | 0.98 | 0.98 | 0.98 | 0.98 | 0.99 | 0.99 |
| nisqa | 3.08 | 3.19 | 3.4 | 3.35 | 3.2 | 3.2 | 3.65 | 3.42 |