# Identification evaluation

In [1]:
import os

import numpy as np
import torch
import pandas as pd
from tqdm import tqdm
from sklearn.metrics import confusion_matrix, accuracy_score

In [2]:
from utils.identification import nearest_tensor

### Load embeddings

In [3]:
N_RECORDINGS = 10

In [4]:
samples_in_system: list[torch.Tensor] = []
samples_to_test: list[dict] = []
users_list: list[str] = []

for subdir in os.listdir('embedings/embeddings_180_noise'):
    subdir_root = os.path.join('embedings/embeddings_180_noise', subdir)
    for user_dir in os.listdir(subdir_root):
        user_root = os.path.join(subdir_root, user_dir)
        if not os.path.isdir(user_root):
            continue
        user_samples = torch.load(os.path.join(user_root, 'tensor.pt')).squeeze()
        if user_samples.shape[0] < N_RECORDINGS + 1:
            continue
        users_list.append(user_dir)
        samples_in_system.append(user_samples[:N_RECORDINGS])
        samples_to_test.append({
            "user_dir": user_dir,
            'samples': user_samples[N_RECORDINGS:],
            'subdir': subdir,
        })

samples_in_system = torch.stack(samples_in_system)

In [5]:
samples_in_system.shape

torch.Size([126, 10, 256])

### Evaluate for users

In [6]:
THRESHOLD = 0.34

In [7]:
def mean_ala_recall(conf_matrix: np.ndarray) -> float:
    """
    Calculates "ala recall" for each user
    (i.e. part of user's samples identified correctly)
    and returns mean across all users
    
    Args:
        conf_matrix (2D array): Confusion matrix whose i-th row and j-th column entry indicates
            the number of samples with true label being i-th class and predicted label being j-th class.
            (i.e. as in scikit-learn)
    """
    n = conf_matrix.shape[0]
    if conf_matrix.shape != (n,n):
        raise ValueError("Input matrix must be square")
    diag = conf_matrix[np.arange(n), np.arange(n)]
    return np.nanmean(diag / conf_matrix.sum(axis=1))


def mean_ala_precision(conf_matrix: np.ndarray) -> float:
    """
    Calculates "ala precision" for each user
    (i.e. ratio between number of user's correctly classified samples
    and number of other users' samples classified as the user)
    and returns mean across all users
    
    Args:
        conf_matrix (2D array): Confusion matrix whose i-th row and j-th column entry indicates
            the number of samples with true label being i-th class and predicted label being j-th class.
            (i.e. as in scikit-learn)
    """
    n = conf_matrix.shape[0]
    if conf_matrix.shape != (n,n):
        raise ValueError("Input matrix must be square")
    diag = conf_matrix[np.arange(n), np.arange(n)]
    return np.nanmean(diag / conf_matrix.sum(axis=0))

In [8]:
# prepare results table

result_rows = ["all", "clean", "echo", "language", "noise"]
result_cols = ["accuracy", "mean recall", "mean precision"]
results = pd.DataFrame(np.full((len(result_rows), len(result_cols)), fill_value=-1.0), index=result_rows, columns=result_cols)

# obtain results for each split name: "all", "clean", "echo", "language", "noise"

for split_name in result_rows:

    # initialize lists of true and predicted users' IDs
    true_users = []
    predicted_users = []

    # iterare over users and each user's samples to get true and predicted user's IDs
    for user_dict in tqdm(samples_to_test):
        user_id = user_dict["user_dir"]
        for test_emb in user_dict['samples']:
            if split_name != 'all':
                # filter out if not this subdir
                if user_dict["subdir"] != split_name:
                    continue
            true_users.append(user_id)
            index, distance = nearest_tensor(target=test_emb, embeddings=samples_in_system, thresh=THRESHOLD)
            if (index, distance) == (0, 0):
                predicted_users.append(-1)
            else:
                predicted_users.append(users_list[index])

    # calculate users' confusion matrix
    conf_matrix = confusion_matrix(y_pred=predicted_users, y_true=true_users)

    # get accuracy, recall, precision
    results.loc[split_name, "accuracy"] = accuracy_score(y_pred=predicted_users, y_true=true_users)
    results.loc[split_name, "mean recall"] = mean_ala_recall(conf_matrix)
    results.loc[split_name, "mean precision"] = mean_ala_precision(conf_matrix)

# show results

results

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 126/126 [00:00<00:00, 135.09it/s]
100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 126/126 [00:00<00:00, 1181.11it/s]
  return np.nanmean(diag / conf_matrix.sum(axis=1))
  return np.nanmean(diag / conf_matrix.sum(axis=0))
100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 126/126 [00:00<00:00, 1029.47it/s]
  return np.nanmean(diag / conf_matrix.sum(axis=1))
100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████

Unnamed: 0,accuracy,mean recall,mean precision
all,0.550446,0.408284,0.384733
clean,0.477654,0.460484,0.280416
echo,0.459906,0.425141,0.250159
language,0.678455,0.675624,0.190169
noise,0.275457,0.234738,0.217449


In [9]:
results.to_latex()

'\\begin{tabular}{lrrr}\n\\toprule\n & accuracy & mean recall & mean precision \\\\\n\\midrule\nall & 0.550446 & 0.408284 & 0.384733 \\\\\nclean & 0.477654 & 0.460484 & 0.280416 \\\\\necho & 0.459906 & 0.425141 & 0.250159 \\\\\nlanguage & 0.678455 & 0.675624 & 0.190169 \\\\\nnoise & 0.275457 & 0.234738 & 0.217449 \\\\\n\\bottomrule\n\\end{tabular}\n'

### Evaluate for denoised

In [10]:
samples_in_system: list[torch.Tensor] = []
samples_to_test: list[dict] = []
users_list: list[str] = []

for subdir in os.listdir('embedings/embeddings_180_denoise/'):
    subdir_root = os.path.join('embedings/embeddings_180_denoise/', subdir)
    for user_dir in os.listdir(subdir_root):
        user_root = os.path.join(subdir_root, user_dir)
        if not os.path.isdir(user_root):
            continue
        user_samples = torch.load(os.path.join(user_root, 'tensor.pt')).squeeze()
        if user_samples.shape[0] < N_RECORDINGS + 1:
            continue
        users_list.append(user_dir)
        samples_in_system.append(user_samples[:N_RECORDINGS])
        samples_to_test.append({
            "user_dir": user_dir,
            'samples': user_samples[N_RECORDINGS:],
            'subdir': subdir,
        })

samples_in_system = torch.stack(samples_in_system)

In [11]:
# prepare results table

result_rows = ["all", "clean", "echo", "language", "noise"]
result_cols = ["accuracy", "mean recall", "mean precision"]
results_denoised = pd.DataFrame(np.full((len(result_rows), len(result_cols)), fill_value=-1.0), index=result_rows, columns=result_cols)

# obtain results for each split name: "all", "clean", "echo", "language", "noise"

for split_name in result_rows:

    # initialize lists of true and predicted users' IDs
    true_users = []
    predicted_users = []

    # iterare over users and each user's samples to get true and predicted user's IDs
    for user_dict in tqdm(samples_to_test):
        user_id = user_dict["user_dir"]
        for test_emb in user_dict['samples']:
            if split_name != 'all':
                # filter out if not this subdir
                if user_dict["subdir"] != split_name:
                    continue
            true_users.append(user_id)
            index, distance = nearest_tensor(target=test_emb, embeddings=samples_in_system, thresh=THRESHOLD)
            if (index, distance) == (0, 0):
                predicted_users.append(-1)
            else:
                predicted_users.append(users_list[index])

    # calculate users' confusion matrix
    conf_matrix = confusion_matrix(y_pred=predicted_users, y_true=true_users)

    # get accuracy, recall, precision
    results_denoised.loc[split_name, "accuracy"] = accuracy_score(y_pred=predicted_users, y_true=true_users)
    results_denoised.loc[split_name, "mean recall"] = mean_ala_recall(conf_matrix)
    results_denoised.loc[split_name, "mean precision"] = mean_ala_precision(conf_matrix)

# show results

results_denoised

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 126/126 [00:00<00:00, 132.68it/s]
100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 126/126 [00:00<00:00, 1219.92it/s]
  return np.nanmean(diag / conf_matrix.sum(axis=1))
100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 126/126 [00:00<00:00, 1048.69it/s]
  return np.nanmean(diag / conf_matrix.sum(axis=1))
100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 126/126 [00:00<00:00, 236.24it/s]
  return np.nanm

Unnamed: 0,accuracy,mean recall,mean precision
all,0.41412,0.323063,0.303418
clean,0.310056,0.338732,0.169522
echo,0.375,0.351481,0.167136
language,0.509539,0.506174,0.165632
noise,0.21671,0.205802,0.198417


In [12]:
results_denoised.to_latex()

'\\begin{tabular}{lrrr}\n\\toprule\n & accuracy & mean recall & mean precision \\\\\n\\midrule\nall & 0.414120 & 0.323063 & 0.303418 \\\\\nclean & 0.310056 & 0.338732 & 0.169522 \\\\\necho & 0.375000 & 0.351481 & 0.167136 \\\\\nlanguage & 0.509539 & 0.506174 & 0.165632 \\\\\nnoise & 0.216710 & 0.205802 & 0.198417 \\\\\n\\bottomrule\n\\end{tabular}\n'

### Evaluate our embeddings

In [13]:
samples_in_system: list[torch.Tensor] = []
samples_to_test: list[dict] = []
users_list: list[str] = []

for subdir in os.listdir('embedings/embeddings_180_noise'):
    subdir_root = os.path.join('embedings/embeddings_180_noise', subdir)
    for user_dir in os.listdir(subdir_root):
        user_root = os.path.join(subdir_root, user_dir)
        if not os.path.isdir(user_root):
            continue
        user_samples = torch.load(os.path.join(user_root, 'tensor.pt')).squeeze()
        if user_samples.shape[0] < N_RECORDINGS + 1:
            continue
        users_list.append(user_dir)
        samples_in_system.append(user_samples[:N_RECORDINGS])
        samples_to_test.append({
            "user_dir": user_dir,
            'samples': user_samples[N_RECORDINGS:],
            'subdir': subdir,
        })

for subdir in os.listdir('nasze_emb'):
    subdir_root = os.path.join('nasze_emb', subdir)
    for user_dir in os.listdir(subdir_root):
        user_root = os.path.join(subdir_root, user_dir)
        if not os.path.isdir(user_root):
            continue
        user_samples = torch.load(os.path.join(user_root, 'tensor.pt')).squeeze()
        if user_samples.shape[0] < N_RECORDINGS + 1:
            continue
        users_list.append(user_dir)
        samples_in_system.append(user_samples[:N_RECORDINGS])
        samples_to_test.append({
            "user_dir": user_dir,
            'samples': user_samples[N_RECORDINGS:],
            'subdir': subdir,
        })

samples_in_system = torch.stack(samples_in_system)

In [14]:
users_list[-6:]

['Hubert_Baran',
 'Bianka_Kowalska',
 'Daniil_Hardzetski',
 'Hubert_Baran',
 'Bianka_Kowalska',
 'Daniil_Hardzetski']

In [15]:
# prepare results table

result_rows = ["Bianka_Kowalska", "Hubert_Baran", "Daniil_Hardzetski"]
result_cols = ["recall", "precision"]
our_results = pd.DataFrame(np.full((len(result_rows), len(result_cols)), fill_value=-1.0), index=result_rows, columns=result_cols)

# obtain results results for all users

# initialize lists of true and predicted users' IDs
true_users = []
predicted_users = []

# iterare over users and each user's samples to get true and predicted user's IDs
for user_dict in tqdm(samples_to_test):
    user_id = user_dict["user_dir"]
    for test_emb in user_dict['samples']:
        true_users.append(user_id)
        index, distance = nearest_tensor(target=test_emb, embeddings=samples_in_system, thresh=THRESHOLD)
        if (index, distance) == (0, 0):
            predicted_users.append(-1)
        else:
            predicted_users.append(users_list[index])

for user_name in result_rows:
    TP = 0
    FP = 0
    FN = 0
    for true_user, pred_user in zip(true_users, predicted_users):
        if (true_user == user_name) and (pred_user == user_name):
            TP += 1
        elif (true_user == user_name) and (pred_user != user_name):
            FN += 1
        elif (true_user != user_name) and (pred_user == user_name):
            FP += 1
    our_results.loc[user_name, "recall"] = TP / (TP + FN)
    our_results.loc[user_name, "precision"] = TP / (TP + FP)

# show results

our_results

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 135/135 [00:00<00:00, 147.15it/s]


Unnamed: 0,recall,precision
Bianka_Kowalska,0.533333,0.133333
Hubert_Baran,0.833333,0.555556
Daniil_Hardzetski,0.611111,0.423077


In [16]:
our_results.to_latex()

'\\begin{tabular}{lrr}\n\\toprule\n & recall & precision \\\\\n\\midrule\nBianka_Kowalska & 0.533333 & 0.133333 \\\\\nHubert_Baran & 0.833333 & 0.555556 \\\\\nDaniil_Hardzetski & 0.611111 & 0.423077 \\\\\n\\bottomrule\n\\end{tabular}\n'