## Evaluate Biometric Verification

In [None]:
import os
import numpy as np
import random
from tqdm import tqdm
import sys
import joblib
import seaborn as sns
import pandas as pd
from scipy import interpolate
from sklearn import metrics
from scipy.stats import ttest_ind, ttest_1samp
import time

%pylab inline
%load_ext autoreload
%autoreload 2

In [2]:
# params
dst_dir = 'results/'

### Evaluate GazeBase

In [3]:
def get_metric_dict_for_setting(inspect_key = 'normalized Ethnicity',
                        inspect_list = ['Caucasian','Black','Asian','Hispanic'],                        
                        demo_dict = dict(),
                        result_list = [],
                        only_equal = False,
                        verbose = 1,
                        ):
    if verbose == 1:
        disable = False
    else:
        disable = True
    metric_dict = dict()
    for fold_nr in tqdm(np.arange(len(result_list)), disable = disable):
        cur_data = result_list[fold_nr]
        scores = cur_data['scores']
        labels = cur_data['labels']
        person_one = cur_data['person_one']
        person_two = cur_data['person_two']        
        
        person_one_list = person_one
        person_two_list = person_two
        
        if demo_dict is not None:
            person_one_label = []
            for i in range(len(person_one_list)):
                if inspect_key is not None:
                    person_one_label.append(demo_dict[inspect_key][person_one_list[i]])
                else:
                    person_one_label.append(demo_dict[person_one_list[i]])

            person_two_label = []
            for i in range(len(person_two_list)):
                if inspect_key is not None:            
                    person_two_label.append(demo_dict[inspect_key][person_two_list[i]])
                else:
                    person_two_label.append(demo_dict[person_two_list[i]])
        
        # all persons
        fpr, tpr, thresholds = metrics.roc_curve(labels, scores, pos_label=1)
        if 'all' not in metric_dict:
            metric_dict['all'] = {'fprs':[],
                                  'tprs':[],
                                  'thresholds':[]}
        metric_dict['all']['fprs'].append(fpr)
        metric_dict['all']['tprs'].append(tpr)
        metric_dict['all']['thresholds'].append(thresholds)
        
        if demo_dict is not None:
            # all combinations
            for key_1 in inspect_list:
                for key_2 in inspect_list:
                    if only_equal:
                        if key_1 != key_2:
                            continue
                    use_ids = []
                    for i in range(len(person_one_label)):
                        # all instances matching the criterion
                        if person_one_label[i] == key_1 and person_two_label[i] == key_2:
                            use_ids.append(i)
                        # all matches; matching the criterion
                        if person_one_label[i] == key_1 and labels[i] == 1:
                            use_ids.append(i)
                    use_ids = list(set(use_ids))
                    fpr, tpr, thresholds = metrics.roc_curve(np.array(labels)[use_ids],
                                                             np.array(scores)[use_ids],
                                                             pos_label=1)
                    cur_key = key_1 + ' - ' + str(key_2)
                    if cur_key not in metric_dict:
                        metric_dict[cur_key] = {'fprs':[],
                                              'tprs':[],
                                              'thresholds':[]}
                    metric_dict[cur_key]['fprs'].append(fpr)
                    metric_dict[cur_key]['tprs'].append(tpr)
                    metric_dict[cur_key]['thresholds'].append(thresholds)
    return metric_dict

## GazeBase

In [4]:
### list of models
to_eval_files = [   dst_dir + '/EKYT_fold0_biometric__dataset_gazebase_max_rounds4_num_folds5_window_size1.npz',
                    dst_dir + '/ekyt_random_window_size_5000_sd_0.1_sd_factor_1.25_embedding_size_128_stimulus_video_model_random_-1baseline_1000_fold0_biometric__dataset_gazebase_max_rounds4_num_folds5_window_size1.npz',
                    dst_dir + '/CLRGAZE_fold0_biometric__dataset_gazebase_max_rounds4_num_folds5_window_size1.npz',
                    dst_dir + '/clrgaze_random_window_size_5000_sd_0.1_sd_factor_1.25_embedding_size_512_stimulus_video_model_random_-1baseline_1000_fold0_biometric__dataset_gazebase_max_rounds4_num_folds5_window_size1.npz',
                ]

to_eval_names = ['EKYT (w/o pre-training)',
                 'EKYT (CP SP-EyeGAN) fine-tuning',
                 'CLRGaze (w/o pre-training)',
                 'CLRGaze (CP SP-EyeGAN) fine-tuning',
                 ]

# params
window_size = 1
plot_points = 1000
xscale = 'log'
plot_random = True
plot_statistics = True
fontsize=14
inspect_thresholds = [0.1,0.01,0.001]
folds = 5

In [5]:
decimals = 3
for m_i in tqdm(np.arange(len(to_eval_files)), disable = True):
    result_list = []
    start = time.time()
    for f_i in tqdm(np.arange(folds), disable = True):
        try:
            cur_result = np.load(to_eval_files[m_i].replace('fold0','fold' + str(f_i)))
            result_list.append({'scores':cur_result['scores'],
                                'labels':cur_result['labels'],
                                'person_one':cur_result['person_one'],
                                'person_two':cur_result['person_two'],
                                })
        except:
            continue
    end = time.time() - start    
    #print('time to load the data: ' + str(end) + ' size: ' + str(len(result_list)))
    
    
    start = time.time()
    metric_dict = get_metric_dict_for_setting(  inspect_key = None,
                                                inspect_list = [],                        
                                                demo_dict = None,
                                                result_list = result_list,
                                                verbose = 0,
                                              )
    
    

    fprs = metric_dict['all']['fprs']
    tprs = metric_dict['all']['tprs']
    eers = []
    for i in range(len(fprs)):
        key_fmr = fprs[i]
        key_fnmr = 1 - tprs[i]
        cur_inter = interpolate.interp1d(key_fmr,key_fnmr)
        fprs_inter = np.linspace(0, 1, plot_points)
        fnrs_inter = cur_inter(fprs_inter)
        cur_eer = fprs_inter[np.nanargmin(np.absolute((fnrs_inter - fprs_inter)))]
        eers.append(cur_eer)
    better_random_pvalue = ttest_1samp(a=1. - np.array(eers),popmean=0.5,alternative = 'greater').pvalue
    if better_random_pvalue < 0.05:
        cur_p_value_add_str = '*'
    else:
        cur_p_value_add_str = ''
    print(to_eval_names[m_i] + ' & ' + str(np.round(np.mean(eers),decimals=decimals)) +\
                                    ' $ \pm$ ' + str(np.round(np.std(eers) / np.sqrt(len(eers)),decimals=decimals)) +\
             cur_p_value_add_str + '\\\\')
    
    if 'fine-tuning' in to_eval_names[m_i]:
        result_list = []
        start = time.time()
        for f_i in tqdm(np.arange(folds), disable = True):
            try:
                cur_result = np.load(to_eval_files[m_i].replace('fold0','fold' + str(f_i)))
                result_list.append({'scores':cur_result['scores_zero_shot'],
                                    'labels':cur_result['labels_zero_shot'],
                                    'person_one':cur_result['person_one_zero_shot'],
                                    'person_two':cur_result['person_two_zero_shot'],
                                    })
            except:
                continue
        end = time.time() - start    
        #print('time to load the data: ' + str(end) + ' size: ' + str(len(result_list)))


        start = time.time()
        metric_dict = get_metric_dict_for_setting(  inspect_key = None,
                                                    inspect_list = [],                        
                                                    demo_dict = None,
                                                    result_list = result_list,
                                                    verbose = 0,
                                                  )


        print_name = to_eval_names[m_i].replace('fine-tuning','zero-shot')
        fprs = metric_dict['all']['fprs']
        tprs = metric_dict['all']['tprs']
        eers = []
        for i in range(len(fprs)):
            key_fmr = fprs[i]
            key_fnmr = 1 - tprs[i]
            cur_inter = interpolate.interp1d(key_fmr,key_fnmr)
            fprs_inter = np.linspace(0, 1, plot_points)
            fnrs_inter = cur_inter(fprs_inter)
            cur_eer = fprs_inter[np.nanargmin(np.absolute((fnrs_inter - fprs_inter)))]
            eers.append(cur_eer)
        better_random_pvalue = ttest_1samp(a=1. - np.array(eers),popmean=0.5,alternative = 'greater').pvalue
        if better_random_pvalue < 0.05:
            cur_p_value_add_str = '*'
        else:
            cur_p_value_add_str = ''
        print(print_name + ' &  ' + str(np.round(np.mean(eers),decimals=decimals)) +\
                                        ' $ \pm$ ' + str(np.round(np.std(eers) / np.sqrt(len(eers)),decimals=decimals)) +\
                     cur_p_value_add_str+ '\\\\')

EKYT (w/o pre-training) & 0.165 $ \pm$ 0.004*\\
EKYT (CP SP-EyeGAN) fine-tuning & 0.169 $ \pm$ 0.003*\\
EKYT (CP SP-EyeGAN) zero-shot &  0.495 $ \pm$ 0.004\\
CLRGaze (w/o pre-training) & 0.181 $ \pm$ 0.003*\\
CLRGaze (CP SP-EyeGAN) fine-tuning & 0.188 $ \pm$ 0.003*\\
CLRGaze (CP SP-EyeGAN) zero-shot &  0.493 $ \pm$ 0.003\\


## JuDo

In [6]:
### list of models
to_eval_files = [   dst_dir + '/EKYT_fold0_biometric__dataset_judo_max_rounds4_num_folds5_window_size1.npz',
                    dst_dir + '/ekyt_random_window_size_5000_sd_0.1_sd_factor_1.25_embedding_size_128_stimulus_video_model_random_-1baseline_1000_fold0_biometric__dataset_judo_max_rounds4_num_folds5_window_size1.npz',
                    dst_dir + '/CLRGAZE_fold0_biometric__dataset_judo_max_rounds4_num_folds5_window_size1.npz',
                    dst_dir + '/clrgaze_random_window_size_5000_sd_0.1_sd_factor_1.25_embedding_size_512_stimulus_video_model_random_-1baseline_1000_fold0_biometric__dataset_judo_max_rounds4_num_folds5_window_size1.npz',
                    ]

to_eval_names = ['EKYT (w/o pre-training)',
                 'EKYT (CP SP-EyeGAN) fine-tuning',
                 'CLRGaze (w/o pre-training)',
                 'CLRGaze (CP SP-EyeGAN) fine-tuning',                 
                 ]



# params
window_size = 1
plot_points = 1000
xscale = 'log'
plot_random = True
plot_statistics = True
fontsize=14
inspect_thresholds = [0.1,0.01,0.001]
folds = 5

In [7]:
decimals = 3
for m_i in tqdm(np.arange(len(to_eval_files)), disable = True):
    result_list = []
    start = time.time()
    for f_i in tqdm(np.arange(folds), disable = True):
        try:
            cur_result = np.load(to_eval_files[m_i].replace('fold0','fold' + str(f_i)))
            result_list.append({'scores':cur_result['scores'],
                                'labels':cur_result['labels'],
                                'person_one':cur_result['person_one'],
                                'person_two':cur_result['person_two'],
                                })
        except:
            continue
    end = time.time() - start    
    #print('time to load the data: ' + str(end) + ' size: ' + str(len(result_list)))
    
    
    start = time.time()
    metric_dict = get_metric_dict_for_setting(  inspect_key = None,
                                                inspect_list = [],                        
                                                demo_dict = None,
                                                result_list = result_list,
                                                verbose = 0,
                                              )
    
    

    fprs = metric_dict['all']['fprs']
    tprs = metric_dict['all']['tprs']
    eers = []
    for i in range(len(fprs)):
        key_fmr = fprs[i]
        key_fnmr = 1 - tprs[i]
        cur_inter = interpolate.interp1d(key_fmr,key_fnmr)
        fprs_inter = np.linspace(0, 1, plot_points)
        fnrs_inter = cur_inter(fprs_inter)
        cur_eer = fprs_inter[np.nanargmin(np.absolute((fnrs_inter - fprs_inter)))]
        eers.append(cur_eer)
    better_random_pvalue = ttest_1samp(a=1. - np.array(eers),popmean=0.5,alternative = 'greater').pvalue
    if better_random_pvalue < 0.05:
        cur_p_value_add_str = '*'
    else:
        cur_p_value_add_str = ''
    print(to_eval_names[m_i] + ' & ' + str(np.round(np.mean(eers),decimals=decimals)) +\
                                    ' $ \pm$ ' + str(np.round(np.std(eers) / np.sqrt(len(eers)),decimals=decimals)) +\
             cur_p_value_add_str + '\\\\')
    
    if 'fine-tuning' in to_eval_names[m_i]:
        result_list = []
        start = time.time()
        for f_i in tqdm(np.arange(folds), disable = True):
            try:
                cur_result = np.load(to_eval_files[m_i].replace('fold0','fold' + str(f_i)))
                result_list.append({'scores':cur_result['scores_zero_shot'],
                                    'labels':cur_result['labels_zero_shot'],
                                    'person_one':cur_result['person_one_zero_shot'],
                                    'person_two':cur_result['person_two_zero_shot'],
                                    })
            except:
                continue
        end = time.time() - start    
        #print('time to load the data: ' + str(end) + ' size: ' + str(len(result_list)))


        start = time.time()
        metric_dict = get_metric_dict_for_setting(  inspect_key = None,
                                                    inspect_list = [],                        
                                                    demo_dict = None,
                                                    result_list = result_list,
                                                    verbose = 0,
                                                  )


        print_name = to_eval_names[m_i].replace('fine-tuning','zero-shot')
        fprs = metric_dict['all']['fprs']
        tprs = metric_dict['all']['tprs']
        eers = []
        for i in range(len(fprs)):
            key_fmr = fprs[i]
            key_fnmr = 1 - tprs[i]
            cur_inter = interpolate.interp1d(key_fmr,key_fnmr)
            fprs_inter = np.linspace(0, 1, plot_points)
            fnrs_inter = cur_inter(fprs_inter)
            cur_eer = fprs_inter[np.nanargmin(np.absolute((fnrs_inter - fprs_inter)))]
            eers.append(cur_eer)
        better_random_pvalue = ttest_1samp(a=1. - np.array(eers),popmean=0.5,alternative = 'greater').pvalue
        if better_random_pvalue < 0.05:
            cur_p_value_add_str = '*'
        else:
            cur_p_value_add_str = ''
        print(print_name + ' &  ' + str(np.round(np.mean(eers),decimals=decimals)) +\
                                        ' $ \pm$ ' + str(np.round(np.std(eers) / np.sqrt(len(eers)),decimals=decimals)) +\
                     cur_p_value_add_str+ '\\\\')

EKYT (w/o pre-training) & 0.112 $ \pm$ 0.003*\\
EKYT (CP SP-EyeGAN) fine-tuning & 0.114 $ \pm$ 0.003*\\
EKYT (CP SP-EyeGAN) zero-shot &  0.49 $ \pm$ 0.002*\\
CLRGaze (w/o pre-training) & 0.109 $ \pm$ 0.002*\\
CLRGaze (CP SP-EyeGAN) fine-tuning & 0.124 $ \pm$ 0.004*\\
CLRGaze (CP SP-EyeGAN) zero-shot &  0.462 $ \pm$ 0.002*\\
