In [1]:
%load_ext rpy2.ipython


# Check Wilson interval


In [2]:
from utils import *

print(get_ci_wilson(n_errors=1, n_instances=100, mean_variance=1, alpha=0.05))
print(get_ci_wilson(n_errors=0, n_instances=100, mean_variance=1, alpha=0.05))
print(get_ci_wilson(n_errors=100, n_instances=100, mean_variance=1, alpha=0.05))
print(get_ci_wilson(n_errors=50, n_instances=100, mean_variance=1, alpha=0.05))
print(get_ci_wilson(n_errors=50, n_instances=100, mean_variance=1, alpha=0.2))


[0.0005129329438755058, 0.9974806313947331]
[0.0, 0.03699349820698568]
[0.9630065017930143, 1.0]
[0.015516486400631463, 0.9844835135993687]
[0.03419670540131324, 0.9658032945986867]


In [3]:
%%R
library(here)
source(here('R-code/utils.R'))
print(get_wilson(G=c(1, rep(0, 99)), alpha=0.05,
      sample_size=100, var_sa=1))
print(get_wilson(G=rep(0, 100), alpha=0.05, sample_size=100, var_sa=1))
print(get_wilson(G=rep(1, 100), alpha=0.05, sample_size=100, var_sa=1))
print(get_wilson(G=c(rep(1, 50), rep(0, 50)),
      alpha=0.05, sample_size=100, var_sa=1))
print(get_wilson(G=c(rep(1, 50), rep(0, 50)),
      alpha=0.2, sample_size=100, var_sa=1))


R[write to console]: here() starts at /local/home/fogliato/cis-matching-tasks



[1] 0.0005129329 0.9974806314
[1] 0.0000000 0.0369935
[1] 0.9630065 1.0000000
[1] 0.01551649 0.98448351
[1] 0.03419671 0.96580329


# Check whether R and python code to compute intervals on generated data


In [4]:
%%R
library(dplyr)
library(purrr)
library(readr)


G <- gen_data_emb(n_ids=50, m=5, d=128, distribution_id="exponential",
                   sigma2within=5, sigma2between=NA, m_fixed=TRUE)

G$emb %>%
data.frame() %>%
mutate(id=G$gt) %>%
group_by(id) %>%
mutate(image=1: n()) %>%
ungroup() %>%
write_csv("/home/fogliato/cis-matching-tasks/results/temp/r-py-exp/emb.csv")

thresholds <- 1.2
mat_acc <- thresholds %>% map(~ get_genuine_and_impostor_mat(G, .x))
out <- mat_acc %>%
map(~ compute_matrix_at_id_level(.x$genuine_mat_acc, .x$impostor_mat_acc, m=G$m))

current_fnmr = mat_acc %>%
map(~ mean(.x$genuine_mat_acc, na.rm=TRUE)) %>%
unlist()
current_fmr = mat_acc %>%
map(~ mean(.x$impostor_mat_acc, na.rm=TRUE)) %>%
unlist()


R[write to console]: 
Attaching package: ‘dplyr’


R[write to console]: The following objects are masked from ‘package:stats’:

    filter, lag


R[write to console]: The following objects are masked from ‘package:base’:

    intersect, setdiff, setequal, union




## Check Wilson


In [5]:
%%R
i = 1
impostor_mat_acc = mat_acc[[i]]$impostor_mat_acc
genuine_mat_acc = mat_acc[[i]]$genuine_mat_acc
fmr_mat = out[[i]]$fmr_mat
fnmr_mat = out[[i]]$fnmr_mat
m_mat = out[[i]]$m_mat
sample_analogue_var <- get_sample_analogue_var(genuine_mat_acc=genuine_mat_acc, impostor_mat_acc=impostor_mat_acc, fnmr_mat=fnmr_mat, fmr_mat=fmr_mat, m=G$m, m_mat=m_mat)
# print(sample_analogue_var)

get_wilson(G=impostor_mat_acc, alpha=0.05, sample_size=length(G$m), var_sa=sample_analogue_var$var_fmr) %>% print()

get_wilson(G=genuine_mat_acc, alpha=0.05, sample_size=length(G$m), var_sa=sample_analogue_var$var_fnmr) %>% print()


[1] 0.04782364 0.07531378
[1] 0.4039680 0.5569698


In [6]:
import pandas as pd
emb = pd.read_csv(
    '/home/fogliato/cis-matching-tasks/results/temp/r-py-exp/emb.csv')
df = {}
for index, row in emb.iterrows():
    id = int(row['id'])
    if id not in df:
        df[id] = {}
    df[id][int(row['image'])] = row[['X' + str(x)
                                     for x in range(1, 129)]].values

df_sim = generate_similarity_scores(df, method = "l2")
def multiply_dict_by_1_over(nested_dict):
    new_dict = {}
    for key, value in nested_dict.items():
        if isinstance(value, list):
            new_dict[key] = [- 1 / x for x in value]
        elif isinstance(value, dict):
            new_dict[key] = multiply_dict_by_1_over(value)
        else:
            new_dict[key] = value
    return new_dict


df_dist = multiply_dict_by_1_over(df_sim)


threshold = - 1.2
df_error = generate_errors(df=df_dist, threshold=threshold)
fnmr, fmr = compute_error_metrics(df_error)
# print(get_ci_dbn_boot(df = df_error, alpha = 0.1, B = 10000))
# print(estimate_var_error_metrics(df_error))
fnmr_var, fmr_var = estimate_var_error_metrics(df_error)
# print(fnmr_var, fmr_var)
tn = sum([sum([len(df_error[id1][id2]) for id2 in df_error if id1 != id2])
         for id1 in df_error])
print(get_ci_wilson(n_errors=fmr * tn, n_instances=tn,
      mean_variance=fmr_var, alpha=0.05))
tp = sum([len(df_error[id][id]) for id in df_error.keys()])
print(get_ci_wilson(n_errors=fnmr * tp, n_instances=tp,
      mean_variance=fnmr_var, alpha=0.05))


[0.047823639564186345, 0.07531377933354634]
[0.40396800754006296, 0.5569698402667973]


## Check double-or-nothing bootstrap


In [7]:
%%R
n_boot = 1e3
out_db = 1:n_boot %>%
map_dfr(~ get_bootstrap_db(fmr_mat=out %>%  map(~ .x$fmr_mat), fnmr_mat=out %>% map(~ .x$fnmr_mat), m_mat=out %>%  map(~ .x$m_mat)) %>%  mutate(threshold=thresholds))
print(quantile(out_db$fnmr, probs=c(0.05, 0.95), na.rm=TRUE))
print(quantile(out_db$fmr, probs=c(0.05, 0.95), na.rm=TRUE))


       5%       95% 
0.4184259 0.5500000 
        5%        95% 
0.04757427 0.07326798 


In [8]:
print(get_ci_boot(df=df_error, alpha=0.1, B=1000, method = 'dbn', parallel = True))


([0.419004329004329, 0.55], [0.04723192170148692, 0.07352564935064934])


## Check vertex bootstrap


In [9]:
%%R
out_vertex = 1:n_boot %>%
    map_dfr(~ get_bootstrap_vertex(fmr_mat=out %>%  map(~ .x$fmr_mat), fnmr_mat=out %>%  map(~ .x$fnmr_mat), m_mat=out %>%  map(~ .x$m_mat)) %>%  mutate(threshold=thresholds))
print(quantile(out_vertex$fnmr, probs=c(0.05, 0.95), na.rm=TRUE))
print(quantile(out_vertex$fmr, probs=c(0.05, 0.95), na.rm=TRUE))


   5%   95% 
0.410 0.544 
        5%        95% 
0.04861668 0.07265320 


In [10]:
print(get_ci_boot(df = df_error, alpha = 0.1, B = 1000, method = 'vertex', parallel = True))


([0.416, 0.544], [0.049008601386089624, 0.07237689296060597])


## Check pointwise ROC intervals


In [11]:
%%R
source(here('R-code/utils.R'))
source(here('R-code/utils_roc.R'))
out = get_ci_for_fixed_fmr(G, length(G$m), m=G$m, alpha=0.05, target_fmr=c(0, 0.01, 0.1, 0.5, 1), alpha_fmr=0.05)


In [12]:
%%R
out %>%  filter(type == 'wilson') %>%  select(target_fmr, lb_fnmr, ub_fnmr)


# A tibble: 5 × 3
  target_fmr  lb_fnmr ub_fnmr
       <dbl>    <dbl>   <dbl>
1       0    9.45e- 1 0.987  
2       0.01 6.12e- 1 0.810  
3       0.1  2.90e- 1 0.489  
4       0.5  5.20e- 2 0.144  
5       1    4.30e-19 0.00762


In [13]:
target_fpr = [0, 0.01, 0.1, 0.5, 1]
ci_roc_wilson = get_ci_roc_wilson(df=df_sim, alpha=0.05, target_fpr=target_fpr, alpha_fpr=0.05)
for fpr in target_fpr:
    print(fpr, 1 - ci_roc_wilson['tpr'][ci_roc_wilson['fpr'].index(
        fpr)], [1 - x for x in ci_roc_wilson['tpr_cis'][ci_roc_wilson['fpr'].index(fpr)]][::-1])


0 0.976 [0.9447478435026352, 0.9871966319061259]
0.01 0.724 [0.6115980271921961, 0.8100070701947348]
0.1 0.392 [0.2899347981488063, 0.4888522817300892]
0.5 0.08999999999999997 [0.05202600839833793, 0.1440606039743456]
1 0.0 [0.0, 0.007624340461552248]


In [14]:
%%R
out %>%  filter(type == 'db') %>%  select(target_fmr, lb_fnmr, ub_fnmr)


# A tibble: 5 × 3
  target_fmr lb_fnmr ub_fnmr
       <dbl>   <dbl>   <dbl>
1       0     0.924    0.988
2       0.01  0.652    0.783
3       0.1   0.323    0.452
4       0.5   0.0577   0.123
5       1     0        0    


In [15]:
target_fpr = [
    0, 0.01, 0.1, 0.5, 1]
ci_roc_dbn = get_ci_roc_boot(df=df_sim, alpha=0.05, B=1000, method = "dbn", target_fpr=target_fpr, parallel=True)

for fpr in target_fpr:
    print(fpr, 1 - ci_roc_dbn['tpr'][ci_roc_dbn['fpr'].index(
    fpr)], [1 - x for x in ci_roc_dbn['tpr_cis'][ci_roc_dbn['fpr'].index(fpr)]][::-1])


0 1.0 [0.9190062111801243, 0.9894736842105263]
0.01 0.724 [0.6479956521739131, 0.7851984126984127]
0.1 0.392 [0.32105263157894737, 0.44232142857142853]
0.5 0.08999999999999997 [0.05515673981191227, 0.12083333333333335]
1 0.0 [0.0, 0.007624340461552137]
