### Bootstrap Confidence Intervals

- Bootstrap: Resample (with replacement) the data a number of times and calculate the statistic of interest

In [58]:
import numpy as np
import pandas as pd

from scipy.stats import norm
from sklearn.utils import resample
from sklearn.metrics import accuracy_score
from sklearn.tree import DecisionTreeClassifier

# Using statsmodel
from statsmodels.stats.proportion import proportion_confint
from statsmodels.stats.proportion import multinomial_proportions_confint

In [3]:
df = pd.read_csv('./pima-indians-diabetes.txt', header=None)
values = df.values

In [31]:
# Bootstrap
n_iterations = 100
n_size = int(len(df) * 0.5)

stats = []
for i in range(n_iterations):
    train = resample(values, n_samples=n_size)
    test = np.array([x for x in values if x.tolist() not in train.tolist()])
    model = DecisionTreeClassifier()
    model.fit(train[:,:-1], train[:,-1])
    pred = model.predict(test[:,:-1])
    acc = accuracy_score(test[:,-1], pred)
    stats.append(acc)
    
# Calculate accuracy C.I.
alpha = 0.99
p = ((1.0 - alpha) / 2) * 100
lower = max(0.0, np.percentile(stats, p))
p = (alpha + (1 - alpha) / 2) * 100
upper = min(1.0, np.percentile(stats, p))
print('{} Empirical C.I. for accuracy: {} - {}'.format(alpha*100, lower*100, upper*100))

99.0 Empirical C.I. for accuracy: 63.73296032787641 - 74.13435444271605


### Annotation Error

- Consider a user with an error of 0.2 on HoneyPot dataset with 100 examples
- Z score values when using 95% C.I. are -1.96 and 1.96. The associated p-value with 95% C.I. is 0.05

$$\widehat{p} \pm z \sqrt{\frac{\widehat{p}(1-\widehat{p})}{n}}$$

In [86]:
alpha = 0.95
n_sample = 100
z_score = norm.ppf(1-(1-alpha)/2)  # Incorporating ignored lower tail
accuracy = 0.8

def confidence_interval(accuracy, n_sample, z_score):
    std_error = z_score * np.sqrt((accuracy * (1.0 - accuracy)) / n_sample)
    lower = max(0.0, accuracy - std_error)
    upper = min(1.0, accuracy + std_error)
    return lower, upper

print('95% C.I.: ', confidence_interval(accuracy, n_sample, z_score))
print()

for n in [100, 75, 50, 30, 25, 20, 15, 10]:
    l, u = confidence_interval(accuracy, n, z_score)
    print('n_sample: {} -> lower: {} and upper: {}'.format(n, l, u))

95% C.I.:  (0.7216014406183979, 0.8783985593816022)

n_sample: 100 -> lower: 0.7216014406183979 and upper: 0.8783985593816022
n_sample: 75 -> lower: 0.7094731412739064 and upper: 0.8905268587260937
n_sample: 50 -> lower: 0.6891276940520258 and upper: 0.9108723059479743
n_sample: 30 -> lower: 0.6568644685026275 and upper: 0.9431355314973726
n_sample: 25 -> lower: 0.6432028812367957 and upper: 0.9567971187632044
n_sample: 20 -> lower: 0.6246954918846838 and upper: 0.9753045081153163
n_sample: 15 -> lower: 0.5975757900989345 and upper: 1.0
n_sample: 10 -> lower: 0.5520819870781755 and upper: 1.0


In [61]:
# Statsmodel: It is one-liner
alpha = 0.05  # Level of significance: 1.0 - 0.95 
correct = 18
n_observations = 20

lower, upper = proportion_confint(correct, n_observations, alpha=alpha)
print(lower, upper)  # NOTE: default method = 'normal'

0.7685216189135128 1.0


#### Wilson Score Interval

$$\frac{\widehat{p} + \frac{z^2}{2n}}{1+\frac{z^2}{n}} \pm \frac{z}{1+\frac{z^2}{n}}\sqrt{\frac{\widehat{p}(1-\widehat{p})}{n}+\frac{z^2}{4n^2}}$$


In [62]:
def wilson_score_interval(accuracy, n_sample, z_score):
    pass

alpha = 0.05  # Level of significance: 1.0 - 0.95 
correct = 18
n_observations = 20

lower, upper = proportion_confint(correct, n_observations, alpha=alpha, method='wilson')
print(lower, upper)  

0.6989663547715127 0.9721335187862318


In [63]:
# Agresti-Coull interval

alpha = 0.05  # Level of significance: 1.0 - 0.95 
correct = 18
n_observations = 20

lower, upper = proportion_confint(correct, n_observations, alpha=alpha, method='agresti_coull')
print(lower, upper) 

0.6867561125596074 0.984343760998137


### A user annotates 75 images with 3 classes (25 images in each class) out of total 150 HoneyPot images. Find `Confidence Interval` of accuracy for each class

In [155]:
# User annotations
class_1 = np.random.choice([0, 1], size=25, p=[0.2, 0.8])  # 80 % correct
class_2 = np.random.choice([0, 1], size=25, p=[0.17, 0.83])  # 83 % correct
class_3 = np.random.choice([0, 1], size=25, p=[0.3, 0.7])  # 70 % correct

# Bootstrap
def bootstrap(annotation, n_iterations=1000, n_samples=15):
    stats = []
    correct = []
    for i in range(n_iterations):
        temp = resample(annotation, n_samples=n_samples)
        accuracy = np.sum(temp)/float(n_samples)
        stats.append(accuracy)
        correct.append(np.sum(temp))
    return np.array(stats), np.array(correct)

class_1_bs, correct_1 = bootstrap(class_1)
class_2_bs, correct_2 = bootstrap(class_2)
class_3_bs, correct_3 = bootstrap(class_3)

# Calculate mean accuracy
class_1_acc = np.mean(class_1_bs)
class_2_acc = np.mean(class_2_bs)
class_3_acc = np.mean(class_3_bs)

alpha = 0.95
n_sample = 1000  # It should be equal to n_iterations
z_score = norm.ppf(1-(1-alpha)/2)
for i, acc in enumerate([class_1_acc, class_2_acc, class_3_acc]):
    lower, upper = confidence_interval(acc, n_sample, z_score)
    print('Class-{} accuracy C.I.: {} and {}'.format(i, lower, upper))

Class-0 accuracy C.I.: 0.9023088670895956 and 0.9360911329104045
Class-1 accuracy C.I.: 0.6947133702534037 and 0.7502199630799294
Class-2 accuracy C.I.: 0.6558679838820715 and 0.7134653494512618


In [156]:
n_observations = 1000 * 15
alpha = 0.05  # Level of significance: 1.0 - 0.95

for i, corr in enumerate([correct_1, correct_2, correct_3]):
    for m in ['normal', 'wilson', 'agresti_coull', 'beta', 'jeffreys', 'binom_test']:
        lower, upper = proportion_confint(np.sum(corr), n_observations, alpha=alpha, method=m)
        print('Method: "{}" Class-{} accuracy C.I.: {} and {}'.format(m, i, lower, upper))
    print()

Method: "normal" Class-0 accuracy C.I.: 0.9148387282359618 and 0.9235612717640382
Method: "wilson" Class-0 accuracy C.I.: 0.9147306374845545 and 0.9234547055501536
Method: "agresti_coull" Class-0 accuracy C.I.: 0.9147293176058364 and 0.9234560254288717
Method: "beta" Class-0 accuracy C.I.: 0.9147236114531655 and 0.9235129977659319
Method: "jeffreys" Class-0 accuracy C.I.: 0.914757758550864 and 0.9234804912757516
Method: "binom_test" Class-0 accuracy C.I.: 0.9147082353114473 and 0.9234748002179807

Method: "normal" Class-1 accuracy C.I.: 0.7153007963459384 and 0.729632536987395
Method: "wilson" Class-1 accuracy C.I.: 0.715244528838758 and 0.7295748874632174
Method: "agresti_coull" Class-1 accuracy C.I.: 0.7152443025087176 and 0.7295751137932578
Method: "beta" Class-1 accuracy C.I.: 0.7152247851262975 and 0.7296221762305688
Method: "jeffreys" Class-1 accuracy C.I.: 0.7152583769773079 and 0.7295891143265029
Method: "binom_test" Class-1 accuracy C.I.: 0.715237414571519 and 0.72960443346125

### Multinomial Proportions Confint

In [65]:
# Problem: A vector of positive integers representing the number of occurrences of 
# each class. The total number of samples equals the sum of such elements.

correct = np.array([8, 10, 15])  
alpha = 0.05
conf_int = multinomial_proportions_confint(correct, alpha=alpha)
conf_int

array([[0.11133872, 0.44973797],
       [0.15300691, 0.51134585],
       [0.26961394, 0.652929  ]])