In [16]:
import numpy as np
import pandas as pd

In [21]:
n = 10 # Original sample size
x = np.random.normal(size=n) # Normal(0,1) distribution, n samples
print(x)

[ 0.99598547 -0.59008227 -0.21184559 -0.51103425  1.2337651   1.01887457
 -0.25309936 -0.175181   -0.29014381 -0.70293388]


Let's investigate the sampling error (standard deviation) of $\frac{1}{n}\sum^n_{i=1}x_i$
- Its theoretical value is: $true\_error \equiv \frac{\sigma}{\sqrt{n}}$，here $\sigma=1$，i.e. $\frac{1}{\sqrt{n}}$
- The estimation with the current sample is: $est\_error \equiv \frac{\hat{\sigma}}{\sqrt{n}}$, where $\hat{\sigma} = \sqrt{\frac{1}{n}\sum^n_{i=1}(x_i - \frac{1}{n}\sum^n_{i=1}x_i)^2}$

In [22]:
x_mean = np.mean(x)
x_std = np.std(x) # sigma hat
x_mean_std = x_std / np.sqrt(n)
print("true_error: ", 1/np.sqrt(n))
print("est_error: ", x_mean_std)

true_error:  0.31622776601683794
est_error:  0.22029744837850462


Boostrap error:

In [23]:
B = 10000
boot = list()
for i in range(B):
    boot.append(np.random.choice(x, n))

In [24]:
boot[0:3]

[array([-0.25309936,  1.2337651 , -0.59008227, -0.29014381, -0.59008227,
        -0.25309936, -0.175181  , -0.70293388,  1.01887457, -0.175181  ]),
 array([ 0.99598547, -0.29014381, -0.21184559, -0.70293388, -0.70293388,
        -0.175181  , -0.70293388,  0.99598547, -0.175181  , -0.59008227]),
 array([-0.59008227,  1.01887457, -0.51103425, -0.175181  ,  1.2337651 ,
        -0.51103425,  1.2337651 ,  0.99598547,  0.99598547, -0.175181  ])]

In [25]:
boot_mean = np.full(shape=B,fill_value=np.nan)
for i in range(len(boot)):
    boot_mean[i] = np.mean(boot[i])

In [26]:
boot_mean[0:3]

array([-0.07771633, -0.15592644,  0.35158629])

In [27]:
x_mean_std_boot = np.sqrt(np.sum((boot_mean - np.mean(boot_mean))**2)/B) # bootstrapped error estimation

👆 It should be similar to x_mean_std

In [28]:
print("est_error: ", x_mean_std)
print("bootstrapped_error: ", x_mean_std_boot)

est_error:  0.22029744837850462
bootstrapped_error:  0.2217568637184773


When n is small, x is not a good sampling of N(0,1):

In [29]:
print("true_error: ", 1/np.sqrt(n))
print("est_error: ", x_mean_std)
print("bootstrapped_error: ", x_mean_std_boot)

true_error:  0.31622776601683794
est_error:  0.22029744837850462
bootstrapped_error:  0.2217568637184773


The difference of the first and the second is determined by n; the difference of the second and the third is determined by B

When n is large, and when our sampling procedure is good (iid in our case), all the above three will be close.

In [30]:
n = 10000 # Original sample size
x = np.random.normal(size=n) # Normal(0,1) distribution, n samples
x_mean = np.mean(x)
x_std = np.std(x) # sigma hat
x_mean_std = x_std / np.sqrt(n)

B = 10000
boot = list()
for i in range(B):
    boot.append(np.random.choice(x, n))

boot_mean = np.full(shape=B,fill_value=np.nan)
for i in range(len(boot)):
    boot_mean[i] = np.mean(boot[i])

x_mean_std_boot = np.sqrt(np.sum((boot_mean - np.mean(boot_mean))**2)/B) # bootstrapped error estimation

print("true_error: ", 1/np.sqrt(n))
print("est_error: ", x_mean_std)
print("bootstrapped_error: ", x_mean_std_boot)

true_error:  0.01
est_error:  0.009973278879004118
bootstrapped_error:  0.009912513676025556
