In [19]:
import numpy as np
import pandas as pd

In [24]:
n = 10 # Original sample size
x = np.random.normal(size=n) # Normal(0,1) distribution, n samples
print(x)

[-1.44224634 -0.06085965  2.29799449  1.69761223 -1.13221641  0.8557239
 -1.43625455  1.5197363   1.27169636  1.14259395]


Let's investigate the sampling error (standard deviation) of $\frac{1}{n}\sum^n_{i=1}x_i$
- Its theoretical value is: $true\_error \equiv \frac{\sigma}{\sqrt{n}}$，here $\sigma=1$，i.e. $\frac{1}{\sqrt{n}}$
- The estimation with the current sample is: $est\_error \equiv \frac{\hat{\sigma}}{\sqrt{n}}$, where $\hat{\sigma} = \sqrt{\frac{1}{n}\sum^n_{i=1}(x_i - \frac{1}{n}\sum^n_{i=1}x_i)^2}$

In [25]:
x_mean = np.mean(x)
x_std = np.std(x) # sigma hat
x_mean_std = x_std / np.sqrt(n)
print("true_error: ", 1/np.sqrt(n))
print("est_error: ", x_mean_std)

true_error:  0.31622776601683794
est_error:  0.41633889344452013


Boostrap error:

In [26]:
B = 10000
boot = list()
for i in range(B):
    boot.append(np.random.choice(x, n))

In [28]:
len(boot)

10000

In [27]:
boot[0:3]

[array([-1.44224634,  0.8557239 ,  1.69761223,  2.29799449, -0.06085965,
         1.5197363 , -1.43625455,  0.8557239 ,  2.29799449, -0.06085965]),
 array([-1.43625455, -1.44224634, -0.06085965, -1.44224634, -0.06085965,
        -1.13221641, -1.44224634, -1.13221641, -1.44224634, -1.44224634]),
 array([-1.13221641,  1.14259395, -1.13221641, -0.06085965,  2.29799449,
        -0.06085965, -1.44224634,  0.8557239 , -1.44224634, -1.13221641])]

In [29]:
boot_mean = np.full(shape=B,fill_value=np.nan)
for i in range(len(boot)):
    boot_mean[i] = np.mean(boot[i])

In [30]:
boot_mean[0:3]

array([ 0.65245651, -1.10336384, -0.21065489])

In [31]:
len(boot_mean)

10000

In [32]:
x_mean_std_boot = np.sqrt(np.sum((boot_mean - np.mean(boot_mean))**2)/B) # bootstrapped error estimation

👆 It should be similar to x_mean_std

In [33]:
print("est_error: ", x_mean_std)
print("bootstrapped_error: ", x_mean_std_boot)

est_error:  0.41633889344452013
bootstrapped_error:  0.4156634679680803


When n is small, x is not a good sampling of N(0,1):

In [34]:
print("true_error: ", 1/np.sqrt(n))
print("est_error: ", x_mean_std)
print("bootstrapped_error: ", x_mean_std_boot)

true_error:  0.31622776601683794
est_error:  0.41633889344452013
bootstrapped_error:  0.4156634679680803


The difference of the first and the second is determined by n; the difference of the second and the third is determined by B

When n is large, and when our sampling procedure is good (iid in our case), all the above three will be close.

In [35]:
n = 10000 # Original sample size
x = np.random.normal(size=n) # Normal(0,1) distribution, n samples
x_mean = np.mean(x)
x_std = np.std(x) # sigma hat
x_mean_std = x_std / np.sqrt(n)

B = 10000
boot = list()
for i in range(B):
    boot.append(np.random.choice(x, n))

boot_mean = np.full(shape=B,fill_value=np.nan)
for i in range(len(boot)):
    boot_mean[i] = np.mean(boot[i])

x_mean_std_boot = np.sqrt(np.sum((boot_mean - np.mean(boot_mean))**2)/B) # bootstrapped error estimation

print("true_error: ", 1/np.sqrt(n))
print("est_error: ", x_mean_std)
print("bootstrapped_error: ", x_mean_std_boot)

true_error:  0.01
est_error:  0.010051593847980402
bootstrapped_error:  0.010110623679913083
