# Central Limit Theorem Demonstration

In this notebook, let's test the three examples of the application of the CLT.

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from matplotlib import patches
%matplotlib inline

In [None]:
# we will plot the theoretical normal distribution multiple times, so might as well define a function for it
def plot_normal(mean,sigma,xmin=-1,xmax=1):
    x = np.linspace(xmin,xmax,1000)
    y = np.exp(-(x-mean)**2/(2*sigma**2)) * 1.0/np.sqrt(2.0*np.pi*sigma**2)
    plt.plot(x,y,ls='-',c='red')

### Example 1: coin toss

Let's toss a coin $p=100$ times, write down the percentage of heads, and repeat such a toss $n=1000$ times. What is the probability distribution of the averaged percentage of heads?

NB: for this situation, when $p$ tosses are made, the stdev is 0.35 (assuming 0 is head, and 1 is tail).

In [None]:
number_tosses = 100
number_experiments = 1000
experiment_probability = 0.5
experiment_stdev = 0.35

# define a function doing p tosses and returning the average number of heads
def mean_heads(p):
    # p tosses with probability 0.5
    # we will consider that 0 is head, and 1 is tails
    tosses = np.random.choice((0,1),size=p,p=(experiment_probability,experiment_probability))
    average = np.mean(tosses)
    return average

# repeat the experiment n times
all_means = np.zeros(number_experiments)
for i in range(number_experiments):
    one_mean = mean_heads(number_tosses)
    all_means[i] = one_mean

# plot the histogram of mu_1 ... mu_n
plt.hist(all_means,bins="auto",density=True)

# plot the theoretical normal distribution according to CLT
theoretical_stdev = experiment_stdev/np.sqrt(number_tosses)
plot_normal(experiment_probability,theoretical_stdev)

plt.xlabel("Average of tosses")
plt.ylabel("Probability")
plt.xlim(-0.1,1.1)
plt.show()

### Example 2: left-handed individuals

In a sample of $p=100$ individuals, how many are left-handed?

In the entire population, $10\%$ of people are left-handed (corresponding stdev = $28.46\%$).

In [None]:
number_individuals = 100
number_experiments = 1000
total_mean = 0.1
total_stdev = 0.2846

# define a function that returns the percentage of left-handed people
def mean_left(p):
    # p tosses with probability 0.1 and 0.9
    # we will consider that 0 is left-handed, and 1 is right-handed
    individuals = np.random.choice((0,1),size=p,p=(total_mean,1-total_mean))
    average_right = np.mean(individuals) # average number of right-handed people
    average_left = 1 - average_right
    return average_left

# repeat the experiment n times
all_means = np.zeros(number_experiments)
for i in range(number_experiments):
    one_mean = mean_left(number_individuals)
    all_means[i] = one_mean

# plot the histogram of mu_1 ... mu_n
plt.hist(all_means,bins="auto",density=True)

# plot the theoretical normal distribution according to CLT
ax = plt.gca() # get current axes
x_limits = ax.get_xlim() # get the x limits of the axes
theoretical_stdev = total_stdev/np.sqrt(number_individuals)
plot_normal(total_mean,theoretical_stdev,*x_limits) # unpack the limits in the gaussian

plt.xlabel("Average fraction of left-handed people in one sample")
plt.ylabel("Probability")
plt.show()

Using the same data, what can we tell about the 3-sigma interval?

In [None]:
# theoretical stdev according to CLT, and 3-sigma boundaries
theoretical_stdev = total_stdev/np.sqrt(number_individuals)
low_bound = total_mean - 3*total_stdev/np.sqrt(number_individuals)
high_bound = total_mean + 3*total_stdev/np.sqrt(number_individuals)
print(f"In the group of {number_individuals} individuals, I am 99.7% certain that " +  
      f"there will be between {low_bound*number_individuals:.1f} and {high_bound*number_individuals:.1f} " +
      f"left-handed individuals.")

# actual number of times the experiment was within 3-sigma
number_within_3sigma = np.count_nonzero( (all_means > low_bound) & (all_means < high_bound) )
print(f"After repeating the experiment {number_experiments} times, " + 
      f"there were between {low_bound*number_individuals:.1f} and {high_bound*number_individuals:.1f} left-handed individuals " + 
      f"{number_within_3sigma/number_experiments*100} % of the time.")

# plot the histogram of mu_1 ... mu_n
plt.hist(all_means,bins="auto",density=True)

# plot the theoretical normal distribution according to CLT
ax = plt.gca() # get current axes
x_limits = ax.get_xlim() # get the x limits of the axes
plot_normal(total_mean,total_stdev/np.sqrt(number_individuals),*x_limits) # unpack the limits in the gaussian

# add a box for 3-sigma confidence level
low_bound = total_mean - 3*total_stdev/np.sqrt(number_individuals)
high_bound = total_mean + 3*total_stdev/np.sqrt(number_individuals)
y_limits = ax.get_ylim() # get the y limits of the axes
confidence_3sigma_rectangle = patches.Rectangle((low_bound,y_limits[0]),high_bound-low_bound,y_limits[1]-y_limits[0],
                              edgecolor="None",
                              facecolor='red',
                              alpha=0.2)
ax.add_patch(confidence_3sigma_rectangle)

plt.xlabel("Average fraction of left-handed people in one sample")
plt.ylabel("Probability")
plt.show()

### Example 3: continuous case

Previous experiments were discrete: there are only 2 possible outcomes.

Here, let's consider an experiment where the random variable is drawn from a probability distribution $P(x)$ that is the beta function.

In [None]:
# first, let's just plot our probability density P(x)
alpha = 1
beta = 10

draws = np.random.beta(alpha,beta,size=1_000_000)
plt.hist(draws,bins="auto")
plt.show()

In [None]:
number_draws = 2
number_experiments = 100000
experiment_probability = alpha/(alpha+beta)
experiment_stdev = np.sqrt( (alpha*beta)/((alpha+beta)**2*(alpha+beta+1) ))

# define a function doing p draws and returning the average of that array
def mean_draws(p):
    # p draws with probability beta(alpha,beta)
    draws = np.random.beta(alpha,beta,size=p)
    average = np.mean(draws)
    return average

# repeat the experiment n times
all_means = np.zeros(number_experiments)
for i in range(number_experiments):
    one_mean = mean_draws(number_draws)
    all_means[i] = one_mean

# plot the histogram of mu_1 ... mu_n
plt.hist(all_means,bins="auto",density=True)

# plot the theoretical normal distribution according to CLT
ax = plt.gca() # get current axes
x_limits = ax.get_xlim() # get the x limits of the axes
theoretical_stdev = experiment_stdev/np.sqrt(number_draws)
plot_normal(experiment_probability,theoretical_stdev,*x_limits)

plt.xlabel("Average of draws")
plt.ylabel("Probability")
#plt.xlim(-0.1,1.1)
plt.show()