In [None]:
import numpy as np
from matplotlib import pyplot as plt
import scipy.stats as stats

### Generate "population" (v. large N) from which we'll sample

In [None]:
mean = 12.0
sdev = 1.5
Ntotal = 100000
population = np.random.normal(mean, sdev, Ntotal)

pop_mean = np.mean(population)
pop_std = np.std(population)
print(f"mean = {pop_mean:.2f}, sdev = {pop_std:.2f}")

### Plot "perfect" histogram:

In [None]:
plt.xlim(mean - 4 * sdev, mean + 4 * sdev)
plt.hist(population, bins=100, density=True)

# Also plot actual gaussian, for comparison
x = np.linspace(mean - 4 * sdev, mean + 4 * sdev, 100)
plt.plot(x, stats.norm.pdf(x, mean, sdev), label="Gaussian", linewidth=3)

plt.ylabel("Probability density (normalised)")
plt.legend()
plt.show()

### Simulate reality: Randomly sample from the population:

In [None]:
sample_size = 50
sample = np.random.choice(population, sample_size)

s_mean = np.mean(sample)
s_sdev = np.std(sample)
s_sdev_c = np.std(sample, ddof=1)
sem = stats.sem(sample)

print(f"Random sample of size: {sample_size}")
print(
    f"Sample: mean = {s_mean:.2f}, S.E.M = {sem:.2f}, sdev = {s_sdev:.2f}, sdev(cor.) = {s_sdev_c:.2f}"
)
print(f"True  : mean = {mean:.2f},               sdev = {sdev:.2f}")

For instruction: manual calculations

In [None]:
n = len(sample)
mean_2 = np.sum(sample) / n
sdev2 = np.sqrt(np.sum((sample - mean_2) ** 2) / n)
sdev_c_2 = np.sqrt(np.sum((sample - mean_2) ** 2) / (n - 1))
sem_2 = sdev_c_2 / np.sqrt(sample_size)

print(
    f"Sample: mean = {mean_2:.2f}, S.E.M = = {sem_2:.2f}, sdev = {sdev2:.2f}, sdev(cor.) = {sdev_c_2:.2f}"
)

### Histogram:
  * Run multiple times to see how sampling can affect

In [None]:
sample = np.random.choice(population, sample_size)

s_mean, s_sdev, s_sdev_c = np.mean(sample), np.std(sample), np.std(sample, ddof=1)
sem = stats.sem(sample)

plt.title(f"mean = {s_mean:.2f}$\\pm${sem:.2f}, $\\sigma$ = {s_sdev_c:.2f}")
plt.ylim(0, 1.6 / (sdev * np.sqrt(2 * np.pi)))
plt.xlim(mean - 4 * sdev, mean + 4 * sdev)
plt.hist(sample, bins=15, density=True)

# Also plot actual gaussian, for comparison
plt.plot(x, stats.norm.pdf(x, s_mean, s_sdev_c), label="Sample", linewidth=3)
plt.plot(x, stats.norm.pdf(x, mean, sdev), "k-", label="True", linewidth=2)

plt.ylabel("Probability density (normalised)")
plt.legend()
plt.show()

print(f"k3 = {stats.skew(sample):.2f}")
print(f"k4 = {stats.kurtosis(sample):.2f}")

### Kernel density estimation (kde)

* Smooths out histogram
* Be careful - looks nice, but often unhelpful!
* Great when there is a very good/dense histrogram
  * Misleading when not!
* Often: good idea to plot actual historgram too, so not to be misleading!

In [None]:
t_sample = np.random.choice(population, 3 * sample_size)

plt.title(f"mean = {s_mean:.2f}$\\pm${sem:.2f}, $\\sigma$ = {s_sdev_c:.2f}")
plt.ylim(0, 1.6 / (sdev * np.sqrt(2 * np.pi)))
plt.xlim(mean - 4 * sdev, mean + 4 * sdev)
plt.hist(t_sample, bins=15, density=True)

kde = stats.gaussian_kde(t_sample)
plt.plot(x, kde(x), label="KDE", linewidth=2.5)
plt.plot(x, stats.norm.pdf(x, mean, sdev), "k-", label="True", linewidth=2)

plt.ylabel("Probability density (normalised)")
plt.legend()
plt.show()

### Monte-Carlo check of Standard Error (in the mean)
  * Use "Monte Carlo" method to check Standard Error (in the mean)
  * For a large number of random samples, calculate the mean
  * Plot the distribution of the means
  * Check that the standard deviation of this matches S.E.M

In [None]:
def random_sample(t_sample_size):
    return np.random.choice(population, t_sample_size)


mean_list = []
for _ in range(1000):
    mean_list.append(np.mean(random_sample(sample_size)))

mean_mean = np.mean(mean_list)
sdev_mean = np.std(mean_list, ddof=1)

# get standard error from *just one* sample:
sem = stats.sem(random_sample(sample_size))

plt.title(f"Distribution of sample (N={sample_size}) means: S.E.M={sem:.2f}")
plt.xlim(mean - 4 * sem, mean + 4 * sem)
plt.hist(mean_list, bins=30, density=True)
x2 = np.linspace(mean - 4 * sem, mean + 4 * sem, 100)
plt.ylabel("Probability density (normalised)")
plt.xlabel("Sample mean")
plt.plot(
    x2,
    stats.norm.pdf(x2, mean_mean, sem),
    label=f"$\\sigma$={sdev_mean:.2f}",
    linewidth=3,
)
plt.legend()
plt.show()