## Ordinary python lists

In [None]:
[3,4,5,"Bob"]

In [None]:
mylist = [2,5,7,"Alice"]
mylist

In [None]:
mylist[3]

In [None]:
mylist[4]

### Building lists

With a `for` loop:

In [None]:
evennumbers = []
for i in range(200):
    if i%2 == 0:
        evennumbers.append(i)
len(evennumbers)

With a `while` loop

In [None]:
evennumbers = [0]
while len(evennumbers) < 100:
    evennumbers.append(evennumbers[-1]+2)
len(evennumbers)

Another `for` loop

In [None]:
evennumbers = []
for i in range(100):
    evennumbers.append(2*i)

    
# We can slice lists
evennumbers[3:7]

With a *list comprehension*:

In [None]:
evennumbers = [2*i for i in range(100)]
evennumbers[-4:-1]

With a different list comprehension:

In [None]:
evennumbers = [i for i in range(200) if i%2 == 0]
evennumbers[50:-48]

In [None]:
sum(evennumbers) 

### Nested clauses

In [None]:
v = 0
for e in evennumbers:
    #print("This will happen every time")
    if e > 50:
        #print("This will happen fifty times")
        if e%7 == 0:
            #print("This will happen only for even multiples of 7 bigger than 50")
            v += 1
v

## About numpy

In [None]:
import numpy as np

A `numpy` `array` is just a dressed-up list:

In [None]:
evennumbers = np.arange(0,200,2)
evennumbers

With some extra functionality:

In [None]:
evennumbers.mean()

In [None]:
evennumbers[evennumbers < 10]

### and pandas

In [None]:
import pandas as pd

`pandas` just dresses `numpy` in some more functionality:

In [None]:
a = np.array([1,2,3])
b = np.array(["bob","gene","tina"])
df = pd.DataFrame(data = {
    'number': a,
    'name': b
})
df

## Back to numpy

Let's generate a normally distributed population:

In [None]:
population = np.random.normal(loc=50, scale=3, size=10000)

`loc` is $\mu$, `scale` is $\sigma$, `size` is $N$

Let's take a sample from this population:

In [None]:
sample = np.random.choice(population, size=100, replace=False)

print("Population mean = {}".format(population.mean().round(2)))
print("Sample mean = {}".format(sample.mean().round(2)))

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [None]:
fig, axs = plt.subplots()
sns.distplot(population, ax = axs)
axs.axvline(population.mean())
sns.distplot(sample, ax = axs)
axs.axvline(sample.mean())

In [None]:
from ipywidgets import interact

In [None]:
def update(n):
    print(n)

interact(update,n=(1,100))

The continuous updating is annoying so...

In [None]:
from ipywidgets import IntSlider

In [None]:
def update(n):
    print(n)

interact(update,n=IntSlider(min=1, max=100, step=1, continuous_update=False))

In [None]:

def update(n):
    fig, axs = plt.subplots()
    sns.distplot(population, ax = axs)
    axs.set_ylim(0,0.2)
    axs.set_xlim(30,70)
    axs.axvline(population.mean())
    sample = np.random.choice(population, size=n, replace=False)
    sns.distplot(sample, ax = axs)
    axs.axvline(sample.mean())

interact(update,n=IntSlider(value=10, min=2, max=1000, step=1, continuous_update=False))

So the sample mean $\bar{x}$ is a good (unbiased) estimator for the population mean $\mu$.

The same is **not** true of the standard deviation.

In [None]:

def update(n):
    # make a set of axes
    fig, axs = plt.subplots()
    # set the axes
    axs.set_ylim(0,0.2)
    axs.set_xlim(30,70)
    # plot the population
    sns.distplot(population, ax = axs)
    # find the mean and sd for the population
    mu = population.mean()
    sigma = population.std()
    # color one sd from the mean
    axs.axvspan(mu-sigma, mu+sigma, facecolor="lightsteelblue", alpha=0.4)
    # draw a sample
    sample = np.random.choice(population, size=n, replace=False)
    # plot the sample
    sns.distplot(sample, ax = axs)
    # find the sample mean and the 'wrong' sd
    xbar = sample.mean()
    s = sample.std()
    axs.axvspan(xbar-s,xbar+s, facecolor="wheat", alpha=0.2)
    print("Standard deviation of sample with /n = {}".format(s))
    print("Population standard deviation = {}".format(sigma))


interact(update,n=IntSlider(value=10, min=2, max=1000, step=1, continuous_update=False))

**This is not quite as compelling a visualisation as we wanted.**

We want to show that using $\dfrac{\sum(x-\bar{x})^2}{n}$ tends to *underestimate* the standard deviation of the population, which is why we use $n-1$ instead.