# Code for lecture 10: the normal distribution
---------
### Februrary 20th, 2020


In [1]:
import sys
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import scipy.io as sio
from scipy import stats
import seaborn as sns
import bokeh as bk
from bokeh.palettes import Category20b
from bokeh.palettes import Magma
from bokeh.palettes import Viridis
from bokeh.plotting import figure, output_file, show
from bokeh.layouts import gridplot
from bokeh.io import output_notebook
output_notebook()

print(bk.__version__)

1.3.4


# What does the normal distribution look like?

### Let's start from the exponential and see how adding terms changes the shape.
Note all we need to define is a mean $\mu$ and variance $\sigma$.

In [11]:
# Define mu (mean) and sigma (standard deviation)
mu = 0
sigma = 1

# We'll pick a standard range for x values.
x_axis = np.arange(-5,5,0.01)

fn_1 = np.exp(x_axis)
fn_2 = np.exp(-(x_axis - mu))
fn_3 = np.exp(-(x_axis - mu)**2)
fn_4 = np.exp((-(x_axis - mu)**2)/(2*sigma**2))
fn_5 = (1/(np.sqrt(2*np.pi*sigma**2)))* np.exp((-(x_axis - mu)**2)/(2*sigma**2))


p = figure(plot_width=800, plot_height=500, y_range=(0, 1.5))
p.title.text = 'Click on legend entries to hide the corresponding lines'

colors = Viridis[6]

p.line(x_axis, fn_1, line_width=2, color= colors[1], alpha=0.8, legend = 'exp(x)')
p.line(x_axis, fn_2, line_width=2, color= colors[2], alpha=0.8, legend = 'exp(-(x - mu))')
p.line(x_axis, fn_3, line_width=2, color= colors[3], alpha=0.8, legend = 'exp((-(x_axis - mu)**2)')
p.line(x_axis, fn_4, line_width=2, color= colors[4], alpha=0.8, legend = 'exp((-(x_axis - mu)**2)/(2*sigma**2))')
p.line(x_axis, fn_5, line_width=2, color= colors[0], alpha=0.8, legend = 'Normal distribution')

p.legend.location = "top_right"
p.legend.click_policy="hide"
p.xaxis.axis_label = 'X'
p.yaxis.axis_label = 'Y'

show(p)

# Calculate the normal distribution from scipy.stats.norm

### Plot five different normal distributions with different means and variances.

In [12]:
# Define five values of mu, and five of sigma
mu_values = [-2, -1, 0, 1, 2]
sigma_values = [1, 0.5, 0.4, 0.7, 1.2]

# What does one normal distribution look like? Use stats.norm from scipy to generate the distribution
example_value = 2
norm_1 = stats.norm.pdf(x_axis, loc = mu_values[example_value], scale = sigma_values[example_value])
norm_1

array([1.17379884e-34, 1.60389149e-34, 2.19020547e-34, 2.98898200e-34,
       4.07652684e-34, 5.55630245e-34, 7.56850351e-34, 1.03029768e-33,
       1.40166423e-33, 1.90569690e-33, 2.58935877e-33, 3.51608364e-33,
       4.77149784e-33, 6.47111073e-33, 8.77064428e-33, 1.18798985e-32,
       1.60813508e-32, 2.17550903e-32, 2.94122210e-32, 3.97395839e-32,
       5.36595934e-32, 7.24102430e-32, 9.76520126e-32, 1.31610627e-31,
       1.77267567e-31, 2.38614139e-31, 3.20990095e-31, 4.31534632e-31,
       5.79786694e-31, 7.78483477e-31, 1.04462189e-30, 1.40086862e-30,
       1.87743219e-30, 2.51454655e-30, 3.36576403e-30, 4.50231851e-30,
       6.01890283e-30, 8.04131389e-30, 1.07365627e-29, 1.43262352e-29,
       1.91041385e-29, 2.54595900e-29, 3.39081361e-29, 4.51320433e-29,
       6.00336350e-29, 7.98055051e-29, 1.06022887e-28, 1.40765092e-28,
       1.86775057e-28, 2.47668829e-28, 3.28210440e-28, 4.34672332e-28,
       5.75307677e-28, 7.60968899e-28, 1.00591715e-27, 1.32888097e-27,
      

## Plot all of five normal distributions using bokeh

In [13]:
p = figure(plot_width=800, plot_height=500)
p.title.text = 'Click on legend entries to hide the corresponding lines'

for mu, sigma, color in zip(mu_values, sigma_values, Category20b[5]):
    norm_data = stats.norm.pdf(x_axis, loc = mu, scale = sigma)
    p.line(x_axis, norm_data, line_width=2, color= color, alpha=0.8, legend = f'mu={mu}, sigma={sigma}')

p.legend.location = "top_right"
p.legend.click_policy="hide"
p.xaxis.axis_label = 'X'
p.yaxis.axis_label = 'Probability density'

show(p)

## Get fancier. Use a slider to explore more values for mu and sigma.

In [14]:
from bokeh.layouts import row, column
from bokeh.models import CustomJS, Slider
from bokeh.plotting import figure, output_file, show, ColumnDataSource

# Note we make a different x axis. Doens't have to be centered at 0.
x = np.linspace(0, 100, 1000)
y = stats.norm.pdf(x, loc = 25, scale = 5)
source = ColumnDataSource(data=dict(x=x, y=y))

# Start figure
plot = figure(y_range=(0, 0.2), plot_width=600, plot_height=400)

# Plot initial line
plot.line('x', 'y', source=source, line_width=3, line_alpha=0.6)

# Define sliders
mu_slider = Slider(start=1, end=100, value=25, step=1, title="Mean")
sigma_slider = Slider(start=1, end=40, value=5, step=1, title="Variance")

# Javascript lets us make interactive sliders
callback = CustomJS(args=dict(source=source, mu=mu_slider, sigma=sigma_slider),
                    code="""
    const data = source.data;
    const m = mu.value;
    const s = sigma.value;
    const x = data['x']
    const y = data['y']

    for (var i = 0; i < x.length; i++) {
        y[i] = (1/(s*Math.sqrt(2*Math.PI)))*Math.exp(-Math.pow((x[i]-m),2)/(2*Math.pow(s,2)))
    }
    console.log(y)
    source.change.emit();
""")

# Defining what happens when a slider value changes. In this case, the callback function evaluates.
mu_slider.js_on_change('value', callback)
sigma_slider.js_on_change('value', callback)

# Final plot labels and things
plot.xaxis.axis_label = 'X'
plot.yaxis.axis_label = 'Probability Density'
layout = row(
    plot,
    column(mu_slider, sigma_slider),
)


show(layout)


# Mean = median = mode

Let's investigate just in case.

In [76]:
# Define number of observations
n_observations = 100
mu = 0
sigma = 1

random_normal_data = np.random.normal(mu, sigma, n_observations)

print(f'The sample mean is {random_normal_data.mean()}')
print(f'The median of the sample is {np.median(random_normal_data)}')

# If you'd rather use seaborn
# fig_0 = sns.distplot(random_normal_data)
# fig_0.axvline(random_normal_data.mean(), label = 'Sample mean')
# fig_0.axvline(np.median(random_normal_data), label = 'Sample mediann', dashes = (2,1,2,1))
# fig_0.legend()
# fig_0.set(xlabel = 'X', ylabel = 'Relative frequency', title = 'Sampling random points from the normal distribution');

p = figure(title='Random data from the normal distribution', tools='', background_fill_color="#fafafa")
def make_plot(hist, edges, legend, color):
    p.quad(top=hist, bottom=0, left=edges[:-1], right=edges[1:],
           fill_color=color, line_color="white", line_alpha = 0.2, alpha=0.6, legend = legend)
    p.xaxis.axis_label = 'x'
    p.yaxis.axis_label = 'Relative frequency'
    p.grid.grid_line_color="white"
    return p


hist, edges = np.histogram(random_normal_data, density=True)
p1 = make_plot(hist, edges, f'{n_observations} random points', colors[1])
show(p1)

The sample mean is -0.02827842632903195
The median of the sample is -0.08822115632582962


# Probability 

Approximately 2/3 of the area under the curve falls between mu-sigma and mu+sigma. In other words, the probability that a randomly chosen observation sampled from a normal distribution falls between mu-sigma and mu+sigma is approximately 66.67% (68.3% exactly).

Approximately 95% of the area under the curve falls within two standard deviations of the mean

In [87]:
# Draw figure showing the area within 1, 2, or 3 sigma of the mean.

# Feel free to change mu and sigma
mu = 0
sigma = 2

# plot
p = figure(plot_width=800, plot_height=500)
p.title.text = 'Click on legend entries to hide the corresponding lines'


colors = Category20b[5]
norm_data = stats.norm.pdf(x_axis, loc = mu, scale = sigma)
p.line(x_axis, norm_data, line_width=2,alpha=0.8, legend = f'mu={mu}, sigma={sigma}', color = colors[0])
mu_plus_sigma = np.argwhere(x_axis >= (mu+sigma)).flatten()[0]
mu_minus_sigma = np.argwhere(x_axis <= (mu+sigma)).flatten()[-1]
x_range_1s = ((x_axis>(mu-sigma)) & (x_axis<(mu+sigma)))
p.varea(x_axis[x_range_1s],np.zeros((sum(x_range_1s))),
         norm_data[x_range_1s],
         fill_alpha = 0.20,
         legend = 'pm 1 sigma',
         color = colors[1])
mu_plus_2sigma = np.argwhere(x_axis >= (mu+sigma)).flatten()[0]
mu_minus_2sigma = np.argwhere(x_axis <= (mu+sigma)).flatten()[-1]
x_range_2s = ((x_axis>(mu-(2*sigma))) & (x_axis<(mu+(2*sigma))))
p.varea(x_axis[x_range_2s],np.zeros((sum(x_range_2s))),
         norm_data[x_range_2s],
         fill_alpha = 0.20,
         legend = 'pm 2 sigma',
         color = colors[2])
mu_plus_3sigma = np.argwhere(x_axis >= (mu+sigma)).flatten()[0]
mu_minus_3sigma = np.argwhere(x_axis <= (mu+sigma)).flatten()[-1]
x_range_3s = ((x_axis>(mu-(3*sigma))) & (x_axis<(mu+(3*sigma))))
p.varea(x_axis[x_range_3s],np.zeros((sum(x_range_3s))),
         norm_data[x_range_3s],
         fill_alpha = 0.20,
         legend = 'pm 3 sigma',
         color = colors[3])

p.legend.location = "top_right"
p.legend.click_policy="hide"
p.xaxis.axis_label = 'X'
p.yaxis.axis_label = 'Probability density'


show(p)

# All normal distributions kindof look the same, huh?

In [82]:
## Generate 5 random data samples with different mu and sigma values.
mu_values = [-2, 5, 10, 40, 0]
sigma_values = [1, 3, 2, 15, 2]
n_observations = 10000


random_data_array = np.zeros((n_observations, len(mu_values)))
for mu, sigma, i in zip(mu_values, sigma_values, range(len(mu_values))):
    print(mu, sigma)
    random_data_array[:, i] = np.random.normal(mu, sigma, n_observations)
    

# Plot all five histograms
p = figure(title=f'Random data from different normal distributions,{n_observations} points',
            tools='',
            background_fill_color="#fafafa",
            plot_width = 800)

colors = Magma[6]
for mu, sigma, i in zip(mu_values, sigma_values, range(len(mu_values))):
    hist, edges = np.histogram(random_data_array[:, i], density=True, bins = np.ceil(np.sqrt(n_observations)).astype(int))
    make_plot(hist, edges, f'mu = {mu}, sigma = {sigma}', colors[i])

p.legend.click_policy="hide"
show(p)




-2 1
5 3
10 2
40 15
0 2


## Let's rescale...

First let's make all the means the same.

In [83]:
# Shift all data of random_data_array so that it is centered around a mean of 0

shifted_random_data_array = np.zeros(random_data_array.shape)
for i in range(len(mu_values)):
    sample_mean = np.mean(random_data_array[:, i])
    shifted_random_data_array[:, i] = random_data_array[:, i] - sample_mean

    
# Now plot to see what has changed...
# Plot all five histograms
p = figure(title=f'Random MEAN CENTERED data from different normal distributions,{n_observations} points',
            tools='',
            background_fill_color="#fafafa",
            plot_width = 800)

colors = Magma[6]
for mu, sigma, i in zip(mu_values, sigma_values, range(len(mu_values))):
    hist, edges = np.histogram(shifted_random_data_array[:, i], density=True, bins = np.ceil(np.sqrt(n_observations)).astype(int))
    make_plot(hist, edges, f'Original mu = {mu}, sigma = {sigma}', colors[i])

p.legend.click_policy="hide"
show(p)


## Notice all distributions have aligned means, but their spread is vastly different.

Can we normalize them all?

In [85]:
normalized_random_data_array = np.zeros(random_data_array.shape)
for i in range(len(mu_values)):
    sample_mean = np.mean(random_data_array[:, i])
    sample_stdev = np.std(random_data_array[:, i], ddof = 1)
    normalized_random_data_array[:, i] = (random_data_array[:, i] - sample_mean)/sample_stdev

    
# Now plot to see what has changed...
# Plot all five histograms
p = figure(title=f'Random MEAN CENTERED data from different normal distributions,{n_observations} points',
            tools='',
            background_fill_color="#fafafa",
            plot_width = 800)

colors = Magma[6]
for mu, sigma, i in zip(mu_values, sigma_values, range(len(mu_values))):
    hist, edges = np.histogram(normalized_random_data_array[:, i], density=True, bins = np.ceil(np.sqrt(n_observations)).astype(int))
    make_plot(hist, edges, f'Original mu = {mu}, sigma = {sigma}', colors[i])

p.legend.click_policy="hide"
show(p)


# The standard normal distribution

### The normal distribution with mu = 0 and sigma = 1.

In [88]:
# Draw the same figure as before but for the standard normal distribution
mu = 0
sigma = 1

# plot
p = figure(plot_width=800, plot_height=500)
p.title.text = 'Click on legend entries to hide the corresponding lines'


colors = Category20b[5]
norm_data = stats.norm.pdf(x_axis, loc = mu, scale = sigma)
p.line(x_axis, norm_data, line_width=2,alpha=0.8, legend = f'mu={mu}, sigma={sigma}', color = colors[0])
mu_plus_sigma = np.argwhere(x_axis >= (mu+sigma)).flatten()[0]
mu_minus_sigma = np.argwhere(x_axis <= (mu+sigma)).flatten()[-1]
x_range_1s = ((x_axis>(mu-sigma)) & (x_axis<(mu+sigma)))
p.varea(x_axis[x_range_1s],np.zeros((sum(x_range_1s))),
         norm_data[x_range_1s],
         fill_alpha = 0.20,
         legend = 'pm 1 sigma',
         color = colors[1])
mu_plus_2sigma = np.argwhere(x_axis >= (mu+sigma)).flatten()[0]
mu_minus_2sigma = np.argwhere(x_axis <= (mu+sigma)).flatten()[-1]
x_range_2s = ((x_axis>(mu-(2*sigma))) & (x_axis<(mu+(2*sigma))))
p.varea(x_axis[x_range_2s],np.zeros((sum(x_range_2s))),
         norm_data[x_range_2s],
         fill_alpha = 0.20,
         legend = 'pm 2 sigma',
         color = colors[2])
mu_plus_3sigma = np.argwhere(x_axis >= (mu+sigma)).flatten()[0]
mu_minus_3sigma = np.argwhere(x_axis <= (mu+sigma)).flatten()[-1]
x_range_3s = ((x_axis>(mu-(3*sigma))) & (x_axis<(mu+(3*sigma))))
p.varea(x_axis[x_range_3s],np.zeros((sum(x_range_3s))),
         norm_data[x_range_3s],
         fill_alpha = 0.20,
         legend = 'pm 3 sigma',
         color = colors[3])

p.legend.location = "top_right"
p.legend.click_policy="hide"
p.xaxis.axis_label = 'X'
p.yaxis.axis_label = 'Probability density'


show(p)