# [Python Reference Link](http://www.data8.org/sp20/python-reference.html)
*Run the cell below so that we can set our modules up*# Importing our modules

In [None]:
import numpy as np
from datascience import *
import math as m

# These lines do some fancy plotting magic.
import matplotlib
%matplotlib inline
import matplotlib.pyplot as plt
plt.style.use('fivethirtyeight')
import warnings
warnings.simplefilter('ignore', FutureWarning)

In [None]:
births = Table.read_table('baby.csv')

# Examining the Central Limit Theorem

In [None]:
weight_bins = np.arange(50,180,10)
births.hist("Birth Weight",bins=weight_bins)

In [None]:
input_values = np.arange(0,15,0.1)
exponential_curve = 0.25*m.e**(0.25*input_values)

r_skew_bins = np.arange(0,12,1)
r_skew_table = Table().with_columns('values',exponential_curve)
r_skew_table.hist(bins=r_skew_bins)

In [None]:
reverse_exponential = 130 - exponential_curve

l_skew_bins = np.arange(118,130,1)
l_skew_table = Table().with_columns('values',reverse_exponential)
l_skew_table.hist(bins = l_skew_bins)

### The Distribution of sample averages

In [None]:
#Let's look at a few columns in the births table 
sample_size = 100
number_of_resamples = 1000
column_name = ...

means = make_array()

for i in np.arange(number_of_resamples):
    one_small_sample = births.sample(sample_size,with_replacement=True)
    mean_of_one_small_sample = np.mean(one_small_sample.column(column_name))
    means = np.append(means, mean_of_one_small_sample)

Table().with_columns('Means of'+column_name,means).hist()    

In [None]:
# Let's look at sampling from a right skew distribution
sample_size = 100
number_of_resamples = 1000

means = make_array()

for i in np.arange(number_of_resamples):
    one_small_sample = r_skew_table.sample(sample_size,with_replacement=True)
    mean_of_one_small_sample = np.mean(one_small_sample.column('values'))
    means = np.append(means, mean_of_one_small_sample)

Table().with_columns('Means',means).hist() 

In [None]:
# Let's look at sampling from a left skew distribution
sample_size = 100
number_of_resamples = 1000

means = make_array()

for i in np.arange(number_of_resamples):
    one_small_sample = l_skew_table.sample(sample_size,with_replacement=True)
    mean_of_one_small_sample = np.mean(one_small_sample.column('values'))
    means = np.append(means, mean_of_one_small_sample)

Table().with_columns('Means',means).hist()  

In [None]:
#Let's look at a few columns in the births table, but now with increased sample size
sample_size = 
number_of_resamples = 
column_name = ...

means = make_array()

for i in np.arange(number_of_resamples):
    one_small_sample = births.sample(sample_size,with_replacement=True)
    mean_of_one_small_sample = np.mean(one_small_sample.column(column_name))
    means = np.append(means, mean_of_one_small_sample)

Table().with_columns('Means of'+column_name,means).hist()   

In [None]:
#Let's look at a few columns in the births table, but now with increased number of resamples
sample_size = ...
number_of_resamples = ...
column_name = ...

means = make_array()

for i in np.arange(number_of_resamples):
    one_small_sample = births.sample(sample_size,with_replacement=True)
    mean_of_one_small_sample = np.mean(one_small_sample.column(column_name))
    means = np.append(means, mean_of_one_small_sample)

Table().with_columns('Means of '+column_name,means).hist(bins=bins_we_use)    

# Linear Regression

## Standard Units

In [None]:
birds = Table.read_table('snowy_plover.csv')
birds

Let's take a look at the relationship between Egg Weight and Bird Weight

In [None]:
birds.scatter('Egg Weight','Bird Weight')
birds.scatter('Egg Weight','Bird Weight',fit_line=True)

Now let's calculate the standard units for each weight variable:

In [None]:
e_weight_standard_units = ...

In [None]:
b_weight_standard_units = ...

In [None]:
# to simplify, let's turn this process into a function
def standard_units_convert(input_data):
    ...
    return ...

In [None]:
standard_units_convert(birds.column('Bird Weight')) == b_weight_standard_units

## Correlation Coefficient `r`

In [None]:
correlation_r = ...

In [None]:
def calculate_r(x_value_array, y_value_array):
    ...
    return ... 

In [None]:
calculate_r(birds.column('Egg Weight'),birds.column('Bird Weight'))

In [None]:
calculate_r(birds.column('Egg Weight'),birds.column('Egg Length'))

## Linear Regression (Slope and Intercept)

In [None]:
# finding the standard deviations for our x-value & y-value arrays
x_val_sd = ...
y_val_sd = ...
print(correlation_r, x_val_sd, y_val_sd)

In [None]:
slope_of_regression = 

In [None]:
#finding the means for our x-value & y-value arrays
x_val_mean = ...
y-val_mean = ...
print(slope_of_regression, x_val_mean, y_val_mean)

In [None]:
intercept = ...

In [None]:
x_val_array = ...
y_estimates_array = ...
birds = birds.with_columns('Estimates',y_estimates_array)
birds

In [None]:
birds.select('Egg Weight','Bird Weight','Estimates').scatter('Egg Weight')
birds.scatter('Egg Weight','Bird Weight',fit_line=True)

## The Residual - Evaluating your Linear Regression Accuracy

In [None]:
residual = ...

In [None]:
birds = birds.with_columns('Residual',residual)
birds

In [None]:
birds.scatter('Egg Weight','Residual',color = 'r')