In [1]:
import pandas as pd
import numpy as np
import scipy.stats as stats
import matplotlib.pyplot as plt
import math

# Null Hypothesis

* Assumption that the sample data that you have is from the population you are comparing it to
* Null Hypothesis states that there is no difference between the sample data and the population it is being compared against

# Significance Level
* Denoted by the greek letter $\alpha$
* Best (most common) practice is 95%, but a higher or lower value may be appropriate depending on what is being tested and associated risk factors
* This value is used to set standards to accept or reject the null hypothesis

# One Sample T-Test
* description 1: determines whether a sample mean differs from the population mean
* description 2: determines whether a sample differs from a known value or standard
  * An example of this in chemisty is when a standard is measured in lab and compared to a stated value
* Conditions:
  * dependent variable should be measured at the interval (ordered data) or ratio (has a true zero, continuous equidistant scale, and shows order, direction, and precise difference in values) level
  * The data should be independent
  * There should be no significant outliers
  * The dependent variable should be normally distributed

In [5]:
np.random.seed(6)

population_ages1 = stats.poisson.rvs(loc=18, mu=35, size=150000)
population_ages2 = stats.poisson.rvs(loc=18, mu=10, size=100000)
population_ages = np.concatenate((population_ages1, population_ages2))

minnesota_ages1 = stats.poisson.rvs(loc=18, mu=30, size=30)
minnesota_ages2 = stats.poisson.rvs(loc=18, mu=10, size=20)
minnesota_ages = np.concatenate((minnesota_ages1, minnesota_ages2))

print(population_ages.mean())
print(minnesota_ages.mean())

43.000112
39.26


In [6]:
#statistic:how much our sample data differs from the population mean
#pvalue: this means that the sample data represented would occur randomly 1.3% of the time (pvalue=0.013)
#df: degrees of freedom (number of sample points - 1)
stats.ttest_1samp(a=minnesota_ages, #sample data
                  popmean=population_ages.mean()) #population mean

TtestResult(statistic=-2.5742714883655027, pvalue=0.013118685425061675, df=49)

In [9]:
stats.t.ppf(q=0.025, df=49)

-2.0095752371292397

$t = \Large\frac{\bar{x}-\mu}{\Large\frac{s}{\sqrt{n}}}$

## Variables
* t is the test statistic
* $\bar{x}$ is the sample mean
* $\mu$ is the population mean
* s is the standard deviation
* n is the degrees of freedom