In [22]:
import pandas as pd
import numpy as np
pd.set_option('display.max_columns', None)
import warnings
warnings.filterwarnings('ignore')
import matplotlib.pyplot as plt
import seaborn as sns 
from scipy.stats import ttest_ind 

# Lesson 4

## Hypothesis testing

We want to test if our the **sample mean** is not equal to the **population mean** = 80.94. We also know that our **sample** has a size of 25 individuals.

$t = \frac{(\bar{X}-\mu)}{\hat{\sigma}/\sqrt{n}}$

where:

* $\bar{X}$ is the **sample mean**
* $\mu$ is the **population mean**
* $\hat{\sigma}$ is the **sample standard deviation**
* $n$ is the number of measures in our sample

In [1]:
import math

sample_mean = 130.1
pop_mean = 120
sample_std = 21.21
n = 100

# calculate t stat
statistic = (sample_mean - pop_mean)/(sample_std/math.sqrt(n))

print("Statistic is: ", statistic)

Statistic is:  4.761904761904759


In [13]:
from scipy import stats
from numpy.random import normal


samples = {}

for i in range(10):
    sample_name = "sample_" + str(i)
    samples[sample_name] = normal(loc = 130.1, scale = 21.21, size = 100)
    sample_mean = "sample_" + str(i) + "_mean"
    samples[sample_mean] = np.mean(samples[sample_name])
    sample_std = "sample_" + str(i) + "_std"
    samples[sample_std] = np.std(samples[sample_name],ddof=1)
    sample_statistic = "sample_" + str(i) + "_t-statistic"
    samples[sample_statistic] = (samples[sample_mean]- pop_mean)/(samples[sample_std]/math.sqrt(n)) 
    
    print("The t-statistic for the sample {} is: {}".format(i,samples[sample_statistic]))


The t-statistic for the sample 0 is: 5.368003783320794
The t-statistic for the sample 1 is: 3.956477446125988
The t-statistic for the sample 2 is: 5.889698858852175
The t-statistic for the sample 3 is: 4.475653506813489
The t-statistic for the sample 4 is: 4.619747471092248
The t-statistic for the sample 5 is: 6.63010134697367
The t-statistic for the sample 6 is: 5.827357606089939
The t-statistic for the sample 7 is: 4.301238035986397
The t-statistic for the sample 8 is: 4.3600456799175715
The t-statistic for the sample 9 is: 5.683266933141984


Now that we have the t-statistic for each random sample, let's make the two tails test. Why two tails? Because we are looking what is the probability that we get a **sample mean** which deviates from the **population mean** more than out t-statistic. We don't care if the our **sample mean** is bigger or smaller than the **population mean**.

Therefore, we can ask ourselves what is the probability of having a deviation within -t and t.


In [16]:
print("Assuming a significance level of 0.05")


# 120 is the pop_mean

for i in range(10):
    sample_name = "sample_" + str(i)
    
    print("The p-value of sample {} is: {:-5.3}".format(i,stats.ttest_1samp(samples[sample_name],120)[1]))
#     gets the p value
    
    if ( stats.ttest_1samp(samples[sample_name],120)[1] < 0.05 ):
        
        print("Therefore we discard the null hypothesis Ho, as it's very unlikely to get sample {} given Ho.".format(i))
    print()
    
    

Assuming a significance level of 0.05
The p-value of sample 0 is: 5.26e-07
Therefore we discard the null hypothesis Ho, as it's very unlikely to get sample 0 given Ho.

The p-value of sample 1 is: 0.000143
Therefore we discard the null hypothesis Ho, as it's very unlikely to get sample 1 given Ho.

The p-value of sample 2 is: 5.33e-08
Therefore we discard the null hypothesis Ho, as it's very unlikely to get sample 2 given Ho.

The p-value of sample 3 is: 2.04e-05
Therefore we discard the null hypothesis Ho, as it's very unlikely to get sample 3 given Ho.

The p-value of sample 4 is: 1.16e-05
Therefore we discard the null hypothesis Ho, as it's very unlikely to get sample 4 given Ho.

The p-value of sample 5 is: 1.78e-09
Therefore we discard the null hypothesis Ho, as it's very unlikely to get sample 5 given Ho.

The p-value of sample 6 is: 7.04e-08
Therefore we discard the null hypothesis Ho, as it's very unlikely to get sample 6 given Ho.

The p-value of sample 7 is: 3.99e-05
Therefor

In [16]:
# Inferential statistics - T-test & P-value

data = pd.read_table('../lab-t-tests-p-values/files_for_lab/machine.txt', sep='\t')

In [17]:
data


Unnamed: 0,New machine,Old machine
0,42.1,42.7
1,41.0,43.6
2,41.3,43.8
3,41.8,43.3
4,42.4,42.5
5,42.8,43.5
6,43.2,43.1
7,42.3,41.7
8,41.8,44.0
9,42.7,44.1


In [24]:


#  all the values are smaller 

# sample_mean1 = 105.5
# sample_std1 = 20.1
# n1 = 10

# sample_mean2 = 90.9
# sample_std2 = 12.2
# n2 = 10

statistic, pvalue = ttest_ind(data['New machine'],data['Old machine'])

# pooled_sample_std = math.sqrt(((n1-1)*sample_std1**2 + (n2-1)*sample_std2**2)/(n1+n2-2))
# statistic = (sample_mean1-sample_mean2)/(pooled_sample_std*math.sqrt((1/n1)+(1/n2)))
print("T Statistic is: ", statistic) 

print("P value is: ", pvalue)

# print("Critical Value of z is: ", t.ppf(0.05, n1+n2-2)) #alpha is 0.05 
# one sided 0.05


T Statistic is:  -3.3972307061176026
P value is:  0.0032111425007745158


In [None]:
# p value is smaller than alpha, reject Ho