#### import packages

In [14]:
import pandas as pd
import numpy as np
from scipy.stats import ttest_1samp, ttest_ind, mannwhitneyu, levene, shapiro
from statsmodels.stats.power import ttest_power
from statsmodels.stats.weightstats import ztest

In [2]:
energ = np.array([
# energy expenditure in mJ and stature (0=obese, 1=lean)
[9.21, 0],
[7.53, 1],
[7.48, 1],
[8.08, 1],
[8.09, 1],
[10.15, 1],
[8.40, 1],
[10.88, 1],
[6.13, 1],
[7.90, 1],
[11.51, 0],
[12.79, 0],
[7.05, 1],
[11.85, 0],
[9.97, 0],
[7.48, 1],
[8.79, 0],
[9.69, 0],
[9.68, 0],
[7.58, 1],
[9.19, 0],
[8.11, 1]])

In [3]:
# Seperating the data into 2 groups
group1 = energ[:, 1] == 0
group1 = energ[group1][:, 0]
group2 = energ[:, 1] == 1
group2 = energ[group2][:, 0]

In [4]:
# two-sample t-test
# null hypothesis: the two groups have the same mean
# this test assumes the two groups have the same variance...
# (can be checked with tests for equal variance - Levene)
# independent groups: e.g., how boys and girls fare at an exam
# dependent groups: e.g., how the same class fare at 2 different exams
t_statistic, p_value = ttest_ind(group1, group2)
print(t_statistic, p_value)

3.9455649161549835 0.0007989982111700593


In [5]:
# p_value < 0.05 => alternative hypothesis:
# they don't have the same mean at the 5% significance level
print ("two-sample t-test p-value=", p_value)

two-sample t-test p-value= 0.0007989982111700593


In [6]:
# pre and post-surgery energy intake
intake = np.array([
[5260, 3910],
[5470, 4220],
[5640, 3885],
[6180, 5160],
[6390, 5645],
[6515, 4680],
[6805, 5265],
[7515, 5975],
[7515, 6790],
[8230, 6900],
[8770, 7335],
])

In [7]:
# Seperating data into 2 groups
pre = intake[:, 0]
post = intake[:, 1]

In [8]:
# paired t-test: doing two measurments on the same experimental unit
# e.g., before and after a treatment
t_statistic, p_value = ttest_1samp(post - pre, 0)
print(t_statistic, p_value)

-11.941392877647603 3.059020942934875e-07


In [9]:
# p < 0.05 => alternative hypothesis:
# the difference in mean is not equal to 0
print ("paired t-test p-value=", p_value)

paired t-test p-value= 3.059020942934875e-07


In [10]:
print(ttest_power(0.587, nobs=22, alpha=0.10, alternative='two-sided'))

0.8456736280306766


#### Workout


In [13]:
data = pd.read_csv("insurance.csv")
data["sex"].unique()
m_data = data[data["sex"] == "male"]
f_data = data[data["sex"] == "female"]
print(m_data["age"].mean(),f_data["age"].mean())

38.917159763313606 39.503021148036254


##### Two tailed test

Ho - Average age of male = Average age of Female / Ha - Average age of Male <> Average age of Female

In [16]:
print(ttest_ind(m_data["age"],f_data["age"]))
print(ztest(m_data["age"],x2=f_data["age"],value=0))

# p-value greater tha 0.05 failed to reject H0


Ttest_indResult(statistic=-0.7624757892095714, pvalue=0.4459106812355743)
(-0.7624757892095714, 0.4457760876380684)


##### One Tailed test

Ho - Different in age is <= 5 / Ha - Difference in the age is less than 5

In [18]:
def t_test(x,y,alternative='both-sided'):
    _, double_p = ttest_ind(x,y,equal_var = False)
    if alternative == 'both-sided':
        pval = double_p
    elif alternative == 'larger':
        if np.mean(x) > np.mean(y):
            pval = double_p/2.
        else:
            pval = 1.0 - double_p/2.
    elif alternative == 'smaller':
        if np.mean(x) < np.mean(y):
            pval = double_p/2.
        else:
            pval = 1.0 - double_p/2.
    return pval

In [34]:
print(ztest(f_data["age"],x2=m_data["age"],alternative="smaller"))
print(t_test(f_data["age"],m_data["age"],alternative='smaller'))

(0.7624757892095714, 0.7771119561809658)
0.7770439374895788
