Data preparation

In [34]:
# Book Figure 9-1

import seaborn as sns

titanic = sns.load_dataset('titanic')

not_survived_fare = titanic[titanic['survived'] == 0 ]['fare']
survived_fare = titanic[titanic['survived'] == 1 ]['fare']

Normality and t-test

In [None]:
# Book Figure 9-2

from scipy import stats
from scipy.stats import ttest_ind, shapiro, norm

# Test the normality of the not survived fare data
stat, p = shapiro(not_survived_fare)
alpha = 0.05
z_crt=norm.ppf(1-alpha/2)
if abs(stat) < z_crt:
    print('Fail to Reject H0 (not survived fare data distributed Normal)')
else:
    print('Reject H0 (not survived fare data not distributed Normal)')

# Test the normality of the survived fare data
stat, p = shapiro(survived_fare)
alpha = 0.05
if abs(stat) < z_crt:
    print('Fail to Reject H0 (survived fare data distributed Normal)')
else:
    print('Reject H0 (survived fare data not distributed Normal)')
# Run t-Test
t_stat, p = ttest_ind(survived_fare, not_survived_fare)

print('Statistics=%.3f, p=%.3f' % (t_stat, p))
alpha = 0.05
if p > alpha:
    print('Fail to Reject H0 (difference is non-significant)')
else:
    print('Reject H0 (difference is significant)')


Data generation for non-parametric tests

In [None]:
# generate uniform data samples
from numpy.random import seed
from numpy.random import rand
from numpy import mean
from numpy import std

# seed the random number generator
seed(1)
# generate two sets of univariate observations
data1 = 5 * rand(100) + 50
data2 = 5 * rand(100) + 51
# summarize
print('data1: mean=%.3f stdv=%.3f' % (mean(data1), std(data1)))
print('data2: mean=%.3f stdv=%.3f' % (mean(data2), std(data2)))


Mann-whitney test

In [None]:
from scipy.stats import mannwhitneyu
# compare samples
stat, p = mannwhitneyu(data1, data2)
print('Statistics=%.3f, p=%.3f' % (stat, p))
# interpret
alpha = 0.05
if p > alpha:
    print('Same distribution (fail to reject H0)')
else:
    print('Different distribution (reject H0)')


Wilcoxon test

In [None]:
from scipy.stats import wilcoxon
# compare samples
stat, p = wilcoxon(data1, data2)
print('Statistics=%.3f, p=%.3f' % (stat, p))
# interpret
alpha = 0.05
if p > alpha:
    print('Same distribution (fail to reject H0)')
else:
    print('Different distribution (reject H0)')


Wilcoxon paired test

In [None]:
stat, p = wilcoxon(data1-data2)

print('Statistics=%.3f, p=%.3f' % (stat, p))
# interpret
alpha = 0.05
if p > alpha:
    print('Same distribution (fail to reject H0)')
else:
    print('Different distribution (reject H0)')


Kruskal-Wallis test

In [None]:
import numpy as np
data1 = np.random.rand(100)*5 + 50
data2 = np.random.rand(100)*5 + 50
data3 = np.random.rand(100)*5 + 52

from scipy.stats import kruskal

stats,p=kruskal(data1,data2,data3)
print('Statistics=%.3f, p=%.3f' % (stat, p))

alpha=0.05

if p>alpha:
    print('Same distribution (fail to reject H0)')
else:
    print('Different distribution (reject H0)')


Friedman test

In [41]:
from scipy.stats import friedmanchisquare

# seed the random number generator
seed(1)
# generate three independent samples
data1 = np.random.rand(100)*5 + 50
data2 = np.random.rand(100)*5 + 50
data3 = np.random.rand(100)*5 + 52
# compare samples
stat, p = friedmanchisquare(data1, data2, data3)
print('Statistics=%.3f, p=%.3f' % (stat, p))
# interpret
alpha = 0.05
if p > alpha:
    print('Same distributions (fail to reject H0)')
else:
    print('Different distributions (reject H0)')


Statistics=76.020, p=0.000
Different distributions (reject H0)
