Data preparation

In [None]:
# Book Figure 9-1

import seaborn as sns

titanic = sns.load_dataset('titanic')

not_survived_fare = titanic[titanic['survived'] == 0 ]['fare']
survived_fare = titanic[titanic['survived'] == 1 ]['fare']

Normality and t-test

In [None]:
# Book Figure 9-2

from scipy import stats
from scipy.stats import ttest_ind, shapiro, norm

# Test the normality of the not survived fare data
stat, p = shapiro(not_survived_fare)
alpha = 0.05
z_crt=norm.ppf(1-alpha/2)
if abs(stat) < z_crt:
    print('Fail to Reject H0 (not survived fare data distributed Normal)')
else:
    print('Reject H0 (not survived fare data not distributed Normal)')

# Test the normality of the survived fare data
stat, p = shapiro(survived_fare)
alpha = 0.05
if abs(stat) < z_crt:
    print('Fail to Reject H0 (survived fare data distributed Normal)')
else:
    print('Reject H0 (survived fare data not distributed Normal)')
# Run t-Test
t_stat, p = ttest_ind(survived_fare, not_survived_fare)

print('Statistics=%.3f, p=%.3f' % (t_stat, p))
alpha = 0.05
if p > alpha:
    print('Fail to Reject H0 (difference is non-significant)')
else:
    print('Reject H0 (difference is significant)')


Data generation for non-parametric tests

In [None]:
# generate uniform data samples
from numpy.random import seed
from numpy.random import rand
from numpy import mean
from numpy import std

# seed the random number generator
seed(1)
# generate two sets of univariate observations
data1 = 5 * rand(100) + 50
data2 = 5 * rand(100) + 51
# summarize
print('data1: mean=%.3f stdv=%.3f' % (mean(data1), std(data1)))
print('data2: mean=%.3f stdv=%.3f' % (mean(data2), std(data2)))


Chi Square Test (One sample for a nominal variable)

In [26]:
df=pd.read_csv('/content/drive/MyDrive/Colab Notebooks/CSV_Files/09_Parametric_Nonparametric_Tests.csv')
f_obs=[]
# C=df.groupby([df.Veh_Type]).count()
f_obs=df['Veh_Type'].value_counts()
f_exp=[15,12,13,16,16,17,6,5]
from scipy.stats import chisquare
chisquare(f_obs=f_obs, f_exp=f_exp)



Power_divergenceResult(statistic=8.900150829562595, pvalue=0.25990456040611837)

Wilcoxon test (one sample for an ordinal variable)

In [3]:
from scipy.stats import wilcoxon
import pandas as pd
import numpy as np

# Your data (a list)
df=pd.read_csv('/content/drive/MyDrive/Colab Notebooks/CSV_Files/09_Parametric_Nonparametric_Tests.csv')

# The single value you want to compare against
reference_value = 3
differences = [x - reference_value for x in df.PCI]

# Perform the one-sample Wilcoxon signed-rank test
statistic, p_value = wilcoxon(differences)

# Output the results
print(f"Wilcoxon Statistic: {statistic}")
print(f"P-value: {p_value}")

# Check if the result is statistically significant (common alpha level is 0.05)
if p_value < 0.05:
    print("The difference is statistically significant.")
else:
    print("The difference is not statistically significant.")


Wilcoxon Statistic: 1748.0
P-value: 0.8669500276906402
The difference is not statistically significant.


Wilcoxon test (one sample for a continuous variable)

In [64]:
stat, p = wilcoxon(df.TT_Bef-20)


print('Statistics=%.3f, p=%.3f' % (stat, p))
# interpret
alpha = 0.05
if p > alpha:
    print('Same distribution (fail to reject H0)')
else:
    print('Different distribution (reject H0)')


Statistics=2292.000, p=0.423
Same distribution (fail to reject H0)


Wilcoxon test (two paired samples)

In [29]:
from scipy.stats import wilcoxon
# compare samples
# stat, p = wilcoxon(data1, data2)
stat, p = wilcoxon(df.TT_Bef, df.TT_Aft)
print('Statistics=%.3f, p=%.3f' % (stat, p))
# interpret
alpha = 0.05
if p > alpha:
    print('Same distribution (fail to reject H0)')
else:
    print('Different distribution (reject H0)')


Statistics=208.000, p=0.000
Different distribution (reject H0)


Mann-whitney test (two unpaired samples)

In [27]:
from scipy.stats import mannwhitneyu
# compare samples
# stat, p = mannwhitneyu(data1, data2)
stat, p = mannwhitneyu(df.Ma_Acc, df.Fe_Acc)
print('Statistics=%.3f, p=%.3f' % (stat, p))
# interpret
alpha = 0.05
if p > alpha:
    print('Same distribution (fail to reject H0)')
else:
    print('Different distribution (reject H0)')


Statistics=4256.000, p=0.068
Same distribution (fail to reject H0)


Kruskal-Wallis test (more than two unpaired samples)

In [34]:
import numpy as np
data1 = np.random.rand(100)*5 + 50
data2 = np.random.rand(100)*5 + 50
data3 = np.random.rand(100)*5 + 52

from scipy.stats import kruskal

# stats,p=kruskal(data1,data2,data3)
stats,p=kruskal(df.Teh,df.Shi,df.Isf)
print('Statistics=%.3f, p=%.3f' % (stat, p))

alpha=0.05

if p>alpha:
    print('Same distribution (fail to reject H0)')
else:
    print('Different distribution (reject H0)')


Statistics=56.420, p=0.000
Different distribution (reject H0)


Friedman test (more than two paired samples)

In [61]:
from scipy.stats import friedmanchisquare
df=pd.read_csv('/content/drive/MyDrive/Colab Notebooks/CSV_Files/09_Parametric_Nonparametric_Tests.csv')

# seed the random number generator
# seed(1)
# generate three independent samples
data1 = np.random.rand(100)*5 + 50
data2 = np.random.rand(100)*5 + 50
data3 = np.random.rand(100)*5 + 52
# compare samples
# stat, p = friedmanchisquare(data1, data2, data3)
stat, p = friedmanchisquare(df.EM, df.MD, df.LN)

print('Statistics=%.3f, p=%.3f' % (stat, p))
# interpret
alpha = 0.05
if p > alpha:
    print('Same distributions (fail to reject H0)')
else:
    print('Different distributions (reject H0)')


Statistics=73.860, p=0.000
Different distributions (reject H0)


In [54]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
