Hypothesis testing 

In [None]:
mu0=10 # mean of the null hypothesis
# Calculate the Z-statistic
Z_0 = (np.mean(x1) - mu0) / (sigma / np.sqrt(n))

# Compare the Z-statistic with the critical value
alpha = 0.05   # significance level
z_alpha2 = stats.norm.ppf(1-alpha/2)    #remind: inverse cumulative distribution function 

if np.abs(Z_0) > z_alpha2:
    print('Reject the null hypothesis at alpha = %.2f' % alpha)
else:
    print('Accept the null hypothesis at alpha = %.2f' % alpha)

In [None]:
# Compute the confidence interval
CI = [np.mean(x1) - z_alpha2 * sigma/np.sqrt(n), np.mean(x1) + z_alpha2 * sigma/np.sqrt(n)]
print('Confidence interval: %.3f, %.3f' % (CI[0],CI[1]))

# Or with the built-in function
CI = stats.norm.interval(1-alpha, loc=np.mean(x1), scale=sigma/np.sqrt(n))
print('Confidence interval: %.3f, %.3f' % (CI[0],CI[1]))

In [None]:
# pvalue
pval = 2 * ( 1 - stats.norm.cdf(np.abs(Z_0)) )      #attention: bilateral rejection region
print('p-value = %.3f' % pval)

In [None]:
# Power curve of the test

delta = np.linspace(0, 30, 100)
mu1 = mu0 + delta
Z_alpha2 = stats.norm.ppf(1 - alpha / 2)

plt.plot(delta, power_20, label = "power (n = 20)")
plt.plot(delta, power_40, label = "power (n = 40)")
plt.xlabel("delta")
plt.ylabel("power")
plt.grid(True)
plt.legend()
plt.show()

In [None]:
# Visualize the confidence interval on a dot plot
plt.title('One-sided confidence interval for the mean with CL = %.2f' % CL)
plt.scatter(data['d'], np.zeros(n), label='difference')
# plot H0
plt.scatter(delta0, -0.01, label='H_0', color='r', marker='x', s=100)
# plot the confidence interval
plt.scatter(CI_lower, -0.01, label='C.I.', color='r', marker='|', s=100)
plt.plot([CI_lower, np.max(data['d'])], [-0.01, -0.01], color='r')
plt.scatter(np.max(data['d']), -0.01, color='r', marker='>', s=100)
# Add labels and legend
plt.ylim(-0.03, 0.03)
plt.xlabel('Difference (lbs)')
plt.yticks([])
plt.legend()
plt.grid()
plt.show()


In [None]:
# Power of test
ratio = 1.5 # ratio between the variances of the two samples
beta = stats.f.cdf(stats.f.ppf(1-alpha/2, df1, df2)/ratio, df1, df2) - stats.f.cdf(stats.f.ppf(alpha/2, df1, df2)/ratio, df1, df2)
print('Power of the test: %.3f' % (1-beta))

t-statistics

In [None]:
# Calculate the t-statistic
t_0 = (np.mean(x1) - mu0) / (np.std(x1, ddof=1) / np.sqrt(n))

# Compare the t-statistic with the critical value
alpha = 0.05   # significance level
t_alpha = stats.t.ppf(1-alpha, n-1)

if t_0 > t_alpha:
    print('Reject the null hypothesis at alpha = %.2f' % alpha)
else:
    print('Accept the null hypothesis at alpha = %.2f' % alpha)

# Compute the confidence interval
CI = np.mean(x1) - t_alpha * np.std(x1, ddof=1)/np.sqrt(n)
print('Confidence interval (lower bound): %.3f' % (CI))

#pvalue
pval = 1 - stats.t.cdf(t_0,n-1)
print('p-value = %.3f' % pval)

In [None]:
# Alternatively, you can use the stats.t.interval function
CI = stats.t.interval(CL, df, loc=data['d'].mean(), scale=data['d'].std() / np.sqrt(n))
print('Confidence interval: (%.3f, %.3f)' % (CI[0], CI[1]))

In [None]:
# With built-in function to make the t-test
t_0, pval = stats.ttest_1samp(x1, mu0, alternative='greater')
print('Test statistic t_0 = %.3f' % t_0)
print('p-value = %.3f' % pval)

Chi squared test

In [None]:
# Compute the one-sided CI on the variance
df = n - 1      # Degrees of freedom
chi2 = stats.chi2.ppf(alpha, df)
CI_upper = df * data['Water content'].var() / chi2
print('Upper bound of the one-sided CI on the variance: %.3f' % CI_upper)

# Compute the two-sided CI on the variance
chi2_1 = stats.chi2.ppf(alpha / 2, df)
chi2_2 = stats.chi2.ppf(1 - alpha / 2, df)

CI_var = [df * data['Water content'].var() / chi2_2,
        df * data['Water content'].var() / chi2_1]

CI_stdev_d = np.sqrt(CI_var)
print('Two-sided CI on the standard deviation (CL = %.2f): [%.3f, %.3f]' % (CL, CI_stdev_d[0], CI_stdev_d[1]))

Normality assumption tests

In [None]:
# Shapiro-Wilk test
_, p_value_SW = stats.shapiro(data['x1'])
print('p-value of the Shapiro-Wilk test: %.3f' % p_value_SW)

# QQ-plot
stats.probplot(data['x1'], dist='norm', plot=plt)
plt.show()

In [None]:
# Anderson-Darling test
def ADpvalue(data):
    """
    This function computes the p-value of the Anderson-Darling test.
    
    Input:
        data: data to be tested
    Output:
        p_value_AD: p-value of the Anderson-Darling test

    """
    anderson = stats.anderson(data, dist='norm')
    # compute the p-value of the Anderson-Darling test
    if anderson.statistic >= 0.6:
        p_value_AD = np.exp(1.2937 - 5.709*anderson.statistic + 0.0186*(anderson.statistic**2))
    elif anderson.statistic >= 0.34:
        p_value_AD = np.exp(0.9177 - 4.279*anderson.statistic - 1.38*(anderson.statistic**2))
    elif anderson.statistic >= 0.2:
        p_value_AD = 1 - np.exp(-8.318 + 42.796*anderson.statistic - 59.938*(anderson.statistic**2))
    else:
        p_value_AD = 1 - np.exp(-13.436 + 101.14*anderson.statistic - 223.73*(anderson.statistic**2))

    return p_value_AD
    
p_value_AD = ADpvalue(data['Water content'])
print('p-value of the Anderson-Darling test: %.3f' % p_value_AD)

Variance assumptions

In [None]:
# Test the equality of variances
# F-test
F0 = data1.var()/data2.var()
df1 = n1 - 1 # degrees of freedom for supplier 1
df2 = n2 - 1 # degrees of freedom for supplier 2
CI = [F0 * stats.f.ppf(alpha/2, df2, df1), F0 * stats.f.ppf(1-alpha/2, df2, df1)]
print('Confidence interval on the ratio of variances (CL = %.2f): [%.3f, %.3f]' % (CL, CI[0], CI[1]))

In [None]:
# plot the cumulative probability
x = np.linspace(0, 4, 100)
plt.plot(x, stats.f.pdf(x, df1, df2), label='Distribution under H_0')

# Adding Title, Labels and Grid
plt.title("F Distribution, %d, %d" % (df1, df2))
plt.xlabel("Values")
plt.ylabel("Probability Density")
plt.grid(True)      

# Filling the Probability Area
F_1 = stats.f.ppf(alpha/2, df1, df2)
F_2 = stats.f.ppf(1-alpha/2, df1, df2)
x_fill = np.linspace(0, F_1, 100)
y_fill = stats.f.pdf(x_fill, df1, df2)
plt.fill_between(x_fill, y_fill, color='red', alpha=0.5)
x_fill = np.linspace(F_2, np.max(x), 100)
y_fill = stats.f.pdf(x_fill, df1, df2)
plt.fill_between(x_fill, y_fill, color='red', alpha=0.5, label='Critical Regions')
# Add text to the plot with the chi2 values and centering the text
plt.text(F_1, 0.1, r'$F_{%.3f} = {%.3f}$' % (alpha/2, F_1), fontsize=10)
plt.text(F_2, 0.1, r'$F_{%.3f} = {%.3f}$' % (1-alpha/2, F_2), fontsize=10)

# Plot the test statistic F0
plt.vlines(F0, 0, np.max(stats.f.pdf(x, df1, df2)), color='r', linestyle='--', label='F_0')
plt.legend()
plt.show()

In [None]:
# Compute the p-value
p_value_F0 = 2 * stats.f.cdf(F0, df1, df2)
print('p-value for F-test for equal variances: %.3f' % p_value_F0)