Question 2: One-Sample Hypothesis Test

Testing whether the mean pH of red wines differs from 3.5

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import scipy.stats as stats
from scipy.stats import t

#load the data
df = pd.read_csv('winequality-red.csv', sep=';')

#variable for analysis
variable = 'pH'
data = df[variable].dropna()

print(f"Variable selected: {variable}")
print(f"Sample size (n): {len(data)}")
print(f"Sample mean: {data.mean():.4f}")
print(f"Sample standard deviation: {data.std(ddof=1):.4f}")

Hypothesis Statement

Research Question: Is the mean pH of red wines different from 3.5?

Null Hypothesis (H₀): μ = 3.5  
Alternative Hypothesis (H₁): μ ≠ 3.5

This is a two-tailed test at α = 0.05 significance level.

In [None]:
#hypothesized population mean
mu_0 = 3.5
alpha = 0.05

Checking Assumptions

Assumption 1 - Independence:
The wine samples are assumed to be independently collected. Each observation represents a different wine sample.

Assumption 2 - Sample Size:
Sample size n = 1599  
Sample size is large (n ≥ 30), so CLT applies.

Assumption 3 - Normality:
Visual checks using histogram and QQ plot

In [None]:
#create figure with two subplots
fig, axes = plt.subplots(1, 2, figsize=(12, 5))

#histogram
axes[0].hist(data, bins=30, edgecolor='black', alpha=0.7)
axes[0].axvline(data.mean(), color='red', linestyle='--', linewidth=2, label=f'Mean = {data.mean():.3f}')
axes[0].axvline(mu_0, color='green', linestyle='--', linewidth=2, label=f'H₀: μ = {mu_0}')
axes[0].set_xlabel(variable)
axes[0].set_ylabel('Frequency')
axes[0].set_title(f'Histogram of {variable}')
axes[0].legend()
axes[0].grid(True, alpha=0.3)

#QQ plot
stats.probplot(data, dist="norm", plot=axes[1])
axes[1].set_title(f'QQ Plot of {variable}')
axes[1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

Visual Assessment:
- From the histogram: The distribution appears roughly bell-shaped and approximately symmetric, though with some slight left skewness.
- From the QQ plot: Most points fall close to the reference line, suggesting the data is approximately normally distributed.

Conclusion:The normality assumption is reasonably satisfied.

Note: With a large sample size (n = 1599), the Central Limit Theorem ensures that the t-test is valid even if data isn't perfectly normal.

Hypothesis Test

In [None]:
#test statistic manually
n = len(data)
sample_mean = data.mean()
sample_std = data.std(ddof=1)
se = sample_std / np.sqrt(n)

#test statistic
t_stat = (sample_mean - mu_0) / se

#degrees of freedom
df = n - 1

#p-value (two-tailed)
p_value = 2 * (1 - t.cdf(abs(t_stat), df))

#critical value at alpha = 0.05
t_critical = t.ppf(1 - alpha/2, df)

print(f"Test: One-sample t-test")
print(f"Test statistic (t): {t_stat:.4f}")
print(f"Degrees of freedom: {df}")
print(f"p-value: {p_value:.4f}")
print(f"Critical value (α = {alpha}): ±{t_critical:.4f}")
print(f"Rejection region: |t| > {t_critical:.4f}")

Confidence Interval

In [None]:
#95% confidence interval
confidence_level = 0.95
margin_of_error = t_critical * se
ci_lower = sample_mean - margin_of_error
ci_upper = sample_mean + margin_of_error

print(f"{confidence_level*100:.0f}% Confidence Interval for μ:")
print(f"({ci_lower:.4f}, {ci_upper:.4f})")
print(f"\nInterpretation:")
print(f"We are {confidence_level*100:.0f}% confident that the true mean {variable} of red wines")
print(f"lies between {ci_lower:.4f} and {ci_upper:.4f}.")

#check if mu_0 is in the CI
if ci_lower <= mu_0 <= ci_upper:
    print(f"\nNote: The hypothesized value μ₀ = {mu_0} IS within the confidence interval.")
else:
    print(f"\nNote: The hypothesized value μ₀ = {mu_0} is NOT within the confidence interval.")

Conclusion

In [None]:
print(f"Decision rule: Reject H₀ if p-value < α = {alpha}")
print(f"p-value = {p_value:.4f}")

if p_value < alpha:
    print(f"\nDecision: REJECT H₀ (p-value = {p_value:.4f} < {alpha})")
    print(f"\nConclusion in context:")
    print(f"At the {alpha} significance level, there is sufficient evidence to conclude")
    print(f"that the mean {variable} of red wines is significantly different from {mu_0}.")
    print(f"The sample data suggests the true mean {variable} is approximately {sample_mean:.4f},")
    print(f"which is {'higher' if sample_mean > mu_0 else 'lower'} than the hypothesized value of {mu_0}.")
else:
    print(f"\nDecision: FAIL TO REJECT H₀ (p-value = {p_value:.4f} ≥ {alpha})")
    print(f"\nConclusion in context:")
    print(f"At the {alpha} significance level, there is insufficient evidence to conclude")
    print(f"that the mean {variable} of red wines is different from {mu_0}.")
    print(f"The data is consistent with the hypothesis that the mean {variable} is {mu_0}.")

Visualization of Hypothesis Test

In [None]:
#t-distribution plot
fig, ax = plt.subplots(figsize=(10, 6))

x = np.linspace(-5, 5, 1000)
y = t.pdf(x, df)

ax.plot(x, y, 'b-', linewidth=2, label=f't-distribution (df={df})')
ax.fill_between(x[x <= -t_critical], 0, t.pdf(x[x <= -t_critical], df),
                alpha=0.3, color='red', label=f'Rejection region (α/2 = {alpha/2})')
ax.fill_between(x[x >= t_critical], 0, t.pdf(x[x >= t_critical], df),
                alpha=0.3, color='red')
ax.axvline(t_stat, color='green', linestyle='--', linewidth=2,
           label=f'Test statistic = {t_stat:.3f}')
ax.axvline(-t_critical, color='red', linestyle=':', linewidth=1.5)
ax.axvline(t_critical, color='red', linestyle=':', linewidth=1.5)
ax.set_xlabel('t-value')
ax.set_ylabel('Probability Density')
ax.set_title(f'One-Sample t-Test: H₀: μ = {mu_0} vs H₁: μ ≠ {mu_0}')
ax.legend()
ax.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()