Question 2 (One Sample Test)
Testing whether the mean pH of red wines differs from 3.5

Importing the Python Libraries & Loading the Dataset

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import scipy.stats as stats
from scipy.stats import t

#load the data
#note: file uses semicolon as delimiter
df = pd.read_csv('winequality-red.csv', sep=';')

#variable for analysis
variable = 'pH'
data = df[variable].dropna()
mu_0 = 3.5
alpha = 0.05

print(f"Variable selected: {variable}")
print(f"Sample size (n): {len(data)}")
print(f"Sample mean: {data.mean():.4f}")
print(f"Sample standard deviation: {data.std(ddof=1):.4f}")

Hypotheses
Research Question:  
Is the average pH of red wines different from 3.5?

- Null hypothesis (H₀): μ = 3.5  
- Alternative hypothesis (H₁): μ ≠ 3.5  

This is a two-tailed one-sample t-test with α = 0.05.


Assumption Checks

Independence: Each observation represents a different wine sample, so independence is reasonable.

Sample Size: n = 1599 ≥ 30, so by the Central Limit Theorem the sample mean is approximately normal.

Normality:
- The histogram is roughly bell-shaped.  
- The QQ plot shows points mostly along the reference line.  

Conclusion: The normality assumption is reasonably satisfied.


In [None]:
#code for part c, normality check 

#create figure with two subplots
fig, axes = plt.subplots(1, 2, figsize=(12, 5))

#histogram
axes[0].hist(data, bins=30, edgecolor='black', alpha=0.7)
axes[0].axvline(data.mean(), color='red', linestyle='--', linewidth=2, label=f'Mean = {data.mean():.3f}')
axes[0].axvline(mu_0, color='green', linestyle='--', linewidth=2, label=f'H₀: μ = {mu_0}')
axes[0].set_xlabel(variable)
axes[0].set_ylabel('Frequency')
axes[0].set_title(f'Histogram of {variable}')
axes[0].legend()
axes[0].grid(True, alpha=0.3)

#QQ plot
stats.probplot(data, dist="norm", plot=axes[1])
axes[1].set_title(f'QQ Plot of {variable}')
axes[1].grid(True, alpha=0.3)

plt.tight_layout()
plt.close()

Hypothesis Test Results

In [None]:
#conduct one-sample t-test
n = len(data)
sample_mean = data.mean()
sample_std = data.std(ddof=1)
se = sample_std / np.sqrt(n)

t_stat = (sample_mean - mu_0) / se
df = n - 1
p_value = 2 * (1 - t.cdf(abs(t_stat), df))
t_critical = t.ppf(1 - alpha/2, df)

print(f"Test statistic (t): {t_stat:.4f}")
print(f"Degrees of freedom: {df}")
print(f"p-value: {p_value:.4f}")
print(f"Critical value (α = {alpha}): ±{t_critical:.4f}")
print(f"Rejection region: |t| > {t_critical:.4f}")

Confidence Interval

In [None]:
#95% confidence interval
confidence_level = 0.95
margin_of_error = t_critical * se
ci_lower = sample_mean - margin_of_error
ci_upper = sample_mean + margin_of_error

print(f"{confidence_level*100:.0f}% Confidence Interval for μ:")
print(f"({ci_lower:.4f}, {ci_upper:.4f})")
print(f"\nInterpretation:")
print(f"We are {confidence_level*100:.0f}% confident that the true mean {variable} of red wines")
print(f"lies between {ci_lower:.4f} and {ci_upper:.4f}.")

if ci_lower <= mu_0 <= ci_upper:
    print(f"\nThe hypothesized value μ₀ = {mu_0} IS within the confidence interval.")
else:
    print(f"\nThe hypothesized value μ₀ = {mu_0} is NOT within the confidence interval.")

Conclusion

Decision: REJECT H₀ (p-value < 0.05)

At the 0.05 significance level, there is sufficient evidence to conclude that the mean pH of red wines is significantly different from 3.5. The sample data suggests the true mean pH is approximately 3.31, which is lower than the hypothesized value of 3.5.

In [None]:
#t-distribution visualization
fig, ax = plt.subplots(figsize=(10, 6))

x = np.linspace(-5, 5, 1000)
y = t.pdf(x, df)

ax.plot(x, y, "b-", linewidth=2, label=f"t-distribution (df={df})")
ax.fill_between(x[x <= -t_critical], 0, t.pdf(x[x <= -t_critical], df),
                alpha=0.3, color="red", label=f"Rejection region (α/2 = {alpha/2})")
ax.fill_between(x[x >= t_critical], 0, t.pdf(x[x >= t_critical], df),
                alpha=0.3, color="red")
ax.axvline(t_stat, color="green", linestyle="--", linewidth=2,
           label=f"Test statistic = {t_stat:.3f}")
ax.axvline(-t_critical, color="red", linestyle=":", linewidth=1.5)
ax.axvline(t_critical, color="red", linestyle=":", linewidth=1.5)
ax.set_xlabel("t-value")
ax.set_ylabel("Probability Density")
ax.set_title(f"One-Sample t-Test: H₀: μ = {mu_0} vs H₁: μ ≠ {mu_0}")
ax.legend()
ax.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()