In [None]:
import pandas as pdimport numpy as npimport matplotlib.pyplot as pltimport seaborn as snsfrom scipy.stats import ttest_ind, f_oneway, shapiro, levene, kruskal, probplotfrom statsmodels.stats.multicomp import pairwise_tukeyhsd# Introduction# The PlantGrowth dataset is a widely recognized dataset in statistical analysis and agricultural research, often used to teach experimental design and hypothesis testing [1].# It consists of 30 observations, where plant weights are measured under three different conditions: control ('ctrl'), treatment 1 ('trt1'), and treatment 2 ('trt2').# This project aims to:# - Explore the dataset visually and statistically to understand its structure and key characteristics.# - Use statistical tests such as t-tests and ANOVA to evaluate whether there are significant differences in plant weights across groups.# - Validate assumptions of normality, variance equality, and independence to ensure the reliability of conclusions.# - Discuss practical and theoretical implications of the findings in the context of agricultural experimentation.# The methods used align with principles introduced by Fisher [2] and are widely applied in modern experimental designs [3, 4].# The tools and libraries used, such as Pandas and Seaborn, are essential for data analysis in Python [5]. Online tutorials and documentation, such as those on Datacamp and Seaborn, provide further resources for learning and applying these methods [6, 7].

In [None]:
# 1. Load and Explore the Dataset# Loading the dataset is the first step to familiarize ourselves with its structure. This includes examining missing values, understanding group distributions, and summarizing key statistics.df = pd.read_csv('PlantGrowth.csv')# Dataset overviewprint("Dataset Overview:")print(df.head())print("\nDataset Information:")print(df.info())# Check for missing valuesmissing_values = df.isnull().sum()print("\nMissing Values:")print(missing_values if missing_values.any() else "No missing values detected.")# Summary statisticsprint("\nSummary Statistics:")print(df.describe())# Count observations in each groupprint("\nGroup Distribution:")print(df['group'].value_counts())

In [None]:
# Visualizations: Boxplot and histogramsplt.figure(figsize=(12, 6))sns.boxplot(data=df, x='group', y='weight', palette='Set2')plt.title('Boxplot: Weight Distribution by Group')plt.xlabel('Group')plt.ylabel('Weight (grams)')plt.show()plt.figure(figsize=(12, 8))for group in df['group'].unique():    sns.histplot(df[df['group'] == group]['weight'], kde=True, label=group, alpha=0.6, binwidth=0.2)plt.title('Histogram: Weight Distribution by Group')plt.xlabel('Weight (grams)')plt.ylabel('Frequency')plt.legend(title='Group')plt.show()

In [None]:
# Visualization: Density plot with all groupsplt.figure(figsize=(10, 6))sns.kdeplot(data=df, x='weight', hue='group', fill=True, common_norm=False, alpha=0.5, palette='muted')plt.title('Density Plot: Weight by Group')plt.xlabel('Weight (grams)')plt.ylabel('Density')plt.show()

In [None]:
# Validation of Statistical Assumptionsprint("\nShapiro-Wilk Test for Normality:")for group in df['group'].unique():    stat, p_value = shapiro(df[df['group'] == group]['weight'])    print(f"Group {group} - W-statistic: {stat:.4f}, P-value: {p_value:.4f}")

In [None]:
# Visualization: Q-Q plots for normalityplt.figure(figsize=(12, 8))for i, group in enumerate(df['group'].unique(), 1):    plt.subplot(2, 2, i)    probplot(df[df['group'] == group]['weight'], dist="norm", plot=plt)    plt.title(f'Q-Q Plot: {group}')plt.tight_layout()plt.show()

In [None]:
# Check homogeneity of varianceslevene_stat, levene_p = levene(    df[df['group'] == 'ctrl']['weight'],    df[df['group'] == 'trt1']['weight'],    df[df['group'] == 'trt2']['weight'])print(f"\nLevene's Test for Homogeneity of Variances:\nStatistic: {levene_stat:.4f}, P-value: {levene_p:.4f}")

In [None]:
# 2. T-Test Analysis# Independent t-tests compare means between two groups. Assumptions include normality and equal variances.trt1 = df[df['group'] == 'trt1']['weight']trt2 = df[df['group'] == 'trt2']['weight']t_stat, p_val = ttest_ind(trt1, trt2, equal_var=True)print("\nT-Test Results:")print(f"T-statistic: {t_stat:.4f}, P-value: {p_val:.4f}")if p_val < 0.05:    print("\nSignificant difference found between 'trt1' and 'trt2'.")else:    print("\nNo significant difference found between 'trt1' and 'trt2'.")

In [None]:
# Visualization: Error bar plotplt.figure(figsize=(8, 6))mean_weights = df.groupby('group')['weight'].mean()std_weights = df.groupby('group')['weight'].std()plt.bar(mean_weights.index, mean_weights, yerr=std_weights, capsize=5, color=['blue', 'green', 'orange'], alpha=0.7)plt.title('Mean Weight with Error Bars')plt.xlabel('Group')plt.ylabel('Weight (grams)')plt.show()

In [None]:
# 3. ANOVA Analysis# One-way ANOVA evaluates differences among three groups. Post-hoc tests are used to identify specific differences.ctrl = df[df['group'] == 'ctrl']['weight']f_stat, p_val_anova = f_oneway(ctrl, trt1, trt2)print("\nANOVA Results:")print(f"F-statistic: {f_stat:.4f}, P-value: {p_val_anova:.4f}")if p_val_anova < 0.05:    print("\nSignificant differences found among the groups.")else:    print("\nNo significant differences found among the groups.")

In [None]:
# Post-hoc analysis using Tukey's HSDposthoc = pairwise_tukeyhsd(df['weight'], df['group'], alpha=0.05)print("\nPost-hoc Results:")print(posthoc)posthoc.plot_simultaneous()plt.title('Tukey HSD: Pairwise Comparisons')plt.show()

In [None]:
# Advanced Analysis: Non-parametric Test (Kruskal-Wallis)kruskal_stat, kruskal_p = kruskal(ctrl, trt1, trt2)print("\nKruskal-Wallis Test:")print(f"Statistic: {kruskal_stat:.4f}, P-value: {kruskal_p:.4f}")

In [None]:
# Visualization: Violin plot with quartilesplt.figure(figsize=(8, 6))sns.violinplot(data=df, x='group', y='weight', palette='coolwarm', inner='quartile')plt.title('Violin Plot: Weight by Group')plt.xlabel('Group')plt.ylabel('Weight (grams)')plt.show()

# References

[1] Fisher, R. A. (1935). The Design of Experiments. Edinburgh: Oliver and Boyd.

[2] Montgomery, D. C. (2019). Design and Analysis of Experiments. Wiley.

[3] Tukey, J. W. (1977). Exploratory Data Analysis. Addison-Wesley.

[4] Sheskin, D. J. (2004). Handbook of Parametric and Nonparametric Statistical Procedures. CRC Press.

[5] Pandas Documentation. https://pandas.pydata.org/docs/

[6] Seaborn Documentation. https://seaborn.pydata.org/

[7] Datacamp. "T-Test in Python." https://www.datacamp.com/

[8] Statistics How To. "One-Way ANOVA." https://www.statisticshowto.com/probability-and-statistics/anova/

[9] Scipy Documentation. https://docs.scipy.org/doc/scipy/

[10] Wikipedia contributors. "Shapiro-Wilk Test." https://en.wikipedia.org/wiki/Shapiro%E2%80%93Wilk_test

[11] Wikipedia contributors. "Levene's Test." https://en.wikipedia.org/wiki/Levene%27s_test

[12] Statistics By Jim. "Understanding T-Tests." https://statisticsbyjim.com/hypothesis-testing/t-tests/

[13] Wikipedia contributors. "Tukey's range test." https://en.wikipedia.org/wiki/Tukey%27s_range_test

[14] Datacamp. "Kruskal-Wallis Test."