In [1]:
# Import necessary libraries
import pandas as pd
from sklearn.datasets import load_iris
import scipy.stats as stats

# --- Introduction to ANOVA F-test ---
# The Analysis of Variance (ANOVA) F-test is a statistical test used to determine
# whether there are any statistically significant differences between the means of two or more
# independent groups.

# When to use it:
# - You have one categorical independent variable (with two or more levels/groups) and
#   one continuous dependent variable.
# - You want to test if the means of the dependent variable are equal across the groups.

# Key Assumptions:
# 1. Independence: The observations in each group are independent of each other.
# 2. Normality: The dependent variable is approximately normally distributed for each group.
# 3. Homogeneity of Variances (Homoscedasticity): The variance of the dependent variable
#    is equal across all groups.

# --- 1. Load a Relevant Dataset ---
# The Iris dataset is a classic choice for this test. It contains measurements for
# three species of iris flowers (the groups) and four continuous features.
# We want to see if the means of these features differ significantly across the species.
iris = load_iris()
# Create a pandas DataFrame for easier manipulation
# The feature_names are the columns for our continuous variables
# The target is the categorical variable (species), which we'll add to the DataFrame
iris_df = pd.DataFrame(data=iris.data, columns=iris.feature_names)
iris_df['species'] = pd.Categorical.from_codes(iris.target, iris.target_names)

print("--- Head of the Iris Dataset ---")
print(iris_df.head())
print("\n")


# --- 2. Select Features and Prepare Data for the Test ---
# For the one-way ANOVA, we need to separate the measurements (our dependent variable)
# for each group (species). We will test each of the four features.

# Let's select the 'petal length (cm)' feature as an example to see the structure.
# The groups are the three species: 'setosa', 'versicolor', and 'virginica'.
species_groups = iris_df['species'].unique()
grouped_data = {species: iris_df['petal length (cm)'][iris_df['species'] == species] for species in species_groups}

# The f_oneway function from scipy.stats takes each group's data as a separate argument.
# For example: stats.f_oneway(group1_data, group2_data, group3_data)
# Let's see what the data for each group looks like:
# print("Petal length for Setosa:", grouped_data['setosa'].head().values)
# print("Petal length for Versicolor:", grouped_data['versicolor'].head().values)
# print("Petal length for Virginica:", grouped_data['virginica'].head().values)


# --- 3. Apply the ANOVA F-test to the Selected Data ---
# We will now loop through each of the four features and perform the ANOVA test
# to see if its mean significantly differs across the three species.

# The null hypothesis (H0) for each test is that the means of the feature
# are the same across all three species.
# The alternative hypothesis (H1) is that at least one species has a different mean.

# We'll store the results in a list
anova_results = []

# Iterate over each feature column
for feature in iris.feature_names:
    # Create a list of data samples for each species
    samples = [iris_df[feature][iris_df['species'] == s] for s in species_groups]
    
    # Perform the one-way ANOVA
    f_statistic, p_value = stats.f_oneway(*samples)
    
    anova_results.append({
        'Feature': feature,
        'F-Statistic': f_statistic,
        'P-Value': p_value
    })

# --- 4. Show the Result of Applying the Technique ---
# We will display the results in a new DataFrame.
# A small p-value (typically < 0.05) suggests that we can reject the null hypothesis.
# This means there is strong evidence that the feature's mean is not the same for all species.
# A large F-statistic also points to a larger difference between group means.

results_df = pd.DataFrame(anova_results)

print("--- ANOVA F-test Results for Each Feature ---")
# The head() of the results is the full table in this case
print(results_df)

print("\n--- Interpretation ---")
print("A low P-Value (e.g., < 0.05) indicates a high probability that the means for that feature are different across the species.")
print("Based on the results, all features show a very low P-Value, suggesting they are all significant for distinguishing between iris species.")



--- Head of the Iris Dataset ---
   sepal length (cm)  sepal width (cm)  petal length (cm)  petal width (cm)  \
0                5.1               3.5                1.4               0.2   
1                4.9               3.0                1.4               0.2   
2                4.7               3.2                1.3               0.2   
3                4.6               3.1                1.5               0.2   
4                5.0               3.6                1.4               0.2   

  species  
0  setosa  
1  setosa  
2  setosa  
3  setosa  
4  setosa  


--- ANOVA F-test Results for Each Feature ---
             Feature  F-Statistic       P-Value
0  sepal length (cm)   119.264502  1.669669e-31
1   sepal width (cm)    49.160040  4.492017e-17
2  petal length (cm)  1180.161182  2.856777e-91
3   petal width (cm)   960.007147  4.169446e-85

--- Interpretation ---
A low P-Value (e.g., < 0.05) indicates a high probability that the means for that feature are different acro