In [1]:
# Import necessary libraries
import pandas as pd
from sklearn.datasets import load_diabetes

# The factor_analyzer library is required for these tests.
# You can install it via pip: pip install factor-analyzer
from factor_analyzer.factor_analyzer import calculate_bartlett_sphericity, calculate_kmo

# --- 1. Load and Prepare the Data ---
# We'll use the scikit-learn diabetes dataset as an example.
# It contains several continuous medical measurements.
diabetes = load_diabetes()
df = pd.DataFrame(data=diabetes.data, columns=diabetes.feature_names)

print("--- Sample Data Head ---")
print(df.head())
print("\n" + "="*50 + "\n")


# --- 2. Bartlett's Test of Sphericity ---
# This test checks if the correlation matrix is an identity matrix (variables are uncorrelated).
# A significant result (p-value < 0.05) is desired.
chi_square_value, p_value = calculate_bartlett_sphericity(df)

print("--- Bartlett's Test of Sphericity ---")
print(f"Chi-Square Value: {chi_square_value:.3f}")
print(f"P-value: {p_value:.3f}")

# Interpretation of the result
if p_value < 0.05:
    print("\nResult: The p-value is significant (p < 0.05). We can reject the null hypothesis.")
    print("This indicates that the variables are correlated and the data is likely suitable for Factor Analysis.")
else:
    print("\nResult: The p-value is not significant (p >= 0.05).")
    print("The variables may be uncorrelated, making the data unsuitable for Factor Analysis.")

print("\n" + "="*50 + "\n")


# --- 3. Kaiser-Meyer-Olkin (KMO) Test ---
# The KMO test measures the proportion of variance among variables that might be common variance.
# A KMO value closer to 1 is better. A value above 0.6 is generally considered acceptable.
kmo_all, kmo_model = calculate_kmo(df)

print("--- Kaiser-Meyer-Olkin (KMO) Test ---")
print(f"Overall KMO Score: {kmo_model:.3f}")

# Interpretation of the result
if kmo_model >= 0.6:
    print("\nResult: The overall KMO score is adequate (>= 0.6).")
    print("The data has sufficient sampling adequacy for Factor Analysis.")
else:
    print("\nResult: The overall KMO score is less than 0.6.")
    print("The data may not be suitable for Factor Analysis due to low sampling adequacy.")

# You can also view the KMO score for each individual variable
print("\nKMO Per Variable:")
kmo_per_variable = pd.Series(kmo_all, index=df.columns)
print(kmo_per_variable)


--- Sample Data Head ---
        age       sex       bmi        bp        s1        s2        s3  \
0  0.038076  0.050680  0.061696  0.021872 -0.044223 -0.034821 -0.043401   
1 -0.001882 -0.044642 -0.051474 -0.026328 -0.008449 -0.019163  0.074412   
2  0.085299  0.050680  0.044451 -0.005670 -0.045599 -0.034194 -0.032356   
3 -0.089063 -0.044642 -0.011595 -0.036656  0.012191  0.024991 -0.036038   
4  0.005383 -0.044642 -0.036385  0.021872  0.003935  0.015596  0.008142   

         s4        s5        s6  
0 -0.002592  0.019907 -0.017646  
1 -0.039493 -0.068332 -0.092204  
2 -0.002592  0.002861 -0.025930  
3  0.034309  0.022688 -0.009362  
4 -0.002592 -0.031988 -0.046641  


--- Bartlett's Test of Sphericity ---
Chi-Square Value: 3385.309
P-value: 0.000

Result: The p-value is significant (p < 0.05). We can reject the null hypothesis.
This indicates that the variables are correlated and the data is likely suitable for Factor Analysis.


--- Kaiser-Meyer-Olkin (KMO) Test ---
Overall KMO S

