In [1]:
import pandas as pd
from sklearn.datasets import load_breast_cancer
from factor_analyzer.factor_analyzer import calculate_bartlett_sphericity, calculate_kmo
import numpy as np

# --- Introduction to Data Suitability Tests for PCA ---
# Before applying Principal Component Analysis (PCA), it's crucial to determine
# if the dataset is suitable for this technique. We use two statistical tests:
#
# 1. Bartlett's Test of Sphericity: This test checks if the observed variables
#    inter-correlate at all. The null hypothesis is that the correlation matrix
#    is an identity matrix (i.e., variables are unrelated). For PCA to be useful,
#    we want to REJECT this null hypothesis. A significant p-value (e.g., < 0.05)
#    indicates that the data is likely suitable for PCA.
#    Data Requirements: The variables should be approximately normally distributed.
#
# 2. Kaiser-Meyer-Olkin (KMO) Test: This test measures the proportion of variance
#    among variables that might be common variance. A higher value, closer to 1,
#    indicates that the correlations between pairs of variables can be explained
#    by other variables, making the data suitable for factor analysis or PCA.
#    KMO values between 0.8 and 1 indicate the sampling is adequate. Values less
#    than 0.6 indicate the sampling is not adequate.
#    Data Requirements: Requires a dataset with more than two variables. The variables
#    should be at a metric level (interval or ratio).

# --- 1. Load a More Suitable Sample Dataset ---
# We'll use the Breast Cancer Wisconsin dataset from scikit-learn.
# This dataset has 30 features, making it a better candidate for PCA.
cancer = load_breast_cancer()
X = pd.DataFrame(cancer.data, columns=cancer.feature_names)
y = cancer.target

print("--- Breast Cancer Dataset Head ---")
print(X.head())
print("\n")


# --- 2. Perform Bartlett's Test ---
# This test will give us a chi-square value and a p-value.
# It checks the null hypothesis that the inter-correlation matrix is an identity matrix.
chi_square_value, p_value = calculate_bartlett_sphericity(X)
print("--- Bartlett's Test of Sphericity ---")
print(f"Chi-Square Value: {chi_square_value:.2f}")
print(f"P-value: {p_value}")

# Interpretation of Bartlett's Test
if p_value < 0.05:
    print("Interpretation: The p-value is significant (p < 0.05). We reject the null hypothesis, which suggests that the variables are correlated and the data is suitable for PCA.\n")
else:
    print("Interpretation: The p-value is not significant (p >= 0.05). We fail to reject the null hypothesis, which suggests the variables are not correlated enough for PCA to be useful.\n")


# --- 3. Perform KMO Test ---
# The calculate_kmo function returns the KMO value for each variable and the overall KMO value.
kmo_per_variable, kmo_overall = calculate_kmo(X)
print("--- Kaiser-Meyer-Olkin (KMO) Test ---")
print(f"Overall KMO for Dataset: {kmo_overall:.2f}")

# Interpretation of KMO Test
if kmo_overall >= 0.8:
    print("Interpretation: The overall KMO score is excellent, indicating the data is highly suitable for PCA.")
elif kmo_overall >= 0.6:
    print("Interpretation: The overall KMO score is acceptable. The data is suitable for PCA.")
else:
    print("Interpretation: The KMO score is below the acceptable limit. The data may not be suitable for PCA.")


--- Breast Cancer Dataset Head ---
   mean radius  mean texture  mean perimeter  mean area  mean smoothness  \
0        17.99         10.38          122.80     1001.0          0.11840   
1        20.57         17.77          132.90     1326.0          0.08474   
2        19.69         21.25          130.00     1203.0          0.10960   
3        11.42         20.38           77.58      386.1          0.14250   
4        20.29         14.34          135.10     1297.0          0.10030   

   mean compactness  mean concavity  mean concave points  mean symmetry  \
0           0.27760          0.3001              0.14710         0.2419   
1           0.07864          0.0869              0.07017         0.1812   
2           0.15990          0.1974              0.12790         0.2069   
3           0.28390          0.2414              0.10520         0.2597   
4           0.13280          0.1980              0.10430         0.1809   

   mean fractal dimension  ...  worst radius  worst textu

