In [None]:
## KMO Test Requirements

# Variables: Requires 3+ related columns.

# Scale: Variables must be numeric (interval/ratio), not categorical.

# Sample Size: Needs enough rows (observations) for a stable correlation matrix.
#     Guideline: Aim for 5-10 rows per variable & a minimum of 100+ total rows.

# Outliers: Avoid significant outliers, as they can distort the KMO score.

# Factorability: Variables should be conceptually related and expected to share underlying factors.

In [10]:
import pandas as pd
import numpy as np
from sklearn.datasets import load_iris
from factor_analyzer.factor_analyzer import calculate_kmo

In [26]:
iris = load_iris()
X = iris.data
feature_names = iris.feature_names

In [27]:
df = pd.DataFrame(X, columns=feature_names)

In [28]:
print("Iris Dataset Features:")
print(df.head())
print("-" * 30)

Iris Dataset Features:
   sepal length (cm)  sepal width (cm)  petal length (cm)  petal width (cm)
0                5.1               3.5                1.4               0.2
1                4.9               3.0                1.4               0.2
2                4.7               3.2                1.3               0.2
3                4.6               3.1                1.5               0.2
4                5.0               3.6                1.4               0.2
------------------------------


In [29]:
kmo_all, kmo_model = calculate_kmo(df)

In [30]:
print(f"Overall KMO for the dataset is: {kmo_model:.2f}")
print("-" * 30)

Overall KMO for the dataset is: 0.54
------------------------------


In [31]:
if kmo_model < 0.6:
    print("KMO value is low (< 0.6). The data may not be suitable for factor analysis.")
else:
    print("KMO value is adequate (> 0.6). The data is suitable for factor analysis.")

print("\nKMO for each variable:")
# The kmo_all object is a series with the KMO value for each variable
print(kmo_all)

KMO value is low (< 0.6). The data may not be suitable for factor analysis.

KMO for each variable:
[0.58406029 0.26957462 0.53074842 0.63420655]


In [None]:
# In this case, the KMO value is 0.536, indicating that the data is fairly suitable for PCA. 
# However, this value is somewhat low and could be considered sub-optimal, as typically KMO values above 0.6 are considered supportive of PCA.

In [32]:
df = pd.read_csv(r"C:\Users\basde\Downloads\playground-series-s3e24\train.csv")

In [34]:
features = df.drop(columns=['id', 'smoking'])

In [35]:
kmo_all, kmo_model = calculate_kmo(df)

In [36]:
print(f"Overall KMO for the dataset is: {kmo_model:.2f}")
print("-" * 30)

Overall KMO for the dataset is: 0.64
------------------------------


In [37]:
if kmo_model < 0.6:
    print("KMO value is low (< 0.6). The data may not be suitable for factor analysis.")
else:
    print("KMO value is adequate (> 0.6). The data is suitable for factor analysis.")

print("\nKMO for each variable:")
# The kmo_all object is a series with the KMO value for each variable
print(kmo_all)

KMO value is adequate (> 0.6). The data is suitable for factor analysis.

KMO for each variable:
[0.53451498 0.73207032 0.80756381 0.73397875 0.72377288 0.7040899
 0.71187704 0.57178088 0.57196206 0.62695974 0.64573053 0.84442995
 0.30703716 0.461931   0.4422162  0.3004782  0.91514953 0.76449083
 0.89712203 0.62364235 0.7505038  0.89748218 0.83494919 0.89583762]


In [39]:
print("\nKMO for each variable:")
# Loop through the KMO values and feature names to print them together
for i, feature in enumerate(features):
    print(f"{feature}: {kmo_all[i]:.2f}")


KMO for each variable:
age: 0.53
height(cm): 0.73
weight(kg): 0.81
waist(cm): 0.73
eyesight(left): 0.72
eyesight(right): 0.70
hearing(left): 0.71
hearing(right): 0.57
systolic: 0.57
relaxation: 0.63
fasting blood sugar: 0.65
Cholesterol: 0.84
triglyceride: 0.31
HDL: 0.46
LDL: 0.44
hemoglobin: 0.30
Urine protein: 0.92
serum creatinine: 0.76
AST: 0.90
ALT: 0.62
Gtp: 0.75
dental caries: 0.90


In [41]:
corr_matrix = df.corr()

In [42]:
print("Pairs of variables with correlation > 0.5 or < -0.5:\n")

Pairs of variables with correlation > 0.5 or < -0.5:



In [43]:
printed_pairs = set()

In [44]:
# i can see that i have a lot of correlated columns here. i can proceed with parallel analysis to find the amount of features i need.
for i in range(len(corr_matrix.columns)):
    for j in range(i): # This avoids comparing a variable with itself and avoids duplicates
        # Get the correlation value
        correlation = corr_matrix.iloc[i, j]

        # Check if the absolute correlation is greater than 0.5
        if abs(correlation) > 0.5:
            var1 = corr_matrix.columns[i]
            var2 = corr_matrix.columns[j]
            print(f"- {var1} and {var2}: {correlation:.2f}")

- weight(kg) and height(cm): 0.69
- waist(cm) and weight(kg): 0.83
- hearing(right) and hearing(left): 0.55
- relaxation and systolic: 0.75
- HDL and triglyceride: -0.51
- LDL and Cholesterol: 0.81
- hemoglobin and height(cm): 0.57
- hemoglobin and weight(kg): 0.53
- ALT and AST: 0.62
