In [1]:
import pandas as pd
import numpy as np
from sklearn.decomposition import PCA

### OG PCA

In [2]:
from ucimlrepo import fetch_ucirepo 
  
# fetch dataset 
cdc_diabetes_health_indicators = fetch_ucirepo(id=891) 
  
# data (as pandas dataframes) 
X = cdc_diabetes_health_indicators.data.features 
y = cdc_diabetes_health_indicators.data.targets 
  
# metadata 
print(cdc_diabetes_health_indicators.metadata) 
  
# variable information 
print(cdc_diabetes_health_indicators.variables) 

{'uci_id': 891, 'name': 'CDC Diabetes Health Indicators', 'repository_url': 'https://archive.ics.uci.edu/dataset/891/cdc+diabetes+health+indicators', 'data_url': 'https://archive.ics.uci.edu/static/public/891/data.csv', 'abstract': 'The Diabetes Health Indicators Dataset contains healthcare statistics and lifestyle survey information about people in general along with their diagnosis of diabetes. The 35 features consist of some demographics, lab test results, and answers to survey questions for each patient. The target variable for classification is whether a patient has diabetes, is pre-diabetic, or healthy. ', 'area': 'Health and Medicine', 'tasks': ['Classification'], 'characteristics': ['Tabular', 'Multivariate'], 'num_instances': 253680, 'num_features': 21, 'feature_types': ['Categorical', 'Integer'], 'demographics': ['Sex', 'Age', 'Education Level', 'Income'], 'target_col': ['Diabetes_binary'], 'index_col': ['ID'], 'has_missing_values': 'no', 'missing_values_symbol': None, 'year_

In [3]:
pca = PCA()
pca.fit(X)
cumsum = np.cumsum(pca.explained_variance_ratio_)

In [4]:
d = np.argmax(cumsum >= 0.95) + 1
d

4

In [5]:
# Get the indices of the top features for each component
top_feature_indices = [component.argsort()[-1::-1][:1] for component in pca.components_]

# Get the feature names for the top features of each component
feature_names = list(X.columns[:])  # Assuming the last column is the target variable
top_features_names = []

for indices in top_feature_indices:
    top_feature_names = [feature_names[i] for i in indices]
    top_features_names.append(top_feature_names)

In [6]:
top_features_names

[['PhysHlth'],
 ['BMI'],
 ['MentHlth'],
 ['Income'],
 ['GenHlth'],
 ['GenHlth'],
 ['Income'],
 ['Sex'],
 ['HighChol'],
 ['Smoker'],
 ['Sex'],
 ['HighChol'],
 ['Fruits'],
 ['Fruits'],
 ['DiffWalk'],
 ['NoDocbcCost'],
 ['HeartDiseaseorAttack'],
 ['HvyAlcoholConsump'],
 ['DiffWalk'],
 ['Stroke'],
 ['AnyHealthcare']]

### Preprocessed PCA

In [7]:
df = pd.read_csv('Preprocessed_data.csv', index_col = 0)
df

Unnamed: 0,HighBP,HighChol,CholCheck,BMI,Smoker,Stroke,HeartDiseaseorAttack,PhysActivity,Fruits,Veggies,...,NoDocbcCost,GenHlth,MentHlth,PhysHlth,DiffWalk,Sex,Age,Education,Income,Diabetes_binary
0,0,0,1,23,0,0,0,1,0,0,...,0,1,0,0,0,0,11,5,7,0
1,1,0,1,19,0,0,0,0,1,1,...,0,3,0,0,0,0,6,6,8,0
2,0,0,1,26,1,0,0,1,1,1,...,0,2,0,0,0,0,1,4,4,0
3,0,1,1,22,0,0,0,1,1,1,...,0,1,0,0,0,1,12,4,2,0
4,0,0,1,22,0,0,0,0,1,1,...,0,1,0,0,0,0,4,6,8,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
253659,0,1,1,37,0,0,0,0,0,1,...,0,4,0,0,0,0,6,4,1,1
253668,0,1,1,29,1,0,1,0,1,1,...,0,2,0,0,1,1,10,3,6,1
253670,1,1,1,25,0,0,1,0,1,0,...,0,5,15,0,1,0,13,6,4,1
253676,1,1,1,18,0,0,0,0,0,0,...,0,4,0,0,1,0,11,2,4,1


In [8]:
pca = PCA()
pca.fit(df.iloc[:,:-1])
cumsum = np.cumsum(pca.explained_variance_ratio_)

In [9]:
cumsum

array([0.51124556, 0.72313009, 0.92760781, 0.96572274, 0.98469077,
       0.98847921, 0.99152562, 0.99283929, 0.99391991, 0.9949791 ,
       0.99592619, 0.99669141, 0.99741631, 0.99801865, 0.99853339,
       0.99895518, 0.99930735, 0.99952092, 0.99971096, 0.99988446,
       1.        ])

In [10]:
d = np.argmax(cumsum >= 0.95) + 1
d

4

In [11]:
X = df.iloc[:,:-1]

In [12]:
pca = PCA(n_components = 4)
pca.fit(X)

In [13]:
pca.explained_variance_ratio_

array([0.51124556, 0.21188453, 0.20447772, 0.03811492])

In [14]:
# Get the indices of the top features for each component
top_feature_indices = [component.argsort()[-1::-1][:1] for component in pca.components_]

# Get the feature names for the top features of each component
feature_names = list(df.columns[:-1])  # Assuming the last column is the target variable
top_features_names = []

for indices in top_feature_indices:
    top_feature_names = [feature_names[i] for i in indices]
    top_features_names.append(top_feature_names)

In [15]:
top_features_names

[['PhysHlth'], ['BMI'], ['MentHlth'], ['Income']]