In [1]:
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler, MinMaxScaler
import numpy as np

In [2]:
df = pd.read_csv("cleaned_augmented.csv")
df.head()

Unnamed: 0,HighBP,HighChol,CholCheck,BMI,Smoker,Stroke,HeartDiseaseorAttack,PhysActivity,Fruits,Veggies,...,number_diagnoses,race_AfricanAmerican,race_Asian,race_Caucasian,race_Hispanic,race_Other,metformin_Down,readmitted_<30,readmitted_>30,Diabetes_binary
0,1,1,1,40,1,0,0,0,0,1,...,9,0,0,1,0,0,0,0,1,0
1,0,0,0,25,1,0,0,1,0,0,...,4,0,0,1,0,0,0,0,0,0
2,1,1,1,28,0,0,0,0,1,0,...,4,0,0,1,0,0,0,0,0,0
3,1,0,1,27,0,0,0,1,1,1,...,9,0,0,1,0,0,0,0,0,0
4,1,1,1,24,0,0,0,1,1,1,...,4,0,0,0,0,1,0,0,0,0


In [3]:
df.columns

Index(['HighBP', 'HighChol', 'CholCheck', 'BMI', 'Smoker', 'Stroke',
       'HeartDiseaseorAttack', 'PhysActivity', 'Fruits', 'Veggies',
       'HvyAlcoholConsump', 'AnyHealthcare', 'NoDocbcCost', 'GenHlth',
       'MentHlth', 'PhysHlth', 'DiffWalk', 'Sex', 'Education', 'Income',
       'age_[20-30)', 'age_[30-40)', 'age_[40-50)', 'age_[50-60)',
       'age_[60-70)', 'age_[70-80)', 'age_[80-90)', 'time_in_hospital',
       'num_lab_procedures', 'num_procedures', 'num_medications',
       'number_outpatient', 'number_emergency', 'number_inpatient',
       'number_diagnoses', 'race_AfricanAmerican', 'race_Asian',
       'race_Caucasian', 'race_Hispanic', 'race_Other', 'metformin_Down',
       'readmitted_<30', 'readmitted_>30', 'Diabetes_binary'],
      dtype='object')

In [4]:
df.shape

(253680, 44)

In [5]:
df = df.drop(columns=['metformin_Down'])
df.columns

Index(['HighBP', 'HighChol', 'CholCheck', 'BMI', 'Smoker', 'Stroke',
       'HeartDiseaseorAttack', 'PhysActivity', 'Fruits', 'Veggies',
       'HvyAlcoholConsump', 'AnyHealthcare', 'NoDocbcCost', 'GenHlth',
       'MentHlth', 'PhysHlth', 'DiffWalk', 'Sex', 'Education', 'Income',
       'age_[20-30)', 'age_[30-40)', 'age_[40-50)', 'age_[50-60)',
       'age_[60-70)', 'age_[70-80)', 'age_[80-90)', 'time_in_hospital',
       'num_lab_procedures', 'num_procedures', 'num_medications',
       'number_outpatient', 'number_emergency', 'number_inpatient',
       'number_diagnoses', 'race_AfricanAmerican', 'race_Asian',
       'race_Caucasian', 'race_Hispanic', 'race_Other', 'readmitted_<30',
       'readmitted_>30', 'Diabetes_binary'],
      dtype='object')

# PCA

## Scaled

In [6]:
# Standardize the features
scaler = StandardScaler()
df_sd_normalized = scaler.fit_transform(df)

# Scale the features to a range between 0 and 1
scaler = MinMaxScaler()
df_mm_scaled = scaler.fit_transform(df)

In [7]:
# Apply MinMaxScaler
scaler = MinMaxScaler()
df_scaled = pd.DataFrame(scaler.fit_transform(df), columns=df.columns)

# Open the text file for writing
with open('PCA_Breakdown_Scaled.txt', 'w') as file:
    # Iterate over different values of n
    for n in [3, 4, 5, 6, 7, 8, 9, 10]:
        # Perform PCA
        pca = PCA(n_components=n, random_state=69)
        pca.fit(df_scaled)
        
        # Write top n principal components to the file
        print("=============================")
        print(f"Top {n} Principal Components:")
        file.write("=============================\n")
        file.write(f"Top {n} Principal Components:\n")
        for i in range(n):
            file.write(f"Principal Component {i+1}:\n")
            print(f"Principal Component {i+1}:")
            
            # Get the indices of the columns contributing to this principal component
            sorted_indices = np.argsort(np.abs(pca.components_[i]))[::-1]
            
            # Write the top 5 columns contributing to this principal component in descending order of contribution
            file.write(", ".join([f"{df.columns[j]}: {pca.components_[i][j]}" for j in sorted_indices[:5]]) + "\n")
            file.write("\n")
            print(",\n".join([f"{df.columns[j]}: {pca.components_[i][j]}" for j in sorted_indices[:5]]))
            print()
file.close()

Top 3 Principal Components:
Principal Component 1:
HighBP: 0.48857082501568516,
HighChol: 0.4174416662689612,
DiffWalk: 0.2888728084499495,
Smoker: 0.277415725271168,
PhysActivity: -0.24876166248903422

Principal Component 2:
race_Caucasian: -0.6977904693586892,
race_AfricanAmerican: 0.6456846233830067,
Sex: -0.15563669328490548,
PhysActivity: -0.10928993016477076,
DiffWalk: 0.10056040390211256

Principal Component 3:
Fruits: -0.6147703168290511,
Sex: 0.47650783275512426,
Smoker: 0.30574259683294697,
Veggies: -0.2780717466917683,
HighChol: -0.23654398192710027

Top 4 Principal Components:
Principal Component 1:
HighBP: 0.48856871670266416,
HighChol: 0.41743897832979,
DiffWalk: 0.2888712878481072,
Smoker: 0.27741650274494867,
PhysActivity: -0.2487714196335959

Principal Component 2:
race_Caucasian: -0.6977887906297127,
race_AfricanAmerican: 0.6456825701133937,
Sex: -0.1556380974724956,
PhysActivity: -0.10925089410480174,
DiffWalk: 0.10057248576273338

Principal Component 3:
Fruits: -0.6

## Not scaled

In [8]:
# Open the text file for writing
with open('PCA_Breakdown_Not_Scaled.txt', 'w') as file:
    # Iterate over different values of n
    for n in [3, 4, 5, 6, 7, 8, 9, 10]:
        # Perform PCA
        pca = PCA(n_components=n, random_state=69)
        pca.fit(df)
        
        # Write top n principal components to the file
        print("=============================")
        print(f"Top {n} Principal Components:")
        file.write("=============================\n")
        file.write(f"Top {n} Principal Components:\n")
        for i in range(n):
            file.write(f"Principal Component {i+1}:\n")
            print(f"Principal Component {i+1}:")
            
            # Get the indices of the columns contributing to this principal component
            sorted_indices = np.argsort(np.abs(pca.components_[i]))[::-1]
            
            # Write the top 5 columns contributing to this principal component in descending order of contribution
            file.write(", ".join([f"{df.columns[j]}: {pca.components_[i][j]}" for j in sorted_indices[:5]]) + "\n")
            file.write("\n")
            print(",\n".join([f"{df.columns[j]}: {pca.components_[i][j]}" for j in sorted_indices[:5]]))
            print()
file.close()

Top 3 Principal Components:
Principal Component 1:
num_lab_procedures: 0.9923696098787408,
num_medications: 0.11098506612320637,
time_in_hospital: 0.04983944331996269,
number_diagnoses: 0.017439161809695247,
number_inpatient: 0.0060385325235484365

Principal Component 2:
PhysHlth: 0.8245968800115989,
MentHlth: 0.5246915276419375,
BMI: 0.16883848114841585,
num_medications: 0.07926385863073222,
Income: -0.06768405878855088

Principal Component 3:
num_medications: 0.958492565393266,
time_in_hospital: 0.16560082660518544,
num_lab_procedures: -0.11720208229700427,
BMI: 0.11607634467191064,
MentHlth: -0.11201444199861704

Top 4 Principal Components:
Principal Component 1:
num_lab_procedures: 0.9923696098787393,
num_medications: 0.110985066123206,
time_in_hospital: 0.04983944331996254,
number_diagnoses: 0.01743916180969518,
number_inpatient: 0.00603853252354842

Principal Component 2:
PhysHlth: 0.8245968800115979,
MentHlth: 0.5246915276419389,
BMI: 0.16883848114841507,
num_medications: 0.0792

In [9]:
df.to_csv("cleaned_augmented_2.csv", index=False)