In [None]:
import pyforest

from IPython.core.interactiveshell import InteractiveShell # Importing so we can run multiple lines in one cell`==
InteractiveShell.ast_node_interactivity = "all" # Code so multiple lines in one cell can be ran simultaenously`==
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)
import warnings
warnings.filterwarnings('ignore')


# ▶  Data Import

In [None]:
df = pd.read_csv('D:\GitHub Repos\WGU_MSDA\D212_Data Mining\medical_clean.csv')

In [None]:
df.head()

In [None]:
df.info()

In [None]:
# Excluding object columns and only including numerical columns
df1 = df.select_dtypes(exclude=['object'])

In [None]:
df1.head()

In [None]:
# Dropping unrelated columns    
df1.drop(['Population','CaseOrder', 'Zip', 'Lat', 'Lng',
           'Item1', 'Item2', 'Item3', 'Item4', 
           'Item5', 'Item6', 'Item7', 'Item8'], axis=1, inplace=True)

In [None]:
df1.head()

In [None]:
df1.info()

In [None]:
# Saving df1 column names for future use
df1_columns = df1.columns

In [None]:
df1_columns

# ▶  Data Preparation

### Standardization

In [None]:
# Dataframe before scaling
df1.head()

In [None]:
# Using Standard Scaler to scale the dataframe df1

scaler = StandardScaler()
df1_scaled = scaler.fit_transform(df1) # First scaling data
df1_scaled = pd.DataFrame(df1_scaled, columns=df1_columns) # Converting scaled data to dataframe

#Svaing scaled dataframe 'df1' to CSV
df1_scaled.to_csv('Medical Clean-Task2', index=False)

In [None]:
# Dataframe after Scaling
df1_scaled.head()

# ▶  Principal Component Analysis (PCA)

In [None]:
# Performing PCA
pca = PCA()  # You can change the number of components as needed
principal_components = pca.fit_transform(df1_scaled)

# Creating a DataFrame with the principal components
df1_pca = pd.DataFrame(data=principal_components, columns=['PC1', 'PC2', 'PC3', 'PC4', 'PC5', 'PC6', 'PC7', 'PC8', 'PC9', 'PC10'])

# Displaying the first few rows of the PCA result
df1_pca.head()

### Covariance Matrix  
The covariance represents the relationship between original variables.

In [None]:
# Get the covariance matrix
covariance_matrix = pca.get_covariance()

# Feature names (from the original dataframe)
feature_names = df1_scaled.columns

# Convert the covariance matrix to a DataFrame
cov_df = pd.DataFrame(covariance_matrix, index=feature_names, columns=feature_names)

# Display the DataFrame
cov_df.head()

### Loading Matrix  
The loading matrix contains the eigenvectors which represents how much the original variable contributes to each principal component.

In [None]:
loading_matrix = pd.DataFrame(pca.components_, columns=df1_scaled.columns, index=df1_pca.columns)
loading_matrix.head(20)

### Variance of each Principal Component

In [None]:
# Variance of each principal component
explained_variance = pca.explained_variance_

# Display the variance
for i, var in enumerate(explained_variance):
    print(f"PC{i+1}: {var:.4f}")


### Total Number of Principal Components - Kaiser Criterion

In [None]:
eigenvalues = pca.explained_variance_
num_components = sum(eigenvalues > 1)
print("Number of components (Kaiser Criterion):", num_components)

In [None]:
# Create a DataFrame for components
components_df = pd.DataFrame({
    'Principal Component': [f'PC{i+1}' for i in range(len(pca.explained_variance_))],
    'Eigenvalue': pca.explained_variance_,
    'Explained Variance Ratio (%)': pca.explained_variance_ratio_ * 100
})

# Filter components based on Kaiser Criterion
retained_components_df = components_df[components_df['Eigenvalue'] > 1]

print("Retained Components (Kaiser Criterion):")
print(retained_components_df)


In [None]:
# Scree Plot with Kaiser Criterion

plt.figure(figsize=(8, 5))
plt.plot(range(1, len(pca.explained_variance_) + 1), pca.explained_variance_, marker='o', linestyle='--')
plt.axhline(y=1, color='r', linestyle='--', label='Kaiser Criterion (Eigenvalue = 1)')
plt.xlabel('Principal Component')
plt.ylabel('Eigenvalue')
plt.title('Scree Plot with Kaiser Criterion')
plt.legend()
plt.grid()
plt.show();


### Total Variance

In [None]:
# Calculate the number of components retained by the Kaiser Criterion
num_components_kaiser = sum(pca.explained_variance_ > 1)
print(f"Number of components retained (Kaiser Criterion): {num_components_kaiser}")

# Calculate the total variance captured in numerical terms
total_variance_kaiser_numeric = sum(pca.explained_variance_[:num_components_kaiser])
print(f"Total variance captured (numerical): {total_variance_kaiser_numeric:.4f}")

# Calculate the total variance captured in percentage terms
total_variance_kaiser_percentage = sum(pca.explained_variance_ratio_[:num_components_kaiser]) * 100
print(f"Total variance captured (percentage): {total_variance_kaiser_percentage:.2f}%")
