In [48]:
import pandas as pd
import numpy as np
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
import pickle
import matplotlib.pyplot as plt

In [49]:
# load cleaned data from analyze_quality-sites.ipynb
water_site_1 = pickle.load(open("water_quality_df_1.pkl", 'rb'))
water_site_2 = pickle.load(open("water_quality_df_2.pkl", 'rb'))
water_site_3 = pickle.load(open("water_quality_df_3.pkl", 'rb'))
water_site_4 = pickle.load(open("water_quality_df_4.pkl", 'rb'))

cols = list(water_site_1.columns)
print(cols)

#For now, remove WQI
water_site_1 = water_site_1.drop("WQI", axis=1)
water_site_2 = water_site_2.drop("WQI", axis=1)
water_site_3 = water_site_3.drop("WQI", axis=1)
water_site_4 = water_site_4.drop("WQI", axis=1)


['Temp (air) (F)', 'Temp (water) (C)', 'Salinity (ppt)', 'pH', 'DO (mg/L)', 'DO (%)', 'Turbidty (NTU)', 'Flow rate (m/s)', 'Discharge (m^3/s)', 'Nitrate (mg/L)', 'Ammonium (mg/L)', 'Phosphate (mg/L)', 'BOD (mg/L)', 'TSS (mg/L)', 'WQI']


In [50]:
def perform_pca(data, threshold=0.9, top_columns=3):
    feature_names = data.columns
    
    # Step 1: Scale the variables to have zero mean
    x1 = StandardScaler().fit_transform(data)
    
    # Initialize PCA with the input feature names
    pca = PCA(n_components=None)
    pca.fit(x1)
    
    # Get the principal components
    principal_components = pca.transform(x1)
    
    # Create a DataFrame with the principal components and their names
    PCDF = pd.DataFrame(data=principal_components, columns=[f'PC{i+1}' for i in range(principal_components.shape[1])])
    
    # Get the explained variance ratio for each principal component
    explained_variance_ratio = pca.explained_variance_ratio_
    
    # Determine the key principal components based on a threshold
    cumulative_variance_ratio = np.cumsum(explained_variance_ratio)
    key_components = np.where(cumulative_variance_ratio >= threshold)[0] + 1
    
    # Get the component loadings (eigenvectors) for the key principal components
    key_component_loadings = pca.components_[key_components - 1]
    
    # Create a dictionary to store the connection between key principal components and original column names
    key_component_column_mapping = {}
    
    for i, component in enumerate(key_components):
        pc_name = f'PC{component}'
        loadings = key_component_loadings[i]
        top_cols = feature_names[np.argsort(-np.abs(loadings))][:top_columns]
        key_component_column_mapping[pc_name] = top_cols
    
    return PCDF, key_component_column_mapping, pca

ws1_pcdf, ws_1_key_components_map, ws1_pca= perform_pca(water_site_1)
ws2_pcdf, ws_2_key_components_map, ws2_pca= perform_pca(water_site_2)
ws3_pcdf, ws_3_key_components_map, ws3_pca= perform_pca(water_site_3)
ws4_pcdf, ws_4_key_components_map, ws4_pca= perform_pca(water_site_4)

In [52]:
def get_loadings(pca, water_site):
    PCloadings = pca.components_.T * np.sqrt(pca.explained_variance_)
    components=water_site.columns.tolist()
    loadingdf=pd.DataFrame(PCloadings,columns=pca.get_feature_names_out())
    loadingdf["variable"]=components
    return loadingdf

ws1_loadings = get_loadings(ws1_pca, water_site_1)
ws_1_key_components_map

{'PC8': Index(['Phosphate (mg/L)', 'pH', 'Nitrate (mg/L)'], dtype='object', name=7),
 'PC9': Index(['Phosphate (mg/L)', 'Turbidty (NTU)', 'TSS (mg/L)'], dtype='object', name=7),
 'PC10': Index(['Turbidty (NTU)', 'Ammonium (mg/L)', 'TSS (mg/L)'], dtype='object', name=7),
 'PC11': Index(['Flow rate (m/s)', 'Temp (air) (F)', 'TSS (mg/L)'], dtype='object', name=7),
 'PC12': Index(['Salinity (ppt)', 'TSS (mg/L)', 'Ammonium (mg/L)'], dtype='object', name=7),
 'PC13': Index(['Temp (air) (F)', 'Flow rate (m/s)', 'Turbidty (NTU)'], dtype='object', name=7),
 'PC14': Index(['Temp (water) (C)', 'DO (%)', 'DO (mg/L)'], dtype='object', name=7)}