In [1]:
import pandas as pd
import seaborn as sns
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import silhouette_score
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import geopandas as gpd
import rasterio
import requests
import zipfile
import io
import os
import shutil
from shapely.geometry import Point



df = pd.read_csv('data_states_processed.csv')
#print(df.head())

# Filter for wells with depth less than or equal to 3000 ft
df = df[df['well_depth_ft'] <= 3000]

# Define ions by group

cations = ['model_Ba_molL', 'model_Ca_molL', 'model_Fe_molL', 'model_K_molL',
           'model_Li_molL', 'model_Mg_molL', 'model_Mn_molL', 'model_Na_molL', 'model_Sr_molL']
# Include Fe2+ as cation

anions = ['model_Br_molL', 'model_Cl_molL', 'model_C_molL', 'model_SO4_molL']  # Treat model_C as HCO3-

neutral_ions = ['model_B_molL', 'model_Si_molL']  # assumed neutral species

# Charges dictionary for equivalent calculation (charge per mole)
charges = {
    'model_Ba_molL': 2,
    'model_Ca_molL': 2,
    'model_Fe_molL': 2,
    'model_K_molL': 1,
    'model_Li_molL': 1,
    'model_Mg_molL': 2,
    'model_Mn_molL': 2,
    'model_Na_molL': 1,
    'model_Sr_molL': 2,
    'model_Br_molL': 1,
    'model_Cl_molL': 1,
    'model_C_molL': 1,
    'model_SO4_molL': 2
}

# --- Calculate equivalents for cations and anions ---

for ion in cations + anions:
    df[ion + '_eq'] = df[ion] * charges[ion]

# --- Calculate totals ---

df['total_cations_eq'] = df[[c + '_eq' for c in cations]].sum(axis=1)
df['total_anions_eq'] = df[[a + '_eq' for a in anions]].sum(axis=1)

# Total ions in mol/L from all charged equivalents converted back to mol/L plus neutral ions
    
df['total_ion_molL'] = (
    df[[ion + '_eq' for ion in cations + anions]].div(
        [charges[i] for i in cations + anions], axis=1
    ).sum(axis=1)
    + df[neutral_ions].sum(axis=1)
)

# --- Calculate fractions ---

# For ions belonging to cations and anions, fraction based on total equivalents
for ion in cations:
    df[ion + '_frac'] = np.where(df['total_cations_eq'] != 0,
                                df[ion + '_eq'] / df['total_cations_eq'], 
                                np.nan)

for ion in anions:
    df[ion + '_frac'] = np.where(df['total_anions_eq'] != 0,
                                df[ion + '_eq'] / df['total_anions_eq'], 
                                np.nan)

# For neutral ions (B, Si), fraction based on total ions in mol/L (including neutral)
for ion in neutral_ions:
    df[ion + '_frac'] = np.where(df['total_ion_molL'] != 0,
                                df[ion] / df['total_ion_molL'], 
                                np.nan)

# --- Log10 transform TDS ---

df['TDS_log10'] = np.log10(df['model_TDS_mgL'])

# --- Select only requested columns in the new df_ions DataFrame ---

frac_columns = [ion + '_frac' for ion in cations + anions + neutral_ions]
selected_columns = frac_columns + ['TDS_log10'] + ['well_depth_ft']

df_ions = df[selected_columns].copy()
# Print columns of df_ions
print("Columns in df_ions:", df_ions.columns.tolist()) 

print("DataFrame df_ions shape:", df_ions.shape)

# Columns you want to add and standardize
extra_si_cols = ['model_si_Calcite', 'model_si_Barite', 'model_si_Chalcedony', 'model_si_Gypsum']

# Extract these columns from df
df_extra = df[extra_si_cols].copy()

# Combine df_ions and the standardized extra columns into df_ion_si
df_ion_si = pd.concat([df_ions, df_extra], axis=1)

print("Columns in df_ion_si:", df_ion_si.columns.tolist()) 

print("DataFrame df_ion_si shape:", df_ion_si.shape)

# List of the new "pp" columns to add and standardize
extra_pp_cols = [
    'model_pp_Calcite_open_2x',
    'model_pp_Calcite_open_4x',
    'model_pp_Calcite_open_8x',
    'model_pp_Barite_open_2x',
    'model_pp_Barite_open_4x',
    'model_pp_Barite_open_8x',
    'model_pp_Chalcedony_open_2x',
    'model_pp_Chalcedony_open_4x',
    'model_pp_Chalcedony_open_8x',
    'model_pp_Gypsum_open_2x',
    'model_pp_Gypsum_open_4x',
    'model_pp_Gypsum_open_8x'
]

# Extract these columns from df
df_pp_extra = df[extra_pp_cols].copy()

# Concatenate df_ions with the standardized pp columns to form df_ion_pp
df_ion_pp = pd.concat([df_ions, df_pp_extra], axis=1)

print("Columns in df_ion_pp:", df_ion_pp.columns.tolist()) 

print("DataFrame df_ion_pp shape:", df_ion_pp.shape)


  from scipy.stats import gaussian_kde


Columns in df_ions: ['model_Ba_molL_frac', 'model_Ca_molL_frac', 'model_Fe_molL_frac', 'model_K_molL_frac', 'model_Li_molL_frac', 'model_Mg_molL_frac', 'model_Mn_molL_frac', 'model_Na_molL_frac', 'model_Sr_molL_frac', 'model_Br_molL_frac', 'model_Cl_molL_frac', 'model_C_molL_frac', 'model_SO4_molL_frac', 'model_B_molL_frac', 'model_Si_molL_frac', 'TDS_log10', 'well_depth_ft']
DataFrame df_ions shape: (4733, 17)
Columns in df_ion_si: ['model_Ba_molL_frac', 'model_Ca_molL_frac', 'model_Fe_molL_frac', 'model_K_molL_frac', 'model_Li_molL_frac', 'model_Mg_molL_frac', 'model_Mn_molL_frac', 'model_Na_molL_frac', 'model_Sr_molL_frac', 'model_Br_molL_frac', 'model_Cl_molL_frac', 'model_C_molL_frac', 'model_SO4_molL_frac', 'model_B_molL_frac', 'model_Si_molL_frac', 'TDS_log10', 'well_depth_ft', 'model_si_Calcite', 'model_si_Barite', 'model_si_Chalcedony', 'model_si_Gypsum']
DataFrame df_ion_si shape: (4733, 21)
Columns in df_ion_pp: ['model_Ba_molL_frac', 'model_Ca_molL_frac', 'model_Fe_molL_fra

  df = pd.read_csv('data_states_processed.csv')


In [2]:
########## ION CLUSTERING ##########
##################################### which features are used the most for clustering? find out which features are most important for clustering

state_col = 'state_alpha' 
output_folder = "ion"
# 1. Clustering across all states together
print("=== CLUSTERING ACROSS ALL STATES ===")

df_ions = df_ions.replace([np.inf, -np.inf], np.nan).dropna()
# remove well_depth_ft from df_ions
df_cluster = df_ions.drop(columns=['well_depth_ft'], errors='ignore')


# Standardization
scaler = StandardScaler()
X_scaled_all = scaler.fit_transform(df_cluster)

# Elbow method (R²)
r2_values = []
k_max = min(12, len(X_scaled_all) - 1)
elbow_k_range = range(1, k_max+1)
tot_ss = np.sum((X_scaled_all - X_scaled_all.mean(axis=0)) ** 2)
for k in elbow_k_range:
    km = KMeans(n_clusters=k, random_state=42, n_init=10)
    km.fit(X_scaled_all)
    between_ss = tot_ss - km.inertia_
    r2_values.append(between_ss / tot_ss)

plt.figure()
plt.plot(list(elbow_k_range), r2_values, marker='o')
plt.xlabel('k')
plt.ylabel('R²')
plt.title('Elbow method for k-means: All States')
elbow_path = os.path.join(output_folder, "all_states_elbow.png")
plt.savefig(elbow_path, dpi=300)
plt.close()

# Silhouette method
sil_scores = []
sil_k_range = range(2, min(13, len(X_scaled_all)))
for k in sil_k_range:
    km = KMeans(n_clusters=k, random_state=42, n_init=10)
    labels = km.fit_predict(X_scaled_all)
    score = silhouette_score(X_scaled_all, labels)
    sil_scores.append(score)
    print(f"All States, k={k}, silhouette score={score:.3f}")

plt.figure()
plt.plot(list(sil_k_range), sil_scores, marker='o')
plt.xlabel('k')
plt.ylabel('Silhouette Score')
plt.title('Silhouette Score vs k: All States')
silhouette_path = os.path.join(output_folder, "all_states_silhouette.png")
plt.savefig(silhouette_path, dpi=300)
plt.close()


########## CLUSTER ASSIGNMENT FOR ALL STATES ##########

k_general = 3
km = KMeans(n_clusters=k_general, random_state=42, n_init=10)
cluster_labels = km.fit_predict(X_scaled_all)

df_plot = df_ions.copy()
df_plot['Cluster'] = cluster_labels.astype(str)  # String for plotting labels
df_plot['model_Ba_molL_frac'] = np.log10(df_plot['model_Ba_molL_frac'])

# 1. BOX PLOTS FOR EACH ION FEATURE BY CLUSTER
cluster_counts = df_plot['Cluster'].value_counts().sort_index()
# Make new labels in the form "0\n(n=73)" etc.
new_labels = [f"{cl}\n(n={cluster_counts[cl]})" for cl in sorted(cluster_counts.index, key=lambda x: int(x))]

# Make a copy of df_plot columns for plotting features
plot_features = list(df_ions.columns)


n_features = len(plot_features)
n_cols = 3  
n_rows = int(np.ceil(n_features / n_cols))

fig, axes = plt.subplots(n_rows, n_cols, figsize=(5 * n_cols, 4 * n_rows))
axes = axes.flatten()

for idx, feature in enumerate(df_ions.columns):
    ax = axes[idx]

    if feature == "model_Ba_molL_frac":
        df_filtered = df_plot[df_plot[feature] > 0].copy()
        if len(df_filtered) > 0:
            print(f"Feature {feature} has {len(df_filtered)} valid points for log10 transformation.")
            df_filtered[feature + "_log"] = np.log10(df_filtered[feature])
            do_log_plot = True

            sns.boxplot(
                x='Cluster',
                y=feature + "_log",
                data=df_filtered,
                ax=ax,
                order=sorted(cluster_counts.index, key=lambda x: int(x))
            )
            ax.set_title(f"log10({feature}) by Cluster")
            ax.set_ylabel(f"log10({feature})")

        else:
            sns.boxplot(
                x='Cluster',
                y=feature,
                data=df_plot,
                ax=ax,
                order=sorted(cluster_counts.index, key=lambda x: int(x))
            )
            ax.set_title(f"{feature} by Cluster")
            ax.set_ylabel(feature)
    else:
        sns.boxplot(
            x='Cluster',
            y=feature,
            data=df_plot,
            ax=ax,
            order=sorted(cluster_counts.index, key=lambda x: int(x))
        )
        ax.set_title(f"{feature} by Cluster")
        ax.set_ylabel(feature)
    ax.set_xlabel("Cluster\n(sample count)")
    # Set new x-tick labels with sample sizes
    tick_locs = range(len(new_labels))
    ax.set_xticks(tick_locs)
    ax.set_xticklabels(new_labels)
for i in range(n_features, len(axes)):
    fig.delaxes(axes[i])

plt.tight_layout()
plt.savefig(os.path.join(output_folder, "Boxplots_by_Cluster_AllStates.png"), dpi=300)
plt.close(fig)
print("Boxplots saved at:", os.path.join(output_folder, "Boxplots_by_Cluster_AllStates.png"))





=== CLUSTERING ACROSS ALL STATES ===
All States, k=2, silhouette score=0.208
All States, k=3, silhouette score=0.181
All States, k=4, silhouette score=0.166
All States, k=5, silhouette score=0.166
All States, k=6, silhouette score=0.175
All States, k=7, silhouette score=0.185
All States, k=8, silhouette score=0.174
All States, k=9, silhouette score=0.182
All States, k=10, silhouette score=0.182
All States, k=11, silhouette score=0.187
All States, k=12, silhouette score=0.189


  result = getattr(ufunc, method)(*inputs, **kwargs)


Boxplots saved at: ion/Boxplots_by_Cluster_AllStates.png


In [3]:
################### FEATURE IMPORTANCE ###################

centers = pd.DataFrame(km.cluster_centers_, columns=df_cluster.columns)
# Feature importance: range (max-min) across cluster centers
feature_importance = (centers.max(axis=0) - centers.min(axis=0)).sort_values(ascending=False)


plt.figure(figsize=(10,5))
feature_importance.plot(kind='bar')
plt.ylabel('Range across cluster centers (standardized)')
plt.title('Feature Importance in Clustering')
plt.tight_layout()
importance_path = os.path.join(output_folder, "Feature_Importance_AllStates.png")
plt.savefig(importance_path, dpi=300)
plt.close()
print("Feature importance plot saved at:", importance_path)

# Calculate within-cluster variance for each feature and each cluster
variances = []
cluster_ids = range(km.n_clusters)
for cluster_id in cluster_ids:
    cluster_data = df_cluster[cluster_labels == cluster_id]
    cluster_variance = cluster_data.var(axis=0)
    variances.append(cluster_variance)

# Combine variances into a DataFrame, indexed by features, columns by cluster
within_cluster_variance_df = pd.DataFrame(variances, index=[f"Cluster {c}" for c in cluster_ids]).T

# Plot grouped bar chart of within-cluster variances
fig, ax = plt.subplots(figsize=(15, 7))

within_cluster_variance_df.plot(kind='bar', ax=ax)

ax.set_ylabel('Variance (standardized units)')
ax.set_title('Feature Variability Within Each Cluster')
ax.legend(title='Cluster')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()

variance_path = os.path.join(output_folder, "Variance_Per_Feature_AllStates.png")
plt.savefig(variance_path, dpi=300)
plt.close()

print(f"Within-cluster variance per cluster plot saved at: {variance_path}")




Feature importance plot saved at: ion/Feature_Importance_AllStates.png
Within-cluster variance per cluster plot saved at: ion/Variance_Per_Feature_AllStates.png


In [4]:
################### MAP CLUSTERS ####################
import pyproj
# Initialize 'cluster_ion' column in df as NaN or empty
df['cluster_ion'] = np.nan



df_plot['dec_lat_va'] = df.loc[df_plot.index, 'dec_lat_va']
df_plot['dec_long_va'] = df.loc[df_plot.index, 'dec_long_va']

# Remove all NaN lat/long rows
df_plot = df_plot.dropna(subset=['dec_lat_va', 'dec_long_va'])


#gdf = lat_long_to_point(df_plot, lat_col='dec_lat_va', long_col='dec_long_va')
df_geo = [Point(lon, lat) for lon, lat in zip(df_plot['dec_long_va'], df_plot['dec_lat_va'])]
gdf = gpd.GeoDataFrame(df_plot, geometry=df_geo, crs='EPSG:4326')


# Now plot coloring by cluster
fig, ax = plt.subplots(figsize=(10, 10))
gdf.plot(
    ax=ax,
    column='Cluster',
    categorical=True,
    legend=True,
    cmap='tab10',   # Choose any categorical colormap
    markersize=30
)
ax.set_title('Clustered Points Map - Across All States')
ax.set_axis_off()
# Save the map
map_path = os.path.join(output_folder, "Clustered_Map_All_States.png")
plt.savefig(map_path, dpi=300)
plt.close()
print("Clustered map saved at:", map_path)

Clustered map saved at: ion/Clustered_Map_All_States.png


In [None]:
states = df['state_alpha'].unique()
print("States found in the dataset:", states)

n_states = len(states)

output_folder = "ion"

# Create one figure for all elbow plots and one for all silhouette plots
fig_elbow, axes_elbow = plt.subplots(n_states, 1, figsize=(7, 4 * n_states), constrained_layout=True)
fig_sil, axes_sil = plt.subplots(n_states, 1, figsize=(7, 4 * n_states), constrained_layout=True)



for idx, state in enumerate(states):
    X_state = X_scaled_all[df['state_alpha'] == state]
    print(f"\nState: {state}, Shape: {X_state.shape}")

    # Standardize per state
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X_state)

    # Elbow
    r2_values = []
    k_max = min(12, len(X_state) - 1)  # Note: use X_state!
    elbow_k_range = range(1, k_max+1)
    tot_ss = np.sum((X_scaled - X_scaled.mean(axis=0)) ** 2)
    for k in elbow_k_range:
        km = KMeans(n_clusters=k, random_state=42, n_init=10)
        km.fit(X_scaled)
        between_ss = tot_ss - km.inertia_
        r2_values.append(between_ss / tot_ss)

    axes_elbow[idx].plot(list(elbow_k_range), r2_values, marker='o')
    axes_elbow[idx].set_xlabel('k')
    axes_elbow[idx].set_ylabel('R²')
    axes_elbow[idx].set_title(f'Elbow: {state}')

    # Silhouette
    sil_scores = []
    sil_k_range = range(2, min(13, len(X_state)))  # Note: use X_state!
    for k in sil_k_range:
        km = KMeans(n_clusters=k, random_state=42, n_init=10)
        labels = km.fit_predict(X_scaled)
        score = silhouette_score(X_scaled, labels)
        sil_scores.append(score)
        print(f"State={state}, k={k}, silhouette score={score:.3f}")

    axes_sil[idx].plot(list(sil_k_range), sil_scores, marker='o')
    axes_sil[idx].set_xlabel('k')
    axes_sil[idx].set_ylabel('Silhouette Score')
    axes_sil[idx].set_title(f'Silhouette: {state}')

# Save the combined figures
fig_elbow.savefig(os.path.join(output_folder, "states_elbow.png"), dpi=300)
fig_sil.savefig(os.path.join(output_folder, "states_silhouette.png"), dpi=300)
plt.close(fig_elbow)
plt.close(fig_sil)


In [None]:
########## ION CLUSTERING BY STATE PART 2 ##########

state_k_map = {
    "NM": 3,
    "TX": 6,
    "FL": 5,
    "CA": 3,
    "AZ": 4
}

for state, k_state in state_k_map.items():
    df_state = df_cluster[df['state_alpha'] == state].copy()
    X_scaled_state = X_scaled_all[df['state_alpha'] == state]

    # Cluster assignment
    km_state = KMeans(n_clusters=k_state, random_state=42, n_init=10)
    cluster_labels = km_state.fit_predict(X_scaled_state)
    df_plot_state = df_ions[df['state_alpha'] == state].copy()
    df_plot_state['Cluster'] = cluster_labels.astype(str)

    # 1. BOX PLOTS FOR EACH ION FEATURE BY CLUSTER
    cluster_counts = df_plot_state['Cluster'].value_counts().sort_index()
    # Make new labels in the form "0\n(n=73)" etc.
    new_labels = [f"{cl}\n(n={cluster_counts[cl]})" for cl in sorted(cluster_counts.index, key=lambda x: int(x))]

    # Make a copy of df_plot columns for plotting features
    plot_features = list(df_ions.columns)
    
    # Boxplot grid
    n_features = len(df_ions.columns)
    n_cols = 3
    n_rows = int(np.ceil(n_features / n_cols))
    

    fig, axes = plt.subplots(n_rows, n_cols, figsize=(5 * n_cols, 4 * n_rows))
    axes = axes.flatten()
    for idx, feature in enumerate(df_ions.columns):
        ax = axes[idx]
        # Add count of points in each cluster
        if feature == "model_Ba_molL_frac":
            # Plot log10 of the feature "on the fly" filtering out <= 0 values to avoid log errors
            df_filtered = df_plot_state[df_plot_state[feature] > 0].copy()
            if len(df_filtered) > 0:
                print(f"Feature {feature} in state {state} has {len(df_filtered)} valid points for log10 transformation.")
                df_filtered[feature + "_log"] = np.log10(df_filtered[feature])
                do_log_plot = True

                sns.boxplot(
                    x='Cluster',
                    y=feature + "_log",
                    data=df_filtered,
                    ax=ax,
                    order=sorted(cluster_counts.index, key=lambda x: int(x))
                )
                ax.set_title(f"log10({feature}) by Cluster")
                ax.set_ylabel(f"log10({feature})")

            else:
                sns.boxplot(
                    x='Cluster',
                    y=feature,
                    data=df_plot_state,
                    ax=ax,
                    order=sorted(cluster_counts.index, key=lambda x: int(x))
                )
                ax.set_title(f"{feature} by Cluster")
                ax.set_ylabel(feature)

        else:
            sns.boxplot(
                x='Cluster',
                y=feature,
                data=df_plot_state,
                ax=ax,
                order=sorted(cluster_counts.index, key=lambda x: int(x))
            )
            ax.set_title(f"{feature} by Cluster")
        #sns.boxplot(x='Cluster', y=feature, data=df_plot_state, ax=ax)

        ax.set_xlabel("Cluster\n(sample count)")
        # Explicitly set tick locations and labels to avoid warning
        tick_locs = range(len(new_labels))
        ax.set_xticks(tick_locs)
        ax.set_xticklabels(new_labels)
        
    for i in range(n_features, len(axes)):
        fig.delaxes(axes[i])

    plt.tight_layout()
    plt.savefig(os.path.join(output_folder, f"Boxplots_by_Cluster_{state}.png"), dpi=300)
    plt.close(fig)
    print(f"Boxplots saved at: {output_folder}/Boxplots_by_Cluster_{state}.png")
    print(f"Number of cluster points: {state}, k={k_state}, Cluster labels shape: {cluster_labels.shape}")
    # Print out number of points in each cluster
    unique, counts = np.unique(cluster_labels, return_counts=True)
    cluster_counts = dict(zip(unique, counts))
    print(f"Cluster counts for {state}: {cluster_counts}")

    centers = pd.DataFrame(km_state.cluster_centers_, columns=df_state.columns)
    # Feature importance: range (max-min) across cluster centers
    feature_importance = (centers.max(axis=0) - centers.min(axis=0)).sort_values(ascending=False)
    plt.figure(figsize=(10,5))
    feature_importance.plot(kind='bar')
    plt.ylabel('Range across cluster centers (standardized)')
    plt.title(f'Feature Importance in Clustering for {state}')
    plt.tight_layout()
    importance_path = os.path.join(output_folder, f"Feature_Importance_{state}.png")
    plt.savefig(importance_path, dpi=300)
    plt.close()
    print("Feature importance plot saved at:", importance_path)
    # Calculate within-cluster variance for each feature and each cluster
    variances = []
    within_cluster_variance_df = pd.DataFrame()
    cluster_ids = range(km_state.n_clusters)
    for cluster_id in cluster_ids:
        cluster_data = df_state[cluster_labels == cluster_id]
        cluster_variance = cluster_data.var(axis=0)
        variances.append(cluster_variance)

    # Combine variances into a DataFrame, indexed by features, columns by cluster
    within_cluster_variance_df = pd.DataFrame(variances, index=[f"Cluster {c}" for c in cluster_ids]).T

    # Plot grouped bar chart of within-cluster variances
    fig, ax = plt.subplots(figsize=(15, 7))

    within_cluster_variance_df.plot(kind='bar', ax=ax)

    ax.set_ylabel('Variance (standardized units)')
    ax.set_title(f'Feature Variability Within Each Cluster - {state}')
    ax.legend(title='Cluster')
    plt.xticks(rotation=45, ha='right')
    plt.tight_layout()

    variance_path = os.path.join(output_folder,f"Variance_Per_Feature_{state}.png")
    plt.savefig(variance_path, dpi=300)
    plt.close()

    print(f"Within-cluster variance per cluster plot saved at: {variance_path}")

    df_plot_state['dec_lat_va'] = df.loc[df[df['state_alpha'] == state].index, 'dec_lat_va']
    df_plot_state['dec_long_va'] = df.loc[df[df['state_alpha'] == state].index, 'dec_long_va']

    # Remove all NaN lat/long rows
    df_plot_state = df_plot_state.dropna(subset=['dec_lat_va', 'dec_long_va'])


    #gdf = lat_long_to_point(df_plot, lat_col='dec_lat_va', long_col='dec_long_va')
    df_geo = [Point(lon, lat) for lon, lat in zip(df_plot_state['dec_long_va'], df_plot_state['dec_lat_va'])]
    gdf = gpd.GeoDataFrame(df_plot_state, geometry=df_geo, crs='EPSG:4326')


    # Now plot coloring by cluster
    fig, ax = plt.subplots(figsize=(10, 10))
    gdf.plot(
        ax=ax,
        column='Cluster',
        categorical=True,
        legend=True,
        cmap='tab10',   # Choose any categorical colormap
        markersize=30
    )
    ax.set_title(f'Clustered Points Map - {state}')
    ax.set_axis_off()
    # Save the map
    map_path = os.path.join(output_folder, f"Clustered_Map_{state}.png")
    plt.savefig(map_path, dpi=300)
    plt.close()
    print("Clustered map saved at:", map_path)


    

Boxplots saved at: ion/Boxplots_by_Cluster_NM.png
Number of cluster points: NM, k=3, Cluster labels shape: (159,)
Cluster counts for NM: {np.int32(0): np.int64(106), np.int32(1): np.int64(50), np.int32(2): np.int64(3)}
Feature importance plot saved at: ion/Feature_Importance_NM.png
Within-cluster variance per cluster plot saved at: ion/Variance_Per_Feature_NM.png
Clustered map saved at: ion/Clustered_Map_NM.png
Feature model_Ba_molL_frac in state TX has 18 valid points for log10 transformation.
Boxplots saved at: ion/Boxplots_by_Cluster_TX.png
Number of cluster points: TX, k=6, Cluster labels shape: (2327,)
Cluster counts for TX: {np.int32(0): np.int64(788), np.int32(1): np.int64(492), np.int32(2): np.int64(689), np.int32(3): np.int64(347), np.int32(4): np.int64(1), np.int32(5): np.int64(10)}
Feature importance plot saved at: ion/Feature_Importance_TX.png
Within-cluster variance per cluster plot saved at: ion/Variance_Per_Feature_TX.png
Clustered map saved at: ion/Clustered_Map_TX.png


In [7]:
############### ION + SI CLUSTERING ACROSS ALL STATES ###############


output_folder = "ion_si"

# Prepare data across all states

df_features = df_ion_si.replace([np.inf, -np.inf], np.nan).dropna()
# remove well_depth_ft from df_ions
df_cluster = df_features.drop(columns=['well_depth_ft'], errors='ignore')


X_all = df_cluster.values
scaler = StandardScaler()
X_scaled_all = scaler.fit_transform(X_all)

# 1. Elbow method (R²)
r2_values = []
k_max = min(12, len(X_scaled_all) - 1)
elbow_k_range = range(1, k_max+1)
tot_ss = np.sum((X_scaled_all - X_scaled_all.mean(axis=0)) ** 2)
for k in elbow_k_range:
    km = KMeans(n_clusters=k, random_state=42, n_init=10)
    km.fit(X_scaled_all)
    between_ss = tot_ss - km.inertia_
    r2_values.append(between_ss / tot_ss)


plt.figure()
plt.plot(list(elbow_k_range), r2_values, marker='o')
plt.xlabel('k')
plt.ylabel('R²')
plt.title('Elbow method for k-means: All States (Ion+SI)')
plt.tight_layout()
plt.savefig(os.path.join(output_folder, "Elbow_AllStates.png"), dpi=300)
plt.close()

# 2. Silhouette method
sil_scores = []
sil_k_range = range(2, min(13, len(X_scaled_all)))
for k in sil_k_range:
    km = KMeans(n_clusters=k, random_state=42, n_init=10)
    labels = km.fit_predict(X_scaled_all)
    score = silhouette_score(X_scaled_all, labels)
    sil_scores.append(score)
    print(f"All States (Ion+SI), k={k}, silhouette score={score:.3f}")

plt.figure()
plt.plot(list(sil_k_range), sil_scores, marker='o')
plt.xlabel('k')
plt.ylabel('Silhouette Score')
plt.title('Silhouette Score vs k: All States (Ion+SI)')
plt.tight_layout()
plt.savefig(os.path.join(output_folder, "Silhouette_AllStates.png"), dpi=300)
plt.close()

k_use = 3

# 3. Final clustering and labeling for plotting
km = KMeans(n_clusters=k_use, random_state=42, n_init=10)
cluster_labels = km.fit_predict(X_scaled_all)
df_plot = df_features.copy()
df_plot['Cluster'] = cluster_labels.astype(str)

# 4. Multi-panel boxplots of ion+SI by cluster
n_features = len(df_ion_si.columns)
n_cols = 3
n_rows = int(np.ceil(n_features / n_cols))

cluster_counts = df_plot['Cluster'].value_counts().sort_index()
# Make new labels in the form "0\n(n=73)" etc.
new_labels = [f"{cl}\n(n={cluster_counts[cl]})" for cl in sorted(cluster_counts.index, key=lambda x: int(x))]


fig, axes = plt.subplots(n_rows, n_cols, figsize=(5 * n_cols, 4 * n_rows))
axes = axes.flatten()

for idx, feature in enumerate(df_ion_si.columns):
    ax = axes[idx]
    if feature == "model_Ba_molL_frac":
        df_filtered = df_plot[df_plot[feature] > 0].copy()
        if len(df_filtered) > 0:
            print(f"Feature {feature} has {len(df_filtered)} valid points for log10 transformation.")
            df_filtered[feature + "_log"] = np.log10(df_filtered[feature])
            sns.boxplot(
                x='Cluster',
                y=feature + "_log",
                data=df_filtered,
                ax=ax,
                order=sorted(cluster_counts.index, key=lambda x: int(x))
            )
            ax.set_title(f"log10({feature}) by Cluster")
            ax.set_ylabel(f"log10({feature})")
        else:
            sns.boxplot(x='Cluster', y=feature, data=df_plot, ax=ax, order=sorted(cluster_counts.index, key=lambda x: int(x)))
            ax.set_title(f"{feature} by Cluster")
            ax.set_ylabel(feature)
    else:
        sns.boxplot(x='Cluster', y=feature, data=df_plot, ax=ax, order=sorted(cluster_counts.index, key=lambda x: int(x)))
        ax.set_title(f"{feature} by Cluster")
        ax.set_ylabel(feature)
    ax.set_xlabel("Cluster\n(sample count)")   
    # Explicitly set tick locations and labels to avoid warning
    tick_locs = range(len(new_labels))
    ax.set_xticks(tick_locs)
    ax.set_xticklabels(new_labels)
for i in range(n_features, len(axes)):
    fig.delaxes(axes[i])

plt.tight_layout()
plt.savefig(os.path.join(output_folder, "Boxplots_by_Cluster_AllStates.png"), dpi=300)
plt.close(fig)
print("Boxplots saved at:", os.path.join(output_folder, "Boxplots_by_Cluster_AllStates.png"))

centers = pd.DataFrame(km.cluster_centers_, columns=df_cluster.columns)
# Feature importance: range (max-min) across cluster centers
feature_importance = (centers.max(axis=0) - centers.min(axis=0)).sort_values(ascending=False)
plt.figure(figsize=(10,5))
feature_importance.plot(kind='bar')
plt.ylabel('Range across cluster centers (standardized)')
plt.title('Feature Importance in Clustering')
plt.tight_layout()
importance_path = os.path.join(output_folder, "Feature_Importance_AllStates.png")
plt.savefig(importance_path, dpi=300)
plt.close()
print("Feature importance plot saved at:", importance_path)

# Calculate within-cluster variance for each feature and each cluster
variances = []
cluster_ids = range(km.n_clusters)
for cluster_id in cluster_ids:
    cluster_data = df_cluster[cluster_labels == cluster_id]
    cluster_variance = cluster_data.var(axis=0)
    variances.append(cluster_variance)

# Combine variances into a DataFrame, indexed by features, columns by cluster
within_cluster_variance_df = pd.DataFrame(variances, index=[f"Cluster {c}" for c in cluster_ids]).T

# Plot grouped bar chart of within-cluster variances
fig, ax = plt.subplots(figsize=(15, 7))

within_cluster_variance_df.plot(kind='bar', ax=ax)

ax.set_ylabel('Variance (standardized units)')
ax.set_title('Feature Variability Within Each Cluster')
ax.legend(title='Cluster')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()

variance_path = os.path.join(output_folder, "Variance_Per_Feature_AllStates.png")
plt.savefig(variance_path, dpi=300)
plt.close()

print(f"Within-cluster variance per cluster plot saved at: {variance_path}")


# 5. Map of clustered points
df_plot['dec_lat_va'] = df.loc[df_plot.index, 'dec_lat_va']
df_plot['dec_long_va'] = df.loc[df_plot.index, 'dec_long_va']

# Remove all NaN lat/long rows
df_plot = df_plot.dropna(subset=['dec_lat_va', 'dec_long_va'])


#gdf = lat_long_to_point(df_plot, lat_col='dec_lat_va', long_col='dec_long_va')
df_geo = [Point(lon, lat) for lon, lat in zip(df_plot['dec_long_va'], df_plot['dec_lat_va'])]
gdf = gpd.GeoDataFrame(df_plot, geometry=df_geo, crs='EPSG:4326')


# Now plot coloring by cluster
fig, ax = plt.subplots(figsize=(10, 10))
gdf.plot(
    ax=ax,
    column='Cluster',
    categorical=True,
    legend=True,
    cmap='tab10',   # Choose any categorical colormap
    markersize=30
)
ax.set_title('Clustered Points Map - Across All States')
ax.set_axis_off()
# Save the map
map_path = os.path.join(output_folder, "Clustered_Map_All_States.png")
plt.savefig(map_path, dpi=300)
plt.close()
print("Clustered map saved at:", map_path)



All States (Ion+SI), k=2, silhouette score=0.181
All States (Ion+SI), k=3, silhouette score=0.163
All States (Ion+SI), k=4, silhouette score=0.152
All States (Ion+SI), k=5, silhouette score=0.159
All States (Ion+SI), k=6, silhouette score=0.155
All States (Ion+SI), k=7, silhouette score=0.162
All States (Ion+SI), k=8, silhouette score=0.142
All States (Ion+SI), k=9, silhouette score=0.146
All States (Ion+SI), k=10, silhouette score=0.150
All States (Ion+SI), k=11, silhouette score=0.150
All States (Ion+SI), k=12, silhouette score=0.150
Feature model_Ba_molL_frac has 24 valid points for log10 transformation.
Boxplots saved at: ion_si/Boxplots_by_Cluster_AllStates.png
Feature importance plot saved at: ion_si/Feature_Importance_AllStates.png
Within-cluster variance per cluster plot saved at: ion_si/Variance_Per_Feature_AllStates.png
Clustered map saved at: ion_si/Clustered_Map_All_States.png


In [None]:
############### ION + SI CLUSTERING BY STATE PART 1 ###############
n_states = len(states)
elbow_fig, elbow_axes = plt.subplots(n_states, 1, figsize=(7, 4 * n_states), constrained_layout=True)
sil_fig, sil_axes = plt.subplots(n_states, 1, figsize=(7, 4 * n_states), constrained_layout=True)

plot_idx = 0
for state in states:
    # Prepare data for this state
    df_state = df[df[state_col] == state][df_cluster.columns].replace([np.inf, -np.inf], np.nan).dropna()

    print(f"\n--- State: {state}, Shape: {df_state.shape} ---")
    if len(df_state) < 10:
        print(f"  Not enough samples, skipping.")
        continue

    X = df_state.values
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)


    # 1. Elbow method (R²)
    r2_values = []
    k_max = min(12, len(X_scaled) - 1)
    elbow_k_range = range(1, k_max + 1)
    tot_ss = np.sum((X_scaled - X_scaled.mean(axis=0)) ** 2)
    for k in elbow_k_range:
        km = KMeans(n_clusters=k, random_state=42, n_init=10)
        km.fit(X_scaled)
        between_ss = tot_ss - km.inertia_
        r2_values.append(between_ss / tot_ss)

    elbow_axes[plot_idx].plot(list(elbow_k_range), r2_values, marker='o')
    elbow_axes[plot_idx].set_xlabel('k')
    elbow_axes[plot_idx].set_ylabel('R²')
    elbow_axes[plot_idx].set_title(f'Elbow: {state} (Ion+SI)')

    # 2. Silhouette method
    sil_scores = []
    sil_k_range = range(2, min(13, len(X_scaled)))
    for k in sil_k_range:
        km = KMeans(n_clusters=k, random_state=42, n_init=10)
        labels = km.fit_predict(X_scaled)
        score = silhouette_score(X_scaled, labels)
        sil_scores.append(score)
        print(f"  k={k}, silhouette score={score:.3f}")

    sil_axes[plot_idx].plot(list(sil_k_range), sil_scores, marker='o')
    sil_axes[plot_idx].set_xlabel('k')
    sil_axes[plot_idx].set_ylabel('Silhouette Score')
    sil_axes[plot_idx].set_title(f'Silhouette: {state} (Ion+SI)')

    plot_idx += 1

# Save the combined figures
elbow_fig.savefig(os.path.join(output_folder, "elbow_states.png"), dpi=300)
sil_fig.savefig(os.path.join(output_folder, "silhouette_states.png"), dpi=300)
plt.close(elbow_fig)
plt.close(sil_fig)


In [8]:
############### ION + SI CLUSTERING BY STATE PART 2 ###############

state_k_map = {
    'NM': 4,
    'TX': 5,
    'FL': 4,
    'CA': 4,
    'AZ': 4
}

for state, k_state in state_k_map.items():
    print(f"\n--- Processing {state} (k={k_state}) ---")
    df_state = df[df[state_col] == state][df_cluster.columns].replace([np.inf, -np.inf], np.nan).dropna()
    if len(df_state) < k_state:
        print(f"  Not enough samples for k={k_state}, skipping.")
        continue
    
    # Standardize features
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(df_state)

    # Cluster assignment
    km = KMeans(n_clusters=k_state, random_state=42, n_init=10)
    cluster_labels = km.fit_predict(X_scaled)
    df_plot = df_features[df['state_alpha'] == state].copy()
    df_plot['Cluster'] = cluster_labels.astype(str)
    cluster_counts = df_plot['Cluster'].value_counts().sort_index()
    # Make new labels in the form "0\n(n=73)" etc.
    new_labels = [f"{cl}\n(n={cluster_counts[cl]})" for cl in sorted(cluster_counts.index, key=lambda x: int(x))]
    # Multi-panel boxplots of features by cluster
    n_features = len(df_ion_si.columns)
    n_cols = 3
    n_rows = int(np.ceil(n_features / n_cols))
    
    fig, axes = plt.subplots(n_rows, n_cols, figsize=(5 * n_cols, 4 * n_rows))
    axes = axes.flatten()
    for idx, feature in enumerate(df_ion_si.columns):
        ax = axes[idx]
        if feature == "model_Ba_molL_frac":
            df_filtered = df_plot[df_plot[feature] > 0].copy()
            if len(df_filtered) > 0:
                print(f"Feature {feature} in state {state} has {len(df_filtered)} valid points for log10 transformation.")
                df_filtered[feature + "_log"] = np.log10(df_filtered[feature])
                sns.boxplot(
                    x='Cluster',
                    y=feature + "_log",
                    data=df_filtered,
                    ax=ax,
                    order=sorted(cluster_counts.index, key=lambda x: int(x))
                )
                ax.set_title(f"log10({feature}) by Cluster")
                ax.set_ylabel(f"log10({feature})")
            else:
                sns.boxplot(x='Cluster', y=feature, data=df_plot, ax=ax, order=sorted(cluster_counts.index, key=lambda x: int(x)))
                ax.set_title(f"{feature} by Cluster")
                ax.set_ylabel(feature)
        else:
            sns.boxplot(x='Cluster', y=feature, data=df_plot, ax=ax, order=sorted(cluster_counts.index, key=lambda x: int(x)))
            ax.set_title(f"{feature} by Cluster")
        ax.set_xlabel("Cluster\n(sample count)")
        # Explicitly set tick locations and labels to avoid warning
        tick_locs = range(len(new_labels))
        ax.set_xticks(tick_locs)
        ax.set_xticklabels(new_labels)
    for i in range(n_features, len(axes)):
        fig.delaxes(axes[i])
    plt.tight_layout()
    plt.savefig(os.path.join(output_folder, f"Boxplots_by_Cluster_{state}.png"), dpi=300)
    plt.close(fig)
    print(f"  Boxplots saved at: {output_folder}/Boxplots_by_Cluster_{state}.png")

    centers = pd.DataFrame(km.cluster_centers_, columns=df_state.columns)
    # Feature importance: range (max-min) across cluster centers
    feature_importance = (centers.max(axis=0) - centers.min(axis=0)).sort_values(ascending=False)
    plt.figure(figsize=(10,5))
    feature_importance.plot(kind='bar')
    plt.ylabel('Range across cluster centers (standardized)')
    plt.title(f'Feature Importance in Clustering for {state}')
    plt.tight_layout()
    importance_path = os.path.join(output_folder, f"Feature_Importance_{state}.png")
    plt.savefig(importance_path, dpi=300)
    plt.close()
    print("Feature importance plot saved at:", importance_path)
    
    # Calculate within-cluster variance for each feature and each cluster
    variances = []
    within_cluster_variance_df = pd.DataFrame()
    cluster_ids = range(km.n_clusters)
    for cluster_id in cluster_ids:
        cluster_data = df_state[cluster_labels == cluster_id]
        cluster_variance = cluster_data.var(axis=0)
        variances.append(cluster_variance)

    # Combine variances into a DataFrame, indexed by features, columns by cluster
    within_cluster_variance_df = pd.DataFrame(variances, index=[f"Cluster {c}" for c in cluster_ids]).T

    # Plot grouped bar chart of within-cluster variances
    fig, ax = plt.subplots(figsize=(15, 7))

    within_cluster_variance_df.plot(kind='bar', ax=ax)

    ax.set_ylabel('Variance (standardized units)')
    ax.set_title(f'Feature Variability Within Each Cluster - {state}')
    ax.legend(title='Cluster')
    plt.xticks(rotation=45, ha='right')
    plt.tight_layout()

    variance_path = os.path.join(output_folder,f"Variance_Per_Feature_{state}.png")
    plt.savefig(variance_path, dpi=300)
    plt.close()

    print(f"Within-cluster variance per cluster plot saved at: {variance_path}")


    df_plot['dec_lat_va'] = df.loc[df[df['state_alpha'] == state].index, 'dec_lat_va']
    df_plot['dec_long_va'] = df.loc[df[df['state_alpha'] == state].index, 'dec_long_va']

    # Remove all NaN lat/long rows
    df_plot = df_plot.dropna(subset=['dec_lat_va', 'dec_long_va'])


    #gdf = lat_long_to_point(df_plot, lat_col='dec_lat_va', long_col='dec_long_va')
    df_geo = [Point(lon, lat) for lon, lat in zip(df_plot['dec_long_va'], df_plot['dec_lat_va'])]
    gdf = gpd.GeoDataFrame(df_plot, geometry=df_geo, crs='EPSG:4326')


    # Now plot coloring by cluster
    fig, ax = plt.subplots(figsize=(10, 10))
    gdf.plot(
        ax=ax,
        column='Cluster',
        categorical=True,
        legend=True,
        cmap='tab10',   # Choose any categorical colormap
        markersize=30
    )
    ax.set_title(f'Clustered Points Map - {state}')
    ax.set_axis_off()
    # Save the map
    map_path = os.path.join(output_folder, f"Clustered_Map_{state}.png")
    plt.savefig(map_path, dpi=300)
    plt.close()
    print("Clustered map saved at:", map_path)




--- Processing NM (k=4) ---


  df_plot = df_features[df['state_alpha'] == state].copy()


  Boxplots saved at: ion_si/Boxplots_by_Cluster_NM.png
Feature importance plot saved at: ion_si/Feature_Importance_NM.png
Within-cluster variance per cluster plot saved at: ion_si/Variance_Per_Feature_NM.png
Clustered map saved at: ion_si/Clustered_Map_NM.png

--- Processing TX (k=5) ---


  df_plot = df_features[df['state_alpha'] == state].copy()


Feature model_Ba_molL_frac in state TX has 18 valid points for log10 transformation.
  Boxplots saved at: ion_si/Boxplots_by_Cluster_TX.png
Feature importance plot saved at: ion_si/Feature_Importance_TX.png
Within-cluster variance per cluster plot saved at: ion_si/Variance_Per_Feature_TX.png
Clustered map saved at: ion_si/Clustered_Map_TX.png

--- Processing FL (k=4) ---
Feature model_Ba_molL_frac in state FL has 1 valid points for log10 transformation.


  df_plot = df_features[df['state_alpha'] == state].copy()


  Boxplots saved at: ion_si/Boxplots_by_Cluster_FL.png
Feature importance plot saved at: ion_si/Feature_Importance_FL.png
Within-cluster variance per cluster plot saved at: ion_si/Variance_Per_Feature_FL.png
Clustered map saved at: ion_si/Clustered_Map_FL.png

--- Processing CA (k=4) ---
Feature model_Ba_molL_frac in state CA has 3 valid points for log10 transformation.


  df_plot = df_features[df['state_alpha'] == state].copy()


  Boxplots saved at: ion_si/Boxplots_by_Cluster_CA.png
Feature importance plot saved at: ion_si/Feature_Importance_CA.png
Within-cluster variance per cluster plot saved at: ion_si/Variance_Per_Feature_CA.png
Clustered map saved at: ion_si/Clustered_Map_CA.png

--- Processing AZ (k=4) ---


  df_plot = df_features[df['state_alpha'] == state].copy()


Feature model_Ba_molL_frac in state AZ has 2 valid points for log10 transformation.
  Boxplots saved at: ion_si/Boxplots_by_Cluster_AZ.png
Feature importance plot saved at: ion_si/Feature_Importance_AZ.png
Within-cluster variance per cluster plot saved at: ion_si/Variance_Per_Feature_AZ.png
Clustered map saved at: ion_si/Clustered_Map_AZ.png


In [9]:
################# ION + PP CLUSTERING ACROSS ALL STATES ############

output_folder = "ion_pp"

# Prepare data across all states
df_features = df_ion_pp.replace([np.inf, -np.inf], np.nan).dropna()
df_cluster = df_features.drop(columns=['well_depth_ft'], errors='ignore')

X_all = df_cluster.values
scaler = StandardScaler()
X_scaled_all = scaler.fit_transform(X_all)

# 1. Elbow method (R²)
r2_values = []
k_max = min(12, len(X_scaled_all) - 1)
elbow_k_range = range(1, k_max + 1)
tot_ss = np.sum((X_scaled_all - X_scaled_all.mean(axis=0)) ** 2)
for k in elbow_k_range:
    km = KMeans(n_clusters=k, random_state=42, n_init=10)
    km.fit(X_scaled_all)
    between_ss = tot_ss - km.inertia_
    r2_values.append(between_ss / tot_ss)

plt.figure()
plt.plot(list(elbow_k_range), r2_values, marker='o')
plt.xlabel('k')
plt.ylabel('R²')
plt.title('Elbow method for k-means: All States (Ion+PP)')
plt.tight_layout()
plt.savefig(os.path.join(output_folder, "Elbow_AllStates.png"), dpi=300)
plt.close()

# 2. Silhouette method
sil_scores = []
sil_k_range = range(2, min(13, len(X_scaled_all)))
for k in sil_k_range:
    km = KMeans(n_clusters=k, random_state=42, n_init=10)
    labels = km.fit_predict(X_scaled_all)
    score = silhouette_score(X_scaled_all, labels)
    sil_scores.append(score)
    print(f"All States (Ion+PP), k={k}, silhouette score={score:.3f}")

plt.figure()
plt.plot(list(sil_k_range), sil_scores, marker='o')
plt.xlabel('k')
plt.ylabel('Silhouette Score')
plt.title('Silhouette Score vs k: All States (Ion+PP)')
plt.tight_layout()
plt.savefig(os.path.join(output_folder, "Silhouette_AllStates.png"), dpi=300)
plt.close()

k_use = 4

# 3. Final clustering and labeling for plotting
km = KMeans(n_clusters=k_use, random_state=42, n_init=10)
cluster_labels = km.fit_predict(X_scaled_all)
df_plot = df_features.copy()
df_plot['Cluster'] = cluster_labels.astype(str)

# 4. Multi-panel boxplots of ion+PP by cluster
n_features = len(df_ion_pp.columns)
n_cols = 3
n_rows = int(np.ceil(n_features / n_cols))

cluster_counts = df_plot['Cluster'].value_counts().sort_index()
# Make new labels in the form "0\n(n=73)" etc.
new_labels = [f"{cl}\n(n={cluster_counts[cl]})" for cl in sorted(cluster_counts.index, key=lambda x: int(x))]

fig, axes = plt.subplots(n_rows, n_cols, figsize=(5 * n_cols, 4 * n_rows))
axes = axes.flatten()

for idx, feature in enumerate(df_ion_pp.columns):
    ax = axes[idx]
    if feature == "model_Ba_molL_frac":
        df_filtered = df_plot[df_plot[feature] > 0].copy()
        if len(df_filtered) > 0:
            print(f"Feature {feature} has {len(df_filtered)} valid points for log10 transformation.")
            df_filtered[feature + "_log"] = np.log10(df_filtered[feature])
            sns.boxplot(
                x='Cluster',
                y=feature + "_log",
                data=df_filtered,
                ax=ax,
                order=sorted(cluster_counts.index, key=lambda x: int(x))
            )
            ax.set_title(f"log10({feature}) by Cluster")
            ax.set_ylabel(f"log10({feature})")
        else:
            sns.boxplot(x='Cluster', y=feature, data=df_plot, ax=ax, order=sorted(cluster_counts.index, key=lambda x: int(x)))
            ax.set_title(f"{feature} by Cluster")
            ax.set_ylabel(feature)
    else:
        sns.boxplot(x='Cluster', y=feature, data=df_plot, ax=ax, order=sorted(cluster_counts.index, key=lambda x: int(x)))
        ax.set_title(f"{feature} by Cluster")
        ax.set_ylabel(feature)
    ax.set_xlabel("Cluster\n(sample count)")
    # Explicitly set tick locations and labels to avoid warning
    tick_locs = range(len(new_labels))
    ax.set_xticks(tick_locs)
    ax.set_xticklabels(new_labels)
# Remove any unused axes
for i in range(n_features, len(axes)):
    fig.delaxes(axes[i])

plt.tight_layout()
plt.savefig(os.path.join(output_folder, "Boxplots_by_Cluster_AllStates.png"), dpi=300)
plt.close(fig)
print("Boxplots saved at:", os.path.join(output_folder, "Boxplots_by_Cluster_AllStates.png"))

centers = pd.DataFrame(km.cluster_centers_, columns=df_cluster.columns)
# Feature importance: range (max-min) across cluster centers
feature_importance = (centers.max(axis=0) - centers.min(axis=0)).sort_values(ascending=False)
plt.figure(figsize=(10,5))
feature_importance.plot(kind='bar')
plt.ylabel('Range across cluster centers (standardized)')
plt.title('Feature Importance in Clustering')
plt.tight_layout()
importance_path = os.path.join(output_folder, "Feature_Importance_AllStates.png")
plt.savefig(importance_path, dpi=300)
plt.close()
print("Feature importance plot saved at:", importance_path)

# Calculate within-cluster variance for each feature and each cluster
variances = []
cluster_ids = range(km.n_clusters)
for cluster_id in cluster_ids:
    cluster_data = df_cluster[cluster_labels == cluster_id]
    cluster_variance = cluster_data.var(axis=0)
    variances.append(cluster_variance)

# Combine variances into a DataFrame, indexed by features, columns by cluster
within_cluster_variance_df = pd.DataFrame(variances, index=[f"Cluster {c}" for c in cluster_ids]).T

# Plot grouped bar chart of within-cluster variances
fig, ax = plt.subplots(figsize=(15, 7))

within_cluster_variance_df.plot(kind='bar', ax=ax)

ax.set_ylabel('Variance (standardized units)')
ax.set_title('Feature Variability Within Each Cluster')
ax.legend(title='Cluster')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()

variance_path = os.path.join(output_folder, "Variance_Per_Feature_AllStates.png")
plt.savefig(variance_path, dpi=300)
plt.close()

print(f"Within-cluster variance per cluster plot saved at: {variance_path}")


# 5. Map of clustered points
df_plot['dec_lat_va'] = df.loc[df_plot.index, 'dec_lat_va']
df_plot['dec_long_va'] = df.loc[df_plot.index, 'dec_long_va']

# Remove all NaN lat/long rows
df_plot = df_plot.dropna(subset=['dec_lat_va', 'dec_long_va'])


#gdf = lat_long_to_point(df_plot, lat_col='dec_lat_va', long_col='dec_long_va')
df_geo = [Point(lon, lat) for lon, lat in zip(df_plot['dec_long_va'], df_plot['dec_lat_va'])]
gdf = gpd.GeoDataFrame(df_plot, geometry=df_geo, crs='EPSG:4326')


# Now plot coloring by cluster
fig, ax = plt.subplots(figsize=(10, 10))
gdf.plot(
    ax=ax,
    column='Cluster',
    categorical=True,
    legend=True,
    cmap='tab10',   # Choose any categorical colormap
    markersize=30
)
ax.set_title('Clustered Points Map - Across All States')
ax.set_axis_off()
# Save the map
map_path = os.path.join(output_folder, "Clustered_Map_All_States.png")
plt.savefig(map_path, dpi=300)
plt.close()
print("Clustered map saved at:", map_path)

All States (Ion+PP), k=2, silhouette score=0.177
All States (Ion+PP), k=3, silhouette score=0.196
All States (Ion+PP), k=4, silhouette score=0.197
All States (Ion+PP), k=5, silhouette score=0.180
All States (Ion+PP), k=6, silhouette score=0.179
All States (Ion+PP), k=7, silhouette score=0.186
All States (Ion+PP), k=8, silhouette score=0.159
All States (Ion+PP), k=9, silhouette score=0.175
All States (Ion+PP), k=10, silhouette score=0.163
All States (Ion+PP), k=11, silhouette score=0.163
All States (Ion+PP), k=12, silhouette score=0.190
Feature model_Ba_molL_frac has 24 valid points for log10 transformation.
Boxplots saved at: ion_pp/Boxplots_by_Cluster_AllStates.png
Feature importance plot saved at: ion_pp/Feature_Importance_AllStates.png
Within-cluster variance per cluster plot saved at: ion_pp/Variance_Per_Feature_AllStates.png
Clustered map saved at: ion_pp/Clustered_Map_All_States.png


In [None]:
############### ION + PP CLUSTERING BY STATE PART 1 ###############

output_folder = "ion_pp"

n_states = len(states)
elbow_fig, elbow_axes = plt.subplots(n_states, 1, figsize=(7, 4 * n_states), constrained_layout=True)
sil_fig, sil_axes = plt.subplots(n_states, 1, figsize=(7, 4 * n_states), constrained_layout=True)


plot_idx = 0
for state in states:
    df_state = df[df[state_col] == state][df_cluster.columns].replace([np.inf, -np.inf], np.nan).dropna()

    print(f"\n--- State: {state}, Shape: {df_state.shape} ---")
    if len(df_state) < 10:
        print(f"  Not enough samples, skipping.")
        continue

    X = df_state.values
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)


    # Elbow method (R²)
    r2_values = []
    k_max = min(12, len(X_scaled)-1)
    elbow_k_range = range(1, k_max+1)
    tot_ss = np.sum((X_scaled - X_scaled.mean(axis=0)) ** 2)
    for k in elbow_k_range:
        km = KMeans(n_clusters=k, random_state=42, n_init=10)
        km.fit(X_scaled)
        between_ss = tot_ss - km.inertia_
        r2_values.append(between_ss / tot_ss)

    elbow_axes[plot_idx].plot(list(elbow_k_range), r2_values, marker='o')
    elbow_axes[plot_idx].set_xlabel('k')
    elbow_axes[plot_idx].set_ylabel('R²')
    elbow_axes[plot_idx].set_title(f'Elbow: {state} (Ion+PP)')

    # Silhouette method
    sil_scores = []
    sil_k_range = range(2, min(13, len(X_scaled)))
    for k in sil_k_range:
        km = KMeans(n_clusters=k, random_state=42, n_init=10)
        labels = km.fit_predict(X_scaled)
        score = silhouette_score(X_scaled, labels)
        sil_scores.append(score)
        print(f"  k={k}, silhouette score={score:.3f}")

    sil_axes[plot_idx].plot(list(sil_k_range), sil_scores, marker='o')
    sil_axes[plot_idx].set_xlabel('k')
    sil_axes[plot_idx].set_ylabel('Silhouette Score')
    sil_axes[plot_idx].set_title(f'Silhouette: {state} (Ion+PP)')

    plot_idx += 1


# Save the combined figures
elbow_fig.savefig(os.path.join(output_folder, "elbow_states.png"), dpi=300)
sil_fig.savefig(os.path.join(output_folder, "silhouette_states.png"), dpi=300)
plt.close(elbow_fig)
plt.close(sil_fig)




In [10]:
############### ION + PP CLUSTERING BY STATE PART 2 ###############

output_folder = "ion_pp"

state_k_map = {
    'NM': 6,
    'TX': 4,
    'FL': 4,
    'CA': 5,
    'AZ': 3
}

for state, k_state in state_k_map.items():
    print(f"\n--- Processing {state} (k={k_state}) ---")
    df_state = df[df[state_col] == state][df_cluster.columns].replace([np.inf, -np.inf], np.nan).dropna()
    if len(df_state) < k_state:
        print(f"  Not enough samples for k={k_state}, skipping.")
        continue
    
    # Standardize features
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(df_state)

    # Cluster assignment
    km = KMeans(n_clusters=k_state, random_state=42, n_init=10)
    cluster_labels = km.fit_predict(X_scaled)
    df_plot = df_features[df['state_alpha'] == state].copy()
    df_plot['Cluster'] = cluster_labels.astype(str)
    cluster_counts = df_plot['Cluster'].value_counts().sort_index()
    # Make new labels in the form "0\n(n=73)" etc.
    new_labels = [f"{cl}\n(n={cluster_counts[cl]})" for cl in sorted(cluster_counts.index, key=lambda x: int(x))]

    # Multi-panel boxplots of features by cluster
    n_features = len(df_ion_pp.columns)
    n_cols = 3
    n_rows = int(np.ceil(n_features / n_cols))
    
    fig, axes = plt.subplots(n_rows, n_cols, figsize=(5 * n_cols, 4 * n_rows))
    axes = axes.flatten()
    for idx, feature in enumerate(df_ion_pp.columns):
        ax = axes[idx]
        if feature == "model_Ba_molL_frac":
            df_filtered = df_plot[df_plot[feature] > 0].copy()
            if len(df_filtered) > 0:
                print(f"Feature {feature} in state {state} has {len(df_filtered)} valid points for log10 transformation.")
                df_filtered[feature + "_log"] = np.log10(df_filtered[feature])
                sns.boxplot(
                    x='Cluster',
                    y=feature + "_log",
                    data=df_filtered,
                    ax=ax,
                    order=sorted(cluster_counts.index, key=lambda x: int(x))
                )
                ax.set_title(f"log10({feature}) by Cluster")
                ax.set_ylabel(f"log10({feature})")
            else:
                sns.boxplot(x='Cluster', y=feature, data=df_plot, ax=ax, order=sorted(cluster_counts.index, key=lambda x: int(x)))
                ax.set_title(f"{feature} by Cluster")
                ax.set_ylabel(feature)
        else:
            sns.boxplot(x='Cluster', y=feature, data=df_plot, ax=ax, order=sorted(cluster_counts.index, key=lambda x: int(x)))
            ax.set_title(f"{feature} by Cluster")
        ax.set_xlabel("Cluster\n(sample count)")
        # Explicitly set tick locations and labels to avoid warning
        tick_locs = range(len(new_labels))
        ax.set_xticks(tick_locs)
        ax.set_xticklabels(new_labels)
    for i in range(n_features, len(axes)):
        fig.delaxes(axes[i])
    plt.tight_layout()
    plt.savefig(os.path.join(output_folder, f"Boxplots_by_Cluster_{state}.png"), dpi=300)
    plt.close(fig)
    print(f"  Boxplots saved at: {output_folder}/Boxplots_by_Cluster_{state}.png")

    centers = pd.DataFrame(km.cluster_centers_, columns=df_state.columns)
    # Feature importance: range (max-min) across cluster centers
    feature_importance = (centers.max(axis=0) - centers.min(axis=0)).sort_values(ascending=False)
    plt.figure(figsize=(10,5))
    feature_importance.plot(kind='bar')
    plt.ylabel('Range across cluster centers (standardized)')
    plt.title(f'Feature Importance in Clustering for {state}')
    plt.tight_layout()
    importance_path = os.path.join(output_folder, f"Feature_Importance_{state}.png")
    plt.savefig(importance_path, dpi=300)
    plt.close()
    print("Feature importance plot saved at:", importance_path)

    # Calculate within-cluster variance for each feature and each cluster
    variances = []
    within_cluster_variance_df = pd.DataFrame()
    cluster_ids = range(km.n_clusters)
    for cluster_id in cluster_ids:
        cluster_data = df_state[cluster_labels == cluster_id]
        cluster_variance = cluster_data.var(axis=0)
        variances.append(cluster_variance)

    # Combine variances into a DataFrame, indexed by features, columns by cluster
    within_cluster_variance_df = pd.DataFrame(variances, index=[f"Cluster {c}" for c in cluster_ids]).T

    # Plot grouped bar chart of within-cluster variances
    fig, ax = plt.subplots(figsize=(15, 7))

    within_cluster_variance_df.plot(kind='bar', ax=ax)

    ax.set_ylabel('Variance (standardized units)')
    ax.set_title(f'Feature Variability Within Each Cluster - {state}')
    ax.legend(title='Cluster')
    plt.xticks(rotation=45, ha='right')
    plt.tight_layout()

    variance_path = os.path.join(output_folder,f"Variance_Per_Feature_{state}.png")
    plt.savefig(variance_path, dpi=300)
    plt.close()

    print(f"Within-cluster variance per cluster plot saved at: {variance_path}")
  


    df_plot['dec_lat_va'] = df.loc[df[df['state_alpha'] == state].index, 'dec_lat_va']
    df_plot['dec_long_va'] = df.loc[df[df['state_alpha'] == state].index, 'dec_long_va']

    # Remove all NaN lat/long rows
    df_plot = df_plot.dropna(subset=['dec_lat_va', 'dec_long_va'])


    #gdf = lat_long_to_point(df_plot, lat_col='dec_lat_va', long_col='dec_long_va')
    df_geo = [Point(lon, lat) for lon, lat in zip(df_plot['dec_long_va'], df_plot['dec_lat_va'])]
    gdf = gpd.GeoDataFrame(df_plot, geometry=df_geo, crs='EPSG:4326')


    # Now plot coloring by cluster
    fig, ax = plt.subplots(figsize=(10, 10))
    gdf.plot(
        ax=ax,
        column='Cluster',
        categorical=True,
        legend=True,
        cmap='tab10',   # Choose any categorical colormap
        markersize=30
    )
    ax.set_title(f'Clustered Points Map - {state}')
    ax.set_axis_off()
    # Save the map
    map_path = os.path.join(output_folder, f"Clustered_Map_{state}.png")
    plt.savefig(map_path, dpi=300)
    plt.close()
    print("Clustered map saved at:", map_path)





--- Processing NM (k=6) ---
  Boxplots saved at: ion_pp/Boxplots_by_Cluster_NM.png
Feature importance plot saved at: ion_pp/Feature_Importance_NM.png
Within-cluster variance per cluster plot saved at: ion_pp/Variance_Per_Feature_NM.png
Clustered map saved at: ion_pp/Clustered_Map_NM.png

--- Processing TX (k=4) ---
Feature model_Ba_molL_frac in state TX has 18 valid points for log10 transformation.
  Boxplots saved at: ion_pp/Boxplots_by_Cluster_TX.png
Feature importance plot saved at: ion_pp/Feature_Importance_TX.png
Within-cluster variance per cluster plot saved at: ion_pp/Variance_Per_Feature_TX.png
Clustered map saved at: ion_pp/Clustered_Map_TX.png

--- Processing FL (k=4) ---
Feature model_Ba_molL_frac in state FL has 1 valid points for log10 transformation.
  Boxplots saved at: ion_pp/Boxplots_by_Cluster_FL.png
Feature importance plot saved at: ion_pp/Feature_Importance_FL.png
Within-cluster variance per cluster plot saved at: ion_pp/Variance_Per_Feature_FL.png
Clustered map sa