In [3]:
import pandas as pd
import seaborn as sns
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import silhouette_score
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import geopandas as gpd
import rasterio
import requests
import zipfile
import io
import os
import shutil
from shapely.geometry import Point
import pyproj


################### DATA PROCESSING ###################

df = pd.read_csv('data_states_processed.csv')
#print(df.head())

# Filter for wells with depth less than or equal to 3000 ft
df = df[df['well_depth_ft'] <= 3000]

# Define ions by group

cations = ['model_Ba_molL', 'model_Ca_molL', 'model_Fe_molL', 'model_K_molL',
           'model_Li_molL', 'model_Mg_molL', 'model_Mn_molL', 'model_Na_molL', 'model_Sr_molL']
# Include Fe2+ as cation

anions = ['model_Br_molL', 'model_Cl_molL', 'model_C_molL', 'model_SO4_molL']  # Treat model_C as HCO3-

neutral_ions = ['model_B_molL', 'model_Si_molL']  # assumed neutral species

# Charges dictionary for equivalent calculation (charge per mole)
charges = {
    'model_Ba_molL': 2,
    'model_Ca_molL': 2,
    'model_Fe_molL': 2,
    'model_K_molL': 1,
    'model_Li_molL': 1,
    'model_Mg_molL': 2,
    'model_Mn_molL': 2,
    'model_Na_molL': 1,
    'model_Sr_molL': 2,
    'model_Br_molL': 1,
    'model_Cl_molL': 1,
    'model_C_molL': 1,
    'model_SO4_molL': 2
}

# --- Calculate equivalents for cations and anions ---

for ion in cations + anions:
    df[ion + '_eq'] = df[ion] * charges[ion]

# --- Calculate totals ---

df['total_cations_eq'] = df[[c + '_eq' for c in cations]].sum(axis=1)
df['total_anions_eq'] = df[[a + '_eq' for a in anions]].sum(axis=1)

# Total ions in mol/L from all charged equivalents converted back to mol/L plus neutral ions
    
df['total_ion_molL'] = (
    df[[ion + '_eq' for ion in cations + anions]].div(
        [charges[i] for i in cations + anions], axis=1
    ).sum(axis=1)
    + df[neutral_ions].sum(axis=1)
)

# --- Calculate fractions ---

# For ions belonging to cations and anions, fraction based on total equivalents
for ion in cations:
    df[ion + '_frac'] = np.where(df['total_cations_eq'] != 0,
                                df[ion + '_eq'] / df['total_cations_eq'], 
                                np.nan)

for ion in anions:
    df[ion + '_frac'] = np.where(df['total_anions_eq'] != 0,
                                df[ion + '_eq'] / df['total_anions_eq'], 
                                np.nan)

# For neutral ions (B, Si), fraction based on total ions in mol/L (including neutral)
for ion in neutral_ions:
    df[ion + '_frac'] = np.where(df['total_ion_molL'] != 0,
                                df[ion] / df['total_ion_molL'], 
                                np.nan)

# --- Log10 transform TDS ---

df['TDS_log10'] = np.log10(df['model_TDS_mgL'])

# --- Select only requested columns in the new df_ions DataFrame ---

frac_columns = [ion + '_frac' for ion in cations + anions + neutral_ions]
selected_columns = frac_columns + ['TDS_log10'] + ['well_depth_ft'] + ['total_ion_molL'] + ['total_cations_eq'] + ['total_anions_eq']

df_ions = df[selected_columns].copy()
# Print columns of df_ions
print("Columns in df_ions:", df_ions.columns.tolist()) 

print("DataFrame df_ions shape:", df_ions.shape)

# Columns you want to add and standardize
extra_si_cols = ['model_si_Calcite', 'model_si_Barite', 'model_si_Chalcedony', 'model_si_Gypsum']

# Extract these columns from df
df_extra = df[extra_si_cols].copy()

# Combine df_ions and the standardized extra columns into df_ion_si
df_ion_si = pd.concat([df_ions, df_extra], axis=1)

print("Columns in df_ion_si:", df_ion_si.columns.tolist()) 

print("DataFrame df_ion_si shape:", df_ion_si.shape)

# List of the new "pp" columns to add and standardize
extra_pp_cols = [
    'model_pp_Calcite_open_2x',
    'model_pp_Calcite_open_4x',
    'model_pp_Calcite_open_8x',
    'model_pp_Barite_open_2x',
    'model_pp_Barite_open_4x',
    'model_pp_Barite_open_8x',
    'model_pp_Chalcedony_open_2x',
    'model_pp_Chalcedony_open_4x',
    'model_pp_Chalcedony_open_8x',
    'model_pp_Gypsum_open_2x',
    'model_pp_Gypsum_open_4x',
    'model_pp_Gypsum_open_8x'
]

# Extract these columns from df
df_pp_extra = df[extra_pp_cols].copy()

# Concatenate df_ions with the standardized pp columns to form df_ion_pp
df_ion_pp = pd.concat([df_ions, df_pp_extra], axis=1)

print("Columns in df_ion_pp:", df_ion_pp.columns.tolist()) 

print("DataFrame df_ion_pp shape:", df_ion_pp.shape)


Columns in df_ions: ['model_Ba_molL_frac', 'model_Ca_molL_frac', 'model_Fe_molL_frac', 'model_K_molL_frac', 'model_Li_molL_frac', 'model_Mg_molL_frac', 'model_Mn_molL_frac', 'model_Na_molL_frac', 'model_Sr_molL_frac', 'model_Br_molL_frac', 'model_Cl_molL_frac', 'model_C_molL_frac', 'model_SO4_molL_frac', 'model_B_molL_frac', 'model_Si_molL_frac', 'TDS_log10', 'well_depth_ft', 'total_ion_molL', 'total_cations_eq', 'total_anions_eq']
DataFrame df_ions shape: (4733, 20)
Columns in df_ion_si: ['model_Ba_molL_frac', 'model_Ca_molL_frac', 'model_Fe_molL_frac', 'model_K_molL_frac', 'model_Li_molL_frac', 'model_Mg_molL_frac', 'model_Mn_molL_frac', 'model_Na_molL_frac', 'model_Sr_molL_frac', 'model_Br_molL_frac', 'model_Cl_molL_frac', 'model_C_molL_frac', 'model_SO4_molL_frac', 'model_B_molL_frac', 'model_Si_molL_frac', 'TDS_log10', 'well_depth_ft', 'total_ion_molL', 'total_cations_eq', 'total_anions_eq', 'model_si_Calcite', 'model_si_Barite', 'model_si_Chalcedony', 'model_si_Gypsum']
DataFrame

  df = pd.read_csv('data_states_processed.csv')


In [4]:
################### ION CLUSTERING PER STATE ###################

states = df['state_alpha'].unique()
print("States found in the dataset:", states)

n_states = len(states)

output_folder = "ion"

df_cluster = df_ions.copy()
df_cluster = df_cluster.replace([np.inf, -np.inf], np.nan).dropna()
# remove well_depth_ft from df_ions
df_cluster = df_cluster.drop(columns=['well_depth_ft', 'total_ion_molL', 'total_cations_eq', 'total_anions_eq'], errors='ignore')
 
# Standardization
scaler = StandardScaler()
X_scaled_all = scaler.fit_transform(df_cluster)

# Create one figure for all elbow plots and one for all silhouette plots
fig_elbow, axes_elbow = plt.subplots(n_states, 1, figsize=(7, 4 * n_states), constrained_layout=True)
fig_sil, axes_sil = plt.subplots(n_states, 1, figsize=(7, 4 * n_states), constrained_layout=True)



for idx, state in enumerate(states):
    X_state = X_scaled_all[df['state_alpha'] == state]
    print(f"\nState: {state}, Shape: {X_state.shape}")

    # Standardize per state
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X_state)

    # Elbow
    r2_values = []
    k_max = min(12, len(X_state) - 1) 
    elbow_k_range = range(1, k_max+1)
    tot_ss = np.sum((X_scaled - X_scaled.mean(axis=0)) ** 2)
    for k in elbow_k_range:
        km = KMeans(n_clusters=k, random_state=42, n_init=10)
        km.fit(X_scaled)
        between_ss = tot_ss - km.inertia_
        r2_values.append(between_ss / tot_ss)

    axes_elbow[idx].plot(list(elbow_k_range), r2_values, marker='o')
    axes_elbow[idx].set_xlabel('k')
    axes_elbow[idx].set_ylabel('R²')
    axes_elbow[idx].set_title(f'Elbow: {state}')

    # Silhouette
    sil_scores = []
    sil_k_range = range(2, min(13, len(X_state)))  
    for k in sil_k_range:
        km = KMeans(n_clusters=k, random_state=42, n_init=10)
        labels = km.fit_predict(X_scaled)
        score = silhouette_score(X_scaled, labels)
        sil_scores.append(score)
        print(f"State={state}, k={k}, silhouette score={score:.3f}")

    axes_sil[idx].plot(list(sil_k_range), sil_scores, marker='o')
    axes_sil[idx].set_xlabel('k')
    axes_sil[idx].set_ylabel('Silhouette Score')
    axes_sil[idx].set_title(f'Silhouette: {state}')

# Save the combined figures
fig_elbow.savefig(os.path.join(output_folder, "states_elbow.png"), dpi=300)
fig_sil.savefig(os.path.join(output_folder, "states_silhouette.png"), dpi=300)
plt.close(fig_elbow)
plt.close(fig_sil)


States found in the dataset: ['NM' 'TX' 'FL' 'CA' 'AZ']

State: NM, Shape: (159, 16)
State=NM, k=2, silhouette score=0.266
State=NM, k=3, silhouette score=0.268
State=NM, k=4, silhouette score=0.257
State=NM, k=5, silhouette score=0.268
State=NM, k=6, silhouette score=0.262
State=NM, k=7, silhouette score=0.271
State=NM, k=8, silhouette score=0.315
State=NM, k=9, silhouette score=0.274
State=NM, k=10, silhouette score=0.299
State=NM, k=11, silhouette score=0.306
State=NM, k=12, silhouette score=0.271

State: TX, Shape: (2327, 16)
State=TX, k=2, silhouette score=0.255
State=TX, k=3, silhouette score=0.213
State=TX, k=4, silhouette score=0.235
State=TX, k=5, silhouette score=0.237
State=TX, k=6, silhouette score=0.240
State=TX, k=7, silhouette score=0.249
State=TX, k=8, silhouette score=0.206
State=TX, k=9, silhouette score=0.213
State=TX, k=10, silhouette score=0.213
State=TX, k=11, silhouette score=0.204
State=TX, k=12, silhouette score=0.210

State: FL, Shape: (376, 16)
State=FL, k=2,

In [5]:
########## ION CLUSTERING BY STATE PART 2 ##########

state_k_map = {
    "NM": 3,
    "TX": 6,
    "FL": 5,
    "CA": 3,
    "AZ": 4
}
additional_columns = [
    'ph', 'Ca_mgL', 'Mg_mgL', 'SO4_mgL', 'Na_mgL', 'Cl_mgL', 'K_mgL', 'Si_mgL'
]
# Select these columns from the original df
df_additional = df[additional_columns]


df_NM = df[df['state_alpha'] == "NM"].copy()
df_NM['Cluster'] = np.nan
df_TX = df[df['state_alpha'] == "TX"].copy()
df_TX['Cluster'] = np.nan
df_FL = df[df['state_alpha'] == "FL"].copy()
df_FL['Cluster'] = np.nan
df_CA = df[df['state_alpha'] == "CA"].copy()
df_CA['Cluster'] = np.nan
df_AZ = df[df['state_alpha'] == "AZ"].copy()
df_AZ['Cluster'] = np.nan



for state, k_state in state_k_map.items():
    df_state = df_cluster[df['state_alpha'] == state].copy()
    X_scaled_state = X_scaled_all[df['state_alpha'] == state]

    # Cluster assignment
    km_state = KMeans(n_clusters=k_state, random_state=42, n_init=10)
    cluster_labels = km_state.fit_predict(X_scaled_state)
    df_plot_state = df_ions[df['state_alpha'] == state].copy()
    df_plot_state['Cluster'] = cluster_labels.astype(str)

    # 1. BOX PLOTS FOR EACH ION FEATURE BY CLUSTER
    cluster_counts = df_plot_state['Cluster'].value_counts().sort_index()
    # Make new labels in the form "0\n(n=73)" etc.
    new_labels = [f"{cl}\n(n={cluster_counts[cl]})" for cl in sorted(cluster_counts.index, key=lambda x: int(x))]

    # Make a copy of df_plot columns for plotting features
    plot_features = list(df_ions.columns)
    
    # Boxplot grid
    n_features = len(df_ions.columns)
    n_cols = 3
    n_rows = int(np.ceil(n_features / n_cols))
    

    fig, axes = plt.subplots(n_rows, n_cols, figsize=(5 * n_cols, 4 * n_rows))
    axes = axes.flatten()
    for idx, feature in enumerate(df_ions.columns):
        ax = axes[idx]
        # Add count of points in each cluster
        if feature == "model_Ba_molL_frac":
            # Plot log10 of the feature "on the fly" filtering out <= 0 values to avoid log errors
            df_filtered = df_plot_state[df_plot_state[feature] > 0].copy()
            if len(df_filtered) > 0:
                print(f"Feature {feature} in state {state} has {len(df_filtered)} valid points for log10 transformation.")
                df_filtered[feature + "_log"] = np.log10(df_filtered[feature])
                do_log_plot = True

                sns.boxplot(
                    x='Cluster',
                    y=feature + "_log",
                    data=df_filtered,
                    ax=ax,
                    order=sorted(cluster_counts.index, key=lambda x: int(x))
                )
                ax.set_title(f"log10({feature}) by Cluster")
                ax.set_ylabel(f"log10({feature})")

            else:
                sns.boxplot(
                    x='Cluster',
                    y=feature,
                    data=df_plot_state,
                    ax=ax,
                    order=sorted(cluster_counts.index, key=lambda x: int(x))
                )
                ax.set_title(f"{feature} by Cluster")
                ax.set_ylabel(feature)

        else:
            sns.boxplot(
                x='Cluster',
                y=feature,
                data=df_plot_state,
                ax=ax,
                order=sorted(cluster_counts.index, key=lambda x: int(x))
            )
            ax.set_title(f"{feature} by Cluster")
        #sns.boxplot(x='Cluster', y=feature, data=df_plot_state, ax=ax)

        ax.set_xlabel("Cluster\n(sample count)")
        # Explicitly set tick locations and labels to avoid warning
        tick_locs = range(len(new_labels))
        ax.set_xticks(tick_locs)
        ax.set_xticklabels(new_labels)
        
    for i in range(n_features, len(axes)):
        fig.delaxes(axes[i])

    plt.tight_layout()
    plt.savefig(os.path.join(output_folder, f"Boxplots_by_Cluster_{state}.png"), dpi=300)
    plt.close(fig)
    print(f"Boxplots saved at: {output_folder}/Boxplots_by_Cluster_{state}.png")
    print(f"Number of cluster points: {state}, k={k_state}, Cluster labels shape: {cluster_labels.shape}")
    # Print out number of points in each cluster
    unique, counts = np.unique(cluster_labels, return_counts=True)
    cluster_counts = dict(zip(unique, counts))
    print(f"Cluster counts for {state}: {cluster_counts}")

    centers = pd.DataFrame(km_state.cluster_centers_, columns=df_state.columns)
    # Feature importance: range (max-min) across cluster centers
    feature_importance = (centers.max(axis=0) - centers.min(axis=0)).sort_values(ascending=False)
    plt.figure(figsize=(10,5))
    feature_importance.plot(kind='bar')
    plt.ylabel('Range across cluster centers (standardized)')
    plt.title(f'Feature Importance in Clustering for {state}')
    plt.tight_layout()
    importance_path = os.path.join(output_folder, f"Feature_Importance_{state}.png")
    plt.savefig(importance_path, dpi=300)
    plt.close()
    print("Feature importance plot saved at:", importance_path)
    # Calculate within-cluster variance for each feature and each cluster
    variances = []
    within_cluster_variance_df = pd.DataFrame()
    cluster_ids = range(km_state.n_clusters)
    for cluster_id in cluster_ids:
        cluster_data = df_state[cluster_labels == cluster_id]
        cluster_variance = cluster_data.var(axis=0)
        variances.append(cluster_variance)

    # Combine variances into a DataFrame, indexed by features, columns by cluster
    within_cluster_variance_df = pd.DataFrame(variances, index=[f"Cluster {c}" for c in cluster_ids]).T

    # Plot grouped bar chart of within-cluster variances
    fig, ax = plt.subplots(figsize=(15, 7))

    within_cluster_variance_df.plot(kind='bar', ax=ax)

    ax.set_ylabel('Variance (standardized units)')
    ax.set_title(f'Feature Variability Within Each Cluster - {state}')
    ax.legend(title='Cluster')
    plt.xticks(rotation=45, ha='right')
    plt.tight_layout()

    variance_path = os.path.join(output_folder,f"Variance_Per_Feature_{state}.png")
    plt.savefig(variance_path, dpi=300)
    plt.close()

    print(f"Within-cluster variance per cluster plot saved at: {variance_path}")

    df_plot_state['dec_lat_va'] = df.loc[df[df['state_alpha'] == state].index, 'dec_lat_va']
    df_plot_state['dec_long_va'] = df.loc[df[df['state_alpha'] == state].index, 'dec_long_va']

    # Remove all NaN lat/long rows
    df_plot_state = df_plot_state.dropna(subset=['dec_lat_va', 'dec_long_va'])


    #gdf = lat_long_to_point(df_plot, lat_col='dec_lat_va', long_col='dec_long_va')
    df_geo = [Point(lon, lat) for lon, lat in zip(df_plot_state['dec_long_va'], df_plot_state['dec_lat_va'])]
    gdf = gpd.GeoDataFrame(df_plot_state, geometry=df_geo, crs='EPSG:4326')


    # Now plot coloring by cluster
    fig, ax = plt.subplots(figsize=(10, 10))
    gdf.plot(
        ax=ax,
        column='Cluster',
        categorical=True,
        legend=True,
        cmap='tab10',   # Choose any categorical colormap
        markersize=30
    )
    ax.set_title(f'Clustered Points Map - {state}')
    ax.set_axis_off()
    # Save the map
    map_path = os.path.join(output_folder, f"Clustered_Map_{state}.png")
    plt.savefig(map_path, dpi=300)
    plt.close()
    print("Clustered map saved at:", map_path)

    # Print df_plot_state columns for debugging
    print("Columns in df_plot_state:", df_plot_state.columns.tolist())

    df_plot_state['Cluster'] = df_plot_state['Cluster'].astype(str)
    #Remove columns dec_lat_va and dec_long_va to avoid clutter
    df_plot_state = df_plot_state.drop(columns=['dec_lat_va', 'dec_long_va'])

    # Create the pairplot
    #pairplot = sns.pairplot(
    #    df_plot_state[pairplot_features + ['Cluster']],  # Features plus cluster column
    #    hue='Cluster',
    #    corner=True,             # Show only lower triangle
    #    plot_kws={'alpha': 0.7, 's': 25},  # Adjust point transparency/size
    #    palette='tab10'
    #)

    #pairplot.fig.suptitle('Cluster Pairplot: Ion Dataset', y=1.01)
    #pairplot_path = os.path.join(output_folder, f"Cluster_Pairplot_{state}.png")
    #plt.savefig(pairplot_path, dpi=300, bbox_inches='tight')
    #plt.close()

    # Assign cluster labels as integers (not as strings) for clarity
    df_ions_with_cluster = df_plot_state.copy()
    # Join additional columns by index to preserve alignment
    df_ions_with_cluster = df_ions_with_cluster.join(df_additional, how='left')
    # Save to CSV
    csv_path = os.path.join(output_folder, f"df_ions_{state}.csv")
    df_ions_with_cluster.to_csv(csv_path, index=False)



Boxplots saved at: ion/Boxplots_by_Cluster_NM.png
Number of cluster points: NM, k=3, Cluster labels shape: (159,)
Cluster counts for NM: {np.int32(0): np.int64(106), np.int32(1): np.int64(50), np.int32(2): np.int64(3)}
Feature importance plot saved at: ion/Feature_Importance_NM.png
Within-cluster variance per cluster plot saved at: ion/Variance_Per_Feature_NM.png
Clustered map saved at: ion/Clustered_Map_NM.png
Columns in df_plot_state: ['model_Ba_molL_frac', 'model_Ca_molL_frac', 'model_Fe_molL_frac', 'model_K_molL_frac', 'model_Li_molL_frac', 'model_Mg_molL_frac', 'model_Mn_molL_frac', 'model_Na_molL_frac', 'model_Sr_molL_frac', 'model_Br_molL_frac', 'model_Cl_molL_frac', 'model_C_molL_frac', 'model_SO4_molL_frac', 'model_B_molL_frac', 'model_Si_molL_frac', 'TDS_log10', 'well_depth_ft', 'total_ion_molL', 'total_cations_eq', 'total_anions_eq', 'Cluster', 'dec_lat_va', 'dec_long_va']
Feature model_Ba_molL_frac in state TX has 18 valid points for log10 transformation.
Boxplots saved at:

In [None]:
########### MEDOID CALCULATION ###########

from scipy.spatial.distance import cdist
import pandas as pd
import numpy as np
import os



output_folder = "ion"
states = ["NM", "TX", "FL", "CA", "AZ"]

# Define only desired clusters per state
state_clusters_map = {
    "AZ": [1, 2],
    "CA": [0, 1, 2],
    "FL": [0, 1, 2, 4],
    "NM": [0, 1],
    "TX": [0, 1, 2, 3]
}
medoid_indices = []
medoid_compositions = []
for state in states:
    csv_path = os.path.join(output_folder, f"df_ions_{state}.csv")
    df_state = pd.read_csv(csv_path)

    cluster_labels = df_state['Cluster'].astype(int).values
    desired_clusters = state_clusters_map[state]

    for cluster_id in desired_clusters:
        mask = (cluster_labels == cluster_id)
        cluster_points = df_state.loc[mask]
        if cluster_points.shape[0] == 0:
            print(f"Cluster {cluster_id} in state {state} has 0 points, skipping.")
            continue

        numeric_columns = cluster_points.select_dtypes(include=[np.number]).columns.difference(additional_columns)
        features = cluster_points[numeric_columns].values

        distances = cdist(features, features, metric='euclidean')
        total_distances = distances.sum(axis=1)
        medoid_idx_local = np.argmin(total_distances)
        medoid_idx_global = cluster_points.index[medoid_idx_local]

        medoid_indices.append({'state': state, 'cluster': cluster_id, 'index': medoid_idx_global})
        medoid_compositions.append({
            'state': state,
            'cluster': cluster_id,
            **df_state.loc[medoid_idx_global].to_dict()
        })

    medoids_df = pd.DataFrame(medoid_compositions)
    medoids_df['model_TDS_mgL'] = 10 ** medoids_df['TDS_log10']
    # For cations
    for ion in cations:
        medoids_df[ion + '_eq'] = medoids_df[ion + '_frac'] * medoids_df['total_cations_eq']
        medoids_df[ion] = medoids_df[ion + '_eq'] / charges[ion]

    # For anions
    for ion in anions:
        medoids_df[ion + '_eq'] = medoids_df[ion + '_frac'] * medoids_df['total_anions_eq']
        medoids_df[ion] = medoids_df[ion + '_eq'] / charges[ion]

    # For neutrals
    for ion in neutral_ions:
        medoids_df[ion] = medoids_df[ion + '_frac'] * medoids_df['total_ion_molL']

    cols_to_drop = [col for col in medoids_df.columns if col.endswith('_eq') or col.endswith('_frac')]
    medoids_df = medoids_df.drop(columns=cols_to_drop)
    
    medoids_csv = os.path.join(output_folder, f"medoids_{state}.csv")
    medoids_df.to_csv(medoids_csv, index=False)
    print(f"Medoids for {state} saved at: {medoids_csv}")


Medoids for NM saved at: ion/medoids_NM.csv
Medoids for TX saved at: ion/medoids_TX.csv
Medoids for FL saved at: ion/medoids_FL.csv
Medoids for CA saved at: ion/medoids_CA.csv
Medoids for AZ saved at: ion/medoids_AZ.csv
