In [2]:
import h5py
import matplotlib
import matplotlib.pyplot as plt
import os
import glob
import mdtraj as md
import numpy as np
from tqdm import tqdm
from westpa_helpers import *

In [3]:
protein = "chignolin"

In [None]:
# Change this to the root of the run
home_path=f"/media/DATA_18_TB_1/awaghili/WESTPA_CG/harmonic_mix_{protein_name}_2d/westpa_prop"
os.chdir(home_path)

sim_config = extract_simulation_config()

# Will be needed to unpickle the TICA models
import sys
sys.path.append(sim_config['cgschnet_path'])

In [None]:
import re

coordinate_files = []

def extract_number(filename):
    match = re.search(fr'{protein_name}_(\d+)\.h5', filename)
    return int(match.group(1)) if match else float('inf') 


# Expand the glob pattern into a list
expanded_files = sorted(glob.glob(
    home_path + "/combined_trajs/*.h5" #Change this to *_all.h5 if you only want the final result
), key=extract_number)

# Append the matched files to the coordinate_files list
# coordinate_files.extend(expanded_files)

In [None]:
model = load_tica_model(sim_config["tica_model_path"])
coordinate_list, label_list = load_trajectories(coordinate_files, size_limit=1000000)

In [None]:
import numpy as np
from sklearn.neighbors import KernelDensity

def generate_energy_based_bins(data, num_bins=15, bandwidth=0.2, oversample=5, alpha=0.5):
    """
    Generate rectilinear bins with finer resolution in low-density (high-energy) regions.

    Parameters:
    - data (np.ndarray): 1D TICA projection.
    - num_bins (int): Desired number of bin edges (excluding -inf/inf).
    - bandwidth (float): KDE bandwidth for density estimation.
    - oversample (int): Multiplier for intermediate binning resolution.

    Returns:
    - bin_edges (list): Bin edges including '-inf' and 'inf'.
    """
    # 1. KDE estimate
    data = data.reshape(-1, 1)
    kde = KernelDensity(kernel='gaussian', bandwidth=bandwidth).fit(data)
    grid = np.linspace(data.min(), data.max(), num_bins * oversample).reshape(-1, 1)
    log_density = kde.score_samples(grid)
    density = np.exp(log_density)

    # 2. Blended weights: combine density and inverse-density
    inv_density = 1 / (density + 1e-12)
    blend_weights = alpha * inv_density + (1 - alpha) * density
    norm_weights = blend_weights / blend_weights.sum()

    # 3. Compute cumulative weights and find equally spaced cutoffs
    cdf = np.cumsum(norm_weights)
    cdf /= cdf[-1]
    edges_idx = np.searchsorted(cdf, np.linspace(0, 1, num_bins + 1))

    # 4. Map those indices back to bin edges
    bin_edges = grid[edges_idx].flatten()
    bin_edges = [-np.inf] + list(np.round(bin_edges[1:-1], 2)) + [np.inf]
    
    return bin_edges


In [None]:
components_to_calculate = range(2) # Number of components to calculate
tica_plot_pad = 0.2

kB = 0.0019872041 # kcal/mol/Kelvin
Temp = 300 # Kelvin

In [None]:
component_values = [calculate_component_values(model, c, components_to_calculate) for c in tqdm(coordinate_list)]
component_values[0].keys(), [[len(v) for v in c.values()] for c in component_values]

component_maxs = [max([max(c[tica_comp]) for c in component_values]) for tica_comp in components_to_calculate]
component_mins = [min([min(c[tica_comp]) for c in component_values]) for tica_comp in components_to_calculate]
component_range = [*zip(component_mins, component_maxs)]