In [None]:
%load_ext autoreload
%autoreload 2
%matplotlib inline
import pandas as pd
from pathlib import Path
from mdgraph.data.preprocess import parallel_preprocess
from mdgraph.data.utils import concatenate_h5, parse_h5

In [None]:
traj_files = sorted(Path("/homes/heng.ma/Research/FoldingTraj/1FME-0/1FME-0-protein/").glob("*dcd"))
topology_files = ["/homes/abrace/src/pytorch-geometric-sandbox/test/data/1FME-unfolded.pdb"] * len(traj_files)
ref_topology = "/homes/abrace/src/pytorch-geometric-sandbox/test/data/1FME-folded.pdb"
save_files = [f"/homes/abrace/tmp/test_pyG_preprocess/test_{i}.h5" for i in range(len(traj_files))]
concatenated_save_file = "/homes/abrace/src/pytorch-geometric-sandbox/test/data/BBA-full.h5"

In [None]:
parallel_preprocess(
    topology_files,
    traj_files,
    ref_topology,
    save_files,
    cutoff=8.0,
    selection="protein and name CA",
    print_every=10000,
    num_workers=20,
)

In [None]:
concatenate_h5(save_files, concatenated_save_file)

In [None]:
data = parse_h5(
    concatenated_save_file,
    fields=["contact_map_values"]#"contact_map", "contact_map_values", "rmsd", "fnc", "point_cloud", "amino_acids"]
)

In [None]:
vals = data["contact_map_values"]

In [None]:
import math
import numpy as np
import pandas as pd
vals = np.concatenate(vals)

In [None]:
inverse_vals = 1 / np.where(vals == 0, 1, vals)
log_inverse_vals = np.log(inverse_vals)
neglog = -1 * np.log(np.where(vals == 0, 1, vals))
df = pd.DataFrame({"vals": vals, "inverse_vals": inverse_vals, "log_inverse_vals": log_inverse_vals, "neglog": neglog})

In [None]:
df["vals"].hist()

In [None]:
df["inverse_vals"].hist(bins=100)

In [None]:
df["log_inverse_vals"].hist(bins=100)

In [None]:
df["inverse_vals"].plot()

In [None]:
df["neglog"].hist(bins=100)

# Test Data

- Individual preprocessed H5 files: `/lambda_stor/homes/heng.ma/Research/FoldingTraj/biophys_analysis/traj_analysis/h5_save/1FME-1/`

In [None]:
path = "/lambda_stor/homes/heng.ma/Research/FoldingTraj/biophys_analysis/traj_analysis/h5_save/1FME-1/"
concatenated_save_file = "/homes/abrace/src/pytorch-geometric-sandbox/test/data/1FME-1.h5"
h5_files = sorted(Path(path).glob("*.h5"))

In [None]:
concatenate_h5(h5_files, concatenated_save_file)

# Train Data

- Individual preprocessed H5 files: `/lambda_stor/homes/heng.ma/Research/FoldingTraj/biophys_analysis/traj_analysis/h5_save/1FME-0/`

In [None]:
path = "/lambda_stor/homes/heng.ma/Research/FoldingTraj/biophys_analysis/traj_analysis/h5_save/1FME-0/"
concatenated_save_file = "/homes/abrace/src/pytorch-geometric-sandbox/test/data/1FME-0.h5"
h5_files = sorted(Path(path).glob("*.h5"))

In [None]:
concatenate_h5(h5_files, concatenated_save_file)

# Stream-AI-MD Data

In [None]:
path = "/homes/abrace/data/bba/deepdrivemd_runs/bba_28_cs1.2/bba_28_cs1.2_h5"
concatenated_save_file = "/homes/abrace/src/pytorch-geometric-sandbox/test/data/1FME-stream-ai-md.h5"
h5_files = sorted(Path(path).glob("*.h5"))

In [None]:
concatenate_h5(h5_files, concatenated_save_file)

In [None]:
import h5py
from mdtools.writers import write_contact_map, write_point_cloud, write_rmsd, write_fraction_of_contacts
def compute_subset(data: dict, every_n_frames: int = 1):
    return {field: data[field][::every_n_frames] for field in data}

def write_h5(data: dict, path: str):
    
    # Get row col format for contact maps
    rows, cols = [], []
    for cm in data["contact_map"]:
        cm = cm.reshape(2, -1)
        rows.append(cm[0])
        cols.append(cm[1])
        
    with h5py.File(path, "w", swmr=False) as f:
        write_contact_map(f, rows, cols)
        #write_point_cloud(f, data["point_cloud"])
        write_rmsd(f, data["rmsd"])
        #write_fraction_of_contacts(f, data["fnc"])

In [None]:
data = parse_h5(concatenated_save_file, fields=["contact_map", "rmsd"])

In [None]:
subset_5 = compute_subset(data, 5)

In [None]:
write_h5(subset_5, "/homes/abrace/src/pytorch-geometric-sandbox/test/data/1FME-stream-ai-md-subset-5.h5")