In [4]:
### IMPORTS & SETUP ###
import numpy as np
import mdtraj as md
import os

In [7]:
# User-defined parameters
raw_directory_path = '/home/bfd21/rds/hpc-work/sample_macrocycle_md/raw/N-Cap2'  # Raw trajectory directory with .xtc files
gro_file_path = '/home/bfd21/rds/hpc-work/sample_macrocycle_md/raw/simulation_prep/N-Cap2/9.gro'  # Path to the .gro file
splits = {'train': 0.8, 'test': 0.1}  # Train, test, and val split
equilibrium_frac = 0.2  # Fraction considered at equilibrium

save_dir = '/home/bfd21/rds/hpc-work/sample_macrocycle_md/N-Cap2' # where to save the final, processed, data.

In [8]:
os.makedirs(save_dir, exist_ok=True)  # Ensure the save directory exists
pdb_file_path = os.path.join(save_dir, 'system.pdb')  # Output PDB file path

# Function to generate a random rotation matrix
def random_rotation_matrix():
    z = np.random.uniform(low=-1, high=1)
    theta = np.random.uniform(low=0, high=2 * np.pi)
    a = np.random.uniform(low=0, high=2 * np.pi)
    
    x = np.sqrt(1 - z ** 2) * np.cos(theta)
    y = np.sqrt(1 - z ** 2) * np.sin(theta)

    rot_matrix = np.array([
        [np.cos(a) + (1 - np.cos(a)) * x**2, x * y * (1 - np.cos(a)) - z * np.sin(a), x * z * (1 - np.cos(a)) + y * np.sin(a)],
        [y * x * (1 - np.cos(a)) + z * np.sin(a), np.cos(a) + (1 - np.cos(a)) * y**2, y * z * (1 - np.cos(a)) - x * np.sin(a)],
        [z * x * (1 - np.cos(a)) - y * np.sin(a), z * y * (1 - np.cos(a)) + x * np.sin(a), np.cos(a) + (1 - np.cos(a)) * z**2],
    ])
    return rot_matrix

# Step 1: Convert .gro file to .pdb
print("Converting .gro file to .pdb...")
traj = md.load(gro_file_path)
traj.save_pdb(pdb_file_path)
print(f"Saved PDB file to {pdb_file_path}")

# Step 2: Process .xtc files using the PDB topology
all_data = []

# Loop over .xtc files in the directory
for file_name in os.listdir(raw_directory_path):
    if file_name.endswith(".xtc"):
        file_path = os.path.join(raw_directory_path, file_name)
        print(f"Processing file: {file_name}")
        
        # Load trajectory data using PDB topology
        traj = md.load(file_path, top=pdb_file_path)  # Load .xtc with PDB topology
        MD_positions = traj.xyz  # Shape: (n_frames, n_atoms, 3)
        num_frames = MD_positions.shape[0]
        
        # Select only equilibrium portion
        start_idx = int((1 - equilibrium_frac) * num_frames)
        MD_positions = MD_positions[start_idx:]  # Keep the last equilibrium_frac portion

        # Process each frame: Center CoM and apply random rotation
        for t in range(len(MD_positions)):
            avg_position = np.mean(MD_positions[t], axis=0)  # Compute CoM
            MD_positions[t] -= avg_position  # Center at CoM
            rotation_matrix = random_rotation_matrix()  # Generate random rotation matrix
            MD_positions[t] = MD_positions[t] @ rotation_matrix.T  # Apply rotation
        
        # Append processed positions (no flattening!)
        all_data.append(MD_positions)



Converting .gro file to .pdb...
Saved PDB file to /home/bfd21/rds/hpc-work/sample_macrocycle_md/N-Cap2/system.pdb
Processing file: traj883.xtc
Processing file: traj611.xtc
Processing file: traj479.xtc
Processing file: traj532.xtc
Processing file: traj524.xtc
Processing file: traj938.xtc
Processing file: traj224.xtc
Processing file: traj405.xtc
Processing file: traj450.xtc
Processing file: traj484.xtc
Processing file: traj787.xtc
Processing file: traj53.xtc
Processing file: traj384.xtc
Processing file: traj144.xtc
Processing file: traj956.xtc
Processing file: traj243.xtc
Processing file: traj71.xtc
Processing file: traj464.xtc
Processing file: traj195.xtc
Processing file: traj139.xtc
Processing file: traj695.xtc
Processing file: traj894.xtc
Processing file: traj656.xtc
Processing file: traj21.xtc
Processing file: traj580.xtc
Processing file: traj212.xtc
Processing file: traj862.xtc
Processing file: traj749.xtc
Processing file: traj789.xtc
Processing file: traj891.xtc
Processing file: tr

In [9]:
# Concatenate all data
all_data = np.concatenate(all_data, axis=0)  # Combine all processed data
print(f"Total processed data shape: {all_data.shape}")  # Shape: (n_samples, n_atoms, 3)

# Shuffle data
np.random.shuffle(all_data)

# Split data into train, test, and val
train_frac = splits['train']
test_frac = splits['test']
val_frac = 1 - (train_frac + test_frac)

num_samples = all_data.shape[0]
train_end = int(train_frac * num_samples)
test_end = train_end + int(test_frac * num_samples)

train_data = all_data[:train_end]  # Shape: (train_samples, n_atoms, 3)
test_data = all_data[train_end:test_end]  # Shape: (test_samples, n_atoms, 3)
val_data = all_data[test_end:]  # Shape: (val_samples, n_atoms, 3)

# Save the splits as .npz files
split_data = {'train': train_data, 'test': test_data, 'val': val_data}
for role, data in split_data.items():
    save_path = os.path.join(save_dir, f'processed_{role}.npz')
    np.savez(save_path, positions=data)
    print(f"Saved {role} data to {save_path} with shape {data.shape}")

Total processed data shape: (517409, 206, 3)
Saved train data to /home/bfd21/rds/hpc-work/sample_macrocycle_md/N-Cap2/processed_train.npz with shape (413927, 206, 3)
Saved test data to /home/bfd21/rds/hpc-work/sample_macrocycle_md/N-Cap2/processed_test.npz with shape (51740, 206, 3)
Saved val data to /home/bfd21/rds/hpc-work/sample_macrocycle_md/N-Cap2/processed_val.npz with shape (51742, 206, 3)
