In [1]:
%load_ext lab_black
# python internal
import os
import socket
import sys
import pyrosetta

print(os.getcwd())
print(socket.gethostname())

/mnt/home/cdemakis/share/pymatcher/example_notebooks
dig64


In [2]:
from pyrosetta.distributed import requires_init
from pyrosetta.distributed.packed_pose.core import PackedPose


@requires_init
def wrapper(packed_pose_in: PackedPose, **kwargs):
    sys.path.append(os.path.abspath(".."))
    import pymatcher as pm

    return pm.matcher(**kwargs)

pyrosetta.distributed will output as pdb.bz2 in the output_path directory.  In each pdb.bz2, and in scores.json, there will be an extra score called matcher_data that contains a dict including all of the information normally available in the Matcher output filenames.

To reduce disk I/O, consider adding more to the wrapper function to manipulate the list of poses that pymatcher returns.  Alternatively, define additional functions that take a PackedPose and kwargs and add them to the protocols list that is distributed.  pyrosetta.distributed will pass each Pose in the list from pymatcher and the kwargs defined in create_tasks to the second protocol.

In [3]:
from dask.distributed import Client
from dask_jobqueue import SLURMCluster
import logging
import pwd
from pyrosetta.distributed.cluster.core import PyRosettaCluster


print("run the following from your local terminal:")
print(
    f"ssh -L 8000:localhost:8787 {pwd.getpwuid(os.getuid()).pw_name}@{socket.gethostname()}"
)


def create_tasks(selected, options):
    for scaf in scafs:
        tasks = {
            "options": f"-match:scaffold_active_site_residues in/{scaf}.pos",  # job-specific options
        }
        tasks["extra_options"] = options  # general options
        tasks["-s"] = f"in/{scaf}.pdb"  # other kwargs
        tasks["-lig_name"] = "lig"
        yield tasks


logging.basicConfig(level=logging.INFO)
scafs = ["test1", "test2"]
options = {
    "-out:level 300",
    "-in:file:extra_res_fa in/lig.params",
    "-geometric_constraint_file in/test.cst",
    "-match::dynamic_grid_refinement true",
    "-match::enumerate_ligand_rotamers true",
    "-match::consolidate_matches true",
    "-match::output_matches_per_group 10",
    "-in:ignore_unrecognized_res",
    "-ex1",
    "-ex2",
    "-chemical:exclude_patches D_AA",
    "-match::euclid_bin_size 0.5",
    "-match::euler_bin_size 5.0",
}

output_path = os.path.join(os.getcwd(), "out")
client_opts = options
client_opts.add(
    "-run:constant_seed 1",
)
pyrosetta.distributed.init(" ".join(list(client_opts)))

if __name__ == "__main__":
    # configure SLURM cluster as a context manager
    with SLURMCluster(
        cores=1,
        processes=1,
        job_cpu=1,
        memory="8GB",
        queue="short",
        walltime="02:00:00",
        death_timeout=120,
        local_directory="$TMPDIR/dask",
        log_directory="logs",
        extra=["--lifetime", "2h", "--lifetime-stagger", "4m"],
    ) as cluster:
        print(cluster.job_script())
        # scale between 1-1020 workers,
        cluster.adapt(
            minimum=1,
            maximum=1,  # TODO
            wait_count=999,  # Number of consecutive times that a worker should be suggested for removal it is removed
            interval="5s",  # Time between checks
        )
        # setup a client to interact with the cluster as a context manager
        with Client(cluster) as client:
            print(client)
            PyRosettaCluster(
                tasks=create_tasks(scafs, options),
                client=client,
                scratch_dir=output_path,
                output_path=output_path,
                sha1=None,  # ignore git status for non-production runs
            ).distribute(protocols=[wrapper])

INFO:pyrosetta.rosetta:Found rosetta database at: /home/cdemakis/.conda/envs/from_phil/lib/python3.8/site-packages/pyrosetta/database; using it....
INFO:pyrosetta.rosetta:PyRosetta-4 2021 [Rosetta PyRosetta4.conda.linux.cxx11thread.serialization.CentOS.python38.Release 2021.27+release.7ce64884a77d606b7b667c363527acc846541030 2021-07-09T18:10:05] retrieved from: http://www.pyrosetta.org
(C) Copyright Rosetta Commons Member Institutions. Created in JHU by Sergey Lyskov and PyRosetta Team.


https://docs.anaconda.com/anaconda/install

run the following from your local terminal:
ssh -L 8000:localhost:8787 cdemakis@dig64


`conda env export --prefix /home/cdemakis/.conda/envs/from_phil > environment.yml`
to reproduce this simulation later.


#!/usr/bin/env bash

#SBATCH -J dask-worker
#SBATCH -e logs/dask-worker-%J.err
#SBATCH -o logs/dask-worker-%J.out
#SBATCH -p short
#SBATCH -n 1
#SBATCH --cpus-per-task=1
#SBATCH --mem=8G
#SBATCH -t 02:00:00

JOB_ID=${SLURM_JOB_ID%;*}

/home/cdemakis/.conda/envs/from_phil/bin/python -m distributed.cli.dask_worker tcp://172.16.131.94:39609 --nthreads 1 --memory-limit 7.45GiB --name name --nanny --death-timeout 120 --local-directory $TMPDIR/dask --lifetime 2h --lifetime-stagger 4m

<Client: 'tcp://172.16.131.94:39609' processes=0 threads=0, memory=0 B>


Task was destroyed but it is pending!
task: <Task pending name='Task-999' coro=<AdaptiveCore.adapt() done, defined at /home/cdemakis/.conda/envs/from_phil/lib/python3.8/site-packages/distributed/deploy/adaptive_core.py:178> wait_for=<Future pending cb=[<TaskWakeupMethWrapper object at 0x7fc0a4cd3c70>()]> cb=[IOLoop.add_future.<locals>.<lambda>() at /home/cdemakis/.conda/envs/from_phil/lib/python3.8/site-packages/tornado/ioloop.py:688]>


In [4]:
def read_scorefile(scores):
    import pandas as pd
    from tqdm import tqdm

    dfs = []
    with open(scores, "r") as f:
        for line in tqdm(f):
            dfs.append(pd.read_json(line).T)
    tabulated_scores = pd.concat(dfs)
    return tabulated_scores


scores = read_scorefile("out/scores.json")

27it [00:00, 130.35it/s]


In [10]:
scores

Unnamed: 0,matcher_data
/mnt/home/cdemakis/share/pymatcher/example_notebooks/out/decoys/0000/2021.08.17.15.08.55.853153_dab46edb325849fb90423ed11a2c87cb.pdb.bz2,"{""match_group"": 1, ""theozyme"": {""16"": ""ARG"", ""..."
/mnt/home/cdemakis/share/pymatcher/example_notebooks/out/decoys/0000/2021.08.17.15.08.55.853153_37d06cb6c83b44a3a266503b4306a69f.pdb.bz2,"{""match_group"": 2, ""theozyme"": {""27"": ""ARG"", ""..."
/mnt/home/cdemakis/share/pymatcher/example_notebooks/out/decoys/0000/2021.08.17.15.08.55.853153_5b5d3002049e4d1182943bcd71c0a1b1.pdb.bz2,"{""match_group"": 3, ""theozyme"": {""28"": ""ARG"", ""..."
/mnt/home/cdemakis/share/pymatcher/example_notebooks/out/decoys/0000/2021.08.17.15.08.55.853153_3909375fb4d144bf91ad84b376b41ab9.pdb.bz2,"{""match_group"": 4, ""theozyme"": {""28"": ""ARG"", ""..."
/mnt/home/cdemakis/share/pymatcher/example_notebooks/out/decoys/0000/2021.08.17.15.08.55.853153_c6d6990c88544e639e0ba71612e12635.pdb.bz2,"{""match_group"": 5, ""theozyme"": {""36"": ""ARG"", ""..."
/mnt/home/cdemakis/share/pymatcher/example_notebooks/out/decoys/0000/2021.08.17.15.08.55.853153_0ea3e775f5ef4d9cbb339bf03b73be6e.pdb.bz2,"{""match_group"": 6, ""theozyme"": {""51"": ""ARG"", ""..."
/mnt/home/cdemakis/share/pymatcher/example_notebooks/out/decoys/0000/2021.08.17.15.08.55.853153_930627e1ec234470a116c537d3797068.pdb.bz2,"{""match_group"": 7, ""theozyme"": {""55"": ""ARG"", ""..."
/mnt/home/cdemakis/share/pymatcher/example_notebooks/out/decoys/0000/2021.08.17.15.08.55.853153_875d9a54f9d842418b9bece8f17eabad.pdb.bz2,"{""match_group"": 8, ""theozyme"": {""55"": ""ARG"", ""..."
/mnt/home/cdemakis/share/pymatcher/example_notebooks/out/decoys/0000/2021.08.17.15.08.55.853153_72d6eddf8c8c45f1bccff6496e694c58.pdb.bz2,"{""match_group"": 9, ""theozyme"": {""60"": ""ARG"", ""..."
/mnt/home/cdemakis/share/pymatcher/example_notebooks/out/decoys/0000/2021.08.17.15.08.55.853153_933956f6cad742a69a79c93ba17a449c.pdb.bz2,"{""match_group"": 10, ""theozyme"": {""60"": ""ARG"", ..."
