Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

rf2aa as a Python module #78

Closed
wants to merge 4 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,12 @@ cd rf2aa/SE3Transformer/
pip3 install --no-cache-dir -r requirements.txt
python3 setup.py install
cd ../../

```
Change the default checkpoint_path/database paths in `RoseTTAFold-All-Atom/rf2aa/config/inference/base.yaml`
then install `rf2aa` as a python module that can be called everywhere.
```shell
pip install . --no-dependencies --no-cache-dir
```
4. Configure signalp6 after downloading a licensed copy of it from https://services.healthtech.dtu.dk/services/SignalP-6.0/
```
Expand Down
14 changes: 14 additions & 0 deletions install_dependencies.sh
Original file line number Diff line number Diff line change
Expand Up @@ -20,3 +20,17 @@ echo "Downloading cs-blast ..."
wget http://wwwuser.gwdg.de/~compbiol/data/csblast/releases/csblast-2.2.3_${platform}.tar.gz -O csblast-2.2.3.tar.gz
mkdir -p csblast-2.2.3
tar xf csblast-2.2.3.tar.gz -C csblast-2.2.3 --strip-components=1

mv csblast-2.2.3 $CONDA_PREFIX/share/

rm -f csblast-2.2.3.tar.gz

# https://github.com/baker-laboratory/RoseTTAFold-All-Atom/issues/5#issuecomment-1990991606
# https://github.com/RosettaCommons/RoseTTAFold/issues/13#issuecomment-1405850297
echo "Downloading blast ..."
wget https://ftp.ncbi.nlm.nih.gov/blast/executables/legacy.NOTSUPPORTED/2.2.26/blast-2.2.26-x64-linux.tar.gz
mkdir -p blast-2.2.26
tar -xf blast-2.2.26-x64-linux.tar.gz -C blast-2.2.26

mv blast-2.2.26 $CONDA_PREFIX/share/
rm -f blast-2.2.26-x64-linux.tar.gz
22 changes: 22 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
[build-system]
requires = ["poetry-core>=1.0.0,<2.0.0"]
build-backend = "poetry.core.masonry.api"

[tool.poetry]
name = "rfaa"
version = "1.0.0"
description = "RoseTTAFold-All-Atom code"
authors = ["Name <email@address>"]

readme = "README.md"
license = "MIT"
repository = "https://github.com/baker-laboratory/RoseTTAFold-All-Atom"
classifiers = [
"Topic :: Scientific/Engineering :: Biochemistry",
"Topic :: Scientific/Engineering :: Protein Engineering"
]

packages = [
{ include = "rf2aa" },
{ include = "rf2aa/*.py" },
]
11 changes: 7 additions & 4 deletions rf2aa/config/inference/base.yaml
Original file line number Diff line number Diff line change
@@ -1,12 +1,15 @@
job_name: "structure_prediction"
output_path: ""
checkpoint_path: RFAA_paper_weights.pt
output_path: "output/structure_prediction"
checkpoint_path: /mnt/db/weights/RoseTTAFoldAA/RFAA_paper_weights.pt
database_params:
sequencedb: ""
hhdb: "pdb100_2021Mar03/pdb100_2021Mar03"
DB_UR30: "/mnt/db/uniref30_uc30/UniRef30_2022_02/UniRef30_2022_02"
DB_BFD: "/mnt/db/bfd/bfd_metaclust_clu_complete_id30_c90_final_seq.sorted_opt"
DB_PDB100: "/mnt/db/pdb100/pdb100_2021Mar03/pdb100_2021Mar03"
command: make_msa.sh
num_cpus: 4
mem: 64

force_cpu: False # force to use cpu if CUDA out of memory
protein_inputs: null
na_inputs: null
sm_inputs: null
Expand Down
14 changes: 11 additions & 3 deletions rf2aa/data/preprocessing.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@
import subprocess

#from rf2aa.run_inference import ModelRunner
script_dir=os.path.dirname(os.path.abspath(__file__)) #RoseTTAFold-All-Atom/rf2aa/data
msa_script_dir=os.path.abspath(os.path.join(script_dir, '..','input_prep')) #RoseTTAFold-All-Atom/rf2aa/input_prep


def make_msa(
Expand All @@ -17,19 +19,25 @@ def make_msa(
out_dir.mkdir(parents=True, exist_ok=True)

command = model_runner.config.database_params.command
search_base = model_runner.config.database_params.sequencedb

# sequence databases
DB_UR30=model_runner.config.database_params.DB_UR30
DB_BFD=model_runner.config.database_params.DB_BFD
num_cpus = model_runner.config.database_params.num_cpus
ram_gb = model_runner.config.database_params.mem
template_database = model_runner.config.database_params.hhdb
template_database = model_runner.config.database_params.DB_PDB100

out_a3m = out_dir / "t000_.msa0.a3m"
out_atab = out_dir / "t000_.atab"
out_hhr = out_dir / "t000_.hhr"
if out_a3m.exists() and out_atab.exists() and out_hhr.exists():
return out_a3m, out_hhr, out_atab

search_command = f"./{command} {fasta_file} {out_dir} {num_cpus} {ram_gb} {search_base} {template_database}"
search_command = f"{msa_script_dir}/{command} {os.path.abspath(fasta_file)} {os.path.abspath(out_dir)} {num_cpus} {ram_gb} {DB_UR30} {DB_BFD} {template_database}"
print(search_command)
_ = subprocess.run(search_command, shell=True)

if _.returncode != 0:
raise RuntimeError(f"Failed to execute command {search_command}")
return out_a3m, out_hhr, out_atab

21 changes: 14 additions & 7 deletions make_msa.sh → rf2aa/input_prep/make_msa.sh
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
#!/bin/bash
set -e

# inputs
in_fasta="$1"
Expand All @@ -8,17 +9,23 @@ out_dir="$2"
CPU="$3"
MEM="$4"

export BLAST_LEGACY=$CONDA_PREFIX/share/blast-2.2.26/blast-2.2.26
export BLASTMAT=$BLAST_LEGACY/data/
export BLAST_LEGACY_BIN=$BLAST_LEGACY/bin

export PATH=$BLAST_LEGACY_BIN:$PATH

# sequence databases
DB_UR30="$5"
DB_BFD="$6"

# template database
DB_TEMPL="$5"
DB_TEMPL="$7"

# current script directory (i.e., pipe directory)
SCRIPT=`realpath -s $0`
export PIPE_DIR=`dirname $SCRIPT`

# sequence databases
DB_UR30="$PIPE_DIR/uniclust/UniRef30_2021_06"
DB_BFD="$PIPE_DIR/bfd/bfd_metaclust_clu_complete_id30_c90_final_seq.sorted_opt"

# Running signalP 6.0
mkdir -p $out_dir/signalp
tmp_dir="$out_dir/signalp"
Expand All @@ -30,7 +37,7 @@ then
fi

# setup hhblits command
export HHLIB=/software/hhsuite/build/bin/
export HHLIB=$(dirname $(which hhblits))
export PATH=$HHLIB:$PATH
HHBLITS_UR30="hhblits -o /dev/null -mact 0.35 -maxfilt 100000000 -neffmax 20 -cov 25 -cpu $CPU -nodiff -realign_max 100000000 -maxseq 1000000 -maxmem $MEM -n 4 -d $DB_UR30"
HHBLITS_BFD="hhblits -o /dev/null -mact 0.35 -maxfilt 100000000 -neffmax 20 -cov 25 -cpu $CPU -nodiff -realign_max 100000000 -maxseq 1000000 -maxmem $MEM -n 4 -d $DB_BFD"
Expand Down Expand Up @@ -113,7 +120,7 @@ fi

echo "Running PSIPRED"
mkdir -p $out_dir/log
$PIPE_DIR/input_prep/make_ss.sh $out_dir/t000_.msa0.a3m $out_dir/t000_.ss2 > $out_dir/log/make_ss.stdout 2> $out_dir/log/make_ss.stderr
$PIPE_DIR/make_ss.sh $out_dir/t000_.msa0.a3m $out_dir/t000_.ss2 > $out_dir/log/make_ss.stdout 2> $out_dir/log/make_ss.stderr

if [ ! -s $out_dir/t000_.hhr ]
then
Expand Down
6 changes: 5 additions & 1 deletion input_prep/make_ss.sh → rf2aa/input_prep/make_ss.sh
100644 → 100755
Original file line number Diff line number Diff line change
@@ -1,21 +1,25 @@
#!/bin/bash
# From: https://github.com/RosettaCommons/RoseTTAFold

set -e

DATADIR="$CONDA_PREFIX/share/psipred_4.01/data"
csblast_dir="$CONDA_PREFIX/share/csblast-2.2.3"
echo $DATADIR

i_a3m="$1"
o_ss="$2"

ID=$(basename $i_a3m .a3m).tmp

$PIPE_DIR/csblast-2.2.3/bin/csbuild -i $i_a3m -I a3m -D $PIPE_DIR/csblast-2.2.3/data/K4000.crf -o $ID.chk -O chk
$csblast_dir/bin/csbuild -i $i_a3m -I a3m -D $csblast_dir/data/K4000.crf -o $ID.chk -O chk

head -n 2 $i_a3m > $ID.fasta
echo $ID.chk > $ID.pn
echo $ID.fasta > $ID.sn

makemat -P $ID
echo "psipred $ID.mtx $DATADIR/weights.dat $DATADIR/weights.dat2 $DATADIR/weights.dat3 > $ID.ss"
psipred $ID.mtx $DATADIR/weights.dat $DATADIR/weights.dat2 $DATADIR/weights.dat3 > $ID.ss
psipass2 $DATADIR/weights_p2.dat 1 1.0 1.0 $i_a3m.csb.hhblits.ss2 $ID.ss > $ID.horiz

Expand Down
6 changes: 3 additions & 3 deletions rf2aa/run_inference.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,9 +24,9 @@ def __init__(self, config) -> None:
self.config = config
initialize_chemdata(self.config.chem_params)
FFindexDB = namedtuple("FFindexDB", "index, data")
self.ffdb = FFindexDB(read_index(config.database_params.hhdb+'_pdb.ffindex'),
read_data(config.database_params.hhdb+'_pdb.ffdata'))
self.device = "cuda:0" if torch.cuda.is_available() else "cpu"
self.ffdb = FFindexDB(read_index(config.database_params.DB_PDB100+'_pdb.ffindex'),
read_data(config.database_params.DB_PDB100+'_pdb.ffdata'))
self.device = "cuda:0" if torch.cuda.is_available() and not config.force_cpu else "cpu"
self.xyz_converter = XYZConverter()
self.deterministic = config.get("deterministic", False)
self.molecule_db = load_pdb_ideal_sdf_strings()
Expand Down