# RGN2 Colab
## Instructions
1. Paste your protein sequence in the input field.
2. Run the cells in the Colab individually with the play button on the left or via _Runtime_ > _Run all._
3. The predicted protein structure will be downloaded after the final "Refinement" cell is executed.

In [None]:
#@title Download RGN2 and Install Dependencies

import os
import sys
import subprocess
from IPython import get_ipython
from IPython.utils import io

WORKDIR = './rgn2'
GIT_REPO = 'https://github.com/aqlaboratory/rgn2.git'
ENV_CONFIG = os.path.join(WORKDIR, 'environment.yml')
RGN2_PARAM_SOURCE_URL = 'https://huggingface.co/christinafl/rgn2'
RGN2_PARAMS_DIR = os.path.join(WORKDIR, 'resources')
RGN2_PARAM_RUN_DIR = os.path.join(RGN2_PARAMS_DIR, 'rgn2_runs')
RGN2_RUN_DIR = os.path.join(WORKDIR, 'runs')

AF2_GIT_REPO = 'https://github.com/deepmind/alphafold.git'
AF2_SOURCE_URL = 'https://storage.googleapis.com/alphafold/alphafold_params_2022-03-02.tar'
AF2_PARAMS_DIR = './alphafold/data/params'
AF2_PARAMS_PATH = os.path.join(AF2_PARAMS_DIR, os.path.basename(AF2_SOURCE_URL))

REFINER_DIR = os.path.join(WORKDIR, 'ter2pdb')
REFINER_PATH = os.path.join(REFINER_DIR, 'ModRefiner-l.zip')
REFINER_URL = 'https://zhanggroup.org/ModRefiner/ModRefiner-l.zip'

try:
  with io.capture_output() as captured:
    %cd '/content'

    # Different conda envs necessary due to conflicting dependencies.
    %shell rm -rf /opt/conda
    %shell wget -q -P /tmp \
      https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh \
        && bash /tmp/Miniconda3-latest-Linux-x86_64.sh -b -p /opt/conda \
        && rm /tmp/Miniconda3-latest-Linux-x86_64.sh

    PATH=%env PATH
    %env PATH=/opt/conda/bin:{PATH}

    CONDA_INIT = 'source /opt/conda/etc/profile.d/conda.sh && conda init'
    RGN2_ENV_INIT = f'{CONDA_INIT} && conda activate rgn2'

    # Download RGN2.
    %shell rm -rf {WORKDIR}
    %shell git clone {GIT_REPO} {WORKDIR}
    %shell {CONDA_INIT} && conda env create -f {ENV_CONFIG}

    # Download AF2 for AF2Rank-based refinement.
    AF2_ENV_INIT = f'{CONDA_INIT} && conda activate af2'

    %shell rm -rf alphafold
    %shell git clone --branch v2.2.4 {AF2_GIT_REPO} alphafold
    %shell {CONDA_INIT} && conda create -y -q --name af2 python=3.7
    %shell {AF2_ENV_INIT} && pip install ml-collections==0.1.0 numpy==1.21.6 pandas==1.3.4 protobuf==3.20.1 scipy==1.7.0 tensorflow-cpu==2.9.0
    %shell {AF2_ENV_INIT} && pip install --no-dependencies ./alphafold
    %shell {AF2_ENV_INIT} && pip install --upgrade jax==0.3.25 \
      jaxlib==0.3.25+cuda11.cudnn82 \
      -f https://storage.googleapis.com/jax-releases/jax_cuda_releases.html
    %shell {AF2_ENV_INIT} && pip install biopython==1.81 chex==0.1.5 dm-haiku==0.0.9 dm-tree==0.1.8 docker==6.1.3 immutabledict==2.2.5
    %shell {AF2_ENV_INIT} && pip install --upgrade typing_extensions

    %shell mkdir --parents "{AF2_PARAMS_DIR}"
    %shell wget -O "{AF2_PARAMS_PATH}" "{AF2_SOURCE_URL}"
    %shell tar --extract --verbose --file="{AF2_PARAMS_PATH}" \
      --directory="{AF2_PARAMS_DIR}" --preserve-permissions
    %shell rm "{AF2_PARAMS_PATH}"

    # Download AminoBERT/RGN2 weights.
    %shell GIT_LFS_SKIP_SMUDGE=1 git clone "{RGN2_PARAM_SOURCE_URL}" "{RGN2_PARAMS_DIR}"
    %shell cd {RGN2_PARAMS_DIR} && git lfs pull
    %shell mv {RGN2_PARAM_RUN_DIR} {RGN2_RUN_DIR}

    # Download Modrefiner to initialize all atoms from CA trace.
    %shell wget -O {REFINER_PATH} {REFINER_URL}
    %shell unzip -o {REFINER_PATH} -d {REFINER_DIR}
    %shell rm {REFINER_PATH}

except subprocess.CalledProcessError:
  print(captured)
  raise

In [None]:
#@title Import Python Packages

%cd '/content/rgn2'

import os
import sys
import re
import hashlib
import json
import subprocess
from pathlib import Path
from IPython.utils import io
from google.colab import files

%reload_ext autoreload
%autoreload 2


sys.path.append('/content/alphafold')
from ter2pdb import ter2pdb

In [None]:
#@title ### Enter the amino acid sequence to fold ⬇️
sequence = 'DEEEIQKAIEELLRKGVSEEEAAIIIVQRFNVAVVVVVQDERQGKHISEYIRRYIPEADVILFANLVVIKVETHELSTRVWEAAQKAY'  #@param {type:"string"}
jobname = 'test' #@param {type:"string"}

# Remove whitespace
sequence = "".join(sequence.split()).upper()
jobname = "".join(jobname.split())

jobname = re.sub(r'\W+', '', jobname)
seq_hash = hashlib.blake2b(sequence.encode(), digest_size=3).hexdigest()
seq_id = f'{jobname}_{seq_hash}'

MAX_SEQUENCE_LENGTH = 1023

# Remove all whitespaces, tabs and end lines; upper-case
sequence = sequence.translate(str.maketrans('', '', ' \n\t')).upper()
aatypes = set('ACDEFGHIKLMNPQRSTVWY')  # 20 standard aatypes
if not set(sequence).issubset(aatypes):
  raise Exception(f'Input sequence contains non-amino acid letters: {set(sequence) - aatypes}. AlphaFold only supports 20 standard amino acids as inputs.')
if len(sequence) > MAX_SEQUENCE_LENGTH:
  raise Exception(f'Input sequence is too long: {len(sequence)} amino acids, while the maximum is {MAX_SEQUENCE_LENGTH}. Please use the full AlphaFold system for long sequences.')

run_inputs = {'sequence': sequence, 'seq_id': seq_id}
with open("run.json", "w") as f:
    json.dump(run_inputs, f)

DATA_DIR = 'aminobert_output'
RUN_DIR = 'runs/15106000'
OUTPUT_DIR = 'output'
REFINE_DIR = 'output/refine_model1'
SEQ_PATH = os.path.join(DATA_DIR, f'{seq_id}.fa')
TER_PATH = os.path.join(RUN_DIR, '1', 'outputsTesting', f'{seq_id}.tertiary')

In [None]:
#@title Generate AminoBERT Embeddings

%%bash
source /opt/conda/etc/profile.d/conda.sh && conda init
conda activate rgn2
python

import os
import sys
import json
sys.path.append(os.path.join(os.getcwd(), 'aminobert'))

import shutil
from aminobert.prediction import aminobert_predict_sequence
from data_processing.aminobert_postprocessing import aminobert_postprocess

DATA_DIR = 'aminobert_output'
DATASET_NAME = '1'
PREPEND_M = True
AMINOBERT_CHKPT_DIR = 'resources/aminobert_checkpoint/AminoBERT_runs_v2_uniparc_dataset_v2_5-1024_fresh_start_model.ckpt-1100000'

with open("run.json", "r") as f:
    run_inputs = json.load(f)

# Remove old data since AminoBERT combines entire directory to create dataset.
if os.path.exists(DATA_DIR):
  shutil.rmtree(DATA_DIR)
os.makedirs(DATA_DIR)

aminobert_predict_sequence(seq=run_inputs['sequence'], header=run_inputs['seq_id'],
                           prepend_m=PREPEND_M, checkpoint=AMINOBERT_CHKPT_DIR,
                           data_dir=DATA_DIR)
aminobert_postprocess(data_dir=DATA_DIR, dataset_name=DATASET_NAME, prepend_m=PREPEND_M)

In [None]:
#@title Run RGN2
#@markdown This step generates the raw RGN2-predicted C-alpha trace.

rgn2_env_init = 'source /opt/conda/etc/profile.d/conda.sh && conda init && conda activate rgn2'
try:
  with io.capture_output() as captured:
    cmd = (f"python rgn/protling.py {os.path.join(RUN_DIR, 'configuration')} "
           f"-p -e 'weighted_testing' -a -g 0")
    %shell {rgn2_env_init} && {cmd}
except subprocess.CalledProcessError:
  print(captured)
  raise

print('Prediction completed!')

In [None]:
#@title Refinement and Structure Download
#@markdown Once this cell has been executed, a PDB file with the refined
#@markdown structure will be automatically downloaded to your computer.
#@markdown **Note**: Notebook refinement pipeline is ~2x slower compared
#@markdown to local execution.
recycles = 1 #@param {type:"integer"}

ter2pdb.run_ca_to_allatom(seq_path=SEQ_PATH, ter_path=TER_PATH,
                          output_dir=OUTPUT_DIR, seq_id=seq_id)

out_suffix = '_prediction'
af2_env_init = 'source /opt/conda/etc/profile.d/conda.sh && conda init && conda activate af2'
jax_env_vars = 'TF_FORCE_UNIFIED_MEMORY=1 XLA_PYTHON_CLIENT_MEM_FRACTION=2.0'
cmd = (f"{jax_env_vars} python ter2pdb/run_af2rank.py refine_model1 "
       f"--target_list {seq_id} --af2_dir /content/alphafold/ "
       f"--out_suffix {out_suffix} --seq_dir {Path(SEQ_PATH).parent} "
       f"--pdb_dir {OUTPUT_DIR} --output_dir {OUTPUT_DIR} --deterministic "
       f"--seq_replacement - --mask_sidechains_add_cb --model_num 1 "
       f"--recycles {recycles}")
try:
  with io.capture_output() as captured:
    %shell {af2_env_init} && {cmd}
except subprocess.CalledProcessError:
  print(captured)
  raise

print('Refinement completed!')

files.download(os.path.join(REFINE_DIR, f'{seq_id}{out_suffix}.pdb'))