<a href="https://colab.research.google.com/github/bramyeon/eval/blob/main/colab/main.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# DSLAB Synthetic Protein Structure Evaluation System v0.3.2

Developed by [Bryan Nathanael Wijaya](mailto:bramyeon@gmail.com) and [Luiz Felipe Vecchietti](lfelipesv@gmail.com)  
Contact us for inquiries or bug reports 🙌

```
python main.py [-h] [--input_dir INPUT_DIR] [--output_dir OUTPUT_DIR] [--contigs CONTIGS] [--ddg_pre] [--mpnn] [--af]
               [--ddg_fin] [--ddg_reference DDG_REFERENCE] [--ddg_quiet] [--ddg_filter_pre] [--ddg_filter_fin]
               [--ddg_max_pre DDG_MAX_PRE] [--ddg_max_fin DDG_MAX_FIN] [--mpnn_mode MPNN_MODE] [--mpnn_num MPNN_NUM]
               [--mpnn_temp MPNN_TEMP] [--mpnn_fr] [--af_init] [--af_multimer] [--af_custemp] [--af_recycle AF_RECYCLE]
               [--af_tolerance AF_TOLERANCE] [--af_seed AF_SEED] [--af_quiet] [--af_filter] [--af_model AF_MODEL]
               [--af_plddt AF_PLDDT] [--af_pae AF_PAE] [--af_ipae AF_IPAE] [--af_no_plddt] [--af_no_pae] [--af_no_ipae]
               [--af_all] [--af_best_avg] [--af_sep_csv] [--af_keep_all]
```

## 0. Preliminaries

In [None]:
#@title 0.1. Environment setup

import os, time, subprocess
ENV = {"TF_FORCE_UNIFIED_MEMORY":"1", "XLA_PYTHON_CLIENT_MEM_FRACTION":"4.0"}
for k,v in ENV.items(): os.environ[k] = v
if not os.path.isdir("params"):
  # get code
  print("installing ColabDesign")
  os.system("(mkdir params; apt-get install aria2 -qq; \
  aria2c -q -x 16 https://storage.googleapis.com/alphafold/alphafold_params_2022-12-06.tar; \
  tar -xf alphafold_params_2022-12-06.tar -C params; touch params/done.txt )&")

  os.system("pip -q install git+https://github.com/sokrypton/ColabDesign.git@gamma")
  os.system("ln -s /usr/local/lib/python3.*/dist-packages/colabdesign colabdesign")
  os.system("wget https://raw.githubusercontent.com/sokrypton/ColabFold/main/colabfold/colabfold.py -O colabfold_utils.py")
  #os.system("wget https://raw.githubusercontent.com/sokrypton/ColabFold/beta/colabfold/mmseqs/api.py")

  # install hhsuite
  print("installing HHsuite")
  os.makedirs("hhsuite", exist_ok=True)
  os.system(f"curl -fsSL https://github.com/soedinglab/hh-suite/releases/download/v3.3.0/hhsuite-3.3.0-SSE2-Linux.tar.gz | tar xz -C hhsuite/")

  # download params
  if not os.path.isfile("params/done.txt"):
    print("downloading AlphaFold params")
    while not os.path.isfile("params/done.txt"):
      time.sleep(5)
if "hhsuite" not in os.environ['PATH']:
  os.environ['PATH'] += ":hhsuite/bin:hhsuite/scripts"

import re, tempfile
from IPython.display import HTML
from google.colab import files
import pandas as pd
import py3Dmol

if not os.path.isdir("colabdesign"):
  print("installing ColabDesign...")
  os.system("pip -q install git+https://github.com/sokrypton/ColabDesign.git@v1.1.1")
  os.system("ln -s /usr/local/lib/python3.*/dist-packages/colabdesign colabdesign")

# git clone {our_eval_github}
# install pyrosetta4 and redirect in src/energy.py

installing ColabDesign
installing HHsuite
downloading AlphaFold params


In [None]:
#@title 0.2. Upload zip file of input PDBs
#@markdown <b>How to</b>: Put all your input PDB files into a folder, zip the folder, then upload the zip file here.

uploaded = files.upload()
filename = list(uploaded.keys())[0]
subprocess.run(f"unzip /content/{filename}", shell=True)

args = dict()
args['input_dir'] = filename.replace(".zip", "") # determine automatically
args['output_dir'] = "output" # determine automatically
os.makedirs(args['output_dir'])

## 1. Argument Settings

In [None]:
#@title 1.1. Arguments for <s>I/O directories and</s> structure specifications

#@markdown | Argument                          | Type  | Description                                   | Default Value | Example Value |
#@markdown | :-------------------------------- | :---: | :-------------------------------------------- | :-----------: | :-----------: |
#@markdown | `-h`, `--help`                    |       | show the help message and exit                |               |               |
#@markdown | `--input_dir INPUT_DIR`           | str   | input directory                               | input         | data/foo      |
#@markdown | `--output_dir OUTPUT_DIR`         | str   | output directory                              | output        | data/boo      |
#@markdown | `--contigs CONTIGS`               | str   | specification of the structure                | None          | A1-113:B1-26/6-6/B33-106 |
#@markdown
#@markdown <br><b>Note</b>: `input_dir` and `output_dir` are set up automatically in 0.2.<br>

contigs = "" #@param {type:"string"}
args['contigs'] = contigs

In [None]:
#@title 1.2. Arguments for service selection

#@markdown | Argument                          | Type  | Description                                              |
#@markdown | :-------------------------------- | :---: | :------------------------------------------------------- |
#@markdown | `--ddg_pre`                       | flag  | use PyRosetta to calculate ddG of preliminary structure  |
#@markdown | `--mpnn`                          | flag  | use ProteinMPNN to design sequence                       |
#@markdown | `--af`                            | flag  | use AF2 to predict structure                             |
#@markdown | `--ddg_fin`                       | flag  | use PyRosetta to calculate ddG of final structure        |
#@markdown
#@markdown <br>

ddg_pre = False #@param {type:"boolean"}
mpnn = False #@param {type:"boolean"}
af = False #@param {type:"boolean"}
ddg_fin = False #@param {type:"boolean"}

args['ddg_pre'] = ddg_pre
args['mpnn'] = mpnn
args['af'] = af
args['ddg_fin'] = ddg_fin

In [None]:
#@title 1.3. Arguments for ddG calculation with PyRosetta

#@markdown | Argument                          | Type  | Description                                   | Default Value | Example Value |
#@markdown | :-------------------------------- | :---: | :-------------------------------------------- | :-----------: | :-----------: |
#@markdown | `--ddg_reference DDG_REFERENCE`   | str   | reference structure for the designs           | None          | data/ref.pdb  |
#@markdown | `--ddg_quiet`                     | flag  | print only warning and error messages         |               |               |
#@markdown | `--ddg_filter_pre`                | flag  | filter preliminary candidates based on ddG    |               |               |
#@markdown | `--ddg_filter_fin`                | flag  | filter final candidates based on ddG          |               |               |
#@markdown | `--ddg_max_pre DDG_MAX_PRE`       | float | <b>MAX</b>imum ddG value to pass the preliminary filter | -30.0    | 10.5     |
#@markdown | `--ddg_max_fin DDG_MAX_FIN`       | float | <b>MAX</b>imum ddG value to pass the final filter       | 2000.0   | 2023.5   |
#@markdown
#@markdown <br>

if ddg_pre or ddg_fin:
    ddg_reference = "" #@param {type:"string"}
    ddg_quiet = False #@param {type:"boolean"}
    ddg_filter_pre = False #@param {type:"boolean"}
    ddg_filter_fin = False #@param {type:"boolean"}
    ddg_max_pre = -30.0 #@param {type:"number"}
    ddg_max_fin = 2000.0 #@param {type:"number"}

    args['ddg_reference'] = ddg_reference
    args['ddg_quiet'] = ddg_quiet

    if ddg_pre:
        args['ddg_filter_pre'] = ddg_filter_pre
        if ddg_filter_pre:
            args['ddg_max_pre'] = ddg_max_pre

    if ddg_fin:
        args['ddg_filter_fin'] = ddg_filter_fin
        if ddg_filter_fin:
            args['ddg_max_fin'] = ddg_max_fin

In [None]:
#@title 1.4. Arguments for sequence design with ProteinMPNN (or MPNN-FR)

#@markdown | Argument                          | Type  | Description                                   | Default Value | Example Value |
#@markdown | :-------------------------------- | :---: | :-------------------------------------------- | :-----------: | :-----------: |
#@markdown | `--mpnn_mode MPNN_MODE`           | str   | ProteinMPNN mode: partial/binder/fixbb        | partial       | binder        |
#@markdown | `--mpnn_num MPNN_NUM`             | int   | number of sequence designs per structure      | 8             | 3             |
#@markdown | `--mpnn_temp MPNN_TEMP`           | float | MPNN sampling temperature                     | 0.1           | 0.01          |
#@markdown | `--mpnn_fr`                       | flag  | use Rosetta Fast Relax                        |               |               |
#@markdown
#@markdown <br>

if mpnn:
    mpnn_mode = "partial" #@param {type:"string"}
    mpnn_num = 8 #@param {type:"number"}
    mpnn_temp = 0.1 #@param {type:"number"}
    mpnn_fr = False #@param {type:"boolean"}

    args['mpnn_mode'] = mpnn_mode
    args['mpnn_num'] = mpnn_num
    args['mpn_temp'] = mpnn_temp
    args['mpnn_fr'] = mpnn_fr

In [None]:
#@title 1.5. Arguments for structure prediction with AlphaFold2 Gamma

#@markdown | Argument                          | Type  | Description                                   | Default Value | Example Value |
#@markdown | :-------------------------------- | :---: | :-------------------------------------------- | :-----------: | :-----------: |
#@markdown | `--af_init`                       | flag  | use initial guess                             |               |               |
#@markdown | `--af_multimer`                   | flag  | use AF2 Multimer                              |               |               |
#@markdown | `--af_custemp`                    | flag  | use custom template for AF2 prediction        |               |               |
#@markdown | `--af_recycle AF_RECYCLE`         | int   | number of AF2 recycles                        | 20            | 3             |
#@markdown | `--af_tolerance AF_TOLERANCE`     | float | recycle early stop tolerance                  | 0.5           | 1.0           |
#@markdown | `--af_seed AF_SEED`               | int   | random model seed                             | 0             | 2023          |
#@markdown | `--af_quiet`                      | flag  | print only error messages                     |               |               |
#@markdown | `--af_filter`                     | flag  | filter with one model before using all models |               |               |
#@markdown | `--af_all`                        | flag  | use all 5 AF2 models to validate              |               |               |
#@markdown
#@markdown <br>

if af:
    af_init = False #@param {type:"boolean"}
    af_multimer = False #@param {type:"boolean"}
    af_custemp = False #@param {type:"boolean"}
    af_recycle = 20 #@param {type:"number"}
    af_tolerance = 0.5 #@param {type:"number"}
    af_seed = 0 #@param {type:"number"}
    af_quiet = False #@param {type:"boolean"}
    af_filter = False #@param {type:"boolean"}
    af_all = False #@param {type:"boolean"}

    args['af_init'] = af_init
    args['af_multimer'] = af_multimer
    args['af_custemp'] = af_custemp
    args['af_recycle'] = af_recycle
    args['af_tolerance'] = af_tolerance
    args['af_seed'] = af_seed
    args['af_quiet'] = af_quiet
    args['af_filter'] = af_filter
    args['af_all'] = af_all

#@markdown <br>
#@markdown
#@markdown | Argument for one-model AF2        | Type  | Description                                   | Default Value | Example Value |
#@markdown | :-------------------------------- | :---: | :-------------------------------------------- | :-----------: | :-----------: |
#@markdown | `--af_model AF_MODEL`             | int   | model number used for filtering/validation    | 1             | 5             |
#@markdown | `--af_plddt AF_PLDDT`             | float | <b>MIN</b>imum pLDDT value to pass the filter | 0.9           | 0.8           |
#@markdown | `--af_pae AF_PAE`                 | float | <b>MAX</b>imum pAE value to pass the filter   | 5.0           | 22.5          |
#@markdown | `--af_ipae AF_IPAE`               | float | <b>MAX</b>imum i-pAE value to pass the filter | 5.0           | 16.7          |
#@markdown | `--af_no_plddt`                   | flag  | do not use pLDDT to filter                    |               |               |
#@markdown | `--af_no_pae`                     | flag  | do not use pAE to filter                      |               |               |
#@markdown | `--af_no_ipae`                    | flag  | do not use i-pAE to filter                    |               |               |
#@markdown
#@markdown <br>

    if af_filter:
        af_model = 1 #@param {type:"number"}
        af_plddt = 0.9 #@param {type:"number"}
        af_pae = 5.0 #@param {type:"number"}
        af_ipae = 5.0 #@param {type:"number"}
        af_no_plddt = False #@param {type:"boolean"}
        af_no_pae = False #@param {type:"boolean"}
        af_no_ipae = False #@param {type:"boolean"}

        args['af_model'] = af_model
        if not af_no_plddt:
            args['af_plddt'] = af_plddt
        if not af_no_pae:
            args['af_pae'] = af_pae
        if not af_no_ipae:
            args['af_ipae'] = af_ipae
        args['af_no_plddt'] = af_no_plddt
        args['af_no_pae'] = af_no_pae
        args['af_no_ipae'] = af_no_ipae

#@markdown <br>
#@markdown
#@markdown | Argument for five-model AF2       | Type  | Description                                            |
#@markdown | :-------------------------------- | :---: | :----------------------------------------------------- |
#@markdown | `--af_best_avg`                   | flag  | save CSV files for the best and average results        |
#@markdown | `--af_sep_csv`                    | flag  | save a separate CSV file for each model                |
#@markdown | `--af_keep_all`                   | flag  | keep all the AF2 results and details (not recommended) |
#@markdown
#@markdown <br>

    if af_all:
        af_best_avg = False #@param {type:"boolean"}
        af_sep_csv = False #@param {type:"boolean"}
        af_keep_all = False #@param {type:"boolean"}

        args['af_best_avg'] = af_best_avg
        args['af_sep_csv'] = af_sep_csv
        args['af_keep_all'] = af_keep_all

## 2. Executing Validation Script

In [None]:
#@title 2.1. Check your command

cmd = ["python main.py"]
for key, value in args.items():
    if type(value) == bool:
        if value:
            cmd.append(key)

    elif type(value) == str:
        if len(value.strip()) > 0:
            cmd.append(f"{key} {value}")

    elif type(value) in [int, float]:
        cmd.append(f"{key} {value}")

    else:
        print(f"WARNING: {key} cannot be {type(value)}. This will be set to its default value intrinsically.")
cmd = ' --'.join(cmd)
print("\nYour command is:")
print(cmd)

In [None]:
#@title 2.2. Run validation script

import subprocess
subprocess.run(cmd, shell=True)

## 3. Results Analysis and Visualization

In [None]:
#@title 3.1. Manually choose or automatically get the output token

#@markdown Leave blank to get the latest output token automatically.
token = "" #@param {type:"string"}
last_token = os.listdir(args['output_dir'])[-1]
if len(token.strip()) == 0:
    token = last_token
print(f"Your output token is `{token}`")
output_dir = os.path.join(args['output_dir'], token)


In [None]:
#@title 3.2. Analyze result CSV file

analyze = "ddG of preliminary structures" #@param ["ddG of preliminary structures", "ddG of preliminary structures - FILTERED", "Biopython-extracted sequences", "ProteinMPNN/MPNN-FR-designed sequences", "AF2 with one model", "AF2 with one model - FILTERED", "AF2 with five models", "AF2 with five models - MODEL 1", "AF2 with five models - MODEL 2", "AF2 with five models - MODEL 3", "AF2 with five models - MODEL 4", "AF2 with five models - MODEL 5", "AF2 with five models - BEST", "AF2 with five models - AVERAGE", "ddG of final structures", "ddG of final structures - FILTERED"]
head = 10 #@param {type:"number"}

if 'af_model' not in args:
    args['af_model'] = None

csv_dir = os.path.join(output_dir, "csv")
csv_file = {"ddG of preliminary structures": "pyrosetta1.csv",
            "ddG of preliminary structures - FILTERED": "pyrosetta1_filtered.csv",
            "Biopython-extracted sequences": "sequence.csv",
            "ProteinMPNN/MPNN-FR-designed sequences": "mpnn.csv",
            "AF2 with one model": f"af2model{args['af_model']}.csv",
            "AF2 with one model - FILTERED": f"af2model{args['af_model']}_filtered.csv",
            "AF2 with five models": "af2all.csv",
            "AF2 with five models - MODEL 1": "af2all_model1.csv",
            "AF2 with five models - MODEL 2": "af2all_model2.csv",
            "AF2 with five models - MODEL 3": "af2all_model3.csv",
            "AF2 with five models - MODEL 4": "af2all_model4.csv",
            "AF2 with five models - MODEL 5": "af2all_model5.csv",
            "AF2 with five models - BEST": "af2all_best.csv",
            "AF2 with five models - AVERAGE": "af2all_average.csv",
            "ddG of final structures": "pyrosetta2.csv",
            "ddG of final structures - FILTERED": "pyrosetta2_filtered.csv"}

csv_path = os.path.join(csv_dir, csv_file[analyze])
df = pd.read_csv(csv_path)
df.head(head)

In [None]:
#@title 3.2.1. Show results for a particular input PDB file `name`
#@markdown <b>Important</b>: Do not forget the `.pdb` extension.<br><b>Example</b>: `0000_rosetta` (X) `0000_rosetta.pdb` (O)
name = "example.pdb" #@param {type:"string"}
filtered_df = df[df['name'] == name]
filtered_df.head(1000)

In [None]:
#@title 3.2.2. Closer look with more details

list_of_dicts = []
for index, row in filtered_df.iterrows():
    row_dict = row.to_dict()
    list_of_dicts.append(row_dict)

for row_dict in list_of_dicts:
    for key, value in row_dict.items():
        print(f"{key}: {value}")
    print("")

In [None]:
#@title 3.3. PyMOL PDB visualization
!sudo apt install pymol

version = "input" #@param ["input", "AF2 with one model", "AF2 with five models - MODEL 1", "AF2 with five models - MODEL 2", "AF2 with five models - MODEL 3", "AF2 with five models - MODEL 4", "AF2 with five models - MODEL 5"]

#@markdown <b>Important</b>: Do not forget the `.pdb` extension.<br><b>Example</b>: `0000_rosetta` (X) `0000_rosetta.pdb` (O)
name = "example.pdb" #@param {type:"string"}

if 'af_model' not in args:
    args['af_model'] = None
pdb_dir = {"input": args['input_dir'],
           "AF2 with one model": os.path.join(output_dir, f"af2model{args['af_model']}_pdb"),
           "AF2 with five models - MODEL 1": os.path.join(output_dir, f"af2all_pdb/model1"),
           "AF2 with five models - MODEL 2": os.path.join(output_dir, f"af2all_pdb/model2"),
           "AF2 with five models - MODEL 3": os.path.join(output_dir, f"af2all_pdb/model3"),
           "AF2 with five models - MODEL 4": os.path.join(output_dir, f"af2all_pdb/model4"),
           "AF2 with five models - MODEL 5": os.path.join(output_dir, f"af2all_pdb/model5")}

pdb_path = os.path.join(pdb_dir[version], name)

In [None]:
#@title Here goes your PDB file!

animate = "none" #@param ["none", "interactive"]
show_as = "cartoon" #@param ["cartoon", "stick", "line", "sphere", "cross"]
color = "chain" #@param ["rainbow", "chain", "plddt"]
dpi = 100 #@param ["100", "200", "400"] {type:"raw"}

import warnings
from Bio.PDB import PDBParser
from colabdesign.shared.plot import pymol_color_list
from colabdesign.rf.utils import get_ca, get_Ls, make_animation
from string import ascii_uppercase,ascii_lowercase
alphabet_list = list(ascii_uppercase+ascii_lowercase)

def count_chains():
    with warnings.catch_warnings():
        warnings.simplefilter("ignore")

        parser = PDBParser()
        structure = parser.get_structure('protein', pdb_path)

        chains = set()
        for model in structure:
            for chain in model:
                chains.add(chain.id)

        return len(chains)

def plot_pdb(num=0):
    hbondCutoff = 4.0
    view = py3Dmol.view(js='https://3dmol.org/build/3Dmol.js')
    if animate == "interactive":
        pdb_str = open(pdb_path,'r').read()
        view.addModelsAsFrames(pdb_str,'pdb',{'hbondCutoff':hbondCutoff})
    else:
        pdb = pdb_path
        pdb_str = open(pdb,'r').read()
        view.addModel(pdb_str,'pdb',{'hbondCutoff':hbondCutoff})
    if color == "rainbow":
        view.setStyle({show_as: {'color':'spectrum'}})
    elif color == "chain":
        for n,chain,c in zip(range(count_chains()),
                            alphabet_list,
                            pymol_color_list):
            view.setStyle({'chain':chain},{show_as: {'color':c}})
    else:
        view.setStyle({show_as: {'colorscheme': {'prop':'b','gradient': 'roygb','min':0.5,'max':0.9}}})

    view.zoomTo()
    if animate == "interactive":
        view.animate({'loop': 'backAndForth'})
    view.show()

print(f"Visualizing: {pdb_path}")
plot_pdb()

Visualizing: 5t5w.pdb


## 4. Download Results

In [None]:
#@title Download all results for this `token` as a zip file

!zip -r {token}.zip {output_dir}
files.download(f"{token}.zip")

  adding: 5t5w.pdb (deflated 74%)


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

## Acknowledgements
A big portion of this evaluation system source code is adapted from the following excellent Jupyter notebooks by the ColabDesign developers.
- [`predict.ipynb`](https://colab.research.google.com/github/sokrypton/ColabDesign/blob/gamma/af/examples/predict.ipynb) for structure prediction with AlphaFold2 Gamma
- [`diffusion.ipynb`](https://colab.research.google.com/github/sokrypton/ColabDesign/blob/v1.1.1/rf/examples/diffusion.ipynb) for contigs handling and sequence design with ProteinMPNN
