# MSA-Search NIM + BOLTZ-2 NIM Study

MSA-Search NIM: https://build.nvidia.com/colabfold/msa-search

BOLTZ-2 NIM: https://build.nvidia.com/mit/boltz2

06Aug2025

## 1.1 Set Up the Environment

In [None]:
!pip install pandas numpy matplotlib httpx "fastapi[standard]"

In [None]:
import json
import os
import requests
import re
import shutil
from google.colab import userdata

import asyncio
from typing import Any, Dict, Optional
from pathlib import Path
from enum import StrEnum
import logging
import sys

## 1.2 Set Up `output` Directory and `API_KEY`

**NOTE:** Be sure to follow the steps in the README to embed your NVIDIA `API_KEY` into your Google Colab environment.

In [None]:
def prepare_output_directory(output):
    """
    Prepare the output directory
    output: str, the output directory
    return: None
    """
    # Overwrite the output directory
    if os.path.exists(output):
        shutil.rmtree(output)
    os.makedirs(output)

In [None]:
API_KEY = userdata.get('API_KEY')

# Output_dir for intermediate openfold structures
output_dir = "/content/output"
prepare_output_directory(output_dir)

#### Define Protein Sequence and Databases to use for MSA-Search

In [None]:
sequence = "ASLTEIEHLVQSVCKSYRETCQLRLEDLLRQRSNIFSREEVTGYQRKSMWEMWERCAHHLTEAIQYVVEFAKRLSGFMELCQNDQIVLLKAGAMEVVLVRMCRAYNADNRTVFFEGKYGGMELFRALGCSELISSIFDFSHSLSALHFSEDEIALYTALVLINAHRPGLQEKRKVEQLQYNLELAFHHHLCKTHRQSILAKLPPKGKLRSLCSQHVERLQIFQHLHPIVVQAAF"
databases = ['Uniref30_2302', 'colabfold_envdb_202108', 'PDB70_220313']


## 1.3 Set Up and Run `MSA-Search`

In [None]:
msa_search_url = "https://health.api.nvidia.com/v1/biology/colabfold/msa-search/predict"
payload = {
    "sequence": sequence,
    "databases": databases,
    "e_value": 0.0001,
    "iterations": 1,
    "max_msa_sequences": 10000,
    "run_structural_template_search": False,
    "output_alignment_formats": ["a3m"],
}

headers = {
    "Authorization": f"Bearer {API_KEY}",
    "content-type": "application/json",
    "NVCF-POLL-SECONDS": "300",
}
# Call MSA-Search NIM
response = requests.post(msa_search_url, json=payload, headers=headers)
msa_response_dict = response.json()
print(f"MSA response : \n {msa_response_dict}")

with open('msa_output.json', 'w') as json_file:
    json.dump(msa_response_dict, json_file, indent=4)

#### Merge and Sort Alignments From MSA-Search For BOLTZ-2 Input

In [None]:
def parse_sequences(input_string, n, query_seq):
    '''
    Parse the output of alignments from the MSA-Search NIM to be used downstream
    input_string: The output file of alignments in a string format
    n: The amount of alignments to return from the output when parsing
    query_seq: The query sequence for alignment
    Returns: A list of alignment identifiers and sequences, starting with the query, where the amount of sequences is given by n
    '''

    # Output is parsed to have a line for the sequence id and sequence itself so `n` returns correlates to n*2 lines
    n = n * 2

    # First, handle the `Query` block separately
    lines = input_string.strip().split('\n')

    # Now process the rest of the lines
    remaining_string = "\n".join(lines[:])

    # Regex to find blocks starting with `>` and then followed by a sequence.
    pattern = re.compile(r'\n>(.*?)\n(.*?)(?=\n>|\Z)', re.DOTALL)

    matches = pattern.finditer(remaining_string)

    output_list = []
    output_list_to_order = []

    for num_match, match in enumerate(matches):

        # The name is the first capturing group, split by tab and take the first part
        name_full = match.group(1).split('\t')[0]
        SW_score = match.group(1).split('\t')[1]

        # The sequence is the second capturing group
        sequence = match.group(2).strip()

        # Store the aligned sequence in the list of outputs by name, sequence, Smith-Waterman score
        output_list_to_order.append((f'>{name_full}', sequence, int(SW_score)))

    output_lines = output_list_to_order[:n]

    return output_lines

In [None]:
# Parse the MSA alignment results to merge results from all datasets used for MSA
all_parsed_dataset_output = []
for num_done, database in enumerate(databases):

  print(f"Parsing results from database: {database}")

  # Pull string of alignments stored in json output for specific dataset
  a3m_dict_msa_search = msa_response_dict['alignments'][database]['a3m']['alignment']

  a3m_dict_msa_search_parsed = parse_sequences(a3m_dict_msa_search, 10000, sequence)

  num_sequences_aligned = (len(a3m_dict_msa_search_parsed))
  print(f"num_sequences aligned {num_sequences_aligned}")

  all_parsed_dataset_output.extend(a3m_dict_msa_search_parsed)


# Sort all the alignments based off of the alignment score
all_parsed_dataset_output.sort(key=lambda x: x[2], reverse=True)

# Now that the alignments across all datasets are sorted, reformat each entry to name and sequence
sorted_parsed_output_formatted = []
for align_tuple in all_parsed_dataset_output:
  sorted_parsed_output_formatted.append(align_tuple[0])
  sorted_parsed_output_formatted.append(align_tuple[1])

merged_alignments_protein = [f">query_sequence\n{sequence}"]

merged_alignments_protein.extend(sorted_parsed_output_formatted)


print(f"Merged alignments: \n {merged_alignments_protein}")

## 1.4 Set Up and Run `BOLTZ-2`

In [None]:
ligand_smiles = "CC(=O)OC1=CC=CC=C1C(=O)O"

sequence = "MKTVRQERLKSIVRILERSKEPVSGAQLAEELSVSRQVIVQDIAYLRSLGYNIVATPRGYVLAGG"

output_file = Path("boltz2_output.json")

In [None]:
# Check for required BOLTZ-2 dependencies
missing_deps = []
try:
    import httpx
except ImportError:
    missing_deps.append("httpx")
try:
    from fastapi import HTTPException
except ImportError:
    missing_deps.append("fastapi")

if missing_deps:
    print("Error: Missing required dependencies. Please install them using:")
    print(f"pip install {' '.join(missing_deps)}")
    sys.exit(1)

logging.basicConfig(level=logging.DEBUG)
logger = logging.getLogger(__name__)


STATUS_URL = "https://api.nvcf.nvidia.com/v2/nvcf/pexec/status/{task_id}"
    

In [None]:
boltz2_url = "https://health.api.nvidia.com/v1/biology/mit/boltz2/predict"
NVCF_POLL_SECONDS = 300
MANUAL_TIMEOUT_SECONDS = 400

payload = {
    "polymers": [
        {
        "id": "A",
        "molecule_type": "protein", 
        "sequence": sequence,
        "msa": {
            "uniref90": {
                "a3m": {
                    "alignment": merged_alignments_protein[0],
                    "format": "a3m",
                    }
                }
            }
        }
    ],
    "ligands": [
        {
        "smiles": ligand_smiles,
        "id": "L1",
        "predict_affinity" : True
        }
    ],
    "recycling_steps": 3,
    "sampling_steps": 20,
    "diffusion_samples": 1,
    "step_scale": 1.64,
    "without_potentials": True,
}

headers = {
    "Authorization": "Bearer $API_KEY_REQUIRED_IF_EXECUTING_OUTSIDE_NGC",
    "NVCF-POLL-SECONDS": f"{NVCF_POLL_SECONDS}",
    "Content-Type": "application/json"
}

# Call BOLTZ-2 NIM
code, response = await client.post(boltz2_url,
                            json=payload,
                            headers=headers,
                            timeout=MANUAL_TIMEOUT_SECONDS)

if code == 200:
    print(f"Request succeeded, returned {code}")
    response_dict = response.json()
    with open(output_file, 'w') as json_file:
        json.dump(response_dict, json_file, indent=4)
    
    # Print information about the returned structures
    print(f"Number of structures returned: {len(response_dict['structures'])}")
    print(f"Number of confidence scores: {len(response_dict['confidence_scores'])}")
    
    # Print the first structure's format and length
    if response_dict['structures']:
        first_structure = response_dict['structures'][0]
        print(f"First structure format: {first_structure['format']}")
        print(f"First structure length: {len(first_structure['structure'])} characters")
    
    # Print confidence scores
    print(f"Confidence scores: {response_dict['confidence_scores']}")