Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
SPDX-License-Identifier: Apache-2.0

# Quick Start: Running Protein Folding on AWS Batch

## Table of Contents
0. [Install Dependencies](#0.-Install-Dependencies)
1. [Create Target](#1.-Create-Target)
2. [Submit Sequence Alignment and Folding Jobs](#2.-Submit-Sequence-Alignment-and-Folding-Jobs) 
3. [Download and Visualize Results](#3.-Download-and-Visualize-Results)

## 0. Install Dependencies

In [2]:
%pip install -U -q -r notebook-requirements.txt

[33m  DEPRECATION: batchfold is being installed using the legacy 'setup.py install' method, because it does not have a 'pyproject.toml' and the 'wheel' package is not installed. pip 23.1 will enforce this behaviour change. A possible replacement is to enable the '--use-pep517' option. Discussion can be found at https://github.com/pypa/pip/issues/8559[0m[33m
[0mNote: you may need to restart the kernel to use updated packages.


In [1]:
!aws sts get-caller-identity

{
    "UserId": "AROASN64FTRLFIOXDZ6MI:bloyal-Isengard",
    "Account": "167428594774",
    "Arn": "arn:aws:sts::167428594774:assumed-role/Admin/bloyal-Isengard"
}


In [2]:
# Import required Python packages

import boto3
from datetime import datetime
import matplotlib.pyplot as plt
from batchfold.batchfold_environment import BatchFoldEnvironment
from batchfold.batchfold_target import BatchFoldTarget
from batchfold.jackhmmer_job import JackhmmerJob
from batchfold.openfold_job import OpenFoldJob
from batchfold.alphafold2_job import AlphaFold2Job
from batchfold.omegafold_job import OmegaFoldJob
from batchfold.esmfold_job import ESMFoldJob
from batchfold.utils import utils
from IPython import display
import numpy as np

# Create AWS clients
boto_session = boto3.session.Session(profile_name='bloyal+proteinfolding-Admin')

batch_environment = BatchFoldEnvironment(boto_session=boto_session)

S3_BUCKET = batch_environment.default_bucket
print(f" S3 bucket name is {S3_BUCKET}")

 S3 bucket name is batchfold-221102-batchfolds3bucket-1byh6n52qfaov


## 1. Create Target

In [3]:
target_id = "7FCC"
target = BatchFoldTarget(target_id=target_id, s3_bucket=S3_BUCKET, boto_session=boto_session)
target.add_sequence(
    seq_id=target_id,
    seq="KEYDIYVSYARNAEEEEFVLLTLRGVLENEFGYKLCIFDRDSLPGGNTVEAVFDFIQRSRRMIVVLSPDYVTEKSISMLEFKLGVMCQNSIATKLIVVEYRPLEHPHPGILQLKESVSFVSWKGEKSKHSGSKFWKALRLALPLRS",
    description="Chain A|Isoform 4 of Interleukin-1 receptor accessory protein|Homo sapiens (9606)",
)

's3://batchfold-221102-batchfolds3bucket-1byh6n52qfaov/7FCC/fastas/7FCC.fasta'

## 2. Submit Sequence Alignment and Folding Jobs

List available job queues

In [4]:
batch_environment.list_job_queue_names()

['G4dnJobQueue', 'GravitonOnDemandJobQueue', 'GravitonSpotJobQueue']

Submit jackHMMER job

In [5]:
job_name = target.target_id + "_JackhmmerJob_" + datetime.now().strftime("%Y%m%d%s")
jackhmmer_job = JackhmmerJob(
    job_name=job_name,
    target_id=target.target_id,
    fasta_s3_uri=target.get_fasta_s3_uri(),
    output_s3_uri=target.get_msas_s3_uri(),
    boto_session=boto_session,
    cpu=16,
    memory=31,
)
jackhmmer_submission = batch_environment.submit_job(
    jackhmmer_job, job_queue_name="GravitonSpotJobQueue"
)

Submit OpenFold job

In [6]:
job_name = target.target_id + "_OpenFoldJob_" + datetime.now().strftime("%Y%m%d%s")
openfold_job = OpenFoldJob(
    job_name=job_name,    
    target_id=target.target_id,
    fasta_s3_uri=target.get_fasta_s3_uri(),
    msa_s3_uri=target.get_msas_s3_uri(),
    output_s3_uri=target.get_predictions_s3_uri() + "/" + job_name,
    boto_session=boto_session,
    cpu=4,
    memory=15,  # Why not 16? ECS needs about 1 GB for container services
    gpu=1,
)
openfold_submission = batch_environment.submit_job(
    openfold_job, job_queue_name="G4dnJobQueue", depends_on=[jackhmmer_submission]
)

Submit AlphaFold2 job

In [7]:
job_name = target.target_id + "_AlphaFold2Job_" + datetime.now().strftime("%Y%m%d%s")
alphafold2_job = AlphaFold2Job(
    job_name=job_name,
    target_id=target.target_id,
    fasta_s3_uri=target.get_fasta_s3_uri(),
    msa_s3_uri=target.get_msas_s3_uri(),
    output_s3_uri=target.get_predictions_s3_uri() + "/" + job_name,
    boto_session=boto_session,
    cpu=4,
    memory=15,  # Why not 16? ECS needs about 1 GB for container services
    gpu=1,
)
alphafold2_submission = batch_environment.submit_job(
    alphafold2_job, job_queue_name="G4dnJobQueue", depends_on=[jackhmmer_submission]
)

Submit OmegaFold job

In [10]:
job_name = target.target_id + "_OmegaFoldJob_" + datetime.now().strftime("%Y%m%d%s")
omegafold_job = OmegaFoldJob(
    job_name=job_name,
    target_id=target.target_id,
    fasta_s3_uri=target.get_fasta_s3_uri(),
    output_s3_uri=target.get_predictions_s3_uri() + "/" + job_name,
    boto_session=boto_session,
    cpu=4,
    memory=15,  # Why not 16? ECS needs about 1 GB for container services
    gpu=1,
)
omegafold_submission = batch_environment.submit_job(
    omegafold_job, job_queue_name="G4dnJobQueue"
)

Submit ESMFold job

In [11]:
job_name = target.target_id + "_ESMFoldJob_" + datetime.now().strftime("%Y%m%d%s")
esmfold_job = ESMFoldJob(
    job_name=job_name,
    target_id=target.target_id,
    fasta_s3_uri=target.get_fasta_s3_uri(),
    output_s3_uri=target.get_predictions_s3_uri() + "/" + job_name,
    boto_session=boto_session,
    cpu=8,
    memory=31,  # Why not 32? ECS needs about 1 GB for container services
    gpu=1,
)
esmfold_submission = batch_environment.submit_job(
    esmfold_job, job_queue_name="G4dnJobQueue"
)

Check on job statuses

In [12]:
for job in [jackhmmer_job, openfold_job, alphafold2_job, omegafold_job, esmfold_job]:
    print(
        f"Job {job.describe_job()[0]['jobName']} is in status {job.describe_job()[0]['status']}"
    )

Job 7FCC_JackhmmerJob_202212211671630902 is in status RUNNABLE
Job 7FCC_OpenFoldJob_202212211671630905 is in status PENDING
Job 7FCC_AlphaFold2Job_202212211671630906 is in status PENDING
Job 7FCC_OmegaFoldJob_202212211671630908 is in status RUNNABLE
Job 7FCC_ESMFoldJob_202212211671630909 is in status RUNNABLE


## 3. Download and Visualize Results

Once the jobs are finished, download and view the results

### Plot Alignment Data

In [None]:
target = BatchFoldTarget(
    target_id=target_id, s3_bucket=S3_BUCKET, boto_session=boto_session
)

In [None]:
target.download_msas(local_path="data")
utils.plot_msa_output_folder(
    path=f"data/{target_id}/msas/jackhmmer", id=target_id
)

### Plot Predicted Structures

Plot OpenFold prediction

In [None]:
last_job_name = target.get_last_job_name(job_type="OpenFold")

print(f"Downloading results for job {last_job_name}")
target.download_predictions(local_path="data", job=last_job_name)

print(f"Displaying predicted structure for job {last_job_name}")
pdb = f"data/{target_id}/predictions/{last_job_name}/{target_id}_model_1_ptm_relaxed.pdb"
utils.plot_banded_pdb(pdb, width=500, height=300)

pkl = f"data/{target_id}/predictions/{last_job_name}/{target_id}_model_1_ptm_output_dict.pkl"
pae_results = np.load(pkl, allow_pickle=True)["predicted_aligned_error"]
utils.plot_metrics(pdb, pae_results)

Plot AlphaFold2 prediction

In [None]:
last_job_name = target.get_last_job_name(job_type="AlphaFold2")

print(f"Downloading results for job {last_job_name}")
target.download_predictions(local_path="data", job=last_job_name)

print("Identifying best model")
best_model_name = utils.get_best_alphafold_model(f"data/{target_id}/predictions/{last_job_name}/ranking_debug.json")

print(f"Displaying predicted structure for model {best_model_name}")
pdb = f"data/{target_id}/predictions/{last_job_name}/ranked_0.pdb"
utils.plot_banded_pdb(pdb)

pkl = f"data/{target_id}/predictions/{last_job_name}/result_{best_model_name}.pkl"
pae_results = np.load(pkl, allow_pickle=True)["predicted_aligned_error"]
utils.plot_metrics(pdb, pae_results)

Plot OmegaFold prediction

In [None]:
last_job_name = target.get_last_job_name(job_type="OmegaFold")

print(f"Downloading results for job {last_job_name}")
target.download_predictions(local_path="data", job=last_job_name)

print(f"Displaying predicted structure for job {last_job_name}")
pdb = f"data/{target_id}/predictions/{last_job_name}/{target_id}.pdb"
utils.plot_banded_pdb(pdb)
utils.plot_metrics(pdb)

Plot ESMFold prediction

In [None]:
last_job_name = target.get_last_job_name(job_type="ESMFold")

print(f"Downloading results for job {last_job_name}")
target.download_predictions(local_path="data", job=last_job_name)

print(f"Displaying predicted structure for job {last_job_name}")
pdb = f"data/{target_id}/predictions/{last_job_name}/{target_id}.pdb"
utils.plot_banded_pdb(pdb)
utils.plot_metrics(pdb)