Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
SPDX-License-Identifier: Apache-2.0

# Quick Start: Running Protein Folding on AWS Batch

## Table of Contents
0. [Install Dependencies](#0.-Install-Dependencies)
1. [Create Target](#1.-Create-Target)
2. [Submit Sequence Alignment and Folding Jobs](#2.-Submit-Sequence-Alignment-and-Folding-Jobs) 
3. [Download and Visualize Results](#3.-Download-and-Visualize-Results)

## 0. Install Dependencies

In [None]:
%pip install -U -q -r notebook-requirements.txt

In [None]:
# Import required Python packages

import boto3
from datetime import datetime
from batchfold.batchfold_environment import BatchFoldEnvironment
from batchfold.batchfold_target import BatchFoldTarget
from batchfold.jackhmmer_job import JackhmmerJob
from batchfold.openfold_job import OpenFoldJob
from batchfold.alphafold2_job import AlphaFold2Job
from batchfold.omegafold_job import OmegaFoldJob
from batchfold.esmfold_job import ESMFoldJob
from batchfold.utils import utils
import numpy as np

# Create AWS clients
boto_session = boto3.session.Session()

batch_environment = BatchFoldEnvironment(boto_session=boto_session)

S3_BUCKET = batch_environment.default_bucket
print(f" S3 bucket name is {S3_BUCKET}")

## 1. Create Target

In [None]:
target_id = "7FCC"
target = BatchFoldTarget(target_id=target_id, s3_bucket=S3_BUCKET, boto_session=boto_session)
target.add_sequence(
    seq_id=target_id,
    seq="KEYDIYVSYARNAEEEEFVLLTLRGVLENEFGYKLCIFDRDSLPGGNTVEAVFDFIQRSRRMIVVLSPDYVTEKSISMLEFKLGVMCQNSIATKLIVVEYRPLEHPHPGILQLKESVSFVSWKGEKSKHSGSKFWKALRLALPLRS",
    description="Chain A|Isoform 4 of Interleukin-1 receptor accessory protein|Homo sapiens (9606)",
)

## 2. Submit Sequence Alignment and Folding Jobs

List available job queues

In [None]:
batch_environment.list_job_queue_names()

Submit jackHMMER job

In [None]:
job_name = target.target_id + "_JackhmmerJob_" + datetime.now().strftime("%Y%m%d%s")
jackhmmer_job = JackhmmerJob(
    job_name=job_name,
    target_id=target.target_id,
    fasta_s3_uri=target.get_fasta_s3_uri(),
    output_s3_uri=target.get_msas_s3_uri(),
    boto_session=boto_session,
    cpu=16,
    memory=31,
)
jackhmmer_submission = batch_environment.submit_job(
    jackhmmer_job, job_queue_name="CPUOnDemandJobQueue"
)

Submit OpenFold job

In [None]:
job_name = target.target_id + "_OpenFoldJob_" + datetime.now().strftime("%Y%m%d%s")
openfold_job = OpenFoldJob(
    job_name=job_name,    
    target_id=target.target_id,
    fasta_s3_uri=target.get_fasta_s3_uri(),
    msa_s3_uri=target.get_msas_s3_uri(),
    output_s3_uri=target.get_predictions_s3_uri() + "/" + job_name,
    boto_session=boto_session,
    cpu=4,
    memory=15,  # Why not 16? ECS needs about 1 GB for container services
    gpu=1,
)
openfold_submission = batch_environment.submit_job(
    openfold_job, job_queue_name="G4dnJobQueue", depends_on=[jackhmmer_submission]
)

Submit AlphaFold2 job

In [None]:
job_name = target.target_id + "_AlphaFold2Job_" + datetime.now().strftime("%Y%m%d%s")
alphafold2_job = AlphaFold2Job(
    job_name=job_name,
    target_id=target.target_id,
    fasta_s3_uri=target.get_fasta_s3_uri(),
    msa_s3_uri=target.get_msas_s3_uri(),
    output_s3_uri=target.get_predictions_s3_uri() + "/" + job_name,
    boto_session=boto_session,
    cpu=4,
    memory=15,  # Why not 16? ECS needs about 1 GB for container services
    gpu=1,
)
alphafold2_submission = batch_environment.submit_job(
    alphafold2_job, job_queue_name="G4dnJobQueue", depends_on=[jackhmmer_submission]
)

AlphaFold can also be run in CPU-only mode for long sequences

In [None]:
job_name = target.target_id + "_AlphaFold2Job_CPU_" + datetime.now().strftime("%Y%m%d%s")
alphafold2_cpu_job = AlphaFold2Job(
    job_name=job_name,
    target_id=target.target_id,
    fasta_s3_uri=target.get_fasta_s3_uri(),
    msa_s3_uri=target.get_msas_s3_uri(),
    output_s3_uri=target.get_predictions_s3_uri() + "/" + job_name,
    boto_session=boto_session,
    cpu=4,
    memory=15,  # Why not 16? ECS needs about 1 GB for container services
    gpu=0,
)
alphafold2_cpu_submission = batch_environment.submit_job(
    alphafold2_cpu_job, job_queue_name="CPUOnDemandJobQueue", depends_on=[jackhmmer_submission]
)

Submit OmegaFold job

In [None]:
job_name = target.target_id + "_OmegaFoldJob_" + datetime.now().strftime("%Y%m%d%s")
omegafold_job = OmegaFoldJob(
    job_name=job_name,
    target_id=target.target_id,
    fasta_s3_uri=target.get_fasta_s3_uri(),
    output_s3_uri=target.get_predictions_s3_uri() + "/" + job_name,
    boto_session=boto_session,
    cpu=4,
    memory=15,  # Why not 16? ECS needs about 1 GB for container services
    gpu=1,
)
omegafold_submission = batch_environment.submit_job(
    omegafold_job, job_queue_name="G4dnJobQueue"
)

Submit ESMFold job

In [None]:
job_name = target.target_id + "_ESMFoldJob_" + datetime.now().strftime("%Y%m%d%s")
esmfold_job = ESMFoldJob(
    job_name=job_name,
    target_id=target.target_id,
    fasta_s3_uri=target.get_fasta_s3_uri(),
    output_s3_uri=target.get_predictions_s3_uri() + "/" + job_name,
    boto_session=boto_session,
    cpu=8,
    memory=31,  # Why not 32? ECS needs about 1 GB for container services
    gpu=1,
)
esmfold_submission = batch_environment.submit_job(
    esmfold_job, job_queue_name="G4dnJobQueue"
)

Check on job statuses

In [None]:
for job in [jackhmmer_job, openfold_job, alphafold2_job, alphafold2_cpu_job, omegafold_job, esmfold_job]:
    print(
        f"Job {job.describe_job()[0]['jobName']} is in status {job.describe_job()[0]['status']}"
    )

## 3. Download and Visualize Results

Once the jobs are finished, download and view the results

### Plot Alignment Data

In [None]:
target = BatchFoldTarget(
    target_id=target_id, s3_bucket=S3_BUCKET, boto_session=boto_session
)

In [None]:
target.download_msas(local_path="data")
utils.plot_msa_output_folder(
    path=f"data/{target_id}/msas/jackhmmer", id=target_id
)

### Plot Predicted Structures

Plot OpenFold prediction

In [None]:
last_job_name = target.get_last_job_name(job_type="OpenFold")

print(f"Downloading results for job {last_job_name}")
target.download_predictions(local_path="data", job=last_job_name)

print(f"Displaying predicted structure for job {last_job_name}")
pdb = f"data/{target_id}/predictions/{last_job_name}/{target_id}_model_1_ptm_relaxed.pdb"
utils.plot_banded_pdb(pdb, width=500, height=300)

pkl = f"data/{target_id}/predictions/{last_job_name}/{target_id}_model_1_ptm_output_dict.pkl"
pae_results = np.load(pkl, allow_pickle=True)["predicted_aligned_error"]
utils.plot_metrics(pdb, pae_results)

Plot AlphaFold2 prediction

In [None]:
last_job_name = target.get_last_job_name(job_type="AlphaFold2")

print(f"Downloading results for job {last_job_name}")
target.download_predictions(local_path="data", job=last_job_name)

print("Identifying best model")
best_model_name = utils.get_best_alphafold_model(f"data/{target_id}/predictions/{last_job_name}/ranking_debug.json")

print(f"Displaying predicted structure for model {best_model_name}")
pdb = f"data/{target_id}/predictions/{last_job_name}/ranked_0.pdb"
utils.plot_banded_pdb(pdb)

pkl = f"data/{target_id}/predictions/{last_job_name}/result_{best_model_name}.pkl"
pae_results = np.load(pkl, allow_pickle=True)["predicted_aligned_error"]
utils.plot_metrics(pdb, pae_results)

Plot OmegaFold prediction

In [None]:
last_job_name = target.get_last_job_name(job_type="OmegaFold")

print(f"Downloading results for job {last_job_name}")
target.download_predictions(local_path="data", job=last_job_name)

print(f"Displaying predicted structure for job {last_job_name}")
pdb = f"data/{target_id}/predictions/{last_job_name}/{target_id}.pdb"
utils.plot_banded_pdb(pdb)
utils.plot_metrics(pdb)

Plot ESMFold prediction

In [None]:
last_job_name = target.get_last_job_name(job_type="ESMFold")

print(f"Downloading results for job {last_job_name}")
target.download_predictions(local_path="data", job=last_job_name)

print(f"Displaying predicted structure for job {last_job_name}")
pdb = f"data/{target_id}/predictions/{last_job_name}/prediction.pdb"
utils.plot_banded_pdb(pdb)
utils.plot_metrics(pdb)