Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
SPDX-License-Identifier: Apache-2.0

# Quick Start: Running Protein Folding on AWS Batch

## Table of Contents
0. [Install Dependencies](#0.-Install-Dependencies)
1. [Create Target](#1.-Create-Target)
2. [Submit Sequence Alignment and Folding Jobs](#2.-Submit-Sequence-Alignment-and-Folding-Jobs) 
3. [Download and Visualize Results](#3.-Download-and-Visualize-Results)

## 0. Install Dependencies

In [None]:
%pip install -U -q -r notebook-requirements.txt

In [None]:
# Import required Python packages

import boto3
from datetime import datetime
from batchfold.batchfold_environment import BatchFoldEnvironment
from batchfold.batchfold_target import BatchFoldTarget
from batchfold.diffdock_job import DiffDockJob
from batchfold.esmfold_job import ESMFoldJob
from batchfold.utils import utils
import numpy as np
import os
import py3Dmol

# Create AWS clients
boto_session = boto3.session.Session()

batch_environment = BatchFoldEnvironment(boto_session=boto_session)

S3_BUCKET = batch_environment.default_bucket
print(f" S3 bucket name is {S3_BUCKET}")

## 1. Create Target

In [None]:
target_id = "6W70"
target = BatchFoldTarget(target_id=target_id, s3_bucket=S3_BUCKET, boto_session=boto_session)
target.add_sequence(
    seq_id=target_id,
    seq="SVKSEYAEAAAVGQEAVAVFNTMKAAFQNGDKEAVAQYLARLASLYTRHEELLNRILEKARREGNKEAVTLMNEFTATFQTGKSIFNAMVAAFKNGDDDSFESYLQALEKVTAKGETLADQIAKAL",
    description="De novo designed ABLE|synthetic construct (32630)",
)

## 2. Submit Sequence Alignment and Folding Jobs

List available job queues

In [3]:
batch_environment.list_job_queue_names()

['G4dnJobQueue', 'GravitonOnDemandJobQueue', 'GravitonSpotJobQueue']

Submit ESMFold job

In [4]:
job_name = target.target_id + "_ESMFoldJob_" + datetime.now().strftime("%Y%m%d%s")
esmfold_job = ESMFoldJob(
    job_name=job_name,
    target_id=target.target_id,
    fasta_s3_uri=target.get_fasta_s3_uri(),
    output_s3_uri=target.get_predictions_s3_uri() + "/" + job_name,
    boto_session=boto_session,
    cpu=8,
    memory=31,  # Why not 32? ECS needs about 1 GB for container services
    gpu=1,
)
esmfold_submission = batch_environment.submit_job(
    esmfold_job, job_queue_name="G4dnJobQueue"
)

Submit DiffDock job

In [5]:
job_name = target.target_id + "_DiffDockJob_" + datetime.now().strftime("%Y%m%d%s")
diffdock_job = DiffDockJob(
    job_name=job_name,
    complex_name=target.target_id,
    protein_s3_uri = os.path.join(target.get_predictions_s3_uri(), esmfold_submission.job_name, target.target_id + ".pdb"),
    ligand_description = "COc1ccc(cc1)n2c3c(c(n2)C(=O)N)CCN(C3=O)c4ccc(cc4)N5CCCCC5=O",
    output_s3_uri=target.get_predictions_s3_uri() + "/" + job_name,
    boto_session=boto_session,
    cpu=4,
    memory=15,  # Why not 32? ECS needs about 1 GB for container services
    gpu=1,
)
diffdock_submission = batch_environment.submit_job(
    diffdock_job, job_queue_name="G4dnJobQueue", depends_on=[esmfold_submission]
)

Check on job statuses

In [6]:
for job in [esmfold_job, diffdock_job]:
    print(
        f"Job {job.describe_job()[0]['jobName']} is in status {job.describe_job()[0]['status']}"
    )

Job 6W70_ESMFoldJob_202304071680893021 is in status RUNNABLE
Job 6W70_DiffDockJob_202304071680893022 is in status PENDING


## 3. Download and Visualize Results

Once the jobs are finished, download and view the results

In [8]:
last_esmfold_job_name = target.get_last_job_name(job_type="ESMFold")
print(f"Downloading results for job {last_esmfold_job_name}")
target.download_predictions(local_path="data", job=last_esmfold_job_name)

last_diffdock_job_name = target.get_last_job_name(job_type="DiffDock")
print(f"Downloading results for job {last_diffdock_job_name}")
target.download_predictions(local_path="data", job=last_diffdock_job_name)

Downloading results for job 6W70_ESMFoldJob_202304071680890559
2 files downloaded from s3.
0 files downloaded from s3.
Downloading results for job 6W70_DiffDockJob_202304071680887263
22 files downloaded from s3.
0 files downloaded from s3.
0 files downloaded from s3.


'/home/ec2-user/SageMaker/batch-protein-folding-code-repo-024bbdbedfbf/notebooks/data'

Plot ESMFold and DiffDock results

In [9]:
view = py3Dmol.view(width=800, height=800)

# add pdb
view.addModel(open(f"data/{target.target_id}/predictions/{last_esmfold_job_name}/6W70.pdb").read(), "pdb");
view.setStyle({"model": 0}, {"cartoon":{"color":"spectrum"}})
view.setStyle({"model": 0, "hetflag":True}, {'stick':{"color":"spectrum"}})

# add sdf
view.addModelsAsFrames(open(f"data/{target.target_id}/predictions/{last_diffdock_job_name}/{target.target_id}/rank1_reverseprocess.pdb").read(), "pdb")
view.setStyle({"model": 1}, {'stick':{"color":"#ff0000"}})
view.setViewStyle({"model": 1}, {'style':'outline','color':'black','width':0.1})

view.animate({'loop': "forward"})
view.zoomTo();
view.show()

Clean up

In [10]:
!rm -rf data/6W70