Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
SPDX-License-Identifier: Apache-2.0

# Using the AWS Batch Architecture for Protein Folding

This notebook allows you to predict protein structures on AWS Batch. 

**Citing this work**

Any publication that discloses findings arising from using this notebook should [cite](https://github.com/deepmind/alphafold/#citing-this-work) the [AlphaFold paper](https://doi.org/10.1038/s41586-021-03819-2).

**Licenses**

Please refer to the `LICENSE` and `THIRD-PARTY-NOTICES` file for more information about third-party software/licensing.

## Table of Contents
0. [Install Dependencies](#0.-Install-Dependencies)
1. [Run a monomer analysis job](#1.-Run-a-monomer-analysis-job)
2. [Run a multimer analysis job](#2.-Run-a-multimer-analysis-job) 
3. [Analyze multiple proteins](#3.-Analyze-multiple-proteins)

## 0. Install Dependencies

In [1]:
# Import required Python packages

import boto3
from datetime import datetime
# from nbhelpers import nbhelpers
import pandas as pd
import sagemaker

pd.set_option("max_colwidth", None)

In [2]:
# Get client informatiion

boto_session = boto3.session.Session(profile_name="bloyal+proteinfolding-Admin")
sm_session = sagemaker.session.Session(boto_session)
region = boto_session.region_name
s3_client = boto_session.client("s3", region_name=region)
batch_client = boto_session.client("batch")

S3_BUCKET = sm_session.default_bucket()
print(f" S3 bucket name is {S3_BUCKET}")

 S3 bucket name is sagemaker-us-east-2-032243382548


## 1. Run a monomer analysis job

Provide sequences for analysis

In [3]:
from batchfold.batchfold_target import BatchFoldTarget

target = BatchFoldTarget(
    target_id="T1084", 
    s3_bucket=S3_BUCKET,
    s3_base_prefix = "T1084",
    boto_session=boto_session
    ).add_sequence(
        seq="MAAHKGAEHHHKAAEHHEQAAKHHHAAAEHHEKGEHEQAAHHADTAYAHHKHAEEHAAQAAKHDAEHHAPKPH",
        description="Meio, Meiothermus silvanus, 73 residues|",
    ).upload_fasta()

In [4]:
from batchfold.batchfold_environment import BatchFoldEnvironment
from batchfold.jackhmmer_job import JackhmmerJob
from batchfold.mmseqs2_job import MMseqs2Job
from batchfold.openfold_job import OpenFoldJob
from batchfold.alphafold2_job import AlphaFold2Job


batch_environment = BatchFoldEnvironment(boto_session = boto_session)

jackhmmer_job = JackhmmerJob(
    job_name = target.target_id + "_JackhmmerJob_" + datetime.now().strftime("%Y%m%d%s"),
    target_id = target.target_id,
    fasta_s3_uri = target.get_fasta_s3_uri(),
    output_s3_uri = target.get_msas_s3_uri(),
    use_small_bfd = True,
    boto_session = boto_session,
    cpu = 4,
    memory = 16
)

mmseqs2_job = MMseqs2Job(
    job_name = target.target_id + "_MMSeqs2Job_" + datetime.now().strftime("%Y%m%d%s"),
    target_id = target.target_id,
    fasta_s3_uri = target.get_fasta_s3_uri(),
    output_s3_uri = target.get_msas_s3_uri(),
    boto_session = boto_session,
    cpu = 64,
    memory = 500
)

alphafold2_job = AlphaFold2Job(
    job_name = target.target_id + "_AlphaFold2Job_" + datetime.now().strftime("%Y%m%d%s"),
    target_id = target.target_id,
    fasta_s3_uri = target.get_fasta_s3_uri(),
    msa_s3_uri = target.get_msas_s3_uri(),
    output_s3_uri = target.get_predictions_s3_uri(),
    use_precomputed_msas = True,
    model_preset = "monomer",
    boto_session = boto_session,
    cpu = 4,
    memory = 16,
    gpu = 1
)

openfold_job = OpenFoldJob(
    job_name = target.target_id + "_OpenFoldJob_" + datetime.now().strftime("%Y%m%d%s"),
    target_id = target.target_id,
    fasta_s3_uri = target.get_fasta_s3_uri(),
    msa_s3_uri = target.get_msas_s3_uri(),
    output_s3_uri = target.get_predictions_s3_uri(),
    use_precomputed_msas = True,
    config_preset = "finetuning_ptm",
    openfold_checkpoint_path = "openfold_params/finetuning_ptm_1.pt",
    save_outputs = True,
    boto_session = boto_session,
    cpu = 4,
    memory = 16,
    gpu = 1
)

In [5]:
jackhmmer_submission = batch_environment.submit_job(jackhmmer_job, job_queue_name="GravitonSpotJobQueue")
mmseqs2_submission = batch_environment.submit_job(jackhmmer_job, job_queue_name="GravitonSpotJobQueue")

In [6]:
openfold_submission = batch_environment.submit_job(openfold_job, job_queue_name="G4dnJobQueue")
alphafold2_submission = batch_environment.submit_job(alphafold2_job, job_queue_name="G4dnJobQueue")