Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
SPDX-License-Identifier: Apache-2.0

# Using the AWS Batch Architecture for Protein Folding

This notebook allows you to predict multiple protein sequences from the CAMEO data set between 2022-04-08 and 2022-07-02

## Table of Contents
0. [Install Dependencies](#0.-Install-Dependencies)
1. [Run a monomer analysis job](#1.-Run-a-monomer-analysis-job)
2. [Run a multimer analysis job](#2.-Run-a-multimer-analysis-job) 
3. [Analyze multiple proteins](#3.-Analyze-multiple-proteins)

## 0. Install Dependencies

In [1]:
# Import required Python packages

import boto3
from datetime import datetime
from batchfold.batchfold_environment import BatchFoldEnvironment
from batchfold.jackhmmer_job import JackhmmerJob
from batchfold.openfold_job import OpenFoldJob
from batchfold.alphafold2_job import AlphaFold2Job
from batchfold.batchfold_target import BatchFoldTarget
import os

# Get client informatiion

boto_session = boto3.session.Session(profile_name="bloyal+proteinfolding-Admin")
batch_environment = BatchFoldEnvironment(boto_session = boto_session)

S3_BUCKET = batch_environment.default_bucket
print(f" S3 bucket name is {S3_BUCKET}")

 S3 bucket name is batch-protein-folding-2207169-c-batchfolds3bucket-13oducmieryqz


## 1. Analyze Cameo targets

In [None]:
# from pathlib import Path
data_dir = "data/fasta/single-chain"
submission_info = {}

for file in os.listdir(data_dir):
    target_id = file.split(".")[0]
    print(target_id)
    target = BatchFoldTarget(target_id=target_id, s3_bucket=S3_BUCKET, boto_session=boto_session)
    target.add_fasta(os.path.join(data_dir, file))
    target.upload_fasta()

    jackhmmer_job = JackhmmerJob(
        job_name = target.target_id + "_JackhmmerJob_" + datetime.now().strftime("%Y%m%d%s"),
        target_id = target.target_id,
        fasta_s3_uri = target.get_fasta_s3_uri(),
        output_s3_uri = target.get_msas_s3_uri(),
        boto_session = boto_session,
        cpu = 16,
        memory = 32
    )

    alphafold2_job = AlphaFold2Job(
        job_name = target.target_id + "_AlphaFold2Job_" + datetime.now().strftime("%Y%m%d%s"),
        target_id = target.target_id,
        fasta_s3_uri = target.get_fasta_s3_uri(),
        msa_s3_uri = target.get_msas_s3_uri(),
        output_s3_uri = target.get_predictions_s3_uri(),
        use_precomputed_msas = True,
        model_preset = "monomer",
        boto_session = boto_session,
        benchmark = True,
        cpu = 4,
        memory = 16,
        gpu = 1
    )

    openfold_job = OpenFoldJob(
        job_name = target.target_id + "_OpenFoldJob_" + datetime.now().strftime("%Y%m%d%s"),
        target_id = target.target_id,
        fasta_s3_uri = target.get_fasta_s3_uri(),
        msa_s3_uri = target.get_msas_s3_uri(),
        output_s3_uri = target.get_predictions_s3_uri(),
        use_precomputed_msas = True,
        config_preset = "finetuning_ptm",
        openfold_checkpoint_path = "openfold_params/finetuning_ptm_1.pt",
        save_outputs = True,
        boto_session = boto_session,
        cpu = 4,
        memory = 16,
        gpu = 1
    )

    jackhmmer_submission = batch_environment.submit_job(jackhmmer_job, job_queue_name="GravitonSpotJobQueue")
    openfold_submission = batch_environment.submit_job(openfold_job, job_queue_name="G4dnJobQueue", depends_on=[jackhmmer_submission])
    alphafold2_submission = batch_environment.submit_job(alphafold2_job, job_queue_name="G4dnJobQueue", depends_on=[jackhmmer_submission])    

    submission_info[target.id] = [jackhmmer_submission, openfold_submission, alphafold2_submission]
