# AWS-RoseTTAFold: Bulk Job Analysis

## I. Introduction

This notebook demonstrates how to analyze multiple protein simultaneously, in this case a subset of the CASP14 target set.

## II. Environment setup

In [None]:
## Install dependencies
%pip install -q -q -r requirements.txt

In [None]:
## Import helper functions at rfutils/rfutils.py
from rfutils import rfutils

## Load additional dependencies
from Bio import SeqIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
import boto3
import glob
import json
from IPython.display import display
import pandas as pd
import sagemaker

pd.set_option("max_colwidth", None)

# Get service clients
session = boto3.session.Session()
sm_session = sagemaker.session.Session()
region = session.region_name
role = sagemaker.get_execution_role()
s3 = boto3.client("s3", region_name=region)
account_id = boto3.client("sts").get_caller_identity().get("Account")

bucket = sm_session.default_bucket()

## III. Input Protein Sequence

Download and process CASP14 sequences

In [None]:
!wget "https://predictioncenter.org/download_area/CASP14/sequences/casp14.seq.txt" -O "data/casp14.fa"
!sed '137,138d' "data/casp14.fa" > "data/casp14_dedup.fa" # Remove duplicate entry for T1085

casp14_iterator = SeqIO.parse("data/casp14_dedup.fa", "fasta")
casp14_df = pd.DataFrame(
    (
        (record.id, record.description, len(record), record.seq)
        for record in casp14_iterator
    ),
    columns=["id", "description", "length", "seq"],
).sort_values(by="length")
!rm data/casp14*

Display information about CASP14 proteins

In [None]:
with pd.option_context("display.max_rows", None):
    display(casp14_df.loc[:, ("id", "description")])

Plot distribution of the protein lengths

In [None]:
import matplotlib.pyplot as plt

fig, ax = plt.subplots()
plt.hist(casp14_df.length, bins=50)
plt.ylabel("Sample count")
plt.xlabel("Residue count")
plt.title("CASP-14 Protein Length Distribution")
plt.show()

Get the names of the AWS Batch resources deployed in your account.

In [None]:
batch_resources = rfutils.get_rosettafold_batch_resources(region=region)

cpu_queue = batch_resources["CPUJobQueue"][0]
gpu_queue = batch_resources["GPUJobQueue"][0]
cpu_data_prep_job_def = batch_resources["CPUDataPrepJobDefinition"][0]
cpu_predict_job_def = batch_resources["CPUPredictJobDefinition"][0]
gpu_predict_job_def = batch_resources["GPUPredictJobDefinition"][0]

batch_resources

Submit analysis jobs for a subset of CASP14 proteins

In [None]:
protein_count = 84  # Change this to analyze a smaller number of CASP14 targets
job_name_list = []

for row in casp14_df[:protein_count].itertuples(index=False):
    record = SeqRecord(row.seq, id=row.id, description=row.description)
    print(f"Protein sequence for analysis is \n{record.description}")
    sequence_length = len(record.seq)
    print(f"Sequence length is {sequence_length}")

    if sequence_length < 400:
        prep_cpu = 8
        prep_mem = 32
        predict_cpu = 4
        predict_mem = 16
        predict_gpu = True
        predict_job_definition = gpu_predict_job_def
        predict_queue = gpu_queue
    else:
        prep_cpu = 8
        prep_mem = 64
        predict_cpu = 4
        predict_mem = 32
        predict_gpu = False
        predict_job_definition = cpu_predict_job_def
        predict_queue = cpu_queue

    job_name = rfutils.create_job_name(record.id)
    print(f"Automatically-generated job name is: {job_name}")
    job_name_list.append(job_name)
    input_uri = rfutils.upload_fasta_to_s3(record, bucket, job_name)
    two_step_response = rfutils.submit_2_step_job(
        bucket=bucket,
        job_name=job_name,
        data_prep_input_file="input.fa",
        data_prep_job_definition=cpu_data_prep_job_def,
        data_prep_queue=cpu_queue,
        data_prep_cpu=prep_cpu,
        data_prep_mem=prep_mem,
        predict_job_definition=predict_job_definition,
        predict_queue=predict_queue,
        predict_cpu=predict_cpu,
        predict_mem=predict_mem,
        predict_gpu=predict_gpu,
    )

## IV. Check Status of Data Prep and Prediction Jobs

In [None]:
rfutils.get_rf_job_info(
    cpu_queue,
    gpu_queue,
    hrs_in_past=1,
)

In [None]:
jobs = []
for job_name in job_name_list:
    metrics = rfutils.get_rf_job_metrics(job_name, bucket, region)
    row = [
        job_name,
        metrics["DATA_PREP"]["JOB_ID"],
        metrics["DATA_PREP"]["CPU"],
        metrics["DATA_PREP"]["MEM"],
        metrics["DATA_PREP"]["LENGTH"],
        metrics["DATA_PREP"]["MSA_COUNT"],
        metrics["DATA_PREP"]["TEMPLATE_COUNT"],
        metrics["DATA_PREP"]["MSA_DURATION"],
        metrics["DATA_PREP"]["SS_DURATION"],
        metrics["DATA_PREP"]["TEMPLATE_DURATION"],
        metrics["DATA_PREP"]["TOTAL_DATA_PREP_DURATION"],
        metrics["PREDICT"]["JOB_ID"],
        metrics["PREDICT"]["CPU"],
        metrics["PREDICT"]["MEM"],
        metrics["PREDICT"]["TOTAL_PREDICT_DURATION"],
    ]
    jobs.append(row)
metrics_df = pd.DataFrame(
    jobs,
    columns=[
        "jobName",
        "dataPrepJobID",
        "dataPrepCPU",
        "dataPrepMEM",
        "sequenceLength",
        "MSACount",
        "templateCount",
        "MSADuration",
        "SSDuration",
        "templateDuration",
        "dataPrepDuration",
        "predictJobId",
        "predictCPU",
        "predictMEM",
        "predictDuration",
    ],
)
metrics_df.sort_values(by=["dataPrepCPU", "dataPrepMEM", "predictCPU", "predictMEM"])

In [None]:
metrics_df.to_csv("results.csv")