# AWS-RoseTTAFold: Bulk Job Analysis

## I. Introduction

This notebook demonstrates how to analyze multiple protein simultaneously, in this case a subset of the CASP14 target set.

## II. Environment setup

In [None]:
## Install dependencies
!pip install -q -q -r requirements.txt

## Import helper functions at rfutils/rfutils.py
from rfutils import rfutils

## Load additional dependencies
from Bio import SeqIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
import boto3
import glob
import json
import pandas as pd
import sagemaker

pd.set_option("max_colwidth", None)

# Get service clients
session = boto3.session.Session()
sm_session = sagemaker.session.Session()
region = session.region_name
role = sagemaker.get_execution_role()
s3 = boto3.client("s3", region_name=region)
account_id = boto3.client("sts").get_caller_identity().get("Account")

bucket = sm_session.default_bucket()

## III. Input Protein Sequence

In [None]:
casp14_list = SeqIO.to_dict(SeqIO.parse("data/casp14.fa", "fasta"))

In [None]:
protein_count = 5

casp14_subset = {
    key: value
    for key, value in casp14_list.items()
    if key in list(casp14_list.keys())[:protein_count]
}
for record in casp14_subset.values():
    print(record.description)

In [None]:
for record in casp14_subset.values():
    print(f"Protein sequence for analysis is \n{record}")
    job_name = rfutils.create_job_name(record.id)
    print(f"Automatically-generated job name is: {job_name}")
    input_uri = rfutils.upload_fasta_to_s3(record, bucket, job_name)
    two_step_response = rfutils.submit_2_step_job(
        bucket=bucket,
        job_name=job_name,
        data_prep_input_file="input.fa",
        data_prep_job_definition="AWS-RosettaFold-CPU-Job-Definition",
        data_prep_queue="AWS-RF-cpu-job-queue",
        data_prep_cpu=16,
        data_prep_mem=60,
        predict_job_definition="AWS-RosettaFold-GPU-Job-Definition",
        predict_queue="AWS-RF-gpu-job-queue",
        predict_cpu=32,
        predict_mem=90,
        predict_gpu=2,
    )

## IV. Check Status of Data Prep and Prediction Jobs

In [None]:
rfutils.get_rf_job_info(
    cpu_queue="AWS-RF-cpu-job-queue", gpu_queue="AWS-RF-gpu-job-queue", hrs_in_past=1
)

## V. Display the Results of Historical Runs

In [None]:
# T1024 LmrP, , 408 residues|
rfutils.display_msa("d1076b6c-e844-4fda-a18a-389583e3bcf1", bucket)
rfutils.display_structure("9f997ec0-0ff8-4e86-9713-857f12bc2e6c", bucket)

In [None]:
# T1036s1 Monoclonal antibody 93k, Varicella-zoster virus, strain pOka, subunit 1, 622 residues|
rfutils.display_msa("cab38277-78f0-4513-bef1-e45f20aaf314", bucket)
rfutils.display_structure("cab38277-78f0-4513-bef1-e45f20aaf314", bucket)

In [None]:
# T1025 AtmM , Actinomadura melliaura, 268 residues|
rfutils.display_msa("d3f7e4e3-ae84-4ef2-a7aa-d57215a8a684", bucket)
rfutils.display_structure("d3f7e4e3-ae84-4ef2-a7aa-d57215a8a684", bucket)