Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
SPDX-License-Identifier: Apache-2.0

# Using the AWS Batch Architecture for Protein Folding

This notebook allows you to predict multiple protein sequences from the CAMEO data set between 2022-04-08 and 2022-07-02

## Table of Contents
0. [Install Dependencies](#0.-Install-Dependencies)
1. [Get target list](#1.-Get-target-list)
2. [Run MSA generation and folding jobs](#2.-Run-MSA-generation-and-folding-jobs) 
3. [Download results](#3.-Download-results)
4. [Visualze results](#4.-Visualize-results)
5. [Compare result to experimental structure](#5.-Compare-result-to-experimental-structure)

## 0. Install Dependencies

In [None]:
# Import required Python packages

import boto3
from datetime import datetime
from batchfold.batchfold_environment import BatchFoldEnvironment
from batchfold.jackhmmer_job import JackhmmerJob
from batchfold.openfold_job import OpenFoldJob
from batchfold.alphafold2_job import AlphaFold2Job
from batchfold.batchfold_target import BatchFoldTarget
from batchfold.mmseqs2_job import MMseqs2Job
import matplotlib.pyplot as plt
from nbhelpers import nbhelpers
import os
import json
import pickle

# Get client information
boto_session = boto3.session.Session()
batch_environment = BatchFoldEnvironment(boto_session = boto_session)

S3_BUCKET = batch_environment.default_bucket
print(f" S3 bucket name is {S3_BUCKET}")

s3 = boto_session.client("s3")

## 1. Get targets

In [None]:
import pandas as pd
targets = pd.read_csv("cameo_benchmarking_targets.csv", header=0)    

## 2. Run MSA and Folding Jobs

In [None]:
import requests 
from Bio import SeqIO
from io import StringIO

for _, row in targets.iterrows():
    print(row["SEQ_ID"])
    target = BatchFoldTarget(target_id=row["SEQ_ID"], s3_bucket=S3_BUCKET, boto_session=boto_session)    
    fasta_string = requests.get(row["SEQ_URL"]).content.decode("utf-8")
    with StringIO(fasta_string) as fasta:
        for record in SeqIO.parse(fasta, "fasta"):
            target.add_sequence(
                seq_id=row["SEQ_ID"],
                seq=str(record.seq),
                description=record.description
            )
    
    job_name = target.target_id + "_MMseqs2Job_" + datetime.now().strftime("%Y%m%d%s")
    mmseqs2_job = MMseqs2Job(
            job_name = job_name,
            target_id = target.target_id,
            fasta_s3_uri = target.get_fasta_s3_uri(),
            output_s3_uri = target.get_msas_s3_uri(),
            boto_session = boto_session,
            cpu = 64,
            memory = 500
        )

    job_name = target.target_id + "_JackhmmerJob_" + datetime.now().strftime("%Y%m%d%s")
    jackhmmer_job = JackhmmerJob(
        job_name = job_name,
        target_id = target.target_id,
        fasta_s3_uri = target.get_fasta_s3_uri(),
        output_s3_uri = target.get_msas_s3_uri(),
        boto_session = boto_session,
        cpu = 16,
        memory = 32
    )

    job_name = target.target_id + "_Jackhmmer_OpenFoldJob_" + datetime.now().strftime("%Y%m%d%s")
    jackhmmer_openfold_job = OpenFoldJob(
        job_name = job_name,
        boto_session = boto_session,
        target_id = target.target_id,
        fasta_s3_uri = target.get_fasta_s3_uri(),
        msa_s3_uri = target.get_msas_s3_uri()+"/jackhmmer/",
        output_s3_uri = target.get_predictions_s3_uri() + "/" + job_name,
        use_precomputed_msas = True,
        config_preset = "finetuning_ptm",
        openfold_checkpoint_path = "openfold_params/finetuning_ptm_2.pt",
        save_outputs = True,
        cpu = 4,
        memory = 15, # Why not 16? ECS needs about 1 GB for container services
        gpu = 1
    )

    job_name = target.target_id + "_Mmseq2_OpenFoldJob_" + datetime.now().strftime("%Y%m%d%s")
    mmseqs2_openfold_job = OpenFoldJob(
        job_name = job_name,
        boto_session = boto_session,
        target_id = target.target_id,
        fasta_s3_uri = target.get_fasta_s3_uri(),
        msa_s3_uri = target.get_msas_s3_uri()+"/mmseqs2/",
        output_s3_uri = target.get_predictions_s3_uri() + "/" + job_name,
        use_precomputed_msas = True,
        config_preset = "finetuning_ptm",
        openfold_checkpoint_path = "openfold_params/finetuning_ptm_2.pt",
        save_outputs = True,
        cpu = 4,
        memory = 15, # Why not 16? ECS needs about 1 GB for container services
        gpu = 1
    )    

    job_name = target.target_id + "_AlphaFold2Job_" + datetime.now().strftime("%Y%m%d%s")
    alphafold2_job = AlphaFold2Job(
        job_name = job_name,
        boto_session = boto_session,
        target_id = target.target_id,
        fasta_s3_uri = target.get_fasta_s3_uri(),
        msa_s3_uri = target.get_msas_s3_uri()+"/jackhmmer",
        output_s3_uri = target.get_predictions_s3_uri() + "/" + job_name,
        use_precomputed_msas = True,
        model_preset = "monomer_ptm",    
        benchmark = True,
        cpu = 4,
        memory = 15, # Why not 16? ECS needs about 1 GB for container services
        gpu = 1
    )    

    jackhmmer_submission = batch_environment.submit_job(jackhmmer_job, job_queue_name="GravitonOnDemandJobQueue")
    mmseqs2_submission = batch_environment.submit_job(mmseqs2_job, job_queue_name="GravitonOnDemandJobQueue")

    jackhmmer_openfold_submission = batch_environment.submit_job(jackhmmer_openfold_job, job_queue_name="G4dnJobQueue", depends_on=[jackhmmer_submission])
    mmseqs2_openfold_submission = batch_environment.submit_job(mmseqs2_openfold_job, job_queue_name="G4dnJobQueue", depends_on=[mmseqs2_submission])
    alphafold2_submission = batch_environment.submit_job(alphafold2_job, job_queue_name="G4dnJobQueue", depends_on=[jackhmmer_submission])    



In [None]:

    


# for target_id in targets:
#     target = BatchFoldTarget(target_id=target_id, s3_bucket=S3_BUCKET, boto_session=boto_session)


    # alphafold2_job_name = target.target_id + "_AlphaFold2Job_" + datetime.now().strftime("%Y%m%d%s")
    # alphafold2_job = AlphaFold2Job(
    #     job_name = alphafold2_job_name,
    #     boto_session = boto_session,
    #     target_id = target.target_id,
    #     fasta_s3_uri = target.get_fasta_s3_uri(),
    #     msa_s3_uri = target.get_msas_s3_uri()+"/jackhmmer",
    #     output_s3_uri = target.get_predictions_s3_uri() + "/" + alphafold2_job_name,
    #     max_template_date = "2022-01-01",
    #     use_precomputed_msas = True,
    #     model_preset = "monomer_ptm",    
    #     benchmark = True,
    #     cpu = 4,
    #     memory = 16,
    #     gpu = 1
    # )

    openfold_job_name = target.target_id + "_OpenFoldJob_" + datetime.now().strftime("%Y%m%d%s")
    openfold_job = OpenFoldJob(
        job_name = openfold_job_name,
        boto_session = boto_session,
        target_id = target.target_id,
        fasta_s3_uri = target.get_fasta_s3_uri(),
        msa_s3_uri = target.get_msas_s3_uri()+"/jackhmmer/",
        output_s3_uri = target.get_predictions_s3_uri() + "/" + openfold_job_name,
        max_template_date = "2022-01-01",
        use_precomputed_msas = True,
        config_preset = "finetuning_ptm",
        openfold_checkpoint_path = "openfold_params/finetuning_ptm_2.pt",
        save_outputs = True,
        cpu = 4,
        memory = 16,
        gpu = 1
    )

    # alphafold2_submission = batch_environment.submit_job(alphafold2_job, job_queue_name="G4dnJobQueue")
    openfold_submission = batch_environment.submit_job(openfold_job, job_queue_name="G4dnJobQueue")


Once the jobs are finished, download the results

## 3. Download results

In [None]:
target = BatchFoldTarget(target_id="7EQB_A", s3_bucket=S3_BUCKET, boto_session=boto_session)

In [None]:
target.download_all(local_path="data")

## 4. Compare result to experimental structure

### Install TMscore

In [None]:
%%bash
wget -qnc https://zhanggroup.org/TM-score/TMscore.cpp
g++ -O3 -ffast-math -lm -o TMscore TMscore.cpp

### Get Results

In [None]:
import re

results = {}
for target_id in targets:
    try:
        target_id = target_id + "_A"
        print(target_id)
        target_results = {}
        # target_results["target_id"] = target_id
        target = BatchFoldTarget(target_id=target_id, s3_bucket=S3_BUCKET, boto_session=boto_session)
        os.makedirs(f"/Users/bloyal/batch-protein-folding/notebooks/data/{target.target_id}/", exist_ok=True)


        # Get openfold .pdf
        # print("downloading openfold .pdb")
        s3.download_file(S3_BUCKET, f"{target.target_id}/predictions/{target.get_last_job_name(job_type='OpenFold')}/{target.target_id}_finetuning_ptm_relaxed.pdb", f"data/{target.target_id}/{target.target_id}_finetuning_ptm_relaxed.pdb")

        # Get openfold timings
        # print("Calculating openfold timings")
        of_timings = nbhelpers.get_openfold_timings_for_job_name(batch_environment, target.get_last_job_name(job_type='OpenFold'))
        target_results.update(of_timings)

        # Get .pdf
        # print("downloading alphafold .pdb")
        s3.download_file(S3_BUCKET, f"{target.target_id}/predictions/{target.get_last_job_name(job_type='AlphaFold2')}/ranked_0.pdb", f"data/{target.target_id}/ranked_0.pdb")

        # Get timings
        # print("Downloading alphafold timings")
        response = s3.get_object(Bucket=S3_BUCKET, Key=f"{target.target_id}/predictions/{target.get_last_job_name(job_type='AlphaFold2')}/timings.json")
        body = response['Body'].read()
        af_timings = json.loads(body)
        target_results.update(af_timings)

        # Get experimental pdb
        base_target_id = re.search("(.*)_", target_id)[1]
        nbhelpers.download_pdb_file(base_target_id, f"data/{target.target_id}", file_format="pdb")

        # Calculate TS_GDT scores
        of_pdb = f"data/{target.target_id}/{target.target_id}_finetuning_ptm_relaxed.pdb"
        af_pdb = f"data/{target.target_id}/ranked_0.pdb"
        experimental_pdb = f"data/{target.target_id}/{base_target_id}.pdb"

        of_score_results = nbhelpers.run_tmscore(of_pdb, experimental_pdb)
        print(f"OpenFold TS_GDT Score: {of_score_results['gdt']}")
        target_results.update({"openfold_gdt": of_score_results['gdt']})

        
        af_score_results = nbhelpers.run_tmscore(af_pdb, experimental_pdb)
        print(f"AlphaFold2 TS_GDT Score: {af_score_results['gdt']}")
        target_results.update({"alphafold2_gdt": af_score_results['gdt']})

        results[target_id] = target_results
    except:
        continue

results_df = pd.DataFrame.from_dict(results, orient="index")



In [None]:
results_df = pd.DataFrame.from_dict(results, orient="index")


In [None]:
results_df.to_csv("220809_B_initial_results.csv")

In [None]:
results_df

-----

Plot GDT scores vs experimental structures in PDB

In [None]:
import pandas as pd
results = pd.read_csv("/Users/bloyal/batch-protein-folding/notebooks/220808_initial_results.csv")

In [None]:
import matplotlib.pyplot as plt
import numpy as np

plt.style.use('seaborn')

fig, ax = plt.subplots(figsize=(20,10))

x = results["seq_length"]
y1 = results["openfold_gdt"]
y2 = results["alphafold2_gdt"]
ax.scatter(x, y1, label="OpenFold")
ax.scatter(x, y2, label="AlphaFold2")

X_line_coords = np.array([
    results["seq_length"],
    results["seq_length"]
])
Y_line_coords = np.array([
    results["openfold_gdt"],
    results["alphafold2_gdt"]
])
ax.plot(X_line_coords, Y_line_coords, color="gray")

ax.axhline(0.9, color="black", linestyle="--")
ax.axhline(0.7, color="red", linestyle="--")
ax.legend(frameon=True, facecolor="white")
ax.set_xlabel("Sequence Length (Residues)")
ax.set_ylabel("GDT_TS")
plt.title("OpenFold vs AlphaFold2 Accuracy on CAMEO Targets", fontsize=18)
plt.show()

In [None]:
for i in range(1,6):
    results[f"alphafold_time_{i}"] = results[f"alphafold_time_process_features_model_{i}_ptm_pred_0"] + results[f"alphafold_time_predict_and_compile_model_{i}_ptm_pred_0"] + results[f"alphafold_time_relax_model_{i}_ptm_pred_0"]

results["alphafold_time"] = results["alphafold_time_1"] + results["alphafold_time_2"]+ results["alphafold_time_3"]+ results["alphafold_time_4"]+ results["alphafold_time_5"]
results["openfold_time"] = results["openfold_inference_time"] + results["openfold_relaxation"]

In [None]:
import matplotlib.pyplot as plt
import numpy as np

plt.style.use('seaborn')

fig, ax = plt.subplots(figsize=(20,10))

x = results["seq_length"]
y1 = results["alphafold_time_1"]
y2 = results["alphafold_time_2"]
y3 = results["alphafold_time_3"]
y4 = results["alphafold_time_4"]
y5 = results["alphafold_time_5"]
y6 = results["openfold_time"]

width = 5

ax.bar(x + width/2, y1, width, label="AlphaFold2 Model 1")
ax.bar(x + width/2, y2, width, bottom=y1, label="AlphaFold2 Model 2")
ax.bar(x + width/2, y3, width, bottom=y1+y2, label="AlphaFold2 Model 3")
ax.bar(x + width/2, y4, width, bottom=y1+y2+y3, label="AlphaFold2 Model 4")
ax.bar(x + width/2, y5, width, bottom=y1+y2+y3+y4, label="AlphaFold2 Model 5")
ax.bar(x - width/2, y6, width, label="OpenFold")

ax.legend(frameon=True, facecolor="white")
ax.set_xlabel("Sequence Length (Residues)")
ax.set_ylabel("Time (sec)")
plt.title("OpenFold vs AlphaFold2 Run Times on CAMEO Targets (By Model)", fontsize=18)
plt.show()

In [None]:
import matplotlib.pyplot as plt
import numpy as np

plt.style.use('seaborn')

fig, ax = plt.subplots(figsize=(20,10))

ec2_hourly_rate = 0.526

x = results["seq_length"]
y1 = results["alphafold_time_1"] * (ec2_hourly_rate / 3600)
y2 = results["alphafold_time_2"] * (ec2_hourly_rate / 3600)
y3 = results["alphafold_time_3"] * (ec2_hourly_rate / 3600)
y4 = results["alphafold_time_4"] * (ec2_hourly_rate / 3600)
y5 = results["alphafold_time_5"] * (ec2_hourly_rate / 3600)
y6 = results["openfold_time"] * (ec2_hourly_rate / 3600)

width = 5

ax.bar(x + width/2, y1, width, label="AlphaFold2 Model 1")
ax.bar(x + width/2, y2, width, bottom=y1, label="AlphaFold2 Model 2")
ax.bar(x + width/2, y3, width, bottom=y1+y2, label="AlphaFold2 Model 3")
ax.bar(x + width/2, y4, width, bottom=y1+y2+y3, label="AlphaFold2 Model 4")
ax.bar(x + width/2, y5, width, bottom=y1+y2+y3+y4, label="AlphaFold2 Model 5")
ax.bar(x - width/2, y6, width, label="OpenFold")

ax.legend(frameon=True, facecolor="white")
ax.set_xlabel("Sequence Length (Residues)")
ax.set_ylabel("Run Cost ($)")
plt.title("OpenFold vs AlphaFold2 Run Costs on CAMEO Targets (g4dn.xlarge on demand)", fontsize=18)
plt.show()

In [None]:
# Calculate TS_GDT scores
target = BatchFoldTarget(target_id="7OA7_A", s3_bucket=S3_BUCKET, boto_session=boto_session)
base_target_id = "7OA7"
of_pdb = f"data/{target.target_id}/{target.target_id}_finetuning_ptm_relaxed.pdb"
print(f"OpenFold PDB is {of_pdb}")
af_pdb = f"data/{target.target_id}/ranked_0.pdb"
print(f"AlphaFold PDB is {af_pdb}")
experimental_pdb = f"data/{target.target_id}/{base_target_id}.pdb"
print(f"Experimental PDB is {experimental_pdb}")

of_score_results = nbhelpers.run_tmscore(of_pdb, experimental_pdb)
print(f"OpenFold results: {of_score_results}")
# target_results.update({"openfold_gdt": of_score_results['gdt']})


af_score_results = nbhelpers.run_tmscore(af_pdb, experimental_pdb)
print(f"AlphaFold results: {af_score_results}")
# target_results.update({"alphafold2_gdt": af_score_results['gdt']})

In [None]:
%%bash
TMscore -seq "data/7OA7_A/7OA7_A_finetuning_ptm_relaxed.pdb"