# I. PPI Data Preprocessing

In this notebook, we download data from BioGrid and UniProt and add it to a Neptune graph database.

This notebook should be uploaded and run on a SageMaker Notebook instance associated with an Amazon Neptune cluster.

---
## 1. Setup

In [None]:
%pip install -U pandas numpy h5py graph-notebook transformers==4.37.2 accelerate bitsandbytes

Clear graph database

In [None]:
%db_reset --generate-token

In [None]:
%db_reset --token <REPLACE WITH TOKEN RETURNED BY GENERATE-TOKEN CALL>

Verify that no nodes exist after reset job has finished

In [None]:
%%gremlin

g.V().count()

Define S3 bucket

In [None]:
S3_URI="s3://<REPLACE WITH YOUR S3 URI>"
# remove trailing slashes
S3_URI = S3_URI[:-1] if S3_URI.endswith('/') else S3_URI

---
## 2. Get BioGrid Data (Edge Features)

In [None]:
import requests
import tqdm
import os


def download(url: str, filename: str) -> str:
    print(f"Downloading {url} to {filename}")
    output_dir = os.path.dirname(filename)
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    with open(filename, "wb") as f:
        with requests.get(url, stream=True) as r:
            r.raise_for_status()
            total = int(r.headers.get("content-length", 0))

            tqdm_params = {
                "desc": url,
                "total": total,
                "miniters": 1,
                "unit": "B",
                "unit_scale": True,
                "unit_divisor": 1024,
            }
            with tqdm.tqdm(**tqdm_params) as pb:
                for chunk in r.iter_content(chunk_size=8192):
                    pb.update(len(chunk))
                    f.write(chunk)
    return filename

In [None]:
import pandas as pd
import os
import zipfile

DATA_DIR = "data"
BIOGRID_DATA_URI = "https://downloads.thebiogrid.org/Download/BioGRID/Latest-Release/BIOGRID-MV-Physical-LATEST.tab3.zip"

download(BIOGRID_DATA_URI, os.path.join(DATA_DIR, "biogrid_mv.zip"))

with zipfile.ZipFile(os.path.join(DATA_DIR, "biogrid_mv.zip"), "r") as zip_ref:
    zip_ref.extractall(DATA_DIR)
os.remove(os.path.join(DATA_DIR, "biogrid_mv.zip"))
BIOGRID_FILE = os.path.join(
    DATA_DIR,
    [filename for filename in os.listdir("data") if filename.startswith("BIOGRID")][0],
)

bg = pd.read_csv(
    BIOGRID_FILE,
    sep="\t",
    usecols=[
        "#BioGRID Interaction ID",
        "BioGRID ID Interactor A",
        "BioGRID ID Interactor B",
        "Official Symbol Interactor A",
        "Organism ID Interactor A",
        "Official Symbol Interactor B",
        "Organism ID Interactor B",
        "Throughput",
        "Experimental System",
        "SWISS-PROT Accessions Interactor A",
        "SWISS-PROT Accessions Interactor B",
    ],
).rename(columns={"#BioGRID Interaction ID": "BioGRID Interaction ID"})
print(f"All Biogrid MV records: {bg.shape}")

# Remove records with missing SWISS-PROT IDs
bg = bg[bg["SWISS-PROT Accessions Interactor A"] != "-"]
bg = bg[bg["SWISS-PROT Accessions Interactor B"] != "-"]
print(f"Records with two SWISS-PROT IDs: {bg.shape}")

# For cases where there are multiple SWISS-PROT IDs, take the first one
bg["SWISS-PROT Accessions Interactor A"] = bg[
    "SWISS-PROT Accessions Interactor A"
].str.split(pat="|", expand=True)[0]
bg["SWISS-PROT Accessions Interactor B"] = bg[
    "SWISS-PROT Accessions Interactor B"
].str.split(pat="|", expand=True)[0]

# Remove records where the protein interacts with itself
bg = bg[bg["BioGRID ID Interactor A"] != bg["BioGRID ID Interactor B"]]
print(f"Records with two different proteins: {bg.shape}")

# Remove duplicate entries
bg = bg.drop_duplicates()
print(f"Unique rows: {bg.shape}")


bg = bg.sort_values(
    by=[
        "Official Symbol Interactor A",
        "Official Symbol Interactor B",
        "Throughput",
        "Experimental System",
    ]
)
os.remove(BIOGRID_FILE)

bg.head()

---
## 3. Get UniProtKB Data (Vertex Features)

### 3.1. Query UniProt

Create list of unique UniProtKB IDs for both interactors

In [None]:
import numpy as np

uniprot_ids = list(
    set(
        np.concatenate(
            [
                bg["SWISS-PROT Accessions Interactor A"],
                bg["SWISS-PROT Accessions Interactor B"],
            ]
        )
    )
)
if None in uniprot_ids:
    uniprot_ids.remove(None)
uniprot_ids.sort()
print(len(uniprot_ids))
print(uniprot_ids[:100])

Fetch data using UniProt API

In [None]:
import pandas as pd
import uniprot

job_id = uniprot.submit_id_mapping(
    from_db="UniProtKB_AC-ID", to_db="UniProtKB", ids=uniprot_ids
)
if uniprot.check_id_mapping_results_ready(job_id):
    link = uniprot.get_id_mapping_results_link(job_id)
    results = uniprot.get_id_mapping_results_search(link, sequence_only=False)

### 3.2. Extract amino acid sequences and other UniProt metadata

In [None]:
seqs = pd.DataFrame(
    [
        {
            "PrimaryAccession": result.get("to").get("primaryAccession"),
            "Description": result.get("to")
            .get("proteinDescription")
            .get("recommendedName")
            .get("fullName")
            .get("value"),
            "Organism": result.get("to").get("organism").get("scientificName"),
            "TaxonId": result.get("to").get("organism").get("taxonId"),
            "Sequence": result.get("to").get("sequence").get("value"),
            "Length": result.get("to").get("sequence").get("length"),
            "MolWeight": result.get("to").get("sequence").get("molWeight"),
            "Families": ";".join(
                [
                    entry[0]["value"]
                    for entry in [
                        result.get("properties")
                        for result in result.get("to").get("uniProtKBCrossReferences")
                        if result.get("database") == "InterPro"
                    ]
                ]
            ),
            "Keywords": ";".join(
                [
                    f"{result.get('category')}:{result.get('name')}"
                    for result in result.get("to").get("keywords")
                    if result.get("category")
                    in [
                        "Cellular component",
                        "Domain",
                        "Molecular Function",
                        "PTM",
                    ]
                ]
            ),
        }
        for result in results.get("results")
        if result.get("to").get("entryType") != "Inactive"
    ]
).drop_duplicates()
print(f"UniProt records: {seqs.shape}")
seqs.head()

---
## 4. Add Prot-T5 Embeddings

### 4.1. Download SWISS-PROT Prot-T5 embeddings from UniProt
NOTE: This file is around 1.3 GB on disk

In [None]:
download(
    "https://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/embeddings/uniprot_sprot/per-protein.h5",
    "data/per-protein.h5",
)

### 4.2. Join embeddings to sequence object

In [None]:
import h5py
import numpy as np

with h5py.File("data/per-protein.h5", "r") as f:

    def _create_embedding(id, length=1024, dtype="float16"):
        arr = np.zeros((length,), dtype=dtype)
        try:
            dataset = f[id]
            dataset.read_direct(arr)
            return {
                "PrimaryAccession": id,
                "prot_t5_embeddings": ",".join(map(str, arr.tolist())),
            }
        except:
            next

    prot_t5_embeddings = pd.json_normalize(
        seqs["PrimaryAccession"].map(_create_embedding)
    )

In [None]:
prot_t5_embeddings.to_csv("data/prot_t5_embeddings.csv", index=False)
display(prot_t5_embeddings.head())

In [None]:
# Uncomment this if you've already calculated the embeddings and just
# want to load them from a file.

# prott5 = pd.read_csv("data/rot_t5_embeddings.csv")

In [None]:
seqs = pd.merge(seqs, prot_t5_embeddings, how='inner', on='PrimaryAccession')


In [None]:
seqs

## 5. Add ESM-2 Embeddings

### 5.1. Calculate Embeddings

In [None]:
from transformers import AutoTokenizer, AutoModel, BitsAndBytesConfig
import torch
import numpy as np


def generate_embeddings(
    text, model_name="facebook/esm2_t36_3B_UR50D", batch_size=24
):

    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_use_double_quant=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch.bfloat16,
    )

    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModel.from_pretrained(
        model_name, device_map="auto", quantization_config=bnb_config
    )

    tmp = []
    total_batches = len(text) // batch_size
    for n, batch in enumerate(
        [text[i : i + batch_size] for i in range(0, len(text), batch_size)]
    ):
        print(f"Batch {n+1} of {total_batches}")
        inputs = tokenizer(
            batch, return_tensors="pt", truncation=True, padding=True, max_length=1024
        )
        with torch.no_grad():
            predictions = model(**inputs)
        # Return mean embeddings after removing <cls> and <eos> tokens and converting to numpy.
        tmp.append(predictions.last_hidden_state[:, 1:-1, :].numpy().mean(axis=1))
    output = np.vstack(tmp)
    print(f"Output shape: {output.shape}")
    return output

In [None]:
torch.cuda.empty_cache()

esm_2_embeddings = generate_embeddings(
    list(seqs["Sequence"]), 
    batch_size=24, 
    model_name="facebook/esm2_t36_3B_UR50D"
)
esm_2_embeddings.shape

In [None]:
esm_2_embeddings = list(
    map(lambda arr: ",".join(map(str, arr.tolist())), esm_2_embeddings)
)

In [None]:
esm = pd.DataFrame(
    {"PrimaryAccession": seqs["PrimaryAccession"], "esm2": esm_2_embeddings}
)
esm.to_csv("data/esm_2_embeddings_3B.csv", index=False)
display(esm.head())

### 5.2. Join embeddings to sequence object

In [None]:
# Uncomment this if you've already calculated the embeddings and just
# want to load them from a file.

# esm = pd.read_csv("data/esm_2_embeddings_3B.csv")

In [None]:
seqs = pd.merge(seqs, esm, how="inner", on="PrimaryAccession").rename(
    columns={"esm2": "esm_2_embeddings"}
)

In [None]:
seqs

---
## 6. Review data

### 6.1. Node Features

In [None]:
print(seqs.shape)
display(seqs.head())

### 6.2 Edge Features

In [None]:
print(bg.shape)
display(bg.head())

---
## 7.  Create Neptune Bulk Loader input files

https://docs.aws.amazon.com/neptune/latest/userguide/bulk-load.html

In [None]:
bulk_load_dir = os.path.join(DATA_DIR, "bulk_loader")
if not os.path.exists(bulk_load_dir):
    os.makedirs(bulk_load_dir)

### 7.1. Create Vertex Files

 Protein vertices

In [None]:
import pandas as pd

protein_vertices = (
    pd.DataFrame(
        {
            "~id": seqs["PrimaryAccession"],
            "~label": "protein",
            "description:String": seqs["Description"],
            "sequence:String": seqs["Sequence"],
            "length:Int": seqs["Length"],
            "molWeight:Int": seqs["MolWeight"],
            "keywords:String[]": seqs["Keywords"],
            "protT5:String": seqs["prot_t5_embeddings"],
            "esm2:String": seqs["esm_2_embeddings"],
        }
    )
    .drop_duplicates()
    .sort_values(by=["~id"])
    .reset_index(drop=True)
)

protein_vertices.to_csv(
    os.path.join(bulk_load_dir, "protein_vertices.csv"), index=False
)
display(protein_vertices.head())

Organism vertices

In [None]:
import pandas as pd

organism_vertices = (
    pd.DataFrame(
        {"~id": seqs["TaxonId"], "~label": "organism", "name:String": seqs["Organism"]}
    )
    .drop_duplicates()
    .sort_values(by=["~id"])
    .reset_index(drop=True)
)

organism_vertices.to_csv(
    os.path.join(bulk_load_dir, "organism_vertices.csv"), index=False
)

display(organism_vertices.head())

Family vertices

In [None]:
protein_family = seqs[["PrimaryAccession", "Families"]]
protein_family["Families"] = protein_family["Families"].map(lambda x: x.split(";"))
protein_family = protein_family.explode("Families")[
    ["PrimaryAccession", "Families"]
].drop_duplicates()

In [None]:
import pandas as pd

family_vertices = (
    pd.DataFrame(
        {
            "~id": protein_family["Families"],
            "~label": "family",
            "name:String": protein_family["Families"],
        }
    )
    .drop_duplicates()
    .sort_values(by=["~id"])
    .reset_index(drop=True)
)

family_vertices = family_vertices[family_vertices["~id"] != ""]

family_vertices.to_csv(os.path.join(bulk_load_dir, "family_vertices.csv"), index=False)

display(family_vertices.head())

### 7.2. Create edge files

Protein-organism edges

In [None]:
import pandas as pd

protein_organism_edges = (
    pd.DataFrame(
        {
            "~id": seqs["PrimaryAccession"]
            + "-found_in-"
            + seqs["TaxonId"].astype(str),
            "~from": seqs["PrimaryAccession"],
            "~to": seqs["TaxonId"],
            "~label": "found_in",
        }
    )
    .drop_duplicates()
    .sort_values(by=["~id"])
    .reset_index(drop=True)
)

protein_organism_edges.to_csv(
    os.path.join(bulk_load_dir, "protein_organism_edges.csv"), index=False
)

display(protein_organism_edges)

Protein-Family edges

In [None]:
import pandas as pd

protein_family_edges = (
    pd.DataFrame(
        {
            "~id": protein_family["PrimaryAccession"]
            + "-member_of-"
            + protein_family["Families"].astype(str),
            "~from": protein_family["PrimaryAccession"],
            "~to": protein_family["Families"],
            "~label": "member_of",
        }
    )
    .drop_duplicates()
    .sort_values(by=["~id"])
    .reset_index(drop=True)
)

protein_family_edges.to_csv(
    os.path.join(bulk_load_dir, "protein_family_edges.csv"), index=False
)

display(protein_family_edges)

Protein-Protein edges

In [None]:
import pandas as pd

protein_protein_edges = (
    pd.DataFrame(
        {
            "~id": bg["SWISS-PROT Accessions Interactor A"]
            + "-interacts_with-"
            + bg["SWISS-PROT Accessions Interactor B"],
            "~from": bg["SWISS-PROT Accessions Interactor A"],
            "~to": bg["SWISS-PROT Accessions Interactor B"],
            "~label": "interacts_with",
            "experimentalSystem:String": bg["Experimental System"],
            "throughput:String": bg["Throughput"],
        }
    )
    .drop_duplicates()
    .sort_values(by=["~id"])
    .reset_index(drop=True)
)

protein_protein_edges.to_csv(
    os.path.join(bulk_load_dir, "protein_protein_edges.csv"), index=False
)
display(protein_protein_edges)

### 7.3. Upload data to S3

In [None]:
from sagemaker import s3

uploader = s3.S3Uploader()
uploader.upload("data/bulk_loader/", S3_URI)

## 8. Load data into Neptune

### 8.1. Verify Neptune Connection

In [None]:
%load_ext graph_notebook.magics

In [None]:
%graph_notebook_version
%graph_notebook_config
%status

### 8.2. Start bulk loading job

Run the following cell and select "Submit".

In [None]:
%load -s $S3_URI

Run the cell below to verify that data loaded successfully into Neptune. Note that the loader may produce errors for a small number of records - this is fine.

The final result should be approximately:

Protein: 23571
Taxon: 53
Family: 16447

In [None]:
%%gremlin

g.V().groupCount().by(label).unfold()