In [None]:
# install Hugging Face Hub python library
%pip install huggingface_hub

In [None]:
# improt Hugging Face Hub python library
from huggingface_hub import hf_hub_download, snapshot_download

In [None]:
# set Hugging Face Hub model id
repository_name = "Rostlab/prot_t5_xl_uniref50"

In [None]:
# download Hugging Face Hub model artifacts
snapshot_download(repo_id=repository_name, local_dir="repo-files", local_dir_use_symlinks=False, ignore_patterns=["pytorch_model_600k.bin", "pytorch_model_723k.bin"])

In [None]:
# create directory structure
!mkdir repo-files/code
!cp inference.py repo-files/code

In [None]:
# create Sagemaker session
import sagemaker
import boto3
sess = sagemaker.Session()

sagemaker_session_bucket=None
if sagemaker_session_bucket is None and sess is not None:
    sagemaker_session_bucket = sess.default_bucket()

try:
    role = sagemaker.get_execution_role()
except ValueError:
    iam = boto3.client('iam')
    role = iam.get_role(RoleName='sagemaker_execution_role')['Role']['Arn']

sess = sagemaker.Session(default_bucket=sagemaker_session_bucket)

In [None]:
# set S3 location storing compressed model for Sagemaker deploy
model_id=repository_name.split("/")[-1]
s3_location=f"s3://{sess.default_bucket()}/protein-similarity-search/{model_id}/model.tar.gz"

In [None]:
# install pigz package to optimize compression phase
!apt-get update
!apt-get install pigz

In [None]:
# compress model artifacts
%cd repo-files
!tar cf - * | pigz > ../model.tar.gz

In [None]:
# upload model to S3
s3_client = boto3.client('s3')
s3_client.upload_file('../model.tar.gz', sess.default_bucket(), f'protein-similarity-search/{model_id}/model.tar.gz')

In [None]:
# deploy the model

from sagemaker.huggingface.model import HuggingFaceModel

huggingface_model = HuggingFaceModel(
   model_data=s3_location,
   role=role,
   transformers_version="4.26",
   pytorch_version="1.13",
   py_version='py39',
)

predictor = huggingface_model.deploy(
    initial_instance_count=1,
    instance_type="ml.g4dn.8xlarge"
    )

In [None]:
# calculate embeddings for sample protein A0A075B7B9

import re
import numpy as np

inputs = {"inputs": ["X E Y C N S T T F Y A"]} # sample sequence of protein A0A075B7B9

res = predictor.predict(data=inputs)
res_np = np.array(res['features']) # per-residue 
res_np_protein =  res_np[0].mean(axis=0) # per-protein

print(f"res_np={res_np.shape}, res_np_protein={res_np_protein.shape}, res_np_protein.mean(axis=0)={res_np_protein.mean(axis=0)}")

In [None]:
%pip install opensearch-py

In [None]:
# create OpenSearch cluster connection

from opensearchpy import OpenSearch, RequestsHttpConnection

# Update the following variables with:
#   1. OS region
#   2. master username and password
#   3. domain endpoint WITHOUT THE INITIAL HTTPS://

region = 'SUBSTITUTE REGION'
aos_host = 'SUBSTITUTE DOMAIN ENDPOINT WITHOUT THE INITIAL HTTPS://'

auth = ("SUBSTITUTE MASTER USERNAME","SUBSTITUTE MASTER PASSWORD")
index_name = 'protein_semantic_search'

aos_client = OpenSearch(
    hosts = [{'host': aos_host, 'port': 443}],
    http_auth = auth,
    use_ssl = True,
    verify_certs = True,
    connection_class = RequestsHttpConnection
)

In [None]:
# query definition and similarity search on OpenSearch

query={
    "size": 10,
    "query": {
        "knn": {
            "protein_vector":{
                "vector": res_np_protein.tolist(),
                "k":10
            }
        }
    }
}

res = aos_client.search(index=index_name, 
                       body=query,
                       stored_fields=["protein_vector","sequence_id"])

In [None]:
# print the result of the semantic search
res['hits']['hits']