# Fine tuning BERT for information retrieval using Amazon Sagemaker 

## Install and import dependencies

In [3]:
import subprocess
import sys

def install(package):
    subprocess.check_call([sys.executable, "-q", "-m", "pip", "install", package])
    
install('sentence_transformers')
install('opensearch-py')
install('requests_aws4auth')

import json
from torch.utils.data import DataLoader
from sentence_transformers import SentenceTransformer, LoggingHandler, util, models, evaluation, losses, InputExample
import logging
from datetime import datetime
import gzip
import os
import tarfile
from collections import defaultdict
from torch.utils.data import IterableDataset
import tqdm
from torch.utils.data import Dataset
import random
import pickle
import argparse
import sagemaker
from sagemaker.pytorch import PyTorch
from sagemaker import get_execution_role

## Setup the Sagemaker session, region and the role 

In [109]:
import boto3
sess = sagemaker.Session()
role = get_execution_role()
boto3_session = boto3.session.Session()
my_region = boto3_session.region_name
output_path = "s3://" + sess.default_bucket() + "/nlp-dualencoder"

'us-east-1'

## Sample Training input instance

## Train the Bi-Encoder

![9E3E4C40-6F04-424F-ADEA-90FFBB1FAEC9_4_5005_c.jpeg](attachment:9E3E4C40-6F04-424F-ADEA-90FFBB1FAEC9_4_5005_c.jpeg)![F3457938-14F6-469C-A2FE-0698099F052E.jpeg](attachment:F3457938-14F6-469C-A2FE-0698099F052E.jpeg)

#### References: https://www.sbert.net/examples/training/ms_marco/README.html

In [19]:
local_mode = False

if local_mode:
    instance_type = "local"
else:
    instance_type = "ml.c5.4xlarge"

est = PyTorch(
    entry_point="nlp_loader_test.py",
    source_dir="scripts/code",  # directory of your training script
    role=role,
    framework_version="1.5.0",
    py_version="py3",
    instance_type=instance_type,
    instance_count=1,
    volume_size=250,
    output_path=output_path,
    #hyperparameters={"batch-size": 128, "epochs": 1, "learning-rate": 1e-3, "log-interval": 100},
)

est.fit()

2022-09-22 14:23:57 Starting - Starting the training job...
2022-09-22 14:24:21 Starting - Preparing the instances for trainingProfilerReport-1663856637: InProgress
.........
2022-09-22 14:25:57 Downloading - Downloading input data
2022-09-22 14:25:57 Training - Downloading the training image...
2022-09-22 14:26:23 Training - Training image download completed. Training in progress..[34mbash: cannot set terminal process group (-1): Inappropriate ioctl for device[0m
[34mbash: no job control in this shell[0m
[34m2022-09-22 14:26:26,965 sagemaker-training-toolkit INFO     Imported framework sagemaker_pytorch_container.training[0m
[34m2022-09-22 14:26:26,982 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)[0m
[34m2022-09-22 14:26:26,990 sagemaker_pytorch_container.training INFO     Block until all host DNS lookups succeed.[0m
[34m2022-09-22 14:26:26,999 sagemaker_pytorch_container.training INFO     Invoking user training script.[0m
[34m2022-09-

###### Use one (non-gpu) from below for the instance_type

[ml.trn1.32xlarge, ml.p2.xlarge, ml.m5.4xlarge, ml.m4.16xlarge, ml.p4d.24xlarge, ml.g5.2xlarge, ml.c5n.xlarge, ml.p3.16xlarge, ml.m5.large, ml.p2.16xlarge, ml.g5.4xlarge, ml.c4.2xlarge, ml.c5.2xlarge, ml.c4.4xlarge, ml.g5.8xlarge, ml.c5.4xlarge, ml.c5n.18xlarge, ml.g4dn.xlarge, ml.g4dn.12xlarge, ml.c4.8xlarge, ml.g4dn.2xlarge, ml.c5.9xlarge, ml.g4dn.4xlarge, ml.c5.xlarge, ml.g4dn.16xlarge, ml.c4.xlarge, ml.g4dn.8xlarge, ml.g5.xlarge, ml.c5n.2xlarge, ml.g5.12xlarge, ml.g5.24xlarge, ml.c5n.4xlarge, ml.trn1.2xlarge, ml.c5.18xlarge, ml.p3dn.24xlarge, ml.g5.48xlarge, ml.g5.16xlarge, ml.p3.2xlarge, ml.m5.xlarge, ml.m4.10xlarge, ml.c5n.9xlarge, ml.m5.12xlarge, ml.m4.xlarge, ml.m5.24xlarge, ml.m4.2xlarge, ml.p2.8xlarge, ml.m5.2xlarge, ml.p3.8xlarge, ml.m4.4xlarge]

## Get the latest trained model

In [49]:
import boto3
bucket = sess.default_bucket()
#Make sure you provide / in the end
prefix = 'nlp-dualencoder/' 
key_list=[]

s3_client = boto3.client('s3')
result = s3_client.list_objects(Bucket=bucket, Prefix=prefix, Delimiter='/')
for i in result.get('CommonPrefixes'):
    key_list.append(i.get('Prefix'))
    
key_list_sorted = sorted(key_list, reverse=True)
model_artifact_s3uri = 's3://'+bucket+'/'+key_list_sorted[0]+'output/model.tar.gz'
model_artifact_s3key = key_list_sorted[0]+'output/model.tar.gz'
model_artifact_s3key_prefix = key_list_sorted[0].split("/")[0]
model_artifact_s3uri

's3://sagemaker-us-east-1-776075316542/nlp-dualencoder/pytorch-training-2022-09-22-14-23-57-751/output/model.tar.gz'

## Model Inference

In [103]:
from sagemaker.huggingface.model import HuggingFaceModel

# create Hugging Face Model Class
huggingface_model = HuggingFaceModel(
entry_point = 'inference.py',
    source_dir="scripts/code",
   model_data=model_artifact,       # path to your model and script
   role=role,                    # iam role with permissions to create an Endpoint
   transformers_version="4.17.0",  # transformers version used
   pytorch_version="1.10.2",        # pytorch version used
   py_version='py38',            # python version used
)



## Realtime endpoint for queries

In [29]:
# deploy the endpoint endpoint
predictor = huggingface_model.deploy(
    initial_instance_count=1,
    instance_type="ml.c5.xlarge"
    )
query_vector = predictor.predict({
	'inputs': "The answer to the universe is GOD."
})
query_vector['vectors'][0]

----!

[0.4333498775959015,
 -0.6009767651557922,
 -0.39783549308776855,
 -0.08384401351213455,
 0.5463345050811768,
 -0.3839586675167084,
 -0.681443452835083,
 0.051684703677892685,
 0.4470563232898712,
 0.6274235844612122,
 0.5728732943534851,
 -0.22543112933635712,
 0.6113768815994263,
 -0.6293169260025024,
 -0.37824878096580505,
 -0.10665169358253479,
 0.48913902044296265,
 0.35678836703300476,
 0.2005131095647812,
 0.17038586735725403,
 -0.0034015923738479614,
 -0.6248665452003479,
 0.421989768743515,
 -0.2897321283817291,
 0.0529760867357254,
 -0.2038678526878357,
 0.252308189868927,
 0.4208436608314514,
 -0.6659421920776367,
 0.7203873991966248,
 0.3893357813358307,
 0.3651546835899353,
 -0.7063692212104797,
 0.1992882788181305,
 -0.4494023621082306,
 0.5988264679908752,
 0.7761674523353577,
 0.6483849287033081,
 0.20695829391479492,
 0.4886120557785034,
 -0.3299712538719177,
 -0.3667222857475281,
 -0.2046678364276886,
 0.12381715327501297,
 -0.06808968633413315,
 -0.38116082549095154,

## Copy model extracts to local

In [78]:
s3_client.download_file(bucket, model_artifact_s3key, '/home/ec2-user/SageMaker/amazon-sagemaker-bert-finetuning-for-search/model.tar.gz')
!rm -rf ./trained_bert_model_extract
!mkdir ./trained_bert_model_extract
!tar -xvzf /home/ec2-user/SageMaker/amazon-sagemaker-bert-finetuning-for-search/model.tar.gz -C ./trained_bert_model_extract

sentence_bert_config.json
README.md
modules.json
config.json
config_sentence_transformers.json
1_Pooling/
1_Pooling/config.json
tokenizer_config.json
special_tokens_map.json
vocab.txt
pytorch_model.bin
tokenizer.json


## Batch Transform the docs to vectors

In [44]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, BertModel
import torch
import torch.nn.functional as F
import os

from itertools import islice
import math
import pandas as pd
from numpy.random import randint


def take_(n, iterable):
    "Return first n items of the iterable as a list"
    return dict(islice(iterable, n))


model_data='./trained_bert_model_extract'

dataframe_list=[]
cols = ['docs','bert_encoded_doc_vectors']

tokenizer = AutoTokenizer.from_pretrained(model_data)
model = BertModel.from_pretrained(model_data)

### Now we read the MS Marco dataset
data_folder = 'msmarco-data'

#### Read the corpus files, that contain all the passages. Store them in the corpus dict
corpus = {}         #dict in the format: passage_id -> passage. Stores all existent passages
collection_filepath = os.path.join(data_folder, 'collection.tsv')
if not os.path.exists(collection_filepath):
    tar_filepath = os.path.join(data_folder, 'collection.tar.gz')
    if not os.path.exists(tar_filepath):
        logging.info("Download collection.tar.gz")
        util.http_get('https://msmarco.blob.core.windows.net/msmarcoranking/collection.tar.gz', tar_filepath)

    with tarfile.open(tar_filepath, "r:gz") as tar:
        tar.extractall(path=data_folder)

logging.info("Read corpus: collection.tsv")
with open(collection_filepath, 'r', encoding='utf8') as fIn:
    for line in fIn:
        pid, passage = line.strip().split("\t")
        pid = int(pid)
        corpus[pid] = passage

n_items = take_(2, corpus.items())


for key in n_items.items():

    tmp_list=[]
    encoded_input = tokenizer(n_items[key[0]], padding=True, truncation=True, return_tensors='pt')["input_ids"]
    bert_encoded = model(encoded_input)["pooler_output"].tolist()
    tmp_list.append(n_items[key[0]])
    tmp_list.append(bert_encoded[0])
    dataframe_list.append(tmp_list)
transformed_dataframe = pd.DataFrame(dataframe_list, columns=cols)

transformed_dataframe.to_csv('./transfromed_vectors.csv',index=False)

s3_client.upload_file('./transfromed_vectors.csv',bucket,  model_artifact_s3key_prefix+'/batch_output/transfromed_vectors.csv')



You are using a model of type distilbert to instantiate a model of type bert. This is not supported for all configurations of models and can yield errors.
Some weights of the model checkpoint at ./trained_bert_model_extract were not used when initializing BertModel: ['transformer.layer.2.attention.v_lin.weight', 'transformer.layer.0.output_layer_norm.weight', 'transformer.layer.3.attention.k_lin.weight', 'transformer.layer.2.attention.q_lin.weight', 'transformer.layer.5.ffn.lin2.weight', 'transformer.layer.5.attention.q_lin.bias', 'transformer.layer.4.attention.q_lin.bias', 'transformer.layer.2.ffn.lin2.bias', 'transformer.layer.3.ffn.lin2.weight', 'transformer.layer.1.attention.k_lin.bias', 'transformer.layer.4.sa_layer_norm.bias', 'transformer.layer.3.attention.q_lin.weight', 'transformer.layer.0.ffn.lin1.weight', 'transformer.layer.4.ffn.lin2.bias', 'transformer.layer.4.attention.k_lin.bias', 'transformer.layer.2.output_layer_norm.weight', 'transformer.layer.1.attention.k_lin.weight

  0%|          | 0.00/1.04G [00:00<?, ?B/s]

{0: 'The presence of communication amid scientific minds was equally important to the success of the Manhattan Project as scientific intellect was. The only cloud hanging over the impressive achievement of the atomic researchers and engineers is what their success truly meant; hundreds of thousands of innocent lives obliterated.', 1: 'The Manhattan Project and its atomic bomb helped bring an end to World War II. Its legacy of peaceful uses of atomic energy continues to have an impact on history and science.'}
['The presence of communication amid scientific minds was equally important to the success of the Manhattan Project as scientific intellect was. The only cloud hanging over the impressive achievement of the atomic researchers and engineers is what their success truly meant; hundreds of thousands of innocent lives obliterated.', [0.8233873248100281, -0.5990200042724609, -0.24138645827770233, 0.6600359678268433, -0.4724467098712921, -0.5715699791908264, -0.1324666440486908, 0.139598

## Get the domain endpoint from the cloudformation outputs

In [65]:
cfn_client = boto3.client('cloudformation')

paginator = cfn_client.get_paginator('describe_stacks')

response_iterator = paginator.paginate(
    StackName='static-cfn',
    PaginationConfig={
        'MaxItems': 123,
        'StartingToken': 'string'
    }
)
response_iterator

for i in response_iterator:
    for j in i['Stacks'][0]['Outputs']:
        if(j['OutputKey']=='DomainEndpoint'):
            host = j['OutputValue']
            break

## OpenSearch domain

In [70]:
import sagemaker
import boto3

sts_client = boto3.client('sts')

# Call the assume_role method of the STSConnection object and pass the role
# ARN and a role session name.

assumed_role_object=sts_client.assume_role(
    RoleArn=role,
    RoleSessionName="AssumeRoleSession1"
)

# From the response that contains the assumed role, get the temporary 
# credentials that can be used to make subsequent API calls

credentials=assumed_role_object['Credentials']


session = boto3.Session(
    aws_access_key_id=credentials['AccessKeyId'],
    aws_secret_access_key=credentials['SecretAccessKey'],
    aws_session_token=credentials['SessionToken']
)

credentials = session.get_credentials()

from opensearchpy import OpenSearch, RequestsHttpConnection, AWSV4SignerAuth
import boto3


port = 443

s3 = boto3.client('s3')
import re
import requests
import json

from requests_aws4auth import AWS4Auth
endpoint = 'https://'+host # the proxy endpoint, including https://
region = my_region
service = 'execute-api'

awsauth = AWSV4SignerAuth(credentials, region)

headers = { "Content-Type": "application/json"
}



client = OpenSearch(
    hosts = [{'host': host, 'port': 443}],
    http_auth = awsauth,
    use_ssl = True,
    #verify_certs = True,
    connection_class = RequestsHttpConnection
)

client



<OpenSearch([{'host': 'vpc-semantic-search-l5f6en4zbgbwraivmibtnvaszy.us-east-1.es.amazonaws.com', 'port': 443}])>

## create index and mapping

In [75]:
index_name = 'nlpindex_search_bert'
index_body = {
  'settings': {
    'index': {
      'number_of_shards': 4,
        'knn': True
    }
  },

    'mappings': {
       
            'properties': {
                'passage': {'type': 'text'},
                
               "bert_vector": {
        "type": "knn_vector",
        "dimension": 768
      },
            }}
}


response = client.indices.create(index_name, body=index_body)
print('\nCreating index:')
print(response)




Creating index:
{'acknowledged': True, 'shards_acknowledged': True, 'index': 'nlpindex_search_bert_19sep'}


## Index the document and vectors (Batch Transform)

In [76]:
import pandas as pd
import s3fs
from ast import literal_eval

df = pd.read_csv('s3://'+bucket+'/'+model_artifact_s3key_prefix+'/batch_output/transfromed_vectors.csv')
df

df = df.reset_index()  # make sure indexes pair with number of rows

for index, row in df.iterrows():
    line={}
    line['passage']=row['docs']
    line['bert_vector']=literal_eval(row['bert_encoded_doc_vectors'])

    response = client.index(
    index = index_name,
    body = line)
    print(response)

{'_index': 'nlpindex_search_bert_19sep', '_type': '_doc', '_id': '40wPZoMBsvn7m6QjI-bk', '_version': 1, 'result': 'created', '_shards': {'total': 2, 'successful': 1, 'failed': 0}, '_seq_no': 0, '_primary_term': 1}
{'_index': 'nlpindex_search_bert_19sep', '_type': '_doc', '_id': '5EwPZoMBsvn7m6QjJuYv', '_version': 1, 'result': 'created', '_shards': {'total': 2, 'successful': 1, 'failed': 0}, '_seq_no': 1, '_primary_term': 1}


## Search the domain in realtime using sagemaker endpoint and lambda

In [77]:
query_vector = predictor.predict({
	'inputs': "The answer to the universe is GOD."
})

q = 'miller'
query = {"query":{
  "knn": {
      "bert_vector": {
        "vector": query_vector['vectors'][0],
        "k": 1
      }
    }
}}


response = client.search(
    body = query,
    index = index_name
)
print('\nSearch results:')
print(response)


Search results:
{'took': 196, 'timed_out': False, '_shards': {'total': 4, 'successful': 4, 'skipped': 0, 'failed': 0}, 'hits': {'total': {'value': 2, 'relation': 'eq'}, 'max_score': 0.0031497153, 'hits': [{'_index': 'nlpindex_search_bert_19sep', '_type': '_doc', '_id': '40wPZoMBsvn7m6QjI-bk', '_score': 0.0031497153, '_source': {'passage': 'The presence of communication amid scientific minds was equally important to the success of the Manhattan Project as scientific intellect was. The only cloud hanging over the impressive achievement of the atomic researchers and engineers is what their success truly meant; hundreds of thousands of innocent lives obliterated.', 'bert_vector': [0.8233873248100281, -0.5990200042724609, -0.24138645827770233, 0.6600359678268433, -0.4724467098712921, -0.5715699791908264, -0.1324666440486908, 0.13959871232509613, 0.34819453954696655, 0.06347744911909103, -0.35870498418807983, -0.7085609436035156, -0.7292724251747131, 0.22835156321525574, -0.2151727229356765

In [92]:
runtime= boto3.client('sagemaker')
runtime.list_endpoints()['Endpoints']
endpoints = runtime.list_endpoints()
endpoints
latest_endpoint = sorted(endpoints['Endpoints'],key=lambda x: x['CreationTime'], reverse=True)[0]['EndpointName']
latest_endpoint

'huggingface-pytorch-inference-2022-09-22-14-55-47-362'

## Mini Search Engine

In [101]:
from ipywidgets import interact, widgets
from IPython.display import display
import boto3
import json
from ast import literal_eval
lambda_client = boto3.client('lambda')
runtime= boto3.client('runtime.sagemaker')

text = widgets.Text(
    value='Search here!',
    placeholder='Search here!',
    description='Search:',
    disabled=False
)

text.style._view_name = '100px'

display(text)

def callback(wdgt):
    payload_ = json.dumps({"inputs": wdgt.value})

    #res = lambda_client.invoke(FunctionName='sagemaker-access-endpoints', InvocationType='RequestResponse',Payload=lambda_payload_)
    res_ = runtime.invoke_endpoint(EndpointName=latest_endpoint,Body=payload_,ContentType='application/json')
    #res=literal_eval(res['Payload'].read().decode())
    
    
    res = json.loads(res_['Body'].read().decode())['vectors'][0]
    
    

    q = 'miller'
    query = {"query":{
    "knn": {
      "bert_vector": {
        "vector": res,
        "k": 1
      }
        }
    }}


    response = client.search(
    body = query,
    index = index_name
    )
    result = {}
    result['doc']=response['hits']['hits'][0]['_source']['passage']
    result['score']=response['hits']['hits'][0]['_score']
    display(result)

text.on_submit(callback)

Text(value='Search here!', description='Search:', placeholder='Search here!')

{'doc': 'The Manhattan Project and its atomic bomb helped bring an end to World War II. Its legacy of peaceful uses of atomic energy continues to have an impact on history and science.',
 'score': 0.0031030374}