# Semantic Search with Fine Tuning Model

In [1]:
!pip install --upgrade torch

Collecting torch
  Using cached torch-1.10.2-cp36-cp36m-manylinux1_x86_64.whl (881.9 MB)
Installing collected packages: torch
  Attempting uninstall: torch
    Found existing installation: torch 1.7.1
    Uninstalling torch-1.7.1:
      Successfully uninstalled torch-1.7.1
Successfully installed torch-1.10.2


In [None]:
from IPython.display import display_html
def restartkernel() :
    display_html("<script>Jupyter.notebook.kernel.restart()</script>",raw=True)
restartkernel()

In [1]:
import torch
print(torch.__version__)

1.10.2+cu102


In [2]:
!pip install -U sentence-transformers rank_bm25
!pip install -q opensearch-py
!pip install -q tqdm

Collecting sentence-transformers
  Using cached sentence_transformers-2.2.2-py3-none-any.whl
Collecting rank_bm25
  Using cached rank_bm25-0.2.2-py3-none-any.whl (8.6 kB)
Collecting huggingface-hub>=0.4.0
  Using cached huggingface_hub-0.4.0-py3-none-any.whl (67 kB)
Collecting transformers<5.0.0,>=4.6.0
  Using cached transformers-4.18.0-py3-none-any.whl (4.0 MB)
Collecting sentencepiece
  Using cached sentencepiece-0.1.97-cp36-cp36m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
Collecting filelock
  Downloading filelock-3.4.1-py3-none-any.whl (9.9 kB)
Collecting sacremoses
  Using cached sacremoses-0.0.53-py3-none-any.whl
Collecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Using cached tokenizers-0.12.1-cp36-cp36m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)
Installing collected packages: filelock, tokenizers, sacremoses, huggingface-hub, transformers, sentencepiece, sentence-transformers, rank-bm25
Successfully installed filelock-3.4.1 huggingface-hub-0.4.0 rank-bm25-

### Note change "cloudformation_stack_name" to the Cloud Formation stack name when you provision your env.

In [3]:
import boto3

cfn = boto3.client('cloudformation')

def get_cfn_outputs(stackname):
    outputs = {}
    for output in cfn.describe_stacks(StackName=stackname)['Stacks'][0]['Outputs']:
        outputs[output['OutputKey']] = output['OutputValue']
    return outputs

## Setup variables to use for the rest of the demo
cloudformation_stack_name = "static-cloudformation-semantic-search"

outputs = get_cfn_outputs(cloudformation_stack_name)

bucket = outputs['s3BucketTraining']
aos_host = outputs['DomainEndpoint']

outputs



{'DomainEndpoint': 'search-opensearchservi-syxjz3qrneyt-qwv7yjocaeblepoky43ienflvu.us-east-1.es.amazonaws.com',
 'S3BucketSecureURL': 'https://static-cloudformation-semantic-se-s3buckethosting-18ofta7sitf9g.s3.amazonaws.com',
 'SageMakerNotebookURL': 'https://console.aws.amazon.com/sagemaker/home?region=us-east-1#/notebook-instances/openNotebook/NotebookInstance-FayBdMH70xG5?view=classic',
 'osArn': 'arn:aws:es:us-east-1:522880334446:domain/opensearchservi-syxjz3qrneyt',
 's3BucketTraining': 'static-cloudformation-semantic-s-s3buckettraining-5wyh4fklf11q',
 'osDomainName': 'opensearchservi-syxjz3qrneyt',
 's3BucketHostingBucketName': 'static-cloudformation-semantic-se-s3buckethosting-18ofta7sitf9g'}

## Step 1: Fine Tune the modal

## Fine Tuning Model

In [4]:
from torch.utils.data import DataLoader
import math
from sentence_transformers import SentenceTransformer,  LoggingHandler, losses, models, util
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator
from sentence_transformers.readers import InputExample
import logging
from datetime import datetime
import sys
import os
import gzip
import csv

Load data set of Amazon Product Question and Answer data from : https://registry.opendata.aws/amazon-pqa/

In [5]:
!aws s3 ls --no-sign-request s3://amazon-pqa/

2021-05-20 13:11:25 2267692311 amazon-pqa.tar.gz
2021-05-09 11:53:53  442066567 amazon_pqa_accessories.json
2021-05-09 11:53:49  275062405 amazon_pqa_activity_&_fitness_trackers.json
2021-05-09 11:53:49  127094083 amazon_pqa_adapters.json
2021-05-09 11:53:49  143639699 amazon_pqa_amazon_echo_&_alexa_devices.json
2021-05-09 11:53:49  106017252 amazon_pqa_area_rugs.json
2021-05-09 11:53:49  164430689 amazon_pqa_backpacks.json
2021-05-09 11:53:49  679285046 amazon_pqa_basic_cases.json
2021-05-09 11:53:49  390964941 amazon_pqa_batteries.json
2021-05-09 11:53:49  107896488 amazon_pqa_battery_chargers.json
2021-05-09 11:53:49   77113272 amazon_pqa_bed_frames.json
2021-05-09 11:53:49  157944761 amazon_pqa_beds.json
2021-05-09 11:53:49  218133567 amazon_pqa_bullet_cameras.json
2021-05-09 11:53:50  118106256 amazon_pqa_camcorders.json
2021-05-09 11:53:50   71239417 amazon_pqa_car.json
2021-05-09 11:53:50  137487049 amazon_pqa_car_stereo_receivers.json
2021-05-09 11:53:50  153301

In [6]:
!aws s3 cp --no-sign-request s3://amazon-pqa/amazon_pqa_headsets.json ./amazon-pqa/amazon_pqa_headsets.json

download: s3://amazon-pqa/amazon_pqa_headsets.json to amazon-pqa/amazon_pqa_headsets.json


In [7]:
import json
import pandas as pd

def load_pqa(file_name,number_rows=1000):
    qa_list = []
    df = pd.DataFrame(columns=('question', 'answer','label'))
    with open(file_name) as f:
        i=0
        previous_row_data = None
        for line in f:
            data = json.loads(line)
            df.loc[i] = [data['question_text'],data['answers'][0]['answer_text'],1.0]
            i+=1
            if previous_row_data is not None:
                df.loc[i] = [data['question_text'],previous_row_data['answers'][0]['answer_text'],0.0]
            previous_row_data = data
            i+=1
            if(i == number_rows*2):
                break
    return df


qa_list = load_pqa('amazon-pqa/amazon_pqa_headsets.json',number_rows=1000)


In [8]:
qa_list

Unnamed: 0,question,answer,label
0,does this work with cisco ip phone 7942,Use the Plantronics compatibility guide to see...,1.0
2,Is this compatible with the cisco ip phone 797...,Don’t know. Call Plantronics,1.0
3,Is this compatible with the cisco ip phone 797...,Use the Plantronics compatibility guide to see...,0.0
4,"If i have a polycom vvx, what adapter cable wi...","Hi Gabrielle, what is the model of VVX?",1.0
5,"If i have a polycom vvx, what adapter cable wi...",Don’t know. Call Plantronics,0.0
...,...,...,...
1995,How good is the microphone quality?,"I don't wear glasses personally, but the perso...",0.0
1996,is their an attachment that I can use to conne...,it didn't come with one but someone may make a...,1.0
1997,is their an attachment that I can use to conne...,Its actully really good. when i play with ny ...,0.0
1998,Do these have more bass than the game zeros?,Due to the closed back design the Game ZERO wi...,1.0


In [9]:
from sklearn.model_selection import train_test_split
from sentence_transformers.readers import InputExample

train_set,test_set = train_test_split(qa_list,test_size=0.2,shuffle=True)
training_set, validation_set = train_test_split(train_set,test_size=0.2)

def create_input_sample(data_set):
    train_samples = []
    for index,row in data_set.iterrows():
        input_example = InputExample(texts=[row['question'], row['answer']], label=row['label'])
        train_samples.append(input_example)
    return train_samples

training_samples = create_input_sample(training_set)
validation_samples = create_input_sample(validation_set)
test_samples = create_input_sample(test_set)


In [10]:
from torch.utils.data import DataLoader
from sentence_transformers import SentenceTransformer,  LoggingHandler, losses, models, util
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator

model_name = "sentence-transformers/distilbert-base-nli-stsb-mean-tokens"
train_batch_size = 16
num_epochs = 1
model_save_path = 'output/fine_tuned_'+model_name.replace("/", "-")

# Use Huggingface/transformers model (like BERT, RoBERTa, XLNet, XLM-R) for mapping tokens to embeddings
word_embedding_model = models.Transformer(model_name)

# Apply mean pooling to get one fixed sized sentence vector
pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension(),
                               pooling_mode_mean_tokens=True,
                               pooling_mode_cls_token=False,
                               pooling_mode_max_tokens=False)

model = SentenceTransformer(modules=[word_embedding_model, pooling_model])

train_dataloader = DataLoader(training_samples, shuffle=True, batch_size=train_batch_size)
train_loss = losses.CosineSimilarityLoss(model=model)

evaluator = EmbeddingSimilarityEvaluator.from_input_examples(validation_samples, name='pqa-valucation')


warmup_steps = math.ceil(len(train_dataloader) * num_epochs  * 0.1) #10% of train data for warm-up



# Train the model
model.fit(train_objectives=[(train_dataloader, train_loss)],
          evaluator=evaluator,
          epochs=num_epochs,
          evaluation_steps=1000,
          warmup_steps=warmup_steps,
          output_path=model_save_path)


model = SentenceTransformer(model_save_path)
test_evaluator = EmbeddingSimilarityEvaluator.from_input_examples(test_samples, name='pqa-test')
test_evaluator(model, output_path=model_save_path)

Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

Iteration:   0%|          | 0/80 [00:00<?, ?it/s]

0.40803412298270003

In [11]:
!cd output/fine_tuned_sentence-transformers-distilbert-base-nli-stsb-mean-tokens && tar czvf ../model.tar.gz *

1_Pooling/
1_Pooling/config.json
config.json
config_sentence_transformers.json
eval/
eval/similarity_evaluation_pqa-valucation_results.csv
modules.json
pytorch_model.bin
README.md
sentence_bert_config.json
similarity_evaluation_pqa-test_results.csv
special_tokens_map.json
tokenizer_config.json
tokenizer.json
vocab.txt


In [12]:
import sagemaker

sagemaker_session = sagemaker.Session()
role = sagemaker.get_execution_role()

inputs = sagemaker_session.upload_data(path='model.tar.gz', key_prefix='fine-tuned-transformers-model')
inputs

's3://sagemaker-us-east-1-522880334446/fine-tuned-transformers-model/model.tar.gz'

### Deploy the BERT model to SageMaker Endpoint

First we need to create a PyTorchModel object. The deploy() method on the model object creates an endpoint which serves prediction requests in real-time. If the instance_type is set to a SageMaker instance type (e.g. ml.m5.large) then the model will be deployed on SageMaker. If the instance_type parameter is set to local then it will be deployed locally as a Docker container and ready for testing locally.

First we need to create a Predictor class to accept TEXT as input and output JSON. The default behaviour is to accept a numpy array.


In [13]:
from sagemaker.pytorch import PyTorch, PyTorchModel
from sagemaker.predictor import Predictor
from sagemaker import get_execution_role

class StringPredictor(Predictor):
    def __init__(self, endpoint_name, sagemaker_session):
        super(StringPredictor, self).__init__(endpoint_name, sagemaker_session, content_type='text/plain')


Deploy the BERT model to Sagemaker Endpoint

#### Note: This process will take serveral minutes to complete.

In [None]:
import time

pytorch_model = PyTorchModel(model_data = inputs, 
                             role=role, 
                             entry_point ='inference.py',
                             source_dir = './code',
                             py_version = 'py38', 
                             framework_version = '1.10.2',
                             predictor_cls=StringPredictor)

predictor = pytorch_model.deploy(instance_type='ml.m5d.large', 
                                 initial_instance_count=1, 
                                 endpoint_name = f'semantic-search-model-{int(time.time())}')

### Test the SageMaker Endpoint.

Input is text data, output is vector data

In [None]:
import json
original_payload = 'Does this work with xbox?'
features = predictor.predict(original_payload)
vector_data = json.loads(features)

vector_data


## Step 2: Ingest data to OpenSearch Cluster


Use Python API to set up connection with OpenSearch Cluster

In [None]:
from opensearchpy import OpenSearch, RequestsHttpConnection, AWSV4SignerAuth
import boto3

region = 'us-east-1' 

credentials = boto3.Session().get_credentials()
auth = AWSV4SignerAuth(credentials, region)
index_name = 'nlp_pqa'

aos_client = OpenSearch(
    hosts = [{'host': aos_host, 'port': 443}],
    http_auth = auth,
    use_ssl = True,
    verify_certs = True,
    connection_class = RequestsHttpConnection
)

Create a index with 2 fields, the first field is "content" for raw sentece, the second field is "nlp_article_vector" for vector data.

In [None]:
knn_index = {
    "settings": {
        "index.knn": True,
        "index.knn.space_type": "cosinesimil",
        "analysis": {
          "analyzer": {
            "default": {
              "type": "standard",
              "stopwords": "_english_"
            }
          }
        }
    },
    "mappings": {
        "properties": {
            "question_vector": {
                "type": "knn_vector",
                "dimension": 768,
                "store": True
            },
            "question": {
                "type": "text",
                "store": True
            },
            "answer": {
                "type": "text",
                "store": True
            }
        }
    }
}


In [None]:
aos_client.indices.delete(index="nlp_pqa")


In [None]:
aos_client.indices.create(index="nlp_pqa",body=knn_index,ignore=400)


Show the created index information

In [None]:
aos_client.indices.get(index="nlp_pqa")

### We can ingest 1000 rows data for test

In [None]:
import json
from tqdm.contrib.concurrent import process_map
from multiprocessing import cpu_count


def load_pqa_as_json(file_name,number_rows=1000):
    result=[]
    with open(file_name) as f:
        i=0
        for line in f:
            data = json.loads(line)
            result.append(data)
            i+=1
            if(i == number_rows):
                break
    return result


qa_list_json = load_pqa_as_json('amazon-pqa/amazon_pqa_headsets.json',number_rows=1000)


def es_import(question):
    vector = json.loads(predictor.predict(question["question_text"]))
    aos_client.index(index='nlp_pqa',
             body={"question_vector": vector, "question": question["question_text"],"answer":question["answers"][0]["answer_text"]}
            )
        
workers = 4 * cpu_count()
    
process_map(es_import, qa_list_json, max_workers=workers,chunksize=1000)

### Query the documents number in the OpenSearch Cluster

In [None]:
res = aos_client.search(index="nlp_pqa", body={"query": {"match_all": {}}})
print("Got %d Hits:" % res['hits']['total']['value'])

## Step 3: Semantic Search 
### Generate vector data for user input query 

Generate vector data for the question by calling SageMaker model

In [None]:
query_raw_sentences = ['does this work with xbox?']
client = boto3.client('sagemaker-runtime')
ENDPOINT_NAME = predictor.endpoint
response = client.invoke_endpoint(EndpointName=ENDPOINT_NAME,
                                       ContentType='text/plain',
                                       Body=query_raw_sentences[0])

search_vector = json.loads((response['Body'].read()))


### Search vector data with "Semanatic Search" 

OpenSearch KNN


In [None]:

query={
    "size": 50,
    "query": {
        "knn": {
            "question_vector":{
                "vector":search_vector,
                "k":50
            }
        }
    }
}

res = aos_client.search(index="nlp_pqa", 
                       body=query,
                       stored_fields=["question","answer"])
#print("Got %d Hits:" % res['hits']['total']['value'])
query_result=[]
for hit in res['hits']['hits']:
    row=[hit['_id'],hit['_score'],hit['fields']['question'][0],hit['fields']['answer'][0]]
    query_result.append(row)

query_result_df = pd.DataFrame(data=query_result,columns=["_id","_score","question","answer"])
display(query_result_df)

### Search the same query with "Keyword Search"

In [None]:
query={
    "size": 50,
    "query": {
        "match": {
            "question":"does this work with xbox?"
        }
    }
}

res = aos_client.search(index="nlp_pqa", 
                       body=query,
                       stored_fields=["question","answer"])
#print("Got %d Hits:" % res['hits']['total']['value'])
query_result=[]
for hit in res['hits']['hits']:
    row=[hit['_id'],hit['_score'],hit['fields']['question'][0],hit['fields']['answer'][0]]
    query_result.append(row)

query_result_df = pd.DataFrame(data=query_result,columns=["_id","_score","question","answer"])
display(query_result_df)


## Cleanup

Make sure that you stop the notebook instance, delete the Amazon SageMaker endpoint and delete the Elasticsearch domain to prevent any additional charges.

In [None]:
# Delete the endpoint
predictor.delete_endpoint()

# Empty S3 Contents
training_bucket_resource = s3_resource.Bucket(bucket)
training_bucket_resource.objects.all().delete()

hosting_bucket_resource = s3_resource.Bucket(outputs['s3BucketHostingBucketName'])
hosting_bucket_resource.objects.all().delete()