<a href="https://colab.research.google.com/github/bvader/elasticsearch-test-simple-vector/blob/main/elasticsearch-test-simple-vector.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Initial Setup

In [None]:
!pip install elasticsearch




In [None]:
# Read in connection and auth info
import getpass, os

os.environ['es_url'] = getpass.getpass('Enter Elasticsearch Endpoint:  ')
os.environ['es_user'] = getpass.getpass('Enter User:  ')
os.environ['es_pwd'] = getpass.getpass('Enter Password:  ')

Enter Elasticsearch Endpoint:  ··········
Enter User:  ··········
Enter Password:  ··········


In [None]:
# Connect and test connection
from elasticsearch import Elasticsearch


es_url = os.environ['es_url']
es_user = os.environ['es_user']
es_pwd = os.environ['es_pwd']

# Initialize the Elasticsearch client
es = Elasticsearch(
    [es_url],
    basic_auth=(es_user, es_pwd),
    request_timeout=30
)
es.info().body

{'name': 'instance-0000000001',
 'cluster_name': '44feffcc909849f295ae3ed4a9be10c1',
 'cluster_uuid': 'Fm7mp0U6S0y1ASR9Gjvh3w',
 'version': {'number': '8.9.0',
  'build_flavor': 'default',
  'build_type': 'docker',
  'build_hash': '8aa461beb06aa0417a231c345a1b8c38fb498a0d',
  'build_date': '2023-07-19T14:43:58.555259655Z',
  'build_snapshot': False,
  'lucene_version': '9.7.0',
  'minimum_wire_compatibility_version': '7.17.0',
  'minimum_index_compatibility_version': '7.0.0'},
 'tagline': 'You Know, for Search'}

In [None]:
# Upload the file first
!head /content/sample_data/amazon_pqa_headset.json

{"question_id": "Tx39GCUOS5AYAFK", "question_text": "does this work with cisco ip phone 7942", "asin": "B000LSZ2D6", "bullet_point1": "Noise-Canceling microphone filters out background sound", "bullet_point2": "HW251N P/N 75100-06", "bullet_point3": "Uses Plantronics QD Quick Disconnect Connector. Must be used with Plantronics Amp or with proper phone or USB adapter cable", "bullet_point4": "Connectivity Technology: Wired, Earpiece Design: Over-the-head, Earpiece Type: Monaural, Host Interface: Proprietary, Microphone Design: Boom, Microphone Technology: Noise Canceling, Product Model: HW251N, Product Series: SupraPlus, Standard Warranty: 2 Year", "bullet_point5": "Easy Lightweight Wear -Leaving One Ear Uncovered For Person-to-Person Conversations", "product_description": "", "brand_name": "Plantronics", "item_name": "Plantronics HW251N SupraPlus Wideband Headset (64338-31)", "question_type": "yes-no", "answer_aggregated": "neutral", "answers": [{"answer_text": "Use the Plantronics com

# Data and Model Setup

In [None]:
# Load the data into the dataframe. 1000 rows for test
import sys
import datetime
import json
import os
import time

import pandas as pd
import numpy as np

from elasticsearch import Elasticsearch
from elasticsearch.helpers import bulk
from datetime import datetime

df = pd.DataFrame(columns=('question', 'answer'))

with open('/content/sample_data/amazon_pqa_headset.json') as f:
    i=0
    for line in f:
        data = json.loads(line)
        df.loc[i] = [data['question_text'],data['answers'][0]['answer_text']]
        i+=1
        if(i == 1000):
            break


In [None]:
# Install eland
!pip install -q eland elasticsearch elasticsearch_dsl transformers sentence_transformers

from elasticsearch_dsl import Search
from eland.ml.pytorch import PyTorchModel
from eland.ml.pytorch.transformers import TransformerModel
from getpass import getpass
import logging
import tempfile
from pprint import pformat
import secrets

In [None]:
# Load the eland model

logging.basicConfig(format='%(asctime)s %(levelname)s : %(message)s')
logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)

MODEL_HUB_URL = "https://huggingface.co"

def load_model(model_id, task_type):
  with tempfile.TemporaryDirectory() as tmp_dir:
    logger.info(f"Loading HuggingFace transformer tokenizer and model [{model_id}] for task [{task_type}]" )

    tm = TransformerModel(model_id=model_id, task_type=task_type)
    model_path, config, vocab_path = tm.save(tmp_dir)

    ptm = PyTorchModel(es, tm.elasticsearch_model_id())
    model_exists = es.options(ignore_status=404).ml.get_trained_models(model_id=ptm.model_id).meta.status == 200

    if model_exists:
      logger.info("Model has already been imported")
    else:
      logger.info("Importing model")
      ptm.import_model(model_path=model_path, config_path=None, vocab_path=vocab_path, config=config)
      logger.info("Starting model deployment")
      ptm.start()
      logger.info(f"Model successfully imported with id '{ptm.model_id}'")

In [None]:
load_model("sentence-transformers/all-MiniLM-L12-v2", "text_embedding")

# fetch it so we can see how it loaded
es.ml.get_trained_models(model_id="sentence-transformers__all-minilm-l12-v2").body

INFO:__main__:Loading HuggingFace transformer tokenizer and model [sentence-transformers/all-MiniLM-L12-v2] for task [text_embedding]
INFO:__main__:Importing model


  0%|          | 0/32 [00:00<?, ? parts/s]

INFO:__main__:Starting model deployment
INFO:__main__:Model successfully imported with id 'sentence-transformers__all-minilm-l12-v2'


{'count': 1,
 'trained_model_configs': [{'model_id': 'sentence-transformers__all-minilm-l12-v2',
   'model_type': 'pytorch',
   'created_by': 'api_user',
   'version': '8.9.0',
   'create_time': 1692240611181,
   'model_size_bytes': 0,
   'estimated_operations': 0,
   'license_level': 'platinum',
   'description': "Model sentence-transformers/all-MiniLM-L12-v2 for task type 'text_embedding'",
   'tags': [],
   'input': {'field_names': ['text_field']},
   'inference_config': {'text_embedding': {'vocabulary': {'index': '.ml-inference-native-000001'},
     'tokenization': {'bert': {'do_lower_case': True,
       'with_special_tokens': True,
       'max_sequence_length': 512,
       'truncate': 'first',
       'span': -1}}}},
   'location': {'index': {'name': '.ml-inference-native-000001'}}}]}

# Create Index, Pipeline and Load Index

In [None]:
# Creates an index in Elasticsearch if one isn't already there."""
es.options(ignore_status=400).indices.create(
    index="nlp_qpa_1000",
    settings={"number_of_shards": 1},
    mappings={
        "properties": {
            "question": { "type": "text"},
            "answer": {"type": "text"},
        }
    }
)

ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'nlp_qpa_1000'})

In [None]:
# Create Pipeline
es.ingest.put_pipeline(id="sentence-text-embedding",
    description="Text embedding pipeline",
    processors=[
    {
        "inference": {
        "model_id": "sentence-transformers__all-minilm-l12-v2",
        "field_map": {
            "question_text": "text_field"
        },
        "target_field": "question_vector"
      }
    }
  ],
  on_failure=[
    {
      "set": {
        "description": "Index document to 'failed-<index>'",
        "field": "_index",
        "value": "failed-{{{_index}}}"
      }
    },
    {
      "set": {
        "description": "Set error message",
        "field": "ingest.failure",
        "value": "{{_ingest.on_failure_message}}"
      }
    }
  ]
)

ObjectApiResponse({'acknowledged': True})

In [None]:
es.options(ignore_status=400).indices.create(
    index="nlp_pqa_1000_embeddings",
    settings={"number_of_shards": 1},
    mappings={
        "properties": {
            "question": { "type": "text"},
            "answer": {"type": "text"},
            "question_vector.predicted_value": {
            "type": "dense_vector",
            "dims": 384,
            "index": "true",
            "similarity": "cosine"
            }
        }
    }
)

ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'nlp_pqa_1000_embeddings'})

In [None]:
# Load data through the model using a pipeline
def generator():
    for index, row in df.iterrows():
        yield {
            "_index": "nlp_pqa_1000_embeddings",
            "pipeline": "sentence-text-embedding",
            "question_text": row["question"],
            "answer": row["answer"]
        }

try:
    res = bulk(es, generator())
    print("Response: ", res)
except Exception as e:
    print(e)


Response:  (1000, [])


In [None]:
# Start the model if not started will error if started already
es.ml.start_trained_model_deployment(model_id="sentence-transformers__all-minilm-l12-v2")

ObjectApiResponse({'assignment': {'task_parameters': {'model_id': 'sentence-transformers__all-minilm-l12-v2', 'deployment_id': 'sentence-transformers__all-minilm-l12-v2', 'model_bytes': 132922385, 'threads_per_allocation': 1, 'number_of_allocations': 1, 'queue_capacity': 1024, 'cache_size': '132922385b', 'priority': 'normal'}, 'routing_table': {'hlv_c1RmSVqcTN2rzu-IDg': {'current_allocations': 1, 'target_allocations': 1, 'routing_state': 'started', 'reason': ''}}, 'assignment_state': 'started', 'start_time': '2023-08-17T02:15:11.365753978Z', 'max_assigned_allocations': 1}})

# Queries

In [None]:
# Simple KNN
query = input ("Enter a question :")
print('\n')

knn = {
    "field": "question_vector.predicted_value",
    "k": 10,
    "num_candidates": 100,
    "query_vector_builder": {
    "text_embedding": {
        "model_id": "sentence-transformers__all-minilm-l12-v2",
        "model_text": query
        }
    }
}

resp = es.search(index="nlp_pqa_1000_embeddings", knn=knn)

for hit in resp['hits']['hits']:
    doc_id = hit['_id']
    score = hit['_score']
    question = hit['_source']['question_text']
    answer = hit['_source']['answer']
    print(f"Score: {score}\nQuestion: {question}\nAnswer: {answer}\n")

Enter a question :cisco 7492


Score: 0.844122
Question: does this work with cisco ip phone 7942
Answer: Use the Plantronics compatibility guide to see what is compatible with your phone. http://www.plantronics.com/us/compatibility-guide/

Score: 0.84125596
Question: Is this compatible with the cisco ip 7941 phone?
Answer: This headset is compatible with the Cisco 7941 phone with the additional QD Modular plug cord.

Score: 0.82627714
Question: Is this compatible with the cisco ip phone 7970 / 7961 models?
Answer: Don’t know. Call Plantronics

Score: 0.8072114
Question: will this work with cisco 7942 phone? what cable i need to buy??
Answer: I would call Headsets.com in San Francisco.   They are will know.  I buy my other supplies from them  800-432-3738

Score: 0.7867383
Question: Do I need some sort of adapter to use this with a Cisco IP 7945 phone?
Answer: Hi Carter, yes you will need the U10 adapter, part number 26716-01. Give us a call at 1-800-683-5715 for any further questions.


In [None]:
# KNN with Filter

query = input ("Enter a question :")
print('\n')

knn = {
    "field": "question_vector.predicted_value",
    "k": 10,
    "num_candidates": 100,
    "query_vector_builder": {
    "text_embedding": {
        "model_id": "sentence-transformers__all-minilm-l12-v2",
        "model_text": query
        }
    },
    "filter": {
      "bool": {
        "must_not": [
          {
            "match": {
              "answer": "plantronics"
            }
          }
        ]
      }
    }
}

resp = es.search(index="nlp_pqa_1000_embeddings", knn=knn)

for hit in resp['hits']['hits']:
    doc_id = hit['_id']
    score = hit['_score']
    question = hit['_source']['question_text']
    answer = hit['_source']['answer']
    print(f"Score: {score}\nQuestion: {question}\nAnswer: {answer}\n")

Enter a question :cisco 7942


Score: 0.86436224
Question: Is this compatible with the cisco ip 7941 phone?
Answer: This headset is compatible with the Cisco 7941 phone with the additional QD Modular plug cord.

Score: 0.82503355
Question: will this work with cisco 7942 phone? what cable i need to buy??
Answer: I would call Headsets.com in San Francisco.   They are will know.  I buy my other supplies from them  800-432-3738

Score: 0.7967855
Question: Do I need some sort of adapter to use this with a Cisco IP 7945 phone?
Answer: Hi Carter, yes you will need the U10 adapter, part number 26716-01. Give us a call at 1-800-683-5715 for any further questions.

Score: 0.7394031
Question: Will this work with the Cisco CP-8811 model & i would need to buy the adapter also, correct?
Answer: Yes, it works but you need the Quick Disconnect cord, Part Number: 26716-01

Score: 0.71503574
Question: I would like to use this headset with a cisco CP-6921 phone. which adapter do i need?
Answer: Hi there,

In [None]:
# Hybrid search rrf
query = input ("Enter a question :")

print('\n')
body = {
    "query": {
      "bool": {
      "must": [
        {
          "match": {
            "answer": "polycom"
          }
        }
      ]
    }
  },
  "knn": {
    "field": "question_vector.predicted_value",
    "k": 10,
    "num_candidates": 100,
    "query_vector_builder": {
      "text_embedding": {
        "model_id": "sentence-transformers__all-minilm-l12-v2",
        "model_text": query
      }
    }
  },
  "rank": {
        "rrf": {
            "window_size": 50,
            "rank_constant": 20
        }
    }
}
resp = es.search(index="nlp_pqa_1000_embeddings", body=body)

for hit in resp['hits']['hits']:
    doc_id = hit['_id']
    rank = hit['_rank']
    question = hit['_source']['question_text']
    answer = hit['_source']['answer']
    print(f"\nRank: {rank}\nQuestion: {question}\nAnswer: {answer}\n")

Enter a question :cisco 7492



Rank: 1
Question: does this work with cisco ip phone 7942
Answer: Use the Plantronics compatibility guide to see what is compatible with your phone. http://www.plantronics.com/us/compatibility-guide/


Rank: 2
Question: Will it work with a polycom vvx 250?
Answer: Yes,it will  work with a polycom vvx 250.


Rank: 3
Question: Is this compatible with the cisco ip 7941 phone?
Answer: This headset is compatible with the Cisco 7941 phone with the additional QD Modular plug cord.


Rank: 4
Question: Will this work with polycom vvx 400 and 410?
Answer: Yes,it work with polycom vvx 400 and 410.


Rank: 5
Question: Is this compatible with the cisco ip phone 7970 / 7961 models?
Answer: Don’t know. Call Plantronics


Rank: 6
Question: Will this work for a polycom vvx phone? also, is it necessary to purchase that extra adapter if i will use only with ploycom?
Answer: Yes, but you need a cable in between the headset and the Polycom. Plantronics Part Number: 27190-01


  resp = es.search(index="nlp_pqa_1000_embeddings", body=body)
