<a href="https://colab.research.google.com/github/bvader/elasticsearch-test-simple-vector/blob/main/elasticsearch-test-simple-vector.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Initial Setup

In [None]:
!pip install elasticsearch


In [None]:
# Read in connection and auth info
# Note the port is REQUIRED for the elasticsearch endpoint!
import getpass, os

os.environ['es_url'] = getpass.getpass('Enter Elasticsearch Endpoint:  ')
os.environ['es_user'] = getpass.getpass('Enter User:  ')
os.environ['es_pwd'] = getpass.getpass('Enter Password:  ')

In [48]:
# Connect and test connection
from elasticsearch import Elasticsearch


es_url = os.environ['es_url']
es_user = os.environ['es_user']
es_pwd = os.environ['es_pwd']

# Initialize the Elasticsearch client
es = Elasticsearch(
    [es_url],
    basic_auth=(es_user, es_pwd),
    request_timeout=30
)
es.info().body

{'name': 'instance-0000000001',
 'cluster_name': '44feffcc909849f295ae3ed4a9be10c1',
 'cluster_uuid': 'Fm7mp0U6S0y1ASR9Gjvh3w',
 'version': {'number': '8.9.0',
  'build_flavor': 'default',
  'build_type': 'docker',
  'build_hash': '8aa461beb06aa0417a231c345a1b8c38fb498a0d',
  'build_date': '2023-07-19T14:43:58.555259655Z',
  'build_snapshot': False,
  'lucene_version': '9.7.0',
  'minimum_wire_compatibility_version': '7.17.0',
  'minimum_index_compatibility_version': '7.0.0'},
 'tagline': 'You Know, for Search'}

In [None]:
# Upload the file first
!head /content/sample_data/amazon_pqa_headset.json

# Data and Model Setup

In [None]:
# Load the data into the dataframe. 1000 rows for test
import sys
import datetime
import json
import os
import time

import pandas as pd
import numpy as np

from elasticsearch import Elasticsearch
from elasticsearch.helpers import bulk
from datetime import datetime

df = pd.DataFrame(columns=('question', 'answer'))

with open('/content/sample_data/amazon_pqa_headset.json') as f:
    i=0
    for line in f:
        data = json.loads(line)
        df.loc[i] = [data['question_text'],data['answers'][0]['answer_text']]
        i+=1
        if(i == 1000):
            break


In [None]:
# Install eland
!pip install -q eland elasticsearch elasticsearch_dsl transformers sentence_transformers

from elasticsearch_dsl import Search
from eland.ml.pytorch import PyTorchModel
from eland.ml.pytorch.transformers import TransformerModel
from getpass import getpass
import logging
import tempfile
from pprint import pformat
import secrets

In [None]:
# Load the eland model

logging.basicConfig(format='%(asctime)s %(levelname)s : %(message)s')
logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)

MODEL_HUB_URL = "https://huggingface.co"

def load_model(model_id, task_type):
  with tempfile.TemporaryDirectory() as tmp_dir:
    logger.info(f"Loading HuggingFace transformer tokenizer and model [{model_id}] for task [{task_type}]" )

    tm = TransformerModel(model_id=model_id, task_type=task_type)
    model_path, config, vocab_path = tm.save(tmp_dir)

    ptm = PyTorchModel(es, tm.elasticsearch_model_id())
    model_exists = es.options(ignore_status=404).ml.get_trained_models(model_id=ptm.model_id).meta.status == 200

    if model_exists:
      logger.info("Model has already been imported")
    else:
      logger.info("Importing model")
      ptm.import_model(model_path=model_path, config_path=None, vocab_path=vocab_path, config=config)
      logger.info("Starting model deployment")
      ptm.start()
      logger.info(f"Model successfully imported with id '{ptm.model_id}'")

In [None]:
load_model("sentence-transformers/all-MiniLM-L12-v2", "text_embedding")

# fetch it so we can see how it loaded
es.ml.get_trained_models(model_id="sentence-transformers__all-minilm-l12-v2").body

# Create Index, Pipeline and Load Index

In [47]:
# Creates an index in Elasticsearch if one isn't already there."""
es.options(ignore_status=400).indices.create(
    index="nlp_pqa_1000",
    settings={"number_of_shards": 1},
    mappings={
        "properties": {
            "question": { "type": "text"},
            "answer": {"type": "text"},
        }
    }
)

ObjectApiResponse({'error': {'root_cause': [{'type': 'resource_already_exists_exception', 'reason': 'index [nlp_pqa_1000/Bac8Vx1WTsKaGGR4-klrTg] already exists', 'index_uuid': 'Bac8Vx1WTsKaGGR4-klrTg', 'index': 'nlp_pqa_1000'}], 'type': 'resource_already_exists_exception', 'reason': 'index [nlp_pqa_1000/Bac8Vx1WTsKaGGR4-klrTg] already exists', 'index_uuid': 'Bac8Vx1WTsKaGGR4-klrTg', 'index': 'nlp_pqa_1000'}, 'status': 400})

In [None]:
# Create Pipeline
es.ingest.put_pipeline(id="sentence-text-embedding",
    description="Text embedding pipeline",
    processors=[
    {
        "inference": {
        "model_id": "sentence-transformers__all-minilm-l12-v2",
        "field_map": {
            "question_text": "text_field"
        },
        "target_field": "question_vector"
      }
    }
  ],
  on_failure=[
    {
      "set": {
        "description": "Index document to 'failed-<index>'",
        "field": "_index",
        "value": "failed-{{{_index}}}"
      }
    },
    {
      "set": {
        "description": "Set error message",
        "field": "ingest.failure",
        "value": "{{_ingest.on_failure_message}}"
      }
    }
  ]
)

In [49]:
es.options(ignore_status=400).indices.create(
    index="nlp_pqa_1000_embeddings",
    settings={"number_of_shards": 1},
    mappings={
        "properties": {
            "question": { "type": "text"},
            "answer": {"type": "text"},
            "question_vector.predicted_value": {
            "type": "dense_vector",
            "dims": 384,
            "index": "true",
            "similarity": "cosine"
            }
        }
    }
)

ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'nlp_pqa_1000_embeddings'})

In [50]:
# Load data through the model using a pipeline
def generator():
    for index, row in df.iterrows():
        yield {
            "_index": "nlp_pqa_1000_embeddings",
            "pipeline": "sentence-text-embedding",
            "question_text": row["question"],
            "answer": row["answer"]
        }

try:
    res = bulk(es, generator())
    print("Response: ", res)
except Exception as e:
    print(e)


Response:  (1000, [])


In [None]:
# Start the model if not started will error if started already
es.ml.start_trained_model_deployment(model_id="sentence-transformers__all-minilm-l12-v2")

# Queries

In [None]:
# Simple KNN
query = input ("Enter a question :")
print('\n')

knn = {
    "field": "question_vector.predicted_value",
    "k": 10,
    "num_candidates": 100,
    "query_vector_builder": {
    "text_embedding": {
        "model_id": "sentence-transformers__all-minilm-l12-v2",
        "model_text": query
        }
    }
}

resp = es.search(index="nlp_pqa_1000_embeddings", knn=knn)

for hit in resp['hits']['hits']:
    doc_id = hit['_id']
    score = hit['_score']
    question = hit['_source']['question_text']
    answer = hit['_source']['answer']
    print(f"Score: {score}\nQuestion: {question}\nAnswer: {answer}\n")

In [None]:
# KNN with Filter

query = input ("Enter a question :")
print('\n')

knn = {
    "field": "question_vector.predicted_value",
    "k": 10,
    "num_candidates": 100,
    "query_vector_builder": {
    "text_embedding": {
        "model_id": "sentence-transformers__all-minilm-l12-v2",
        "model_text": query
        }
    },
    "filter": {
      "bool": {
        "must_not": [
          {
            "match": {
              "answer": "plantronics"
            }
          }
        ]
      }
    }
}

resp = es.search(index="nlp_pqa_1000_embeddings", knn=knn)

for hit in resp['hits']['hits']:
    doc_id = hit['_id']
    score = hit['_score']
    question = hit['_source']['question_text']
    answer = hit['_source']['answer']
    print(f"Score: {score}\nQuestion: {question}\nAnswer: {answer}\n")

In [None]:
# Hybrid search rrf
query = input ("Enter a question :")

print('\n')
body = {
    "query": {
      "bool": {
      "must": [
        {
          "match": {
            "answer": "polycom"
          }
        }
      ]
    }
  },
  "knn": {
    "field": "question_vector.predicted_value",
    "k": 10,
    "num_candidates": 100,
    "query_vector_builder": {
      "text_embedding": {
        "model_id": "sentence-transformers__all-minilm-l12-v2",
        "model_text": query
      }
    }
  },
  "rank": {
        "rrf": {
            "window_size": 50,
            "rank_constant": 20
        }
    }
}
resp = es.search(index="nlp_pqa_1000_embeddings", body=body)

for hit in resp['hits']['hits']:
    doc_id = hit['_id']
    rank = hit['_rank']
    question = hit['_source']['question_text']
    answer = hit['_source']['answer']
    print(f"\nRank: {rank}\nQuestion: {question}\nAnswer: {answer}\n")