<a href="https://colab.research.google.com/github/bvader/elasticsearch-test-elser/blob/main/elasticsearch_test_elser.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Initial Setup

In [None]:
!pip install elasticsearch

In [None]:
# Read in connection and auth info
# Note the port is REQUIRED for the elasticsearch endpoint!
import getpass, os

os.environ['es_url'] = getpass.getpass('Enter Elasticsearch Endpoint:  ')
os.environ['es_user'] = getpass.getpass('Enter User:  ')
os.environ['es_pwd'] = getpass.getpass('Enter Password:  ')

In [None]:
# Connect and test connection
from elasticsearch import Elasticsearch


es_url = os.environ['es_url']
es_user = os.environ['es_user']
es_pwd = os.environ['es_pwd']

# Initialize the Elasticsearch client
es = Elasticsearch(
    [es_url],
    basic_auth=(es_user, es_pwd),
    request_timeout=30
)
es.info().body

# Data and Model Setup

In [None]:
# See https://registry.opendata.aws/amazon-pqa/
# See https://amazon-pqa.s3.amazonaws.com/readme.txt
# aws s3 ls --no-sign-request s3://amazon-pqa/
# https://amazon-pqa.s3.amazonaws.com/amazon-pqa.tar.gz

# Upload the file first
!head /content/sample_data/amazon_pqa_headset.json

In [None]:
# Load Data file (Just load 1000 for now)

import sys
import datetime
import json
import os
import time

import pandas as pd
import numpy as np

from ast import literal_eval
from tqdm import tqdm

from elasticsearch import Elasticsearch
from elasticsearch.helpers import bulk
from datetime import datetime


df = pd.DataFrame(columns=('question', 'answer'))

with open('/content/sample_data/amazon_pqa_headset.json') as f:
    i=0
    for line in f:
        data = json.loads(line)
        df.loc[i] = [data['question_text'],data['answers'][0]['answer_text']]
        i+=1
        if(i == 1000):
            break

print(df.columns)
print(df.shape)
print(df.iloc[999]['question'])
print(df.iloc[999]['answer'])

# Create the Index, Pipeline and Load Index

In [None]:
# Creates an index in Elasticsearch with raw data
es.options(ignore_status=400).indices.create(
    index="nlp_pqa_1000",
    settings={"number_of_shards": 1},
    mappings={
        "properties": {
            "question": { "type": "text"},
            "answer": {"type": "text"},
        }
    }
)

In [None]:
# Generate Action
def generator():
    for index, row in df.iterrows():
        yield {
            "_index": "nlp_pqa_1000",
            "question": row["question"],
            "answer": row["answer"]
        }
# Bulk indexing nlp_
try:
    res = bulk(es, generator())
    print("Response: ", res)
except Exception as e:
    print(e)


In [None]:
# Download / Load ELSER
es.ml.put_trained_model(model_id=".elser_model_2_linux-x86_64", input={"field_names": "text_field"})

In [None]:
# Start ELSER
es.ml.start_trained_model_deployment(
    model_id=".elser_model_2_linux-x86_64"
)

In [None]:
# Create Elser Pipeline
es.ingest.put_pipeline(id="elser-expansion",
    description="Elser pipeline",
    processors=[
    {
        "inference": {
        "model_id": ".elser_model_2_linux-x86_64",
        # This is the new 8.11+ syntax
        "input_output": [
            {
                "input_field": "answer",
                "output_field": "content_embedding"
            }
        ],
        "inference_config": {
          "text_expansion": {
            }
          }
        }
      }
  ],
  on_failure=[
    {
      "set": {
        "description": "Index document to 'failed-<index>'",
        "field": "_index",
        "value": "failed-{{{_index}}}"
      }
    },
    {
      "set": {
        "description": "Set error message",
        "field": "ingest.failure",
        "value": "{{_ingest.on_failure_message}}"
      }
    }
  ]
)

In [None]:
# Create an index using Elser expansion
es.options(ignore_status=400).indices.create(
    index="nlp_pqa_1000_elser_embeddings",
    settings={"number_of_shards": 1, "index.mapping.total_fields.limit": 2000 },
    mappings={
        "properties": {
            "content_embedding": {
                "type": "sparse_vector"
            },
            "question": { "type": "text"},
            "answer": {"type": "text"}
        }
    }
)

In [None]:
# Load data with term expansion
def generator():
    for index, row in df.iterrows():
        yield {
            "_index": "nlp_pqa_1000_elser_embeddings",
            "pipeline": "elser-expansion",
            "question": row["question"],
            "answer": row["answer"]
        }

try:
    res = bulk(es, generator(), chunk_size=100) # batch size 100
    print("Response: ", res)
except Exception as e:
    print(e)


# Queries

In [None]:
# Simple Text Expansion

query_text = input("Enter a question :")
print('\n')

query={
    "text_expansion": {
      "content_embedding" : {
        "model_id":".elser_model_2_linux-x86_64",
        "model_text": query_text
    }
  }
}

resp = es.search(index="nlp_pqa_1000_elser_embeddings", query=query)

for hit in resp['hits']['hits']:
    doc_id = hit['_id']
    score = hit['_score']
    question = hit['_source']['question']
    answer = hit['_source']['answer']
    print(f"Question: {question}\nAnswer: {answer}\n")

In [None]:
# Text expansion with filter, exclude results
# search for "Does this work with xbox"

query_text = input("Enter a question :")
print('\n')

query={
    "bool": {
      "must": [
        {
            "text_expansion": {
              "content_embedding" : {
                "model_id":".elser_model_2_linux-x86_64",
                "model_text": query_text
            }
          }
        }
      ],
      "must_not": [
        {
         "match_phrase": {
            "question": "xbox one"
          }
        },
        {
          "match_phrase": {
            "question": "xbox 1"
          }
        }
      ]
    }
}

resp = es.search(index="nlp_pqa_1000_elser_embeddings", query=query)

for hit in resp['hits']['hits']:
    doc_id = hit['_id']
    score = hit['_score']
    question = hit['_source']['question']
    answer = hit['_source']['answer']
    print(f"Question: {question}\nAnswer: {answer}\n")

In [None]:
# Text Expansion with filter... only include certain results
# search for "Does this work with xbox"
query_text = input("Enter a question :")
print('\n')

query={
    "bool": {
      "must": [
        {
            "text_expansion": {
              "content_embedding" : {
                "model_id":".elser_model_2_linux-x86_64",
                "model_text": query_text
            }
          }
        }
      ],
      "must": [
        {
         "match": {
            "question": "xbox 360"
          }
        }
      ]
    }
}

resp = es.search(index="nlp_pqa_1000_elser_embeddings", query=query)

print(f"{resp}\n")

for hit in resp['hits']['hits']:
    doc_id = hit['_id']
    score = hit['_score']
    question = hit['_source']['question']
    answer = hit['_source']['answer']
    print(f"Question: {question}\nAnswer: {answer}\n")



In [None]:
# Hybrid search with text expansion and rrf
# NOTE : This works with a slightly lower level of the API
# The higher level / abstracted  API is still under development
query_text = input ("Enter a question :")

print('\n')
body = {
  "sub_searches": [
    {
      "query": {
        "bool": {
          "must": [
          {
            "match": {
            "answer": "polycom"
              }
            }
          ]
        }
      }
    },
    {
      "query": {
          "text_expansion": {
            "content_embedding" : {
              "model_id":".elser_model_2_linux-x86_64",
              "model_text": query_text
          }
        }
      }
    }
  ],
  "rank": {
    "rrf": {
        "window_size": 50,
        "rank_constant": 20
    }
  }
}


index = "nlp_pqa_1000_elser_embeddings"
resp = es.perform_request("POST", f"/{index}/_search", headers={"content-type": "application/json", "accept": "application/json"}, body=body)

print(f"\n Resp:{resp} \n")
for hit in resp['hits']['hits']:
    doc_id = hit['_id']
    rank = hit['_rank']
    question = hit['_source']['question']
    answer = hit['_source']['answer']
    print(f"\nRank: {rank}\nQuestion: {question}\nAnswer: {answer}\n")