In [97]:
MODEL_NAME="all-MiniLM-L6-v2"

########################
region = 'us-east-1' 
index_name = 'ncert'
service = 'es'
aos_host = "search-biology-j6cjabt44maa5ju4lbxfca3jmq.us-east-1.es.amazonaws.com"

In [98]:
%pip install opensearch-py

Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


In [99]:
from opensearchpy import OpenSearch, RequestsHttpConnection, AWSV4SignerAuth
import boto3
from requests_aws4auth import AWS4Auth
from sentence_transformers import SentenceTransformer
import pandas as pd
from azure_openai_helper import generate_answer_from_context

In [100]:
credentials = boto3.Session().get_credentials()
auth = AWSV4SignerAuth(credentials, region)

In [101]:
awsauth = AWS4Auth(credentials.access_key, credentials.secret_key, region, service, session_token=credentials.token)

In [102]:
aos_client = OpenSearch(
    hosts = [{'host': aos_host, 'port': 443}],
    http_auth = auth,
    use_ssl = True,
    verify_certs = True,
    connection_class = RequestsHttpConnection
)

In [103]:
knn_index = {
    "settings": {
        "index.knn": True,
        "index.knn.space_type": "cosinesimil",
        "analysis": {
          "analyzer": {
            "default": {
              "type": "standard",
              "stopwords": "_english_"
            }
          }
        }
    },
    "mappings": {
        "properties": {
            "text_vector": {
                "type": "knn_vector",
                "dimension": 384,
                "store": True
            },
            "text": {
                "type": "text",
                "store": True
            },
        }
    }
}

In [104]:
aos_client.indices.create(index=index_name,body=knn_index,ignore=400)

{'acknowledged': True, 'shards_acknowledged': True, 'index': 'ncert'}

In [105]:
aos_client.indices.get(index=index_name)

{'ncert': {'aliases': {},
  'mappings': {'properties': {'text': {'type': 'text', 'store': True},
    'text_vector': {'type': 'knn_vector', 'store': True, 'dimension': 384}}},
  'settings': {'index': {'number_of_shards': '5',
    'provided_name': 'ncert',
    'knn.space_type': 'cosinesimil',
    'knn': 'true',
    'creation_date': '1691834226591',
    'analysis': {'analyzer': {'default': {'type': 'standard',
       'stopwords': '_english_'}}},
    'number_of_replicas': '1',
    'uuid': 'gRGGI3wpTX2Me2vb6goCEQ',
    'version': {'created': '135217827'}}}}}

In [106]:
file_path = "C:/Ambarish/NCERT/Chap04AnimalKingdom.pdf"

In [107]:
from PyPDF2 import PdfReader

In [108]:
def get_pdf_data(file_path, num_pages = 1):
    reader = PdfReader(file_path)
    full_doc_text = ""
    pages = reader.pages
    num_pages = len(pages) 
    
    try:
        for page in range(num_pages):
            current_page = reader.pages[page]
            text = current_page.extract_text()
            full_doc_text += text
    except:
        print("Error reading file")
    finally:
        return full_doc_text

In [109]:
def get_chunks(fulltext:str,chunk_length =500) -> list:
    text = fulltext

    chunks = []
    while len(text) > chunk_length:
        last_period_index = text[:chunk_length].rfind('.')
        if last_period_index == -1:
            last_period_index = chunk_length
        chunks.append(text[:last_period_index])
        text = text[last_period_index+1:]
    chunks.append(text)
    return chunks

In [110]:
fullText = get_pdf_data(file_path)

In [111]:
Lines =get_chunks(fullText,500)

In [112]:
len(Lines)

73

In [113]:
model = SentenceTransformer(MODEL_NAME)

In [114]:
embeddings_all = model.encode(Lines, convert_to_tensor=True)

In [115]:
embeddings_length=len(embeddings_all)

In [116]:
for i in range(embeddings_length):
    text_vector = embeddings_all[i].tolist()
    text = Lines[i]
    aos_client.index(index=index_name,
    body={"text_vector": text_vector, 
    "text": text})

In [117]:
res = aos_client.search(index=index_name,
 body={"query": {"match_all": {}}})
print("Records found: %d." % res['hits']['total']['value'])

Records found: 73.
