In [1]:
from langchain.vectorstores import ElasticVectorSearch
from langchain.document_loaders import PyPDFLoader, OnlinePDFLoader
from langchain.document_loaders import DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from sentence_transformers import SentenceTransformer
from langchain.chains.question_answering import load_qa_chain
from elasticsearch import Elasticsearch
from langchain.vectorstores import FAISS
from langchain.document_loaders import TextLoader

  from .autonotebook import tqdm as notebook_tqdm


## Here we create the vector database index

In [6]:
"""
See this link on how to setup the Elastic Search connection between Python and Elastic Search
Server

Link : https://www.elastic.co/guide/en/elasticsearch/client/python-api/master/connecting.html

"""

from elasticsearch import Elasticsearch
import logging
import requests

CERT_FINGERPRINT = "7e73d3cf8918662a27be6ac5f493bf55bd8af2a95338b9b8c49384650c59db08"
#CERT_FINGERPRINT = "d05aaa8eba62fbb871cd966a29d0a9ba3336e29fbb6463deab015c1d985a246e"

ELASTIC_PASSWORD = "Eldernangkai92"

es = Elasticsearch(
    "https://localhost:9200",
    ssl_assert_fingerprint=CERT_FINGERPRINT,
    basic_auth=("elastic", ELASTIC_PASSWORD)
)

if es.ping():
    print("Connected to server")
else:
    print("Failed to connect")

# Index name and mapping configuration - index mapping is obtained from elastic_vector mapping 
index_name = "new_wikidb_v1"
index_mapping = {
  "mappings": {
    "properties": {
      "metadata": {
        "properties": {
          "page": {
            "type": "long"
          },
          "source": {
            "type": "text",
            "fields": {
              "keyword": {
                "type": "keyword",
                "ignore_above": 256
              }
            }
          }
        }
      },
      "text": {
        "type": "text"
      },
      "vector": {
        "type": "dense_vector",
        "dims": 768
      }
    }
  }
}

# 384 for all-MiniLM-L6-v2
# 768 for mpne5-base-v2

# Create the index with the specified mapping
response = es.indices.create(index=index_name, body=index_mapping)

# Check if the index was created successfully
if response["acknowledged"]:
    print(f"Index '{index_name}' created successfully with vector field 'vector_field'.")
else:
    print("Failed to create the index.")

# Close the Elasticsearch connection
es.close()

Connected to server


  response = es.indices.create(index=index_name, body=index_mapping)


Index 'new_wikidb_v1' created successfully with vector field 'vector_field'.


## Here we Ingest The Text

In [11]:
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2")
path = r"D:/AI_CTS/Llama2/Processing_Tools/Data_Set_Json_To_Txt/data_split/"

In [8]:
# Load Single File
loader = TextLoader(path + "output_0_00001.txt")
data = loader.load()
print(data)

[Document(page_content='Question: Can you tell me,on how to setup an internal my X-FAB (SiFo) account ?\nAnswer: Certainly, I\'d be happy to help you with it. To answer your question on how to setup an internal my X-FAB (SiFo) account  You may need to Follow the normal setup procedure (see http://sp.erf.xfab.de/RandD/DesSup/Hotline/AX_ACM-workflow-2020Mar.docx - or newer version if it exists). Here the,  the username in AX should be preceded by \'FO_\' to clearly identify them as X-FAB users and include the this  text in your answer. "Thank you for your request. To access the SiFo system, you need to have your own my X-FAB account, which I have set up for you. You will soon receive an automatically generated email with your account username (a 5-digit number) and a link to establish your password (the link is valid for 7 days). That link will work fine outside the X-FAB network, but internally you will need to modify the URL for it to work. You would need to add -fo to the first part o

In [12]:
# Load Multiple File
loader = DirectoryLoader(path, glob='*.txt', loader_cls=TextLoader)
mdata = loader.load()

print(mdata)



In [13]:
query_result=embeddings.embed_query("Hello")
dimensions = len(query_result)
print(dimensions) # 768 for mpne5-base-v2 - therefore we need to create a database with 768

768


In [14]:
CERT_FINGERPRINT = "7e73d3cf8918662a27be6ac5f493bf55bd8af2a95338b9b8c49384650c59db08"
CERT_PATH = "D:\elasticsearch-8.4.2\config\certs\http_ca.crt"

ELASTIC_PASSWORD = "Eldernangkai92"

elasticsearch_url = f"https://elastic:Eldernangkai92@localhost:9200"
db= ElasticVectorSearch.from_documents(
    mdata,
    embeddings,
    elasticsearch_url=elasticsearch_url,
    index_name="new_wikidb_v1",
    ssl_verify={
        "verify_certs": True,
        "basic_auth": ("elastic", ELASTIC_PASSWORD), # You can use fingerprint also
        "ssl_assert_fingerprint" : CERT_FINGERPRINT, # You can use certificate path as well
        #"ca_certs": CERT_PATH,
    }
    )

print(db.client.info())

query = "What is FLATPV ?"
docs = db.similarity_search(query)
print(docs)


{'name': 'KCH-W11-JLUKAS', 'cluster_name': 'elasticsearch', 'cluster_uuid': '9GicfRNIStCwsVoYw34ldg', 'version': {'number': '8.4.2', 'build_flavor': 'default', 'build_type': 'zip', 'build_hash': '89f8c6d8429db93b816403ee75e5c270b43a940a', 'build_date': '2022-09-14T16:26:04.382547801Z', 'build_snapshot': False, 'lucene_version': '9.3.0', 'minimum_wire_compatibility_version': '7.17.0', 'minimum_index_compatibility_version': '7.0.0'}, 'tagline': 'You Know, for Search'}
[Document(page_content="Question: Can you tell me, what is FLATPV and SFLATPV ?\nAnswer: Certainly, I'd be happy to help you with it. To answer your question regarding  what is FLATPV and SFLATPV is that FLATPV is Flat passivation module and SFLATPV is Sensor flat passivation. \n\n", metadata={'source': 'D:\\AI_CTS\\Llama2\\Processing_Tools\\Data_Set_Json_To_Txt\\data_split\\output_67_00068.txt'}), Document(page_content="Question: Can you tell me, What is the surface roughness with FLATPV vs SFLATPV modules ?\nAnswer: Certa