In [21]:
from langchain.document_loaders import DirectoryLoader

directory = 'text_input_data'

def load_docs(directory):
  loader = DirectoryLoader(directory)
  documents = loader.load()
  return documents

documents = load_docs(directory)
len(documents)

13

In [22]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

def split_docs(documents,chunk_size=2000,chunk_overlap=20):
  text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
  docs = text_splitter.split_documents(documents)
  return docs

docs = split_docs(documents)
print(len(docs))

2560


In [23]:
from langchain.embeddings import SentenceTransformerEmbeddings
embeddings = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")

In [24]:
from langchain.vectorstores import Chroma
db = Chroma.from_documents(docs, embeddings)

In [30]:
query = "How to high availability in Azure services?"
matching_docs = db.similarity_search(query,k=2)

matching_docs[0].page_content

"perspective, you need to do whatever you can to make sure that your services have optimal uptime and performance. A popular and effective method for enhancing availability and performance is load balancing. Load balancing is a method of distributing network traffic across servers that are part of a service. For example, if you have front-end web servers as part of your service, you can use load balancing to distribute the traffic across your multiple frontend web servers. This distribution of traffic increases availability because if one of the web servers becomes unavailable, the load balancer stops sending traffic to that server and redirects it to the servers that are still online. Load balancing also helps performance, because the processor, network, and memory overhead for serving requests is distributed across all the load-balanced servers. We recommend that you employ load balancing whenever you can, and as appropriate for your services. Following are scenarios at both the Azur

In [10]:
persist_directory = "ChromaDB"

vectordb = Chroma.from_documents(
    documents=docs, embedding=embeddings, persist_directory=persist_directory
)

vectordb.persist()

In [13]:
new_db = Chroma(persist_directory="ChromaDB",embedding_function=embeddings)

In [20]:
query = "list all the resilient services in Aws"
matching_docs = new_db.similarity_search_with_score(query,k=4)
matching_docs[0]

(Document(page_content='Resources\n\nRelated best practices:\n\nREL03-BP01 Choose how to segment your workload (p. 272)\n\nREL10-BP01 Deploy the workload to multiple locations (p. 328)\n\nREL11-BP01 Monitor all components of the workload to detect failures (p. 339)\n\nREL11-BP03 Automate healing on all layers (p. 342)\n\nREL12-BP05 Test resiliency using chaos engineering (p. 353)\n\nREL13-BP01 Define recovery objectives for downtime and data loss (p. 361)\n\nUnderstanding workload health\n\nRelated documents:\n\nAvailability with redundancy\n\nReliability pillar - Availability\n\nMeasuring availability\n\nAWS Fault Isolation Boundaries\n\nShared Responsibility Model for Resiliency\n\nStatic stability using Availability Zones\n\nAWS Service Level Agreements (SLAs)\n\nGuidance for Cell-based Architecture on AWS\n\nAWS infrastructure\n\nAdvanced Multi-AZ Resiliance Patterns whitepaper\n\nRelated services:\n\nAmazon CloudWatch\n\nAWS Config\n\nAWS Trusted Advisor\n\n349\n\nAWS Well-Archite