In [10]:
from pathlib import Path
import os
from haystack import Pipeline
from haystack.components.embedders import SentenceTransformersDocumentEmbedder,SentenceTransformersTextEmbedder
from haystack.components.converters import PyPDFToDocument, TextFileToDocument
from haystack.components.preprocessors import DocumentCleaner, DocumentSplitter
from haystack.components.routers import FileTypeRouter
from haystack.components.joiners import DocumentJoiner
from haystack.components.writers import DocumentWriter
from haystack_integrations.document_stores.chroma import ChromaDocumentStore
from haystack_integrations.components.retrievers.chroma import ChromaQueryTextRetriever

HERE = Path(os.getcwd())
print(HERE)

data_path = HERE / "data"
file_paths = [str(data_path / name) for name in os.listdir(data_path)]

chroma_store = ChromaDocumentStore()

pipeline = Pipeline()
pipeline.add_component("FileTypeRouter", FileTypeRouter(mime_types=["text/plain", "application/pdf"]))
pipeline.add_component("TextFileConverter", TextFileToDocument())
pipeline.add_component("PdfFileConverter", PyPDFToDocument())

pipeline.add_component("Joiner", DocumentJoiner())
pipeline.add_component("Cleaner", DocumentCleaner())
pipeline.add_component("Splitter", DocumentSplitter(split_by="sentence", split_length=250, split_overlap=30))
# pipeline.add_component("TextEmbedder", SentenceTransformersTextEmbedder())
pipeline.add_component("Embedder", SentenceTransformersDocumentEmbedder(model="sentence-transformers/all-MiniLM-L6-v2"))

pipeline.add_component("Writer", DocumentWriter(document_store=chroma_store))

pipeline.connect("FileTypeRouter.text/plain", "TextFileConverter.sources")
pipeline.connect("FileTypeRouter.application/pdf", "PdfFileConverter.sources")
pipeline.connect("TextFileConverter.documents", "Joiner.documents")
pipeline.connect("PdfFileConverter.documents", "Joiner.documents")
pipeline.connect("Joiner.documents", "Cleaner.documents")
pipeline.connect("Cleaner.documents", "Splitter.documents")
pipeline.connect("Splitter.documents", "Embedder.documents")
# pipeline.connect("TextEmbedder.embeddings", "Embedder.documents")
pipeline.connect("Embedder.documents", "Writer.documents")

pipeline.run(
    {"FileTypeRouter": {"sources": file_paths}},
)

# Querying pipeline
querying = Pipeline()
querying.add_component("retriever", ChromaQueryTextRetriever(chroma_store))
results = querying.run({"retriever": {"query": "Vishwam", "top_k": 3}})
print(results)


/unity/f2/asugandhi/Downloads/LLM_Playground


Batches: 100%|██████████| 1/1 [00:00<00:00, 11.66it/s]
Add of existing embedding ID: 11f7061bb8c56ae79965f1ba0d1a0283188dc031309394e1a03470d5d72207a9
Insert of existing embedding ID: 11f7061bb8c56ae79965f1ba0d1a0283188dc031309394e1a03470d5d72207a9
Add of existing embedding ID: ce02ebe3fa97972f0c76b2c175f658873b2d0e19987e9cbc38dcacb25b8ebdba
Insert of existing embedding ID: ce02ebe3fa97972f0c76b2c175f658873b2d0e19987e9cbc38dcacb25b8ebdba
Add of existing embedding ID: b9679ae3e33c58d9299d929f03d3b6f868d81dcd0fb7197d59e38c1962a4f92d
Insert of existing embedding ID: b9679ae3e33c58d9299d929f03d3b6f868d81dcd0fb7197d59e38c1962a4f92d
Add of existing embedding ID: 7e1267428582f9f3323ba16dc7fe4db1771b409537ae46b94bdeb4005888b9a5
Insert of existing embedding ID: 7e1267428582f9f3323ba16dc7fe4db1771b409537ae46b94bdeb4005888b9a5
Add of existing embedding ID: 2492025f08f3623107a8b4c3503bff07c8dde09d7bdbc74f6af12edc28d59cdb
Insert of existing embedding ID: 2492025f08f3623107a8b4c3503bff07c8dde09d7bdbc

{'retriever': {'documents': [Document(id=b9679ae3e33c58d9299d929f03d3b6f868d81dcd0fb7197d59e38c1962a4f92d, content: 'Vishwam Shah is a highly motivated and skilled Computer Science professional currently pursuing a Ma...', meta: {'file_path': '/unity/f2/asugandhi/Downloads/LLM_Playground/data/mf.txt', 'source_id': '6d425f2fa8ce25e5d4b7890423744220600079b727b22e39b514f70d4660eab5'}, score: 1.0066450834274292, embedding: vector of size 384), Document(id=2ab2eea350a81830ea14589c92e831aaaed042fc64d0361100d48c38400ad51d, content: 'VISHWAM SHAH
Tallahassee, FL |shahvishwam7@gmail.com |+1 (850) 666 - 0095 |https://www.linkedin.com/...', meta: {'file_path': '/unity/f2/asugandhi/Downloads/LLM_Playground/data/Resume_Vishwam_Shah_Back_end.pdf', 'source_id': 'cdc6b0ecf29b6b22099b92f3b240d9ccb8d50e564aeb4c7ffd07e02f35d9b2e4'}, score: 1.562894344329834, embedding: vector of size 384), Document(id=7e1aef98a5cb652d441afa9411a0181a2f6a9ffe054156a322990d9dcab01840, content: ' The
“ECMWF Parameter ID” co

#Information Retriver

In [4]:
# Querying pipeline
querying = Pipeline()
querying.add_component("retriever", ChromaQueryTextRetriever(chroma_store))
results = querying.run({"retriever": {"query": "Aditya", "top_k": 3}})
print(results)


{'retriever': {'documents': [Document(id=ce02ebe3fa97972f0c76b2c175f658873b2d0e19987e9cbc38dcacb25b8ebdba, content: 'Aditya Sugandhi's journey as a Software Engineer is characterized by a deep commitment to excellence...', meta: {'file_path': '/unity/f2/asugandhi/Downloads/LLM_Playground/data/Aditya_train.txt', 'source_id': '228fb178549cb032d67e0b2da301131f48d7c88c814b6d6920c92727b1c8f5fd'}, score: 1.1221085786819458, embedding: vector of size 384), Document(id=11f7061bb8c56ae79965f1ba0d1a0283188dc031309394e1a03470d5d72207a9, content: 'Aditya Sugandhi is a seasoned Software Engineer with a rich background and diverse skill set, encomp...', meta: {'file_path': '/unity/f2/asugandhi/Downloads/LLM_Playground/data/Aditya_test.txt', 'source_id': 'c85a2287836cae980897693decb5e9d07e80f60b7c96b4e542ef3057e11fc228'}, score: 1.2236461639404297, embedding: vector of size 384), Document(id=a6ad41c3febd74d1f6825aac59c2d6dd7589ae8088bb3b449ea239c97d6f1b1c, content: ' . . . . . . . . . . . . . . . . .

In [11]:
from haystack import Pipeline
from haystack.utils import Secret
from haystack_integrations.components.retrievers.chroma import ChromaQueryTextRetriever
from haystack.components.readers import ExtractiveReader
from haystack.components.generators import GPTGenerator
from haystack.components.builders.prompt_builder import PromptBuilder
from haystack.components.generators import OpenAIGenerator

template = """
Answer all the questions in the following format and based on Aditya.
and if the answer is not present in the context, then generate the answer using the LLM model.
Context:
{% for doc in documents %}
  {{ doc.content }}
{% endfor %}
Question: {{question}}
Answer:
"""

prompt_builder = PromptBuilder(template=template)
retriever = ChromaQueryTextRetriever(document_store = chroma_store)
#ExtractiveReader to extract answers from the relevant context
api_key = Secret.from_token("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")
llm = OpenAIGenerator(model="gpt-3.5-turbo-0125",api_key=api_key)
reader = ExtractiveReader(model="deepset/roberta-base-squad2-distilled")

extractive_qa_pipeline = Pipeline()
extractive_qa_pipeline.add_component("retriever", retriever)
extractive_qa_pipeline.add_component(instance=prompt_builder, name="prompt_builder")
extractive_qa_pipeline.add_component("llm", llm)

extractive_qa_pipeline.connect("retriever.documents", "prompt_builder.documents")
extractive_qa_pipeline.connect("prompt_builder", "llm")


query = "who is Aditya Dating?"
print(query)
# Define the input data for the pipeline components
input_data = {
    "retriever": {"query": query, "top_k": 1},
    "prompt_builder": {"question": query},
     # Use 'max_tokens' instead of 'max_new_tokens'
}

# Run the pipeline with the updated input data
results = extractive_qa_pipeline.run(input_data)
print(results)

2024-02-27 12:26:57.653361: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2024-02-27 12:27:10.699656: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/local/cuda-11.8/lib64:/usr/local/cuda-11.8/lib64
2024-02-27 12:27:10.699819: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/local/cuda-11.8/lib64:/usr/local/cuda-11.8/lib64


who is Aditya Dating?


AuthenticationError: Error code: 401 - {'error': {'message': 'Incorrect API key provided: sk-ypXXD***************************************F7de. You can find your API key at https://platform.openai.com/account/api-keys.', 'type': 'invalid_request_error', 'param': None, 'code': 'invalid_api_key'}}