In [51]:
import warnings
from helper import load_env

warnings.filterwarnings('ignore')
load_env()

Import all the required dependencies

In [52]:
from haystack import Pipeline
from haystack.document_stores.in_memory import InMemoryDocumentStore
from haystack.components.converters import PDFMinerToDocument
from haystack.components.preprocessors import DocumentCleaner
from haystack.components.preprocessors import DocumentSplitter
from haystack.components.writers import DocumentWriter
from haystack_integrations.components.embedders.cohere.document_embedder import CohereDocumentEmbedder
from haystack_integrations.components.embedders.cohere.text_embedder import CohereTextEmbedder
from haystack.components.retrievers.in_memory import InMemoryEmbeddingRetriever
from haystack.components.builders import PromptBuilder
from haystack.components.generators import OpenAIGenerator
import os

Set up the document ingestion pipeline

In [47]:
document_store = InMemoryDocumentStore(embedding_similarity_function="cosine")

pipeline = Pipeline()
pipeline.add_component("converter", PDFMinerToDocument())
pipeline.add_component("cleaner", DocumentCleaner())
pipeline.add_component("splitter", DocumentSplitter(split_by="sentence", split_length=5))
pipeline.add_component("embedder", CohereDocumentEmbedder(model="embed-english-v3.0"))
pipeline.add_component("writer", DocumentWriter(document_store=document_store))

pipeline.connect("converter", "cleaner")
pipeline.connect("cleaner", "splitter")
pipeline.connect("splitter", "embedder")
pipeline.connect("embedder", "writer")

pipeline.run({"converter": {"sources": ["sample-business-plan-2015.pdf"]}})

Calculating embeddings: 100%|██████████| 2/2 [00:01<00:00,  1.53it/s]


 'writer': {'documents_written': 53}}

In [48]:
prompt = """
Answer the question based on the provided context.
Context:
{% for doc in documents %}
   {{ doc.content }} 
{% endfor %}
Question: {{ query }}
"""

query_embedder = CohereTextEmbedder(model="embed-english-v3.0")
retriever = InMemoryEmbeddingRetriever(document_store=document_store)
prompt_builder = PromptBuilder(template=prompt)
generator = OpenAIGenerator()

query_pipeline = Pipeline()
query_pipeline.add_component("query_embedder", query_embedder)
query_pipeline.add_component("retriever", InMemoryEmbeddingRetriever(document_store=document_store))
query_pipeline.add_component("prompt", prompt_builder)
query_pipeline.add_component("generator", generator)

query_pipeline.connect("query_embedder.embedding", "retriever.query_embedding")
query_pipeline.connect("retriever.documents", "prompt.documents")
query_pipeline.connect("prompt", "generator")

location_question = "What locations will the company operate in? Provide response in a JSON format like, {\"locations\": [{\"City\": \"\"}]}"
industries_question = "What industries does this company fall into? Provide response in a JSON format like, {\"industries\": [\"\"]}"
# supply_chain_question = "What kinds of suppliers are mentioned in the document? Provide response in a JSON format like, {\"suppliers\": [\"\"]}"
# nat_resource_question = "What natural resources are mentioned in the document? Provide response in a JSON format like, {\"resources\": [\"\"]}"

industries_result = query_pipeline.run(
    {
        "query_embedder": {"text": industries_question},
        "retriever": {"top_k": 1},
        "prompt": {"query": industries_question},
    }
)

location_result = query_pipeline.run(
    {
        "query_embedder": {"text": location_question},
        "retriever": {"top_k": 1},
        "prompt": {"query": location_question},
    }
)


In [49]:
print(industries_result["generator"]["replies"][0])
print(location_result["generator"]["replies"][0])

{"industries": ["Bicycle Industry", "Retail Industry", "Repair and Service Industry"]}
{
    "locations": [
        {"City": "Abbotville"}
    ]
}


In [50]:
industry_analysis_pipeline = Pipeline()