In [1]:
%%bash

pip install haystack-ai
pip install datasets

Collecting datasets
  Obtaining dependency information for datasets from https://files.pythonhosted.org/packages/09/7e/fd4d6441a541dba61d0acb3c1fd5df53214c2e9033854e837a99dd9e0793/datasets-2.14.5-py3-none-any.whl.metadata
  Using cached datasets-2.14.5-py3-none-any.whl.metadata (19 kB)
Collecting pyarrow>=8.0.0 (from datasets)
  Obtaining dependency information for pyarrow>=8.0.0 from https://files.pythonhosted.org/packages/77/0d/3a698f5fee20e6086017ae8a0fe8eac40eebceb7dc66e96993b10503ad58/pyarrow-13.0.0-cp310-cp310-macosx_11_0_arm64.whl.metadata
  Using cached pyarrow-13.0.0-cp310-cp310-macosx_11_0_arm64.whl.metadata (3.0 kB)
Collecting dill<0.3.8,>=0.3.0 (from datasets)
  Obtaining dependency information for dill<0.3.8,>=0.3.0 from https://files.pythonhosted.org/packages/f5/3a/74a29b11cf2cdfcd6ba89c0cecd70b37cd1ba7b77978ce611eb7a146a832/dill-0.3.7-py3-none-any.whl.metadata
  Using cached dill-0.3.7-py3-none-any.whl.metadata (9.9 kB)
Collecting xxhash (from datasets)
  Obtaining depen

In [2]:
import os
from getpass import getpass

openai_api_key = os.getenv("OPENAI_API_KEY", None) or getpass("Enter OpenAI API key:")

In [7]:
from haystack.preview import Document

doc_dir = "news"
files_to_index = [doc_dir + "/" + f for f in os.listdir(doc_dir)]

documents = []
for file_path in files_to_index:
  with open(file_path, encoding="utf-8") as file:
    text = file.read()
    documents.append(Document(text=text, metadata={"name": file_path.split("/")[-1][:-4]}))

news/ke-huy-quan-time100-impact-awards-part1.txt
news/sustainability-starbucks-cup-overhaul-part2.txt
news/cycle-syncing-womens-heath-part3.txt
news/white-women-perfection-anti-racism-part2.txt
news/hawaii-wildfire-relief-government-shutdown-part1.txt
news/white-women-perfection-anti-racism-part3.txt
news/cycle-syncing-womens-heath-part2.txt
news/sustainability-starbucks-cup-overhaul-part3.txt
news/the-lesson-for-employers-at-the-center-of-hollywoods-ai-standoff-part1.txt
news/the-lesson-for-employers-at-the-center-of-hollywoods-ai-standoff-part3.txt
news/ke-huy-quan-time100-impact-awards-part2.txt
news/sustainability-starbucks-cup-overhaul-part1.txt
news/white-women-perfection-anti-racism-part1.txt
news/hawaii-wildfire-relief-government-shutdown-part2.txt
news/hawaii-wildfire-relief-government-shutdown-part3.txt
news/cycle-syncing-womens-heath-part1.txt
news/the-lesson-for-employers-at-the-center-of-hollywoods-ai-standoff-part2.txt
news/ke-huy-quan-time100-impact-awards-part3.txt
news

In [9]:
from haystack.preview.document_stores import MemoryDocumentStore

document_store = MemoryDocumentStore(embedding_similarity_function="cosine")
document_store.write_documents(documents)

  from .autonotebook import tqdm as notebook_tqdm


In [18]:
haystack_prompt = """
You are a helpful expert. \
Your answers are short and to the point. \
You must only use information from the given documents. \
Use an unbiased and journalistic tone. \ 
Do not repeat text. \
Always use references in the form [Source: NAME OF DOCUMENT] when using information from a document. e.g. [Source: 3], for Document[3]. \
The reference must only refer to the name that comes in square brackets after passage. \
Otherwise, do not use brackets in your answer and reference ONLY the name of the passage without mentioning the word passage. \
If the documents can't answer the question or you are unsure say: 'The answer cannot be found in the text directly, but I would recommend consulting [Source: X] for more information', pointing the user to the most relevant document. \
{new_line}\
These are the documents:\ 

{% for doc in documents %}
[Source: {{ doc.metadata.name }}]:
{{ doc.text }}
{% endfor %}\

{new_line}\
Question: {{question}}\
{new_line}\
Answer:\
{new_line}
"""

In [66]:
from haystack.preview import component
from typing import List, Dict, Any

@component
class OpenAICostCalculator:
    """ Cost calculator for OpenAI """

    @component.output_types(total_cost=List[int])
    def run(self, metadata: List[Dict[str, Any]]):
        total_costs = []

        for data in metadata:
            cost = 0
            if  "gpt-4" in data["model"]:
                cost = (data["usage"]["prompt_tokens"] * 0.00003) + (data["usage"]["completion_tokens"] * 0.00006)
            elif "gpt-3.5-turbo" in data["model"]:
                cost = (data["usage"]["prompt_tokens"] * 0.0000015) + (data["usage"]["completion_tokens"] * 0.000002)
            elif "davinci-003" in data["model"]:
                cost = (data["usage"]["prompt_tokens"] * 0.00002) + (data["usage"]["completion_tokens"] * 0.00002)
            total_costs.append(cost)

        return {"total_cost": total_costs}

In [67]:
from haystack.preview import Pipeline
from haystack.preview.components.retrievers import MemoryBM25Retriever
from haystack.preview.components.generators.openai.gpt4 import GPT4Generator
from haystack.preview.components.builders.answer_builder import AnswerBuilder
from haystack.preview.components.builders.prompt_builder import PromptBuilder

pipeline = Pipeline()
pipeline.add_component(instance=MemoryBM25Retriever(document_store=document_store), name="retriever")
pipeline.add_component(instance=PromptBuilder(template=haystack_prompt), name="prompt_builder")
pipeline.add_component(instance=PromptInspector(), name="prompt_inspector")
pipeline.add_component(instance=GPT4Generator(api_key=openai_api_key), name="llm")
pipeline.add_component(instance=OpenAICostCalculator(), name="cost_calculator")
pipeline.add_component(instance=AnswerBuilder(), name="answer_builder")
pipeline.connect("retriever", "prompt_builder.documents")
pipeline.connect("prompt_builder", "llm")
pipeline.connect("prompt_builder", "prompt_inspector")
pipeline.connect("llm.replies", "answer_builder.replies")
pipeline.connect("llm.metadata", "answer_builder.metadata")
pipeline.connect("llm.metadata", "cost_calculator.metadata")
pipeline.connect("retriever", "answer_builder.documents")

![](./hackathon-demo.png)

In [68]:
question = "What is the latest climate news?"
results = pipeline.run({
    "retriever": {"query": question},
    "prompt_builder": {"question": question},
    "answer_builder": {"query": question}
})

Ranking by BM25...: 100%|██████████| 185/185 [00:00<00:00, 7828.67 docs/s]


In [69]:
results["answer_builder"]["answers"]



In [70]:
results["cost_calculator"]

{'total_cost': [0.19638]}