https://haystack.deepset.ai/integrations/milvus-document-store
https://haystack.deepset.ai/integrations/ollama

In [None]:
import glob
import os

from haystack import Pipeline
from haystack.components.converters import MarkdownToDocument
from haystack.components.embedders import SentenceTransformersDocumentEmbedder, SentenceTransformersTextEmbedder
from haystack.components.preprocessors import DocumentSplitter
from haystack.components.writers import DocumentWriter

from milvus_haystack import MilvusDocumentStore
from milvus_haystack.milvus_embedding_retriever import MilvusEmbeddingRetriever

In [71]:


document_store = MilvusDocumentStore(
    connection_args={
        "host": "localhost",
        "port": "19530",
        "user": "",
        "password": "",
        "secure": False,
    },
    collection_name="scouting",
    vector_field="embeddings"
)


In [None]:
prompt_template = """You are an scouting assistant in football (soccer). 
                    Answer the following query based on the provided context. If the context does
                     not include an answer, reply with 'I don't know'. \n
                     Query: {{query}}
                     Documents:
                     {% for doc in documents %}
                     Player-ID: {{doc.meta['player_transfermarkt_id']}}, Report-Content: {{ doc.content }} \n###\n 
                     {% endfor %}
                     
                     Give a short answer for every unique player-id from the provided documents and a summary about their reports.
                     Answer: 
                  """

In [None]:
from haystack.utils import Secret
from haystack.components.embedders import SentenceTransformersTextEmbedder
from haystack.components.builders import PromptBuilder
from haystack_integrations.components.generators.ollama import OllamaGenerator
from haystack_integrations.components.embedders.ollama.text_embedder import OllamaTextEmbedder

rag_pipeline = Pipeline()
rag_pipeline.add_component("text_embedder", OllamaTextEmbedder(model="nomic-embed-text"))
rag_pipeline.add_component("retriever", MilvusEmbeddingRetriever(document_store=document_store, top_k=3))
rag_pipeline.add_component("prompt_builder", PromptBuilder(template=prompt_template))
rag_pipeline.add_component("generator", OllamaGenerator(model="mistral", url="http://localhost:11434/api/generate"))
rag_pipeline.connect("text_embedder.embedding", "retriever.query_embedding")
rag_pipeline.connect("retriever.documents", "prompt_builder.documents")
rag_pipeline.connect("prompt_builder", "generator")

question = "I need an attacking winger, showcasing exceptional speed and technique."


results = rag_pipeline.run(
    {
        "text_embedder": {"text": question},
        "prompt_builder": {"query": question},
    }
)
print('RAG answer:', results["generator"]["replies"][0])


In [None]:
# Checking what we get back from queries
from haystack_integrations.components.embedders.ollama.text_embedder import OllamaTextEmbedder
# t=test. Wanted ot make sure i dont accidently use it in the other code
t_query = "Performed as an attacking winger, showcasing exceptional speed and technique."
# used this embedding locally when importing aswell
t_embedder = OllamaTextEmbedder(model="nomic-embed-text")
t_retriever = MilvusEmbeddingRetriever(document_store=document_store, top_k=3)
                                
t_embedding = t_embedder.run(t_query)["embedding"]          
t_retrieved_documents = t_retriever.run(t_embedding)

t_prompt_builder = PromptBuilder(template=prompt_template)
t_prompt = t_prompt_builder.run(template_variables={"documents": t_retrieved_documents['documents'],"query": question})
print(t_prompt)


In [111]:
#Trying to get structured output now
# https://haystack.deepset.ai/tutorials/28_structured_output_with_loop
from pydantic import BaseModel, Field
from typing import List
import json

class PlayerResponse(BaseModel):
    player_id: int = Field(description="ID of the player")
    report_summary: str = Field(name="report_summary",
                                description="Summary of the reports that have the same player id")


# We want to get a list of players
class ListPlayerResponse(BaseModel):
    list: List[PlayerResponse]
    

json_schema = json.dumps(PlayerResponse.model_json_schema())
json_schema


'{"properties": {"player_id": {"description": "ID of the player", "title": "Player Id", "type": "integer"}, "report_summary": {"description": "Summary of the reports that have the same player id", "name": "report_summary", "title": "Report Summary", "type": "string"}}, "required": ["player_id", "report_summary"], "title": "PlayerResponse", "type": "object"}'

In [112]:
import json
import random
import pydantic
from pydantic import ValidationError
from typing import Optional, List
from colorama import Fore
from haystack import component

# Define the component input parameters
@component
class OutputValidator:
    def __init__(self, pydantic_model: pydantic.BaseModel):
        self.pydantic_model = pydantic_model
        self.iteration_counter = 0

    # Define the component output
    @component.output_types(valid_replies=List[str], invalid_replies=Optional[List[str]], error_message=Optional[str])
    def run(self, replies: List[str]):
        print("got to output validation")
        self.iteration_counter += 1

        ## Try to parse the LLM's reply ##
        # If the LLM's reply is a valid object, return `"valid_replies"`
        try:
            output_dict = json.loads(replies[0])
            self.pydantic_model.parse_obj(output_dict)
            print(
                Fore.GREEN
                + f"OutputValidator at Iteration {self.iteration_counter}: Valid JSON from LLM - No need for looping: {replies[0]}"
            )
            return {"valid_replies": replies}

        # If the LLM's reply is corrupted or not valid, return "invalid_replies" and the "error_message" for LLM to try again
        except (ValueError, ValidationError) as e:
            print(
                Fore.RED
                + f"OutputValidator at Iteration {self.iteration_counter}: Invalid JSON from LLM - Let's try again.\n"
                f"Output from LLM:\n {replies[0]} \n"
                f"Error from OutputValidator: {e}"
            )
            return {"invalid_replies": replies, "error_message": str(e)}

Again postfix with _loop as not to infere with any other variables

In [117]:
from haystack.components.builders import PromptBuilder

prompt_template_loop = """
You are a scouting assistant in football (soccer). 
Answer the following query and create a JSON object from the information present in this context: 
Query: 
{{query}}


Context:
{% for doc in documents %}
Player-ID: {{doc.meta['player_transfermarkt_id']}}, Report-Content: {{ doc.content }} \n###\n 
{% endfor %}
                     
Give a short answer for every unique player-id from the provided documents and a summary about their reports..
Only use information that is present in the context. Follow this JSON schema, but only return the actual instances without any additional schema definition:
{{schema}}
Make sure your response is a dict and not a list.
{% if invalid_replies and error_message %}
  You already created the following output in a previous attempt: {{invalid_replies}}
  However, this doesn't comply with the format requirements from above and triggered this Python exception: {{error_message}}
  Correct the output and try again. Just return the corrected output without any extra explanations.
{% endif %}
"""

prompt_builder_loop = PromptBuilder(template=prompt_template_loop)


In [118]:
rag_pipeline_loop = Pipeline(max_loops_allowed=5)
rag_pipeline_loop.add_component("text_embedder", OllamaTextEmbedder(model="nomic-embed-text"))
rag_pipeline_loop.add_component("retriever", MilvusEmbeddingRetriever(document_store=document_store, top_k=3))
rag_pipeline_loop.add_component("prompt_builder", prompt_builder_loop)
rag_pipeline_loop.add_component("llm", OllamaGenerator(model="mistral", url="http://localhost:11434/api/generate"))
output_validator = OutputValidator(pydantic_model=PlayerResponse)
rag_pipeline_loop.add_component(instance=output_validator, name="output_validator")


rag_pipeline_loop.connect("text_embedder.embedding", "retriever.query_embedding")
rag_pipeline_loop.connect("retriever.documents", "prompt_builder.documents")
rag_pipeline_loop.connect("prompt_builder", "llm")
rag_pipeline_loop.connect("llm", "output_validator")

# If a component has more than one output or input, explicitly specify the connections:
rag_pipeline_loop.connect("output_validator.invalid_replies", "prompt_builder.invalid_replies")
rag_pipeline_loop.connect("output_validator.error_message", "prompt_builder.error_message")

<haystack.core.pipeline.pipeline.Pipeline object at 0x7f2e9e8de2c0>
🚅 Components
  - text_embedder: OllamaTextEmbedder
  - retriever: MilvusEmbeddingRetriever
  - prompt_builder: PromptBuilder
  - llm: OllamaGenerator
  - output_validator: OutputValidator
🛤️ Connections
  - text_embedder.embedding -> retriever.query_embedding (List[float])
  - retriever.documents -> prompt_builder.documents (List[Document])
  - prompt_builder.prompt -> llm.prompt (str)
  - llm.replies -> output_validator.replies (List[str])
  - output_validator.invalid_replies -> prompt_builder.invalid_replies (Optional[List[str]])
  - output_validator.error_message -> prompt_builder.error_message (Optional[str])

In [75]:
# save pipeline overview in local directory
rag_pipeline_loop.draw("auto-correct-pipeline.png")

In [119]:
question_loop = "I need an attacking winger, showcasing exceptional speed and technique."

# result = rag_pipeline_loop.run({"prompt_builder": {"context": passage, "schema": json_schema}})
results = rag_pipeline_loop.run(
    {
        "text_embedder": {"text": question_loop},
        "prompt_builder": {"query": question_loop, "schema": json_schema},
    }
)

ReadTimeout: HTTPConnectionPool(host='localhost', port=11434): Read timed out. (read timeout=120)