## Incorporating a custom component into a pipeline

In [1]:
from unstructured_component import UnstructuredParser

from haystack import Pipeline
from haystack.components.embedders import AzureOpenAIDocumentEmbedder
from haystack.components.preprocessors import DocumentCleaner
from haystack.components.preprocessors import DocumentSplitter
from haystack.components.writers import DocumentWriter
from pathlib import Path
from haystack.document_stores.types import DuplicatePolicy
from haystack.utils import Secret
from haystack_integrations.components.converters.unstructured import UnstructuredFileConverter
from haystack.components.fetchers import LinkContentFetcher
from haystack.components.converters import HTMLToDocument
from haystack.document_stores.in_memory import InMemoryDocumentStore 
import os
from dotenv import load_dotenv

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
unstructured_api_key = os.environ.get("UNSTRUCTURED_API_KEY")
load_dotenv(".env")
api_key = os.environ.get("news_api")
open_ai_key = os.environ.get("OPENAI_API_KEY")
unstructured = os.environ.get("UNSTRUCTURED")

AZURE_OPENAI_KEY = os.getenv('AZURE_OPENAI_API_KEY')
AZURE_OPENAI_ENDPOINT = os.getenv('AZURE_OPENAI_ENDPOINT')
AZURE_OPENAI_SERVICE = os.getenv('AZURE_OPENAI_SERVICE')
AZURE_OPENAI_EMBEDDING_SERVICE= os.getenv('AZURE_OPENAI_EMBEDDING_SERVICE')

In [5]:
unstructured_parser = UnstructuredParser(unstructured_key=unstructured_api_key,
                                          chunking_strategy="by_page",
                                          strategy="auto",
                                          model="yolox")

regex_pattern = (
    r'<.*?>'  # HTML tags
    r'|\t'  # Tabs
    r'|\n+'  # Newlines
    r'|&nbsp;'  # Non-breaking spaces
)
document_cleaner = DocumentCleaner(
                    remove_empty_lines=True,
                    remove_extra_whitespaces=True,
                    remove_repeated_substrings=False,
                    remove_substrings=None,  
                    remove_regex=regex_pattern
                )

document_embedder = AzureOpenAIDocumentEmbedder(azure_endpoint=AZURE_OPENAI_ENDPOINT,
                                                        api_key=Secret.from_token(AZURE_OPENAI_KEY),
                                                        azure_deployment=AZURE_OPENAI_EMBEDDING_SERVICE) 

# Initialize pipeline
pipeline = Pipeline()
pipeline.add_component("unstructured", unstructured_parser)
pipeline.add_component("cleaner", document_cleaner)
pipeline.add_component("embedder", document_embedder)

pipeline.connect("unstructured", "cleaner")
pipeline.connect("cleaner", "embedder")

<haystack.core.pipeline.pipeline.Pipeline object at 0x7f82b927f2e0>
🚅 Components
  - unstructured: UnstructuredParser
  - cleaner: DocumentCleaner
  - embedder: AzureOpenAIDocumentEmbedder
🛤️ Connections
  - unstructured.documents -> cleaner.documents (List[Document])
  - cleaner.documents -> embedder.documents (List[Document])

In [9]:
result = pipeline.run({"unstructured": {"sources": ["https://www.sec.gov/Archives/edgar/data/1511699/000114554924032499/0001145549-24-032499.txt"]}})

Download succeeded


Embedding Texts:   0%|          | 0/2 [00:00<?, ?it/s]

INFO: HTTP Request: POST https://bytewax-workshop.openai.azure.com//openai/deployments/bytewax-workshop-ada/embeddings?api-version=2023-05-15 "HTTP/1.1 200 OK"


Embedding Texts:  50%|█████     | 1/2 [00:00<00:00,  2.08it/s]

INFO: HTTP Request: POST https://bytewax-workshop.openai.azure.com//openai/deployments/bytewax-workshop-ada/embeddings?api-version=2023-05-15 "HTTP/1.1 200 OK"


Embedding Texts: 100%|██████████| 2/2 [00:00<00:00,  2.90it/s]


In [11]:
result['embedder']['documents']


[Document(id=765cc448ab0ab1e45c67f28053e7a070484a0cccb718ce4c9708e12f8d2e35e2, content: '0001145549-24-032499.txt : 20240529 0001145549-24-032499.hdr.sgml : 20240529 20240529101511 ACCESSIO...', meta: {'filename': '0001145549-24-032499.txt', 'languages': ['eng'], 'filetype': 'text/plain', 'source_url': 'https://www.sec.gov/Archives/edgar/data/1511699/000114554924032499/0001145549-24-032499.txt', 'symbol': 'PSQGX'}, embedding: vector of size 1536),
 Document(id=68898c29543d7846ef26e0066d105862535af7a85b3b90bf60edc397d26a7199, content: 'ORGANIZATION NAME: IRS NUMBER: 000000000 FILING VALUES: FORM TYPE: NPORT-P SEC ACT: 1940 Act SEC FIL...', meta: {'filename': '0001145549-24-032499.txt', 'languages': ['eng'], 'filetype': 'text/plain', 'source_url': 'https://www.sec.gov/Archives/edgar/data/1511699/000114554924032499/0001145549-24-032499.txt', 'symbol': 'PSQGX'}, embedding: vector of size 1536),
 Document(id=534501e2c53488982d6ab511dfe67d62fdbaa3f315b553311d6114d9ceda0c41, content: 'S000043