In [9]:
from opensearchpy import OpenSearch

# Initialize the OpenSearch client for a local Docker setup
client = OpenSearch(
    hosts=[{'host': 'localhost', 'port': 9200}],
)

# List all indices
indices = client.indices.get_alias("*")

# Alternatively, if you don't use aliases, you can use the following line to list all indices
# indices = client.indices.get("*")

# Print the names of all indices
for index in indices:
    print(index)


.opensearch-observability
.plugins-ml-config
.opensearch-sap-log-types-config


In [12]:
# Assuming UnstructuredFileLoader is defined and correctly imported

from langchain.document_loaders import UnstructuredFileLoader
from collections import defaultdict

class CategoryContentFetcher(UnstructuredFileLoader):
    def __init__(self, file_path, mode="elements", strategy="hi_res"):
        """
        Initializes the CategoryContentFetcher with file path, mode, and strategy.
        
        :param file_path: Path to the file to be processed.
        :param mode: Loading mode, passed to UnstructuredFileLoader.
        :param strategy: Strategy for loading, passed to UnstructuredFileLoader.
        """
        # Initialize the superclass with the file path and any other necessary parameters
        super().__init__(file_path, mode=mode, strategy=strategy)
        # Load the documents immediately or as per your requirement
        self.documents = self.load()  # Assuming load() is a method from UnstructuredFileLoader
        self.category_counts = self._calculate_category_counts()

    def _calculate_category_counts(self):
        """
        Calculates the counts of each category found in the documents.
        
        :return: A dictionary with categories as keys and their counts as values.
        """
        category_counts = {}
        for doc in self.documents:
            # Assuming each 'doc' has a 'metadata' attribute with a 'category' key
            category = doc.metadata.get('category')
            if category:
                category_counts[category] = category_counts.get(category, 0) + 1
        return category_counts

    def fetch_content_by_category(self, category="Table"):
        """
        Fetches page_content of documents that have a specified category in their metadata.
        
        :param category: Category to filter the documents by. Default is "Table".
        :return: List of page_content strings from documents matching the specified category.
        """
        return [(doc.page_content, doc.metadata) for doc in self.documents if doc.metadata.get('category') == category]

    def get_category_counts(self):
        """
        Returns the calculated counts of each category.
        
        :return: A dictionary with categories as keys and their counts as values.
        """
        return self.category_counts
    def enhance_table_content(self, documents):
        """
        Enhances the content of documents categorized as tables by organizing, sorting, and merging them
        based on their parent_id and page_number.
        
        :param documents: List of tuples, each containing page_content and metadata, to be enhanced.
        :return: A dictionary of merged contents by parent_id.
        """
        organized_docs = defaultdict(list)
        for content, metadata in documents:
            parent_id = metadata.get('parent_id', 'no_parent')  # Handle documents without a parent_id
            organized_docs[parent_id].append((content, metadata))
        
        # Ensure each group is sorted by page_number
        for parent_id in organized_docs:
            organized_docs[parent_id].sort(key=lambda x: x[1]['page_number'])
        
        # Step 2: Merge contents
        merged_contents = {}
        for parent_id, docs in organized_docs.items():
            # Simple string concatenation to merge document content
            merged_content = " ".join([doc[0] for doc in docs])
            merged_contents[parent_id] = merged_content
        
        # Return the merged contents organized by parent_id
        return merged_contents


In [13]:
#fetcher = CategoryContentFetcher("stuff.pdf")

The PDF <_io.BufferedReader name='stuff.pdf'> contains a metadata field indicating that it should not allow text extraction. Ignoring this field and proceeding. Use the check_extractable if you want to raise an error in this case
The PDF <_io.BufferedReader name='stuff.pdf'> contains a metadata field indicating that it should not allow text extraction. Ignoring this field and proceeding. Use the check_extractable if you want to raise an error in this case


In [20]:
fetcher.get_category_counts()

{'Image': 4,
 'Header': 124,
 'Title': 366,
 'UncategorizedText': 146,
 'Table': 49,
 'NarrativeText': 840,
 'ListItem': 113,
 'Footer': 39}

In [18]:
fetcher.fetch_content_by_category(category="Image")

[('Enablon, a subsidiary of Wolters Kluwer  Enablon Hosted Solution System ',
  {'source': 'stuff.pdf',
   'coordinates': {'points': ((188.83333333333334, 687.1666666666666),
     (188.83333333333334, 917.1666666666666),
     (1550.0, 917.1666666666666),
     (1550.0, 687.1666666666666)),
    'system': 'PixelSpace',
    'layout_width': 1700,
    'layout_height': 2200},
   'filename': 'stuff.pdf',
   'last_modified': '2024-02-24T16:11:51',
   'filetype': 'application/pdf',
   'page_number': 1,
   'category': 'Image'}),
 ('System and Organization Controls Report  Report on Controls Placed in Operation and Tests of Operating Effectiveness Relevant to Security  For the Period  May 1, 2022, to April 30, 2023 ',
  {'source': 'stuff.pdf',
   'coordinates': {'points': ((177.83333333333331, 1184.6666666666667),
     (177.83333333333331, 1622.9999999999998),
     (1538.8333333333333, 1622.9999999999998),
     (1538.8333333333333, 1184.6666666666667)),
    'system': 'PixelSpace',
    'layout_widt

In [None]:
# Assuming UnstructuredFileLoader and Document classes are defined/imported

# Initialize CategoryContentFetcher with the path to your file
# Replace 'path/to/your/document.pdf' with the actual file path
fetcher = CategoryContentFetcher("stuff.pdf")

# Optionally, if you want to filter documents by a specific category,
# use the fetch_content_by_category method
category = "Table"  # Example category
filtered_documents = fetcher.fetch_content_by_category(category)

# Display filtered documents
print(f"Documents filtered by category '{category}':")
for content, metadata in filtered_documents:
    print(f"Content: {content}\nMetadata: {metadata}\n")

# To get and display the count of documents in each category,
# use the get_category_counts method
category_counts = fetcher.get_category_counts()
print("Counts of each category:")
for cat, count in category_counts.items():
    print(f"{cat}: {count}")


In [10]:
import hashlib
import os
import json
from langchain.document_loaders.csv_loader import CSVLoader
from langchain.document_loaders import (
    UnstructuredExcelLoader,
    PyPDFLoader,
    UnstructuredWordDocumentLoader,
    UnstructuredPowerPointLoader,
    UnstructuredFileLoader,
)
from langchain.text_splitter import RecursiveCharacterTextSplitter
import pathlib
from fastapi import HTTPException
from typing import Optional, List

def calculate_md5_hash(filename, chunk_size=8192):
    try:
        
        with open(filename, "rb") as f:
            md5_hash = hashlib.md5()
            while chunk := f.read(chunk_size):
                md5_hash.update(chunk)

        return md5_hash.hexdigest()
    except Exception as e:
        
        raise e