feat(rag): add image/video readers (#21)

This PR introduces readers and adds new ones for - mp3+4, xlsx, tiff, ods File formats. Closes #20
ametnes · Apr 16, 2024 · 4406116 · 4406116
1 parent 3840d9f
commit 4406116
Show file tree

Hide file tree

Showing 29 changed files with 76,647 additions and 2,844 deletions.
diff --git a/.github/workflows/package.yml b/.github/workflows/package.yml
@@ -23,7 +23,7 @@ jobs:
           password: ${{ secrets.DOCKER_HUB_PASSWORD }}
 
       - name: Build and push Docker image
-        uses: docker/build-push-action@v3
+        uses: docker/build-push-action@v5
         with:
           context: .
           push: true
@@ -49,7 +49,7 @@ jobs:
           password: ${{ secrets.DOCKER_HUB_PASSWORD }}
 
       - name: Build and push frontend Docker image
-        uses: docker/build-push-action@v3
+        uses: docker/build-push-action@v5
         with:
           context: .
           push: true
@@ -77,7 +77,7 @@ jobs:
           password: ${{ secrets.DOCKER_HUB_PASSWORD }}
 
       - name: Build and push RAG docker image
-        uses: docker/build-push-action@v3
+        uses: docker/build-push-action@v5
         with:
           context: .
           push: true

diff --git a/.github/workflows/test_rag.yml b/.github/workflows/test_rag.yml
@@ -59,6 +59,7 @@ jobs:
       - name: Install dependencies
         run: |
           python -m pip install --upgrade pip
+          sudo apt install ffmpeg tesseract-ocr -y
           pip install -r nesis/rag/requirements.txt -r nesis/rag/requirements-test.txt -r nesis/rag/requirements-huggingface.txt
       - name: Run unit tests
         env:

diff --git a/.gitignore b/.gitignore
@@ -143,4 +143,4 @@ dist
 *.pyc
 videos
 screenshots
-*.mp4
+#*.mp4
diff --git a/README.md b/README.md
@@ -14,9 +14,7 @@
 ---
 # 👋 What is Nesis❓
 
-Nesis is an open-source enterprise knowledge discovery solution that connects to multitudes of datasources, collecting
-
-
+Nesis is an open-source enterprise knowledge discovery solution that connects to multitudes of datasources, collecting 
 information and making it available in a conversation manner. Nesis leverages generative AI to aggregate document chunks
 collected from different documents in multiple formats such as pdf, docx, xlsx and turn them into meaning human-readable compositions. Allowing you to;
 

diff --git a/nesis/api/requirements.txt b/nesis/api/requirements.txt
@@ -5,10 +5,10 @@ psycopg2-binary==2.9.9
 python-dateutil==2.8.2
 python-memcached==1.59
 pytz==2020.1
-PyYAML==6.0.0
+PyYAML==6.0.1
 requests==2.31.0
 simplejson==3.17.6
-SQLAlchemy==2.0.23
+SQLAlchemy==2.0.25
 StringGenerator==0.4.4
 urllib3~=2.2.0
 alembic==1.12.1

diff --git a/nesis/rag/Dockerfile b/nesis/rag/Dockerfile
@@ -1,18 +1,22 @@
 FROM python:3.11-buster as build
 COPY nesis/rag/requirements.txt /app/nesis/rag/requirements.txt
 COPY nesis/rag/requirements-huggingface.txt /app/nesis/rag/requirements-huggingface.txt
+COPY nesis/rag/requirements-torch-cpu-x86.txt /app/nesis/rag/requirements-torch-cpu-x86.txt
 
 RUN apt-get update \
     && python -m venv /app/.venv \
-    && /app/.venv/bin/pip install -r /app/nesis/rag/requirements.txt -r /app/nesis/rag/requirements-huggingface.txt --default-timeout=1200
+    && /app/.venv/bin/pip install -r /app/nesis/rag/requirements.txt \
+      -r /app/nesis/rag/requirements-torch-cpu-x86.txt -r /app/nesis/rag/requirements-huggingface.txt \
+      --default-timeout=1200
 
 
 
 ARG NESIS_VERSION
 FROM python:3.11.6-slim-bookworm
 RUN apt-get update \
     && apt-get clean \
-    && adduser --system --home /app --shell /bin/bash nesis
+    && adduser --system --home /app --shell /bin/bash nesis \
+    && apt install ffmpeg tesseract-ocr -y
 
 WORKDIR /app
 

diff --git a/nesis/rag/core/components/ingest/ingest_helper.py b/nesis/rag/core/components/ingest/ingest_helper.py
@@ -10,7 +10,6 @@
     DocxReader,
     EpubReader,
     HWPReader,
-    ImageReader,
     IPYNBReader,
     MarkdownReader,
     MboxReader,
@@ -20,6 +19,13 @@
     VideoAudioReader,
 )  # pants: no-infer-dep
 
+from nesis.rag.core.components.ingest.readers import (
+    ExcelReader,
+    TiffReader,
+    OdsReader,
+    ImageReader,
+)
+
 logger = logging.getLogger(__name__)
 
 
@@ -42,7 +48,10 @@
     ".mbox": MboxReader,
     ".ipynb": IPYNBReader,
     ".json": JSONReader,
-    ".xlsx": JSONReader,
+    ".xls": ExcelReader,
+    ".xlsx": ExcelReader,
+    ".ods": OdsReader,
+    ".tiff": TiffReader,
 }
 
 

diff --git a/nesis/rag/core/components/ingest/readers.py b/nesis/rag/core/components/ingest/readers.py
@@ -0,0 +1,161 @@
+import copy
+import pathlib
+import tempfile
+from pathlib import Path
+from typing import List, Optional, Dict, Any
+
+import pandas as pd
+from PIL import Image, ImageSequence
+from fsspec import AbstractFileSystem
+from llama_index.core import Document
+from llama_index.core.readers.base import BaseReader
+from unstructured.partition.image import partition_image
+from unstructured.partition.xlsx import partition_xlsx
+
+
+def _clean_metadata(metadata: Dict, exclusion_list: List[str] = None) -> Dict:
+    """
+    We want to remove any unwanted metadata fields. This is particularly useful when readers introduce metadata from
+    intermediate steps, but we would rather not have that metadata in the vector store.
+    :param metadata: the metadata to clean
+    :param exclusion_list: the exclusion field list
+    :return: the cleaned metadata
+    """
+    metadata_copy = copy.deepcopy(metadata or {})
+    for exclusion_item in exclusion_list or []:
+        metadata_copy.pop(exclusion_item, None)
+    return metadata_copy
+
+
+class ExcelReader(BaseReader):
+    """
+    A simple MS Excel file reader. Uses pandas in the background
+    """
+
+    def __init__(self, config: Optional[Dict[str, Any]] = None) -> None:
+        self._config = config or {}
+        self._metadata_exclusion_list = self._config.get("metadata_exclusion_list") or [
+            "file_directory",
+            "filename",
+        ]
+
+    def load_data(
+        self,
+        file: Path,
+        extra_info: Optional[Dict] = None,
+        fs: Optional[AbstractFileSystem] = None,
+    ) -> List[Document]:
+        elements = partition_xlsx(file.absolute())
+        documents: List[Document] = []
+
+        for element in elements:
+            element_dict = element.to_dict()
+            document = Document(
+                text=element_dict["text"],
+                metadata={
+                    **(extra_info or {}),
+                    **_clean_metadata(
+                        element_dict["metadata"],
+                        exclusion_list=self._metadata_exclusion_list,
+                    ),
+                },
+            )
+            documents.append(document)
+        return documents
+
+
+class OdsReader(BaseReader):
+    """
+    A simple open document spreadsheet reader
+    """
+
+    def load_data(
+        self,
+        file: Path,
+        extra_info: Optional[Dict] = None,
+        fs: Optional[AbstractFileSystem] = None,
+    ) -> List[Document]:
+        data = pd.read_excel(file.absolute(), engine="odf").to_string()
+        return [Document(text=data, metadata=extra_info or {})]
+
+
+class ImageReader(BaseReader):
+    """
+    The llamaindex reader doesn't return any text so we use unstructured.io instead of llamaindex ImageReader.
+    """
+
+    def __init__(self, config: Optional[Dict[str, Any]] = None) -> None:
+        self._config = config or {}
+        self._metadata_exclusion_list = self._config.get("metadata_exclusion_list") or [
+            "file_directory",
+            "filename",
+        ]
+
+    def load_data(
+        self,
+        file: Path,
+        extra_info: Optional[Dict] = None,
+        fs: Optional[AbstractFileSystem] = None,
+    ) -> List[Document]:
+        elements = partition_image(file.absolute())
+        documents: List[Document] = []
+
+        for element in elements:
+            element_dict = element.to_dict()
+            document = Document(
+                text=element_dict["text"],
+                metadata={
+                    **(extra_info or {}),
+                    **_clean_metadata(
+                        element_dict["metadata"],
+                        exclusion_list=self._metadata_exclusion_list,
+                    ),
+                },
+            )
+            documents.append(document)
+        return documents
+
+
+class TiffReader(BaseReader):
+    """
+    A simple tiff file reader. Converts the pages into png and then uses an image reader to convert into llama-index
+    documents
+    """
+
+    def __init__(self, config: Optional[Dict[str, Any]] = None) -> None:
+        self._config = config
+        self._image_reader = ImageReader(config=self._config)
+
+    def _load_page_data(
+        self,
+        file: Path,
+        extra_info: Optional[Dict] = None,
+        fs: Optional[AbstractFileSystem] = None,
+    ) -> List[Document]:
+        return self._image_reader.load_data(
+            file.absolute(), extra_info=extra_info, fs=fs
+        )
+
+    def load_data(
+        self,
+        file: Path,
+        extra_info: Optional[Dict] = None,
+        fs: Optional[AbstractFileSystem] = None,
+    ) -> List[Document]:
+
+        with Image.open(file.absolute()) as image:
+            documents: List[Document] = []
+            for idx, page in enumerate(ImageSequence.Iterator(image)):
+                with tempfile.NamedTemporaryFile(
+                    dir=tempfile.gettempdir(),
+                    prefix=f"{file.name.split('.')[0]}-{idx}-",
+                ) as temp_file_name:
+                    path = pathlib.Path(temp_file_name.name).with_suffix(".png")
+
+                    page.save(path)
+                    page_documents: List[Document] = self._load_page_data(
+                        file=path, extra_info=extra_info, fs=fs
+                    )
+                    documents += page_documents
+
+        return documents
diff --git a/nesis/rag/requirements-huggingface.txt b/nesis/rag/requirements-huggingface.txt
@@ -1,3 +1,3 @@
 # Use pytorch@cpu
-torch @ https://download.pytorch.org/whl/cpu/torch-2.2.2%2Bcpu-cp311-cp311-linux_x86_64.whl
 llama-index-embeddings-huggingface==0.1.3
+
diff --git a/nesis/rag/requirements-test.txt b/nesis/rag/requirements-test.txt
@@ -1,3 +1,4 @@
 pylint==2.13.8
 pytest==8.1.1
-coverage==7.4.4
+coverage==7.4.4
+
diff --git a/nesis/rag/requirements-torch-cpu-x86.txt b/nesis/rag/requirements-torch-cpu-x86.txt
@@ -0,0 +1,3 @@
+# Use pytorch@cpu
+torch @ https://download.pytorch.org/whl/cpu/torch-2.2.2%2Bcpu-cp311-cp311-linux_x86_64.whl
+torchvision @ https://download.pytorch.org/whl/cpu/torchvision-0.17.2%2Bcpu-cp311-cp311-linux_x86_64.whl
diff --git a/nesis/rag/requirements.txt b/nesis/rag/requirements.txt
@@ -9,7 +9,6 @@ PyYAML>=6.0.1
 simplejson==3.17.6
 StringGenerator==0.4.4
 more-itertools==10.1.0
-onnxruntime==1.17.1
 gevent==23.9.1
 PyPDF2==3.0.1
 
@@ -25,10 +24,30 @@ llama-index-vector-stores-postgres==0.1.4.post1
 llama-index-vector-stores-chroma==0.1.6
 llama-index-vector-stores-qdrant==0.1.4
 llama-index-readers-file==0.1.12
-llama-index-embeddings-fastembed==0.1.4
 llama-index-llms-openai==0.1.12
 
 boto3==1.34.75
+
+# Dependencies for llamaindex readers. Consider replacing them with unstructured.io converters
 python-multipart==0.0.9
 python_pptx==0.6.23
 docx2txt==0.8
+openpyxl==3.1.2
+pydub==0.25.1
+odfpy==1.4.1
+EbookLib==0.18
+html2text==2024.2.26
+
+# Dependency for video/audio encoders
+openai-whisper @ git+https://github.com/openai/whisper.git
+ffprobe==0.5
+
+# unstructured converters
+unstructured==0.13.2
+unstructured-client==0.18.0
+unstructured.pytesseract==0.3.12
+unstructured-inference==0.7.25
+pillow_heif==0.16.0
+
+# This causes conflicts from onnxruntime so we attempt to install it last. Do not pin to a version so pip resolves it
+llama-index-embeddings-fastembed
diff --git a/nesis/rag/tests/rag/core/components/__init__.py b/nesis/rag/tests/rag/core/components/__init__.py