Skip to content

Commit

Permalink
feat(rag): add image/video readers (#21)
Browse files Browse the repository at this point in the history
This PR introduces readers and adds new ones for
- mp3+4, xlsx, tiff, ods File formats.

Closes #20
  • Loading branch information
mawandm committed Apr 16, 2024
1 parent 3840d9f commit 4406116
Show file tree
Hide file tree
Showing 29 changed files with 76,647 additions and 2,844 deletions.
6 changes: 3 additions & 3 deletions .github/workflows/package.yml
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ jobs:
password: ${{ secrets.DOCKER_HUB_PASSWORD }}

- name: Build and push Docker image
uses: docker/build-push-action@v3
uses: docker/build-push-action@v5
with:
context: .
push: true
Expand All @@ -49,7 +49,7 @@ jobs:
password: ${{ secrets.DOCKER_HUB_PASSWORD }}

- name: Build and push frontend Docker image
uses: docker/build-push-action@v3
uses: docker/build-push-action@v5
with:
context: .
push: true
Expand Down Expand Up @@ -77,7 +77,7 @@ jobs:
password: ${{ secrets.DOCKER_HUB_PASSWORD }}

- name: Build and push RAG docker image
uses: docker/build-push-action@v3
uses: docker/build-push-action@v5
with:
context: .
push: true
Expand Down
1 change: 1 addition & 0 deletions .github/workflows/test_rag.yml
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,7 @@ jobs:
- name: Install dependencies
run: |
python -m pip install --upgrade pip
sudo apt install ffmpeg tesseract-ocr -y
pip install -r nesis/rag/requirements.txt -r nesis/rag/requirements-test.txt -r nesis/rag/requirements-huggingface.txt
- name: Run unit tests
env:
Expand Down
2 changes: 1 addition & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -143,4 +143,4 @@ dist
*.pyc
videos
screenshots
*.mp4
#*.mp4
4 changes: 1 addition & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -14,9 +14,7 @@
---
# 👋 What is Nesis❓

Nesis is an open-source enterprise knowledge discovery solution that connects to multitudes of datasources, collecting


Nesis is an open-source enterprise knowledge discovery solution that connects to multitudes of datasources, collecting
information and making it available in a conversation manner. Nesis leverages generative AI to aggregate document chunks
collected from different documents in multiple formats such as pdf, docx, xlsx and turn them into meaning human-readable compositions. Allowing you to;

Expand Down
4 changes: 2 additions & 2 deletions nesis/api/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -5,10 +5,10 @@ psycopg2-binary==2.9.9
python-dateutil==2.8.2
python-memcached==1.59
pytz==2020.1
PyYAML==6.0.0
PyYAML==6.0.1
requests==2.31.0
simplejson==3.17.6
SQLAlchemy==2.0.23
SQLAlchemy==2.0.25
StringGenerator==0.4.4
urllib3~=2.2.0
alembic==1.12.1
Expand Down
8 changes: 6 additions & 2 deletions nesis/rag/Dockerfile
Original file line number Diff line number Diff line change
@@ -1,18 +1,22 @@
FROM python:3.11-buster as build
COPY nesis/rag/requirements.txt /app/nesis/rag/requirements.txt
COPY nesis/rag/requirements-huggingface.txt /app/nesis/rag/requirements-huggingface.txt
COPY nesis/rag/requirements-torch-cpu-x86.txt /app/nesis/rag/requirements-torch-cpu-x86.txt

RUN apt-get update \
&& python -m venv /app/.venv \
&& /app/.venv/bin/pip install -r /app/nesis/rag/requirements.txt -r /app/nesis/rag/requirements-huggingface.txt --default-timeout=1200
&& /app/.venv/bin/pip install -r /app/nesis/rag/requirements.txt \
-r /app/nesis/rag/requirements-torch-cpu-x86.txt -r /app/nesis/rag/requirements-huggingface.txt \
--default-timeout=1200



ARG NESIS_VERSION
FROM python:3.11.6-slim-bookworm
RUN apt-get update \
&& apt-get clean \
&& adduser --system --home /app --shell /bin/bash nesis
&& adduser --system --home /app --shell /bin/bash nesis \
&& apt install ffmpeg tesseract-ocr -y

WORKDIR /app

Expand Down
13 changes: 11 additions & 2 deletions nesis/rag/core/components/ingest/ingest_helper.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,6 @@
DocxReader,
EpubReader,
HWPReader,
ImageReader,
IPYNBReader,
MarkdownReader,
MboxReader,
Expand All @@ -20,6 +19,13 @@
VideoAudioReader,
) # pants: no-infer-dep

from nesis.rag.core.components.ingest.readers import (
ExcelReader,
TiffReader,
OdsReader,
ImageReader,
)

logger = logging.getLogger(__name__)


Expand All @@ -42,7 +48,10 @@
".mbox": MboxReader,
".ipynb": IPYNBReader,
".json": JSONReader,
".xlsx": JSONReader,
".xls": ExcelReader,
".xlsx": ExcelReader,
".ods": OdsReader,
".tiff": TiffReader,
}


Expand Down
161 changes: 161 additions & 0 deletions nesis/rag/core/components/ingest/readers.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,161 @@
import copy
import pathlib
import tempfile
from pathlib import Path
from typing import List, Optional, Dict, Any

import pandas as pd
from PIL import Image, ImageSequence
from fsspec import AbstractFileSystem
from llama_index.core import Document
from llama_index.core.readers.base import BaseReader
from unstructured.partition.image import partition_image
from unstructured.partition.xlsx import partition_xlsx


def _clean_metadata(metadata: Dict, exclusion_list: List[str] = None) -> Dict:
"""
We want to remove any unwanted metadata fields. This is particularly useful when readers introduce metadata from
intermediate steps, but we would rather not have that metadata in the vector store.
:param metadata: the metadata to clean
:param exclusion_list: the exclusion field list
:return: the cleaned metadata
"""
metadata_copy = copy.deepcopy(metadata or {})
for exclusion_item in exclusion_list or []:
metadata_copy.pop(exclusion_item, None)
return metadata_copy


class ExcelReader(BaseReader):
"""
A simple MS Excel file reader. Uses pandas in the background
"""

def __init__(self, config: Optional[Dict[str, Any]] = None) -> None:
self._config = config or {}
self._metadata_exclusion_list = self._config.get("metadata_exclusion_list") or [
"file_directory",
"filename",
]

def load_data(
self,
file: Path,
extra_info: Optional[Dict] = None,
fs: Optional[AbstractFileSystem] = None,
) -> List[Document]:
elements = partition_xlsx(file.absolute())
documents: List[Document] = []

for element in elements:
element_dict = element.to_dict()
document = Document(
text=element_dict["text"],
metadata={
**(extra_info or {}),
**_clean_metadata(
element_dict["metadata"],
exclusion_list=self._metadata_exclusion_list,
),
},
)
documents.append(document)
return documents


class OdsReader(BaseReader):
"""
A simple open document spreadsheet reader
"""

def load_data(
self,
file: Path,
extra_info: Optional[Dict] = None,
fs: Optional[AbstractFileSystem] = None,
) -> List[Document]:
data = pd.read_excel(file.absolute(), engine="odf").to_string()
return [Document(text=data, metadata=extra_info or {})]


class ImageReader(BaseReader):
"""
The llamaindex reader doesn't return any text so we use unstructured.io instead of llamaindex ImageReader.
"""

def __init__(self, config: Optional[Dict[str, Any]] = None) -> None:
self._config = config or {}
self._metadata_exclusion_list = self._config.get("metadata_exclusion_list") or [
"file_directory",
"filename",
]

def load_data(
self,
file: Path,
extra_info: Optional[Dict] = None,
fs: Optional[AbstractFileSystem] = None,
) -> List[Document]:
elements = partition_image(file.absolute())
documents: List[Document] = []

for element in elements:
element_dict = element.to_dict()
document = Document(
text=element_dict["text"],
metadata={
**(extra_info or {}),
**_clean_metadata(
element_dict["metadata"],
exclusion_list=self._metadata_exclusion_list,
),
},
)
documents.append(document)
return documents


class TiffReader(BaseReader):
"""
A simple tiff file reader. Converts the pages into png and then uses an image reader to convert into llama-index
documents
"""

def __init__(self, config: Optional[Dict[str, Any]] = None) -> None:
self._config = config
self._image_reader = ImageReader(config=self._config)

def _load_page_data(
self,
file: Path,
extra_info: Optional[Dict] = None,
fs: Optional[AbstractFileSystem] = None,
) -> List[Document]:
return self._image_reader.load_data(
file.absolute(), extra_info=extra_info, fs=fs
)

def load_data(
self,
file: Path,
extra_info: Optional[Dict] = None,
fs: Optional[AbstractFileSystem] = None,
) -> List[Document]:

with Image.open(file.absolute()) as image:
documents: List[Document] = []
for idx, page in enumerate(ImageSequence.Iterator(image)):
with tempfile.NamedTemporaryFile(
dir=tempfile.gettempdir(),
prefix=f"{file.name.split('.')[0]}-{idx}-",
) as temp_file_name:
path = pathlib.Path(temp_file_name.name).with_suffix(".png")

page.save(path)
page_documents: List[Document] = self._load_page_data(
file=path, extra_info=extra_info, fs=fs
)
documents += page_documents

return documents
2 changes: 1 addition & 1 deletion nesis/rag/requirements-huggingface.txt
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
# Use pytorch@cpu
torch @ https://download.pytorch.org/whl/cpu/torch-2.2.2%2Bcpu-cp311-cp311-linux_x86_64.whl
llama-index-embeddings-huggingface==0.1.3

3 changes: 2 additions & 1 deletion nesis/rag/requirements-test.txt
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
pylint==2.13.8
pytest==8.1.1
coverage==7.4.4
coverage==7.4.4

3 changes: 3 additions & 0 deletions nesis/rag/requirements-torch-cpu-x86.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
# Use pytorch@cpu
torch @ https://download.pytorch.org/whl/cpu/torch-2.2.2%2Bcpu-cp311-cp311-linux_x86_64.whl
torchvision @ https://download.pytorch.org/whl/cpu/torchvision-0.17.2%2Bcpu-cp311-cp311-linux_x86_64.whl
23 changes: 21 additions & 2 deletions nesis/rag/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,6 @@ PyYAML>=6.0.1
simplejson==3.17.6
StringGenerator==0.4.4
more-itertools==10.1.0
onnxruntime==1.17.1
gevent==23.9.1
PyPDF2==3.0.1

Expand All @@ -25,10 +24,30 @@ llama-index-vector-stores-postgres==0.1.4.post1
llama-index-vector-stores-chroma==0.1.6
llama-index-vector-stores-qdrant==0.1.4
llama-index-readers-file==0.1.12
llama-index-embeddings-fastembed==0.1.4
llama-index-llms-openai==0.1.12

boto3==1.34.75

# Dependencies for llamaindex readers. Consider replacing them with unstructured.io converters
python-multipart==0.0.9
python_pptx==0.6.23
docx2txt==0.8
openpyxl==3.1.2
pydub==0.25.1
odfpy==1.4.1
EbookLib==0.18
html2text==2024.2.26

# Dependency for video/audio encoders
openai-whisper @ git+https://github.com/openai/whisper.git
ffprobe==0.5

# unstructured converters
unstructured==0.13.2
unstructured-client==0.18.0
unstructured.pytesseract==0.3.12
unstructured-inference==0.7.25
pillow_heif==0.16.0

# This causes conflicts from onnxruntime so we attempt to install it last. Do not pin to a version so pip resolves it
llama-index-embeddings-fastembed
Empty file.

0 comments on commit 4406116

Please sign in to comment.