-
Notifications
You must be signed in to change notification settings - Fork 21
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
feat(rag): add image/video readers (#21)
This PR introduces readers and adds new ones for - mp3+4, xlsx, tiff, ods File formats. Closes #20
- Loading branch information
Showing
29 changed files
with
76,647 additions
and
2,844 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -143,4 +143,4 @@ dist | |
*.pyc | ||
videos | ||
screenshots | ||
*.mp4 | ||
#*.mp4 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,161 @@ | ||
import copy | ||
import pathlib | ||
import tempfile | ||
from pathlib import Path | ||
from typing import List, Optional, Dict, Any | ||
|
||
import pandas as pd | ||
from PIL import Image, ImageSequence | ||
from fsspec import AbstractFileSystem | ||
from llama_index.core import Document | ||
from llama_index.core.readers.base import BaseReader | ||
from unstructured.partition.image import partition_image | ||
from unstructured.partition.xlsx import partition_xlsx | ||
|
||
|
||
def _clean_metadata(metadata: Dict, exclusion_list: List[str] = None) -> Dict: | ||
""" | ||
We want to remove any unwanted metadata fields. This is particularly useful when readers introduce metadata from | ||
intermediate steps, but we would rather not have that metadata in the vector store. | ||
:param metadata: the metadata to clean | ||
:param exclusion_list: the exclusion field list | ||
:return: the cleaned metadata | ||
""" | ||
metadata_copy = copy.deepcopy(metadata or {}) | ||
for exclusion_item in exclusion_list or []: | ||
metadata_copy.pop(exclusion_item, None) | ||
return metadata_copy | ||
|
||
|
||
class ExcelReader(BaseReader): | ||
""" | ||
A simple MS Excel file reader. Uses pandas in the background | ||
""" | ||
|
||
def __init__(self, config: Optional[Dict[str, Any]] = None) -> None: | ||
self._config = config or {} | ||
self._metadata_exclusion_list = self._config.get("metadata_exclusion_list") or [ | ||
"file_directory", | ||
"filename", | ||
] | ||
|
||
def load_data( | ||
self, | ||
file: Path, | ||
extra_info: Optional[Dict] = None, | ||
fs: Optional[AbstractFileSystem] = None, | ||
) -> List[Document]: | ||
elements = partition_xlsx(file.absolute()) | ||
documents: List[Document] = [] | ||
|
||
for element in elements: | ||
element_dict = element.to_dict() | ||
document = Document( | ||
text=element_dict["text"], | ||
metadata={ | ||
**(extra_info or {}), | ||
**_clean_metadata( | ||
element_dict["metadata"], | ||
exclusion_list=self._metadata_exclusion_list, | ||
), | ||
}, | ||
) | ||
documents.append(document) | ||
return documents | ||
|
||
|
||
class OdsReader(BaseReader): | ||
""" | ||
A simple open document spreadsheet reader | ||
""" | ||
|
||
def load_data( | ||
self, | ||
file: Path, | ||
extra_info: Optional[Dict] = None, | ||
fs: Optional[AbstractFileSystem] = None, | ||
) -> List[Document]: | ||
data = pd.read_excel(file.absolute(), engine="odf").to_string() | ||
return [Document(text=data, metadata=extra_info or {})] | ||
|
||
|
||
class ImageReader(BaseReader): | ||
""" | ||
The llamaindex reader doesn't return any text so we use unstructured.io instead of llamaindex ImageReader. | ||
""" | ||
|
||
def __init__(self, config: Optional[Dict[str, Any]] = None) -> None: | ||
self._config = config or {} | ||
self._metadata_exclusion_list = self._config.get("metadata_exclusion_list") or [ | ||
"file_directory", | ||
"filename", | ||
] | ||
|
||
def load_data( | ||
self, | ||
file: Path, | ||
extra_info: Optional[Dict] = None, | ||
fs: Optional[AbstractFileSystem] = None, | ||
) -> List[Document]: | ||
elements = partition_image(file.absolute()) | ||
documents: List[Document] = [] | ||
|
||
for element in elements: | ||
element_dict = element.to_dict() | ||
document = Document( | ||
text=element_dict["text"], | ||
metadata={ | ||
**(extra_info or {}), | ||
**_clean_metadata( | ||
element_dict["metadata"], | ||
exclusion_list=self._metadata_exclusion_list, | ||
), | ||
}, | ||
) | ||
documents.append(document) | ||
return documents | ||
|
||
|
||
class TiffReader(BaseReader): | ||
""" | ||
A simple tiff file reader. Converts the pages into png and then uses an image reader to convert into llama-index | ||
documents | ||
""" | ||
|
||
def __init__(self, config: Optional[Dict[str, Any]] = None) -> None: | ||
self._config = config | ||
self._image_reader = ImageReader(config=self._config) | ||
|
||
def _load_page_data( | ||
self, | ||
file: Path, | ||
extra_info: Optional[Dict] = None, | ||
fs: Optional[AbstractFileSystem] = None, | ||
) -> List[Document]: | ||
return self._image_reader.load_data( | ||
file.absolute(), extra_info=extra_info, fs=fs | ||
) | ||
|
||
def load_data( | ||
self, | ||
file: Path, | ||
extra_info: Optional[Dict] = None, | ||
fs: Optional[AbstractFileSystem] = None, | ||
) -> List[Document]: | ||
|
||
with Image.open(file.absolute()) as image: | ||
documents: List[Document] = [] | ||
for idx, page in enumerate(ImageSequence.Iterator(image)): | ||
with tempfile.NamedTemporaryFile( | ||
dir=tempfile.gettempdir(), | ||
prefix=f"{file.name.split('.')[0]}-{idx}-", | ||
) as temp_file_name: | ||
path = pathlib.Path(temp_file_name.name).with_suffix(".png") | ||
|
||
page.save(path) | ||
page_documents: List[Document] = self._load_page_data( | ||
file=path, extra_info=extra_info, fs=fs | ||
) | ||
documents += page_documents | ||
|
||
return documents |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,3 +1,3 @@ | ||
# Use pytorch@cpu | ||
torch @ https://download.pytorch.org/whl/cpu/torch-2.2.2%2Bcpu-cp311-cp311-linux_x86_64.whl | ||
llama-index-embeddings-huggingface==0.1.3 | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,3 +1,4 @@ | ||
pylint==2.13.8 | ||
pytest==8.1.1 | ||
coverage==7.4.4 | ||
coverage==7.4.4 | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,3 @@ | ||
# Use pytorch@cpu | ||
torch @ https://download.pytorch.org/whl/cpu/torch-2.2.2%2Bcpu-cp311-cp311-linux_x86_64.whl | ||
torchvision @ https://download.pytorch.org/whl/cpu/torchvision-0.17.2%2Bcpu-cp311-cp311-linux_x86_64.whl |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Empty file.
Oops, something went wrong.