In [None]:
#| default_exp loaders

# Loaders

> A module for importing data and converting it to a processable output for the most typical file formats

In [None]:
#| export
import PyPDF2
from pathlib import Path
from PyPDF2 import PdfReader
import random
from typing import List
import uuid
from PIL import Image
from io import BytesIO

In [None]:
#| hide
from nbdev_rag.store import *
from nbdev_rag.base import *
from nbdev_rag.context import *
from nbdev_rag.llm import *
from transformers import AutoModelForCausalLM, AutoTokenizer
from sentence_transformers import SentenceTransformer
import torch

#|hide
We have a set of PDF files we previously downloaded to test out and create a PDF loader. We will try to create a loader that extract both images and text in an structured way. 

In [None]:
#| export
#For simplicity lets start with accepting a List. 
class PDFLoader:
    """Accepts a dir or single path and converts its contents into documents that can be later used for storage and retrieval"""
    def __init__(self, path_dir: str):
        self.path_dir = Path(path_dir)
        if self.path_dir.is_dir():
            self.paths = [path for path in self.path_dir.iterdir() if path]
        else:
            self.paths = [self.path_dir]
        self.path = None
        
    def pdf_validator(self, path):
        """Tries to read the pdf and returns a Bool value with the result"""
        try:
            reader = PdfReader(path)
            return True
        except Exception as e:
            return False

    def load_random_pdf(self):
        """Load a random pdf from the dataset. It loads pdfs until a valid one is found"""
        valid_pdf_found = False
        while not valid_pdf_found:  # Continue until a valid PDF is found
            pdf_path = random.choice(self.paths)
            is_valid = self.pdf_validator(pdf_path)
            if is_valid:
                reader = PdfReader(pdf_path)
                valid_pdf_found = True
                self.path = pdf_path
                return reader
            else:
                pdf_path.unlink()
                self.paths.remove(pdf_path)  # Remove the invalid path from the list
        
        if not valid_pdf_found:
            return None
    def load_pdf(self, path):
        reader = PdfReader(path)
        self.path = path
        return reader
    
    def get_documents(self, path = None):
        """Get a List of Text Documents from a pdf Path."""
        documents = []
        #Extracting text and storing it in documents
        if path == None:
            reader = self.load_random_pdf()
        else:
            reader = self.load_pdf(path)
        for i, page in enumerate(reader.pages):
            params = {"metadata": {**{"page": i + 1}, **reader.metadata}, "text": page.extract_text()}
            if i == 0:
                title = reader.metadata.get('title', None)
                if title is None:
                    title = params['text'].split('\n')[0]        
            if title is not None:
                params["name"] = title
            doc = Document(**params)
            documents.append(doc)
        return documents
    def get_images(self, path = None):
        #Can add some metadata like what page and location was found on. 
        #Create Image Node with that kind of info. 
        if path == None:
            reader = self.load_random_pdf()
        else:
            reader = self.load_pdf(path)
        images = []
        for count, page in enumerate(reader.pages):
            for image_file_object in page.images:
                image = Image.open(BytesIO(image_file_object.data))
                images.append(image)
        return images

In [None]:
#| export
class DocumentBridge:
    """Class for connecting a list of documents into its corresponding Nodes and relationships"""
    def __init__(self, documents: List, context: ModelContext):
        if isinstance(documents, List):
            self.documents = documents
        else:
            raise "You have to include a List of documents"
        self.context = context
    def nodes(self, chunk_size = 1024) -> List[TextNode]:
        """Brige a series of Documents into nodes linked by the end and start of the prev and next document. Great for linking together complex docs with structure
        such as pages or other info extracted first on a Document basis."""
        doc_nodes_list = [doc.create_nodes_from_doc(self.context, chunk_size = chunk_size) for doc in self.documents]
        for i, node_list in enumerate(doc_nodes_list):
            if i == 0:
                node_list[-1].next_node = doc_nodes_list[i + 1][0].id
            else:
                if i < len(doc_nodes_list) - 1:
                    node_list[-1].next_node = doc_nodes_list[i + 1][0].id
                node_list[0].prev_node = doc_nodes_list[i - 1][-1].id
        nodes = [node for node_list in doc_nodes_list for node in node_list]
        return nodes
        
    def join(self) -> Document:
        """Bridges a series of Documents into a single document. Great for storing sub-documents into a single one. Keeps some metadata of the documents into one. """
        #Store metadata about length, pages etc. For the later processing to be better. Maybe metadata about where each page started and ended in terms of characters could be good. 
        #see tradeoffs between this and diff docs pointing to a single reference. 
        #In reality in the conversion to nodes all the info is kept. We can post-process there. 
        pass

In [None]:
#| hide
#|eval: false
tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-Instruct-v0.2", device_map = "cuda")
model = AutoModelForCausalLM.from_pretrained(
    "mistralai/Mistral-7B-Instruct-v0.2",
    device_map="cuda",
    torch_dtype=torch.bfloat16,
)
embedding_model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
llm = LLM(model = model, tokenizer = tokenizer)
context = ModelContext(llm = llm, embedding = embedding_model, tokenizer = tokenizer)

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [None]:
#|hide
#|eval: false
loader = PDFLoader('datasets/papers_pdf')
documents = loader.get_documents()
images = loader.get_images(path = loader.path)
bridge = DocumentBridge(documents, context = context)
nodes = bridge.nodes(chunk_size = 2500)
nodes
#For images save surrounding image context for context + gpt/blip interpretation of it.

[TextNode(id = b724179a-84d8-409c-ae48-7af22b0529ab,text = Are you talking to [‘xem’] or [‘x’, ‘em’]? On
 Tokenization and Addressing Misgendering in LLMs
 with Pronoun Tokenization Parity
 Anaelia Ovalle∗‡Ninareh Mehrabi†Palash Goyal†
 Jwala Dhamala†‡Kai-Wei Chang‡†Richard Zemel†
 Aram Galstyan†Rahul Gupta†
 ‡University of California, Los Angeles†Amazon Alexa
 Abstract
 A large body of NLP research has documented the ways gender biases manifest
 and amplify within large language models (LLMs), though this research has pre-
 dominantly operated within a gender binary-centric context. A growing body
 of work has identified the harmful limitations of this gender-exclusive framing;
 many LLMs cannot correctly and consistently refer to persons outside the gender
 binary, especially if they use neopronouns. While data scarcity has been identified
 as a possible culprit, the precise mechanisms through which it influences LLM
 misgendering remain underexplored. Our work addresses this gap by 

In [None]:
#|hide
#|eval: false
store = DocumentStore(documents)
ids = store.ids()
store.get()

[Document(id = 8c5b989b-40e8-4a35-b80c-22a3912ea683, name = Symbolic Numeric Planning with Patterns, metadata = {'page': 1, '/CreationDate': 'D:20231218020935Z', '/Creator': 'TeX', '/ModDate': 'D:20231218020935Z', '/PTEX.Fullbanner': 'This is pdfTeX, Version 3.141592653-2.6-1.40.25 (TeX Live 2023) kpathsea version 6.3.5', '/Producer': 'pdfTeX-1.40.25', '/TemplateVersion': '2024.1', '/Trapped': '/False'}, n_nodes = 0),
 Document(id = c639d307-43f2-419c-8a8d-82393c942d28, name = Symbolic Numeric Planning with Patterns, metadata = {'page': 2, '/CreationDate': 'D:20231218020935Z', '/Creator': 'TeX', '/ModDate': 'D:20231218020935Z', '/PTEX.Fullbanner': 'This is pdfTeX, Version 3.141592653-2.6-1.40.25 (TeX Live 2023) kpathsea version 6.3.5', '/Producer': 'pdfTeX-1.40.25', '/TemplateVersion': '2024.1', '/Trapped': '/False'}, n_nodes = 0),
 Document(id = ce64e44a-5ee6-4da1-8325-4c93d80c958d, name = Symbolic Numeric Planning with Patterns, metadata = {'page': 3, '/CreationDate': 'D:202312180209

In [None]:
#|hide
#|eval: false
# Maybe I can make it so the bridge knows if an object is a Node or a Doc when inputting it and serves for both. 
documents = store.get()
bridge = DocumentBridge(documents, context = context)
nodes = bridge.nodes()


In [None]:
#|hide
import nbdev; nbdev.nbdev_export()