In [5]:
from unstructured.partition.pdf import partition_pdf
from pathlib import Path

import weaviate
from weaviate.embedded import EmbeddedOptions
import os

First, we initialize Weaviate:

In [6]:
client = weaviate.Client(
    embedded_options=EmbeddedOptions(
        additional_env_vars={"X-HuggingFace-Api-Key": "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"}
    )
)
assert client.is_ready()

embedded weaviate is already listening on port 8079


            Consider upgrading to the new and improved v4 client instead!
            See here for usage: https://weaviate.io/developers/weaviate/client-libraries/python
            


In [35]:
client.schema.delete_all()
# Create a new class with a vectorizer
schema = {
    "class": "Test",    
    "vectorizer": "text2vec-huggingface",
    "properties": [
        {
            "name": "source", 
            "dataType": ["text"],
            "description": "The source of the PDF"
        },
        {
            "name": "title",
            "dataType": "string",
            "description": "Titles in the PDF",
            "moduleConfig": {
                "text2vec-huggingface": {"skip": False, "vectorizePropertyName": False}
            },

        },

        {
            "name": "content",  #What we want to vectorize
            "dataType": "string",
            "description": "Content of PDF",
            "moduleConfig": {
                "text2vec-huggingface": {"skip": False, "vectorizePropertyName": False}
            },
        },
        {
            "name": "path",
            "dataType": ["text"],
            "description": "The path to the PDF"
        },
    ],
    "moduleConfig": {
    "text2vec-huggingface": {
      "model": "sentence-transformers/all-MiniLM-L6-v2",  # Can be any public or private Hugging Face model.
      "options": {
        "waitForModel": True,  # Try this if you get a "model not ready" error
      }
}
}
}

client.schema.create_class(schema)

UnexpectedStatusCodeError: Add properties to classes! Unexpected status code: 422, with response body: {'error': [{'message': "property 'title': invalid dataType: reference property to nonexistent class"}]}.

In [None]:
import logging

logging.basicConfig(level=logging.INFO)


class TitleNarrativeExtractor:
    def __init__(self):
        self.titles = []  # Keep track of the extracted titles
        self.narrative_texts = []  # Keep track of the extracted narrative texts

    def process(self, element):
        if element.category == "Title":
            self.extract_title(element.text)

        if element.category == "NarrativeText":
            self.extract_narrative_text(element.text)

    def extract_title(self, text):
        logging.info(f"Title extracted: {text}")
        self.titles.append(text)

    def extract_narrative_text(self, text):
        logging.info(f"Narrative text extracted: {text}")
        self.narrative_texts.append(text)

    def extract_elements(self, elements):
        for element in elements:
            self.process(element)

    def get_titles(self):
        return "\n".join(self.titles)

    def get_narrative_texts(self):
        return "\n".join(self.narrative_texts)


In [40]:
import json
from weaviate.util import generate_uuid5

# Define the class for storing PDFs
class PDFIngestor:
    def __init__(self, directory, client):
        self.directory = Path(directory)
        self.client = client
        self.extractor = TitleNarrativeExtractor()
        self.data_objects = []
        self.result=None
        self.data_object=None   

    def ingest_pdfs(self):
 
    
        # Iterate over all PDF files in the directory
        for path in self.directory.glob('*.pdf'):
            print(f"Processing {path.name}...")

            # Convert the PDF to text (this is a list of elements with text that will be vectorized)
            elements = partition_pdf(filename=path)

            # Extract data using TitleNarrativeExtractor
            self.extractor.extract_elements(elements)


            self.data_object = {
                                    "source": path.name,
                                    "Title": [' '.join(str(title) for title in self.extractor.get_titles())],  # Concatenate titles into a single string  # Ensure titles are strings
                                    "content": [' '.join(str(text) for text in self.extractor.get_narrative_texts())],  # Ensure narrative texts are strings
                                    "path": str(path)

                                    
                    }
            
            self.result = self.client.data_object.validate(
                class_name="Test",
                    data_object = {
                                    "source": path.name,
                                    "Title": ' '.join(str(title) for title in self.extractor.get_titles()),  # Concatenate titles into a single string  # Ensure titles are strings
                                    "content": ' '.join(str(text) for text in self.extractor.get_narrative_texts()),  # Ensure narrative texts are strings
                                    "path": str(path)

                                    
                    },
                )
            print(json.dumps(self.result, indent=2))

            # Add the data object to the batch request
            self.data_objects.append(self.data_object)

            with client.batch as batch:
                for self.data_object in self.data_objects:
                    batch.add_data_object(self.data_object, "Test", uuid=generate_uuid5(self.data_object))

                
# Use the class
ingestor = PDFIngestor('/Users/ceciliaacosta/ift6759/how-to-ingest-pdfs-with-unstructured/data', client)
ingestor.ingest_pdfs()



Processing paper02.pdf...


INFO:root:Title extracted: A Comparison of House Price Classification with Structured and Unstructured Text Data
INFO:root:Title extracted: Erika Cardenas Florida Atlantic University ecardenas2015@fau.edu
INFO:root:Title extracted: Connor Shorten Florida Atlantic University cshorten2015@fau.edu
INFO:root:Title extracted: Taghi M. Khoshgoftaar Florida Atlantic University khoshgof@fau.edu
INFO:root:Title extracted: Borivoje Furht Florida Atlantic University bfurht@fau.edu
INFO:root:Title extracted: Abstract
INFO:root:Narrative text extracted: Purchasing a home is one of the largest investments most people make. House price prediction allows indi- viduals to be informed about their asset wealth. Trans- parent pricing on homes allows for a more efficient mar- ket and economy. We report the performance of ma- chine learning models trained with structured tabular representations and unstructured text descriptions. We collected a dataset of 200 descriptions of houses which include meta-inform

{
  "error": [
    {
      "message": "invalid object: invalid UUID length: 0"
    }
  ],
  "valid": false
}


TypeError: Object of type Title is not JSON serializable