##### The Aryn Partitioner is configured to use the Aryn Partitioning Service to provide fast, GPU-powered performance. Go to [aryn.ai/sign-up ](aryn.ai/sign-up) to get a free API key for the service. You can also run the Aryn Partitioner locally by changing `use_partitioning_service` to `False`. Though you can use CPU to run the Aryn Partitioner, it is recommended to use an NVIDIA GPU for good performance.


##### In this example, we will write the output of the Sycamore from pdf to Vector Store of Langchain.


In [None]:
from langchain.text_splitter import CharacterTextSplitter
from langchain_openai import OpenAIEmbeddings
from langchain_community.vectorstores import FAISS
from langchain.chains.question_answering import load_qa_chain
from langchain.llms.openai import OpenAI
from langchain.callbacks import get_openai_callback

import os

import sycamore 
from sycamore.data import Document
from sycamore.transforms.partition import ArynPartitioner


Replace the `aryn_api_key` with your key 

In [1]:
aryn_api_key = 'aryn-api-key'

In [None]:
# requirements to be added 
#  faiss-cpu==1.7.4
#  langchain-community


In [3]:
context = sycamore.init()
work_dirs = ['./data/2306.07303.pdf']
pdf_docset = context.read.binary(work_dirs, binary_format="pdf")


partitioned_docset = pdf_docset.partition(
    partitioner=ArynPartitioner(threshold=0.35, use_ocr = False ,batch_at_a_time=True,  extract_table_structure=True, aryn_api_key = aryn_api_key) 
    ,num_gpus=0.1
        )

(Map(BinaryScan._to_document)->MapBatches(_wrap) pid=23614) The `max_size` parameter is deprecated and will be removed in v4.26. Please specify in `size['longest_edge'] instead`.


In [4]:
text = ""
for doc in partitioned_docset.take_all():
    for doci in doc.elements:
        if doci.type == "table":
            text +=  doci['table'].to_csv()
        elif doci.text_representation:
            text +=  doci.text_representation


2024-07-11 20:38:02,238	INFO set_read_parallelism.py:115 -- Using autodetected parallelism=8 for stage ReadBinary to satisfy parallelism at least twice the available number of CPUs (4).
2024-07-11 20:38:02,239	INFO set_read_parallelism.py:122 -- To satisfy the requested parallelism of 8, each read task output is split into 8 smaller blocks.
2024-07-11 20:38:02,239	INFO streaming_executor.py:112 -- Executing DAG InputDataBuffer[Input] -> TaskPoolMapOperator[ReadBinary] -> TaskPoolMapOperator[Map(BinaryScan._to_document)->MapBatches(_wrap)]
2024-07-11 20:38:02,240	INFO streaming_executor.py:113 -- Execution config: ExecutionOptions(resource_limits=ExecutionResources(cpu=None, gpu=None, object_store_memory=None), exclude_resources=ExecutionResources(cpu=0, gpu=0, object_store_memory=0), locality_with_output=False, preserve_order=False, actor_locality_enabled=True, verbose_progress=False)
2024-07-11 20:38:02,241	INFO streaming_executor.py:115 -- Tip: For detailed progress reporting, run `r

Running 0:   0%|          | 0/1 [00:00<?, ?it/s]

{'type': 'Page-header', 'bbox': [0.09452253453871783, 0.016898599104447798, 0.703795166015625, 0.04355328646573153], 'properties': {'score': 0.6613875031471252, 'page_number': 1}, 'text_representation': 'This work has been submitted to the Expert Systems With Applications journal (Elsevier) for\npossible publication\n'}
{'type': 'Title', 'bbox': [0.19320326861213236, 0.0794008359042081, 0.8112643612132353, 0.12149649880149148], 'properties': {'score': 0.6311418414115906, 'page_number': 1}, 'text_representation': 'A COMPREHENSIVE SURVEY ON APPLICATIONS OF\nTRANSFORMERS FOR DEEP LEARNING TASKS\n'}
{'type': 'Text', 'bbox': [0.10384827557732078, 0.139707294810902, 0.972286376953125, 0.15242800625887784], 'properties': {'score': 0.3755064010620117, 'page_number': 1}}
{'type': 'Text', 'bbox': [0.10481727151309743, 0.15526880437677557, 0.7572548540900735, 0.16686823064630682], 'properties': {'score': 0.415859580039978, 'page_number': 1}}
{'type': 'Text', 'bbox': [0.10525580911075368, 0.169614

In [5]:

text_splitter = CharacterTextSplitter(
    separator = "\n",
    chunk_overlap = 200,
    chunk_size = 1000,
    length_function = len 
)

chunks = text_splitter.split_text(text)

embedding = OpenAIEmbeddings()
faiss_index = FAISS.from_texts(chunks, embedding)


ImportError: Could not import faiss python package. Please install it with `pip install faiss-gpu` (for CUDA supported GPU) or `pip install faiss-cpu` (depending on Python version).

In [None]:
while True:
    user_question = input()
    docs = faiss_index.similarity_search(user_question, k=5)

    llm = OpenAI()
    chain = load_qa_chain(llm, chain_type= "stuff")
    with get_openai_callback() as cb:
        response = chain.run(input_documents=docs, question=user_question)
        print(cb)
        print(response)
