##### In this example, we will write the output of the Sycamore from pdf to `FAISS` vector database using Langchain.


##### The Aryn Partitioner in this job is configured to use the Aryn Partitioning Service to provide fast, GPU-powered performance. Go to [aryn.ai/sign-up ](aryn.ai/sign-up) to get a free API key for the service. This is the recommended configuration.

##### You can also run the Aryn Partitioner locally by setting `use_partitioning_service` to `False`. Though you can use CPU to run the Aryn Partitioner, it is recommended to use an NVIDIA GPU for good performance.



In [None]:
from langchain.text_splitter import CharacterTextSplitter
from langchain_openai import OpenAIEmbeddings
from langchain_community.vectorstores import FAISS
from langchain.chains.question_answering import load_qa_chain
from langchain.llms.openai import OpenAI
from langchain.callbacks import get_openai_callback

from pathlib import Path
import os
os.environ['KMP_DUPLICATE_LIB_OK']='True'

import sycamore 
from sycamore.data import Document
from sycamore.transforms.partition import ArynPartitioner


In [None]:
# requirements to be added 
#  faiss-cpu==1.7.4
#  langchain-community


In [None]:
from sycamore.utils.aryn_config import ArynConfig, _DEFAULT_PATH
assert ArynConfig.get_aryn_api_key() != "", f"Unable to find aryn API key.  Looked in {_DEFAULT_PATH}"

if the above assertion fails, you can either set the environment variable ARYN_API_KEY and restart jupyter
or make a yaml file at the specified path in the assertion error that looks like:

```
aryn_token: "YOUR-ARYN-API-KEY"
```

It is unsafe, but if neither of those options work, you can put it in this notebook with
```
import os
os.environ["ARYN_API_KEY"] = "UNSAFE-ARYN-API-KEY-LOCATION" 
```

but beware that it is easy to accidentally commit the notebook file and have it include your key.

In [None]:
if os.path.exists("/.dockerenv"):
    # Running in Docker.
    work_dir = "/app/work/docker_volume"
else:
    # Running outside of docker. This will land under notebooks/data/
    work_dir = "./data"
    

In [None]:

os.makedirs(work_dir, exist_ok = True)
metadata = {}
for f in ["2306.07303"]:
    path = os.path.join(work_dir, f + ".pdf")
    url = os.path.join("http://arxiv.org/pdf", f)
    if not Path(path).is_file():
        print("Downloading {} to {}".format(url, path))
        subprocess.run(["curl", "-o", path, url])


In [None]:
context = sycamore.init()
work_dirs = ['./data/2306.07303.pdf']
pdf_docset = context.read.binary(work_dirs, binary_format="pdf")


partitioned_docset = pdf_docset.partition(
    partitioner=ArynPartitioner(threshold=0.35, extract_table_structure=True) 
        )


In [None]:
text = ""
for doc in partitioned_docset.take_all():
    for doci in doc.elements:
        if doci.type == "table":
            text +=  doci['table'].to_csv()
        elif doci.text_representation:
            text +=  doci.text_representation


In [None]:

text_splitter = CharacterTextSplitter(
    separator = "\n",
    chunk_overlap = 200,
    chunk_size = 1000,
    length_function = len 
)

chunks = text_splitter.split_text(text)

embedding = OpenAIEmbeddings()
faiss_index = FAISS.from_texts(chunks, embedding)


In [None]:
def questionAnswering(user_question):
    docs = faiss_index.similarity_search(user_question, k=5)
    llm = OpenAI()
    chain = load_qa_chain(llm, chain_type= "stuff")
    with get_openai_callback() as cb:
        response = chain.run(input_documents=docs, question=user_question)
        print(cb)
        print(response)


In [None]:
questionAnswering("What is transformer?")