In [None]:
! pip install -U pypdf transformers langchain\
  sentence_transformers pyarrow # pandas bitsandbytes einops xformers ipywidgets accelerate

In [76]:
import pandas as pd
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings, HuggingFaceInferenceAPIEmbeddings
from langchain.vectorstores import SKLearnVectorStore

In [91]:
# Load the PDF file. English or French only
pdf_doc_path=''
if pdf_doc_path:
  loader = PyPDFLoader(pdf_doc_path)
  pages = loader.load_and_split()

  text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1024,
    chunk_overlap=64,
    separators=['\n\n', '\n', '(?=>\. )', ' ', ''])

  # Split the pages into texts as defined above
  texts = text_splitter.split_documents(pages)

  embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2")

In [92]:
# Set the persisted vector store
vector_db_path = "./document_vector_db.parquet"
if pdf_doc_path:
# Create/upadte the vector store
  vector_db = SKLearnVectorStore.from_documents(
      texts,
      embedding=embeddings,
      persist_path=vector_db_path,
      serializer="parquet")
  # persist the store
  vector_db.persist()
  df = pd.read_parquet(vector_db_path)
  df=df.drop_duplicates(subset="texts")
  vector_db=df.to_parquet(vector_db_path)


In [93]:
from langchain.chains.question_answering import load_qa_chain
from langchain import HuggingFaceHub
import os
os.environ["HUGGINGFACEHUB_API_TOKEN"] = "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"

llm=HuggingFaceHub(repo_id="tiiuae/falcon-7b-instruct",
                   model_kwargs={"temperature":0.5 ,
                                 "max_length":512,
                                 "max_new_tokens":200
                                 })



In [94]:
from langchain.chains import RetrievalQA

vector_db=SKLearnVectorStore(
      embedding=embeddings,
      persist_path=vector_db_path,
      serializer="parquet")


qa = RetrievalQA.from_chain_type(llm=llm, chain_type="stuff",
                                 retriever=vector_db.as_retriever(search_kwargs={"k": 2}),
                                 return_source_documents=True,
                                 verbose=False,
)


In [95]:
#Ask a question?
queary='What is PMD?'
results=qa({"query": queary})

In [96]:
results['result']

' PMD stands for Pressure Management Device. It is a device used to prevent vapor from entering the propellant tank of a rocket during launch and ascent.'