In [285]:
# ! pip install -U pypdf torch transformers langchain ipywidgets accelerate \
#  sentence_transformers pyarrow pandas bitsandbytes einops xformers

In [286]:
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings, HuggingFaceInferenceAPIEmbeddings
from langchain.vectorstores import SKLearnVectorStore

In [287]:
#from google.colab import output
#output.enable_custom_widget_manager()

In [288]:
# Load the PDF file. English or French only
pdf_doc_path=''
if pdf_doc_path:
  loader = PyPDFLoader(pdf_doc_path)
  pages = loader.load_and_split()

  text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1024,
    chunk_overlap=64,
    separators=['\n\n', '\n', '(?=>\. )', ' ', ''])

  # Split the pages into texts as defined above
  texts = text_splitter.split_documents(pages)

  embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2")



In [289]:
# Set the persisted vector store
vector_db_path = "./document_vector_db.parquet"
if pdf_doc_path:
# Create/upadte the vector store
  vector_db = SKLearnVectorStore.from_documents(
      texts,
      embedding=embeddings,
      persist_path=vector_db_path,
      serializer="parquet")
  # persist the store
  vector_db.persist()
else:
    vector_db=SKLearnVectorStore(
        embedding=embeddings,
        persist_path=vector_db_path,
        serializer="parquet"
    )

In [290]:
import pandas as pd

# load into pandas
df = pd.read_parquet(vector_db_path)

# Have a look at the store and remove dublicates
df=df.drop_duplicates(subset="texts")

In [291]:
from langchain.chains.question_answering import load_qa_chain
from langchain import HuggingFaceHub
import os
os.environ["HUGGINGFACEHUB_API_TOKEN"] = "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"

llm=HuggingFaceHub(repo_id="tiiuae/falcon-7b",
                   model_kwargs={"temperature":0.5 ,
                                 "max_length":512,
                                 "max_new_tokens":200
                                 })



In [292]:
from langchain.chains import RetrievalQA

qa = RetrievalQA.from_chain_type(llm=llm, chain_type="stuff",
                                 retriever=vector_db.as_retriever(search_kwargs={"k": 1}),
                                 return_source_documents=True,
                                 verbose=False,
)


In [297]:
#Ask a question?
queary=''
results=qa({"query": queary})

In [None]:
results