This the simplest method to extract text from pdf and chat with it.

In [None]:
!pip install langchain pinecone-client transformers accelerate bitsandbytes PyPDF2 sentence_transformers openai tiktoken

In [None]:
import os
import pinecone
from langchain.embeddings import OpenAIEmbeddings
from PyPDF2 import PdfReader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import Pinecone, FAISS
from langchain.llms import HuggingFacePipeline
from langchain.memory import ConversationBufferMemory
from langchain.chains import ConversationalRetrievalChain
from langchain.chains.question_answering import load_qa_chain
from langchain.chat_models import ChatOpenAI
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
import torch

torch.set_default_device('cuda')
# model for hf embeddings
model_name = "bert-large-uncased" # use any model from huggingface

# openai api key
# os.environ["OPENAI_API_KEY"] = ""

# hf api key
os.environ["HUGGINGFACEHUB_API_TOKEN"] = ''

# pinecone index
index_name = 'langchain'
pinecone.init(
	api_key='',
	environment='gcp-starter'
)

# for deleting the index in pinecone
# if index_name in pinecone.list_indexes():
#     pinecone.delete_index(index_name)


# for creating the index in pinecone
if index_name not in pinecone.list_indexes():
    pinecone.create_index(
        name=index_name,
        metric='cosine',
        dimension=1024
    )

reader = PdfReader('/content/comp.pdf') # location of pdf

# extract text
raw_text = ''
for i, page in enumerate(reader.pages):
    text = page.extract_text()
    if text:
        raw_text += text

# text splitter
text_splitter = RecursiveCharacterTextSplitter(chunk_size=512, chunk_overlap=60)
chunks = text_splitter.split_text(raw_text)

# hf embedding
hf_embeddings = HuggingFaceEmbeddings(model_name=model_name)
vector_store = Pinecone.from_texts(texts=chunks, embedding=hf_embeddings, index_name=index_name)

# # open_ai embedding
# openai_embeddings = OpenAIEmbeddings()
# vector_store = FAISS.from_texts(chunks, openai_embeddings)

In [None]:
# hf llm model
model_id = ""

hf_llm = HuggingFacePipeline.from_model_id(
    model_id=model_id,
    task="text-generation",
    model_kwargs={"temperature": 0.7, "max_length": 1024},
    device=0 # if device set to 0 means GPU
)

# openai llm model
# openai_llm = ChatOpenAI(
#     model="gpt-3.5-turbo",
#     max_tokens=1024,
# )

# memory chain for chat history
memory = ConversationBufferMemory(
          memory_key="chat_history",
          return_messages=True,
)

# retrival chain
qa_chain = ConversationalRetrievalChain.from_llm(
    llm=hf_llm,
    retriever=vector_store.as_retriever(),
    memory=memory
)

In [None]:
question = 'what is program'
response = qa_chain(question)
print("Response:", response['answer'])