This convert pdf pages to an image and then extract text from it by using OCR.

In [None]:
!pip install langchain pinecone-client transformers PyPDF2 sentence_transformers openai tiktoken torch pytesseract pdf2image PyMuPDF Pillow unstructured

In [None]:
!sudo apt install tesseract-ocr
!sudo apt install poppler-utils

In [None]:
import os
import uuid
import torch
import pinecone
import fitz
import pytesseract
from PIL import Image
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores.pinecone import Pinecone
from langchain.chains import ConversationalRetrievalChain
from langchain.llms import HuggingFacePipeline, HuggingFaceHub


# GPU
torch.set_default_device('cuda')

# hf_hub
os.environ["HUGGINGFACEHUB_API_TOKEN"] = ''

# pinecone_api
index_name = 'langchain'
pinecone.init(
	api_key='',
	environment='gcp-starter'
)

# delete existing index in pinecone
if index_name in pinecone.list_indexes():
    pinecone.delete_index(name=index_name)

# create new index in pinecone
if index_name not in pinecone.list_indexes():
    pinecone.create_index(name=index_name, metric="cosine", dimension=1024)


In [None]:
pdf = '' #example.pdf
doc = fitz.open(pdf)

# method to extract texts from pages
all_texts = ''
for page_number in range(doc.page_count):
    page = doc[page_number]
    pix = page.get_pixmap(matrix=fitz.Matrix(10, 10))
    image = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
    extracted_text = pytesseract.image_to_string(image)
    all_texts += extracted_text.replace('\n\n', ' ')

In [None]:
# covert to chunks
text_splitter = RecursiveCharacterTextSplitter(chunk_size=700, chunk_overlap=20)
chunks = text_splitter.split_text(all_texts)

# id
chunk_ids = ["langchain-" + str(uuid.uuid4()) for _ in chunks]

# embedding
hf_embeddings = HuggingFaceEmbeddings(model_name="bert-large-cased")
vector_store = Pinecone.from_texts(ids=chunk_ids, texts=chunks, embedding=hf_embeddings, index_name=index_name)

In [None]:
config = {
    "max_new_tokens": 1024,
    "repetition_penalty": 1.1,
    "temperature": 0.9,
    "top_k": 10,
    "top_p": 0.9,
}

llm = HuggingFaceHub(
    repo_id="bigscience/bloom",  # model to use from huggingface
    model_kwargs=config)

memory = ConversationBufferWindowMemory(
    memory_key="chat_history",
    return_messages=True)

chain = ConversationalRetrievalChain.from_llm(
            llm=llm,
            retriever=vector_store.as_retriever(search_type='similarity', search_kwargs={"k": 4}),
            memory=memory,
            chain_type='stuff',
            return_source_documents=True)

In [None]:
questions = ''
response = chain(questions)
response['result'].replace('\n',' ')