## PDF Query Using Langchain

In [None]:
!pip install langchain
!pip install openai
!pip install PyPDF2
!pip install faiss-cpu
!pip install tiktoken

In [None]:
from PyPDF2 import PdfReader
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.text_splitter import CharacterTextSplitter
from langchain.vectorstores import FAISS
from langchain.document_loaders import PyPDFLoader, TextLoader
from langchain.embeddings import HuggingFaceBgeEmbeddings,HuggingFaceInstructEmbeddings

In [None]:
import os
os.environ["OPENAI_API_KEY"] = ""

In [None]:
#provide the path of  pdf file/files.
pdfreader = PdfReader('icd10cm-tabular-2022-April-1.pdf')


In [None]:
from typing_extensions import Concatenate
# read text from pdf
raw_text = ''
for i, page in enumerate(pdfreader.pages):
    content = page.extract_text()
    if content:
        raw_text += content

In [None]:
raw_text

In [None]:
# raw_text = ''
# with open('ICD-10 Training data.txt', 'r') as file:
#     raw_text = file.read().rstrip()


In [None]:
# raw_text

In [None]:
# We need to split the text using Character Text Split such that it sshould not increse token size
text_splitter = CharacterTextSplitter(
    separator = "\n",
    chunk_size = 1000,
    chunk_overlap  = 100,
    length_function = len,
)
texts = text_splitter.split_text(raw_text)

In [None]:
len(texts)

In [None]:
import tiktoken

def num_tokens_from_string(string: str, encoding_name: str) -> int:
    encoding = tiktoken.get_encoding(encoding_name)
    num_tokens = len(encoding.encode(string))
    return num_tokens

print(num_tokens_from_string(raw_text, "cl100k_base"))

In [None]:
# Download embeddings from OpenAI
embeddings = OpenAIEmbeddings()

In [None]:
document_search = FAISS.from_texts(texts, embeddings)

In [None]:
document_search


In [None]:
from langchain.chains.question_answering import load_qa_chain
from langchain.llms import OpenAI

In [None]:
chain = load_qa_chain(OpenAI(), chain_type="stuff")

In [None]:
query = "what is Diagnosis Code and description for Inhalant use, unspecified"
docs = document_search.similarity_search(query)
chain.run(input_documents=docs, question=query)

In [None]:
query = "what is description of F19"
docs = document_search.similarity_search(query)
chain.run(input_documents=docs, question=query)