## PDF Query Using Langchain

In [14]:
# !pip install langchain
# !pip install openai
# !pip install PyPDF2
# !pip install faiss-cpu
# !pip install tiktoken

In [15]:
from PyPDF2 import PdfReader
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.text_splitter import CharacterTextSplitter
from langchain.vectorstores import FAISS

In [16]:
import os
os.environ["OPENAI_API_KEY"] = "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"
# os.environ["SERPAPI_API_KEY"] = ""

In [17]:
# Reading one file
pdfreader = PdfReader('AlintaEnergy-101862027-21498402.pdf')

In [18]:
from typing_extensions import Concatenate
# read text from pdf
raw_text = ''
for i, page in enumerate(pdfreader.pages):
    content = page.extract_text()
    if content:
        raw_text += content

In [19]:
raw_text

"Abinash Dash\n4 Eur ah PL\nELTHAM VIC 3095Gas account number: 101862027\nCustomer number: 17413906\nAccount Det ails\nSupply addr ess: 4 EURAH PL ACE EL THAM VIC 3095\nBilling period: 25 Jan 2023 t o 28 Mar 2023 (63 days)\nInvoice number: 21498402\nIssue date: 30 Mar 2023\nNeed t o get in t ouch?\nalintaener gy.com.au/cont actus\n13 37 02 Monday to Friday 8am - 6pm and 8am -\n12pm (AEDT ) on S aturdays\nFaults and emer gencies 1800 898 220 (Austr alian\nGas Networ ks (VIC)) 24 hours, 7 days\nYour ener gy plan\nYou're enjoying our Fair Dea l 43gas plan. W ith competitive\nprices, no lock in contr acts or exit fees and flexible payment\noptions.\nYour usage summar y\nAverage cost per day\n$4.11Average dai ly Usage\n99.28 MJ\nAverage costs include contr acted discounts (if applicable) but exclude other\ndiscounts, r ebates, concessions, or other char ges/cr edits.\nYour total monthly usage\nMonthlyMonthly\n  UsageUsage\n  (MJ)(MJ)AAververageage\n  Day UsageDay Usage\n  (MJ)(MJ)\nTTotal m

In [20]:
# We need to split the text using Character Text Split such that it sshould not increse token size
text_splitter = CharacterTextSplitter(
    separator = "\n",
    chunk_size = 800,
    chunk_overlap  = 200,
    length_function = len,
)
texts = text_splitter.split_text(raw_text)

In [21]:
len(texts)

9

In [22]:
texts

["Abinash Dash\n4 Eur ah PL\nELTHAM VIC 3095Gas account number: 101862027\nCustomer number: 17413906\nAccount Det ails\nSupply addr ess: 4 EURAH PL ACE EL THAM VIC 3095\nBilling period: 25 Jan 2023 t o 28 Mar 2023 (63 days)\nInvoice number: 21498402\nIssue date: 30 Mar 2023\nNeed t o get in t ouch?\nalintaener gy.com.au/cont actus\n13 37 02 Monday to Friday 8am - 6pm and 8am -\n12pm (AEDT ) on S aturdays\nFaults and emer gencies 1800 898 220 (Austr alian\nGas Networ ks (VIC)) 24 hours, 7 days\nYour ener gy plan\nYou're enjoying our Fair Dea l 43gas plan. W ith competitive\nprices, no lock in contr acts or exit fees and flexible payment\noptions.\nYour usage summar y\nAverage cost per day\n$4.11Average dai ly Usage\n99.28 MJ\nAverage costs include contr acted discounts (if applicable) but exclude other",
 'options.\nYour usage summar y\nAverage cost per day\n$4.11Average dai ly Usage\n99.28 MJ\nAverage costs include contr acted discounts (if applicable) but exclude other\ndiscounts, r e

In [23]:
texts[0]

"Abinash Dash\n4 Eur ah PL\nELTHAM VIC 3095Gas account number: 101862027\nCustomer number: 17413906\nAccount Det ails\nSupply addr ess: 4 EURAH PL ACE EL THAM VIC 3095\nBilling period: 25 Jan 2023 t o 28 Mar 2023 (63 days)\nInvoice number: 21498402\nIssue date: 30 Mar 2023\nNeed t o get in t ouch?\nalintaener gy.com.au/cont actus\n13 37 02 Monday to Friday 8am - 6pm and 8am -\n12pm (AEDT ) on S aturdays\nFaults and emer gencies 1800 898 220 (Austr alian\nGas Networ ks (VIC)) 24 hours, 7 days\nYour ener gy plan\nYou're enjoying our Fair Dea l 43gas plan. W ith competitive\nprices, no lock in contr acts or exit fees and flexible payment\noptions.\nYour usage summar y\nAverage cost per day\n$4.11Average dai ly Usage\n99.28 MJ\nAverage costs include contr acted discounts (if applicable) but exclude other"

In [24]:
## https://python.langchain.com/docs/integrations/vectorstores/faiss
# Facebook AI Similarity Search (Faiss) is a library for efficient similarity search and clustering 
# of dense vectors.
# It contains algorithms that search in sets of vectors of any size, up to ones that possibly do not
# fit in RAM. It also contains supporting code for evaluation and parameter tuning.

In [25]:
# Download embeddings from OpenAI
embeddings = OpenAIEmbeddings()

In [26]:
document_search = FAISS.from_texts(texts, embeddings)

In [27]:
from langchain.chains.question_answering import load_qa_chain
from langchain.llms import OpenAI

In [28]:
chain = load_qa_chain(OpenAI(), chain_type="stuff")

In [29]:
query = "What is the supply address ?"

## Similarity search using FAISS with OpenAI embeddings

docs = document_search.similarity_search(query)
chain.run(input_documents=docs, question=query)

' The supply address is 4 Eurah Place ELTHAM VIC 3095.'

In [31]:
print(len(docs))
docs

4


[Document(page_content="Abinash Dash\n4 Eur ah PL\nELTHAM VIC 3095Gas account number: 101862027\nCustomer number: 17413906\nAccount Det ails\nSupply addr ess: 4 EURAH PL ACE EL THAM VIC 3095\nBilling period: 25 Jan 2023 t o 28 Mar 2023 (63 days)\nInvoice number: 21498402\nIssue date: 30 Mar 2023\nNeed t o get in t ouch?\nalintaener gy.com.au/cont actus\n13 37 02 Monday to Friday 8am - 6pm and 8am -\n12pm (AEDT ) on S aturdays\nFaults and emer gencies 1800 898 220 (Austr alian\nGas Networ ks (VIC)) 24 hours, 7 days\nYour ener gy plan\nYou're enjoying our Fair Dea l 43gas plan. W ith competitive\nprices, no lock in contr acts or exit fees and flexible payment\noptions.\nYour usage summar y\nAverage cost per day\n$4.11Average dai ly Usage\n99.28 MJ\nAverage costs include contr acted discounts (if applicable) but exclude other"),
 Document(page_content="Supply period: 25 Jan 2023 t o 28 Mar 2023 (63 days)\nTariff: AGN (VIC) Centr al 2\nGas meter: 4856GE / 1 Read date: 28 Mar 2023\nRead typ

In [32]:
query = "How much is the bill and what is the period ?"
docs = document_search.similarity_search(query)
chain.run(input_documents=docs, question=query)

' The bill is $258.62 and the period is from 25 Jan 2023 to 28 Mar 2023 (63 days).'

In [33]:
print(len(docs))
docs

4


[Document(page_content="Supply period: 25 Jan 2023 t o 28 Mar 2023 (63 days)\nTariff: AGN (VIC) Centr al 2\nGas meter: 4856GE / 1 Read date: 28 Mar 2023\nRead type Start read End read Heating\nValuePressure\nFactorUsage MJ\nActual 4633 4791 39.16 1.0109 6254.72\nYour next meter r ead is scheduled between 24 May 2023 and 01 Jun 2023 .Please ensur e easy\naccess to your meter on these days.\nHow we've worked ou t your bi ll\nOpening ba lance and payments r eceiv ed Total\nOpening balance $216.23\n20 Feb 23 Payment $172.89 cr\nPrompt Payment Discount $43.34 cr\nBalance br ought f orward $0.00\nNew charges and cr edits\nUsage and su pply charges Units Price Amount\nPeak - Step 1 191.80 MJ $0.03916 $7.51\nPeak - Step 2 153.30 MJ $0.03344 $5.13\nPeak - Step 3 349.87 MJ $0.02541 $8.89\nDaily Char ge 7days $0.69300 $4.85"),
 Document(page_content='Balance br ought f orward $0.00\nNew charges $330.16\nTotal balance $330.16\nPay on time discount\nif paid b y the due da te$71.54 cr\nDue date 24 A

In [34]:
query = "what is the cost per day and the due date and daily usage ?"
docs = document_search.similarity_search(query)
chain.run(input_documents=docs, question=query)

' The cost per day is $4.11, the due date is 24 Apr 2023, and the average daily usage is 99.28 MJ.'

In [35]:
print(len(docs))
docs

4


[Document(page_content='options.\nYour usage summar y\nAverage cost per day\n$4.11Average dai ly Usage\n99.28 MJ\nAverage costs include contr acted discounts (if applicable) but exclude other\ndiscounts, r ebates, concessions, or other char ges/cr edits.\nYour total monthly usage\nMonthlyMonthly\n  UsageUsage\n  (MJ)(MJ)AAververageage\n  Day UsageDay Usage\n  (MJ)(MJ)\nTTotal monthly gas usageotal monthly gas usage AAververage daily gas usageage daily gas usageMar\n 23Feb\n 23Jan\n 23Dec\n 22Nov\n 22Oct\n 22Sep\n 22Aug\n 22Jul\n 22Jun\n 22May\n 22Apr\n 22Mar\n 2205k10k15k20k\n0150300450600\nCompar ed to same time last year , your usage incr eased 3%\nBalance br ought f orward $0.00\nNew charges $330.16\nTotal balance $330.16\nPay on time discount\nif paid b y the due da te$71.54 cr\nDue date 24 Apr 2023\n$258.62'),
 Document(page_content='Usage and su pply charges Units Price Amount\nPeak - Step 1 191.80 MJ $0.03916 $7.51\nPeak - Step 2 153.30 MJ $0.03344 $5.13\nPeak - Step 3 349.87 MJ

In [None]:
query = "Fantastic, thanks"
docs = document_search.similarity_search(query)
chain.run(input_documents=docs, question=query)

In [None]:
query = "sure, please summarize the electicity bill"
docs = document_search.similarity_search(query)
chain.run(input_documents=docs, question=query)

In [None]:
query = "okay, can you update the answer with total usage, average daily usage, the bill amount, the billing period and the due date ?"
docs = document_search.similarity_search(query)
chain.run(input_documents=docs, question=query)

In [None]:
query = "okay, can you append the previous two answers?"
docs = document_search.similarity_search(query)
chain.run(input_documents=docs, question=query)

In [None]:
## Reading multiple files

raw_text = ''

pdf_dir = os.getcwd()

for filename in os.listdir(pdf_dir):
    if filename.endswith('.pdf'):
        pdf_file = open(os.path.join(pdf_dir, filename), 'rb')
        pdf_reader = PdfReader(pdf_file)
        
        text = ''
        for i in range(len(pdf_reader.pages)):
            page = pdf_reader.pages[i]
            raw_text += page.extract_text()

In [None]:
raw_text

In [None]:
# We need to split the text using Character Text Split such that it sshould not increse token size
text_splitter = CharacterTextSplitter(
    separator = "\n",
    chunk_size = 800,
    chunk_overlap  = 200,
    length_function = len,
)
texts = text_splitter.split_text(raw_text)

In [None]:
# Download embeddings from OpenAI
embeddings = OpenAIEmbeddings()

In [None]:
document_search = FAISS.from_texts(texts, embeddings)

In [None]:
chain = load_qa_chain(OpenAI(), chain_type="stuff")

In [None]:
query = "how much is the bill for the electricity account ?"
docs = document_search.similarity_search(query)
chain.run(input_documents=docs, question=query)

In [None]:
query = "how much is the bill for the gas account ? "
docs = document_search.similarity_search(query)
chain.run(input_documents=docs, question=query)

In [None]:
query = "how much are the new charges for the gas account ? "
docs = document_search.similarity_search(query)
chain.run(input_documents=docs, question=query)

In [None]:
query = "how much are the new charges for the electricity account ? "
docs = document_search.similarity_search(query)
chain.run(input_documents=docs, question=query)

In [None]:
from langchain.document_loaders import OnlinePDFLoader

In [None]:
loader = OnlinePDFLoader("https://arxiv.org/pdf/1706.03762.pdf")

In [None]:
!pip install unstructured

In [None]:
data = loader.load()

In [None]:
data

In [None]:
# Download embeddings from OpenAI
embeddings = OpenAIEmbeddings()

In [None]:
!pip install chromadb

In [None]:
from langchain.indexes import VectorstoreIndexCreator
index = VectorstoreIndexCreator().from_loaders([loader])

In [None]:
query = "Explain me about Attention is all you need"
index.query(query)