In [1]:
from langchain.document_loaders import UnstructuredPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [2]:
loader = UnstructuredPDFLoader("./pdf.pdf")

In [3]:
data = loader.load()

  from .autonotebook import tqdm as notebook_tqdm


In [5]:
textSplitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
texts = textSplitter.split_documents(data)

In [6]:
print(len(texts))

2


In [7]:
print(texts[0].page_content)

CORRECTED PAYER’S name, street address, city or town, state or province, country, ZIP or foreign postal code, and telephone no.

9191

VOID

1a Total ordinary dividends

OMB No. 1545-0110

MORGAN STANLEY 1585 BROADWAY NEW YORK NY 10036 1-800-622-2393

PAYER’S TIN

RECIPIENT’S TIN

36-3145972

RECIPIENT’S name

ANN C PORCARO TOD

Street address (including apt. no.)

LINDA A COSTAGLIOLA 55 JAMES RD

City or town, state or province, country, and ZIP or foreign postal code

MONROE NY 10950-4315

11 FATCA filing requirement

$ 1b Qualified dividends

$ 2a Total capital gain distr. $ 2c Section 1202 gain $ 2e Section 897 ordinary dividends $ 3 Nondividend distributions $ 5 Section 199A dividends $. 0.00 7 Foreign tax paid

$ 0.00 9 Cash liquidation distributions $ 0.00 12 Exempt-interest dividends

Form 1099-DIV

(Rev. January 2022)

Dividends and Distributions

For calendar year

20


In [8]:
from langchain.vectorstores import Pinecone
from langchain.embeddings import GooglePalmEmbeddings

In [9]:
from pinecone import Pinecone
pc = Pinecone(api_key="your-pinecone-api-key-here")
index = pc.Index(host='host-url-here')

In [10]:
embed = GooglePalmEmbeddings(google_api_key="your-google-api-key-here")

In [11]:
from langchain.vectorstores import Pinecone

In [12]:
import os
os.environ['PINECONE_API_KEY'] = "your-pinecone-api-key-here"

In [13]:
docsearch = Pinecone.from_texts([t.page_content for t in texts], embed, index_name="langchain2")

In [14]:
qr = "What is the client's name?"
docs = docsearch.similarity_search(query=qr)

In [15]:
print(len(docs))

2


In [16]:
docs

[Document(page_content='CORRECTED PAYER’S name, street address, city or town, state or province, country, ZIP or foreign postal code, and telephone no.\n\n9191\n\nVOID\n\n1a Total ordinary dividends\n\nOMB No. 1545-0110\n\nMORGAN STANLEY 1585 BROADWAY NEW YORK NY 10036 1-800-622-2393\n\nPAYER’S TIN\n\nRECIPIENT’S TIN\n\n36-3145972\n\nRECIPIENT’S name\n\nANN C PORCARO TOD\n\nStreet address (including apt. no.)\n\nLINDA A COSTAGLIOLA 55 JAMES RD\n\nCity or town, state or province, country, and ZIP or foreign postal code\n\nMONROE NY 10950-4315\n\n11 FATCA filing requirement\n\n$ 1b Qualified dividends\n\n$ 2a Total capital gain distr. $ 2c Section 1202 gain $ 2e Section 897 ordinary dividends $ 3 Nondividend distributions $ 5 Section 199A dividends $. 0.00 7 Foreign tax paid\n\n$ 0.00 9 Cash liquidation distributions $ 0.00 12 Exempt-interest dividends\n\nForm 1099-DIV\n\n(Rev. January 2022)\n\nDividends and Distributions\n\nFor calendar year\n\n20'),
 Document(page_content='2b Unrecap. 

In [18]:
from langchain.llms.google_palm import GooglePalm
from langchain.chains.question_answering import load_qa_chain

llm = GooglePalm(google_api_key="your-google-api-key-here", temperature=0)
chain = load_qa_chain(llm, chain_type='stuff')

In [21]:
qr = "What is the payer's name in the document 9191?"
docs = docsearch.similarity_search(query=qr)
chain.invoke({"input_documents":docs, "question":qr})['output_text']

'MORGAN STANLEY'

In [22]:
qr = "What is the recipient's name in the document 9191?"
docs = docsearch.similarity_search(query=qr)
chain.run(input_documents=docs, question=qr)

"The recipient's name on this document is ANN C PORCARO TOD."

In [22]:
qr = "What is the recipient's address including apt. no.?"
docs = docsearch.similarity_search(query=qr)
chain.run(input_documents=docs, question=qr)

"The recipient's address is 55 JAMES RD\n\nLINDA A COSTAGLIOLA\n\nMONROE NY 10950-4315"

In [23]:
qr = "What is the recipient's street address including apt. no.?"
docs = docsearch.similarity_search(query=qr)
chain.run(input_documents=docs, question=qr)

"The recipient's street address is 55 JAMES RD\n\nLINDA A COSTAGLIOLA"

In [24]:
qr = "What is the recipient's street address?"
docs = docsearch.similarity_search(query=qr)
chain.run(input_documents=docs, question=qr)

"The recipient's address is 55 JAMES RD\n\nMONROE NY 10950-4315."

In [25]:
qr = "What is the foreign tax amount in $?"
docs = docsearch.similarity_search(query=qr)
chain.run(input_documents=docs, question=qr)

'The foreign tax amount is 0.00.'

In [26]:
qr = "What is the form number of the document?"
docs = docsearch.similarity_search(query=qr)
chain.run(input_documents=docs, question=qr)

'The form number is 1099-DIV.'

In [27]:
qr = "What is the document name?"
docs = docsearch.similarity_search(query=qr)
chain.run(input_documents=docs, question=qr)

'The document is a 1099-DIV.'

In [28]:
qr = "What is the name of the other document which should be filed with this document?"
docs = docsearch.similarity_search(query=qr)
chain.run(input_documents=docs, question=qr)

'Form 1096'

In [None]:
qr = "What is the name of he seller?"
docs = docsearch.similarity_search(query=qr)
chain.run(input_documents=docs, question=qr)

'Chapman, Kim and Green'

In [None]:
chain.run(input_documents=docs, question="what is the address of the seller?")

'64731 James Branch Smithmouth, NC 26872'

In [None]:
chain.run(input_documents=docs, question="what is the invoice number?")

'The invoice number is 61356291.'

In [None]:
chain.run(input_documents=docs, question="when was the invoice generated?")

'09/06/2012'

In [None]:
chain.run(input_documents=docs, question="what is the address of the client?")

'The address of the client is 2280 Angela Plain Hortonshire, MS 93248.'

In [None]:
chain.run(input_documents=docs, question="how many items are there in the invoice?")

'4'

In [None]:
chain.run(input_documents=docs, question="what is the summary of the invoice?")

'The total cost of this invoice is 212,09.'

In [None]:
chain.run(input_documents=docs, question="what is the total VAT in %?")

'10%'

In [None]:
chain.run(input_documents=docs, question="what is the total VAT in $?")

'The total VAT is 19,28.'

In [None]:
chain.run(input_documents=docs, question="what is the quantity of the item Wine Glasses Goblets Pair Clear Glass purchased?")

'5'

In [None]:
chain.run(input_documents=docs, question="what is the first question i asked?")

'The first question you asked is: What is the invoice number?'