### Information Retreival - LangChain
- Without Using OpenAI Embeddings
- Without OpenAI LLM

Two Applications:
- Text Documents
- Multiple PDF Files

In [47]:
!pip install langchain
!pip install huggingface_hub
!pip install sentence_transformers



### Get HUGGINGFACEHUB_API_KEY

In [48]:
import os
os.environ["HUGGINGFACEHUB_API_TOKEN"] = "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"

### Download Text File

In [49]:
import requests

url = url = "https://frontiernerds.com/files/state_of_the_union.txt"
res = requests.get(url)
with open("state_of_the_union.txt", "w") as f:
  f.write(res.text)

In [50]:
# Document Loader
from langchain.document_loaders import TextLoader
loader = TextLoader('./state_of_the_union.txt')
documents = loader.load()

In [51]:
documents

[Document(page_content='Madame Speaker, Vice President Biden, members of Congress, distinguished guests, and fellow Americans:\n\nOur Constitution declares that from time to time, the president shall give to Congress information about the state of our union. For 220 years, our leaders have fulfilled this duty. They have done so during periods of prosperity and tranquility. And they have done so in the midst of war and depression; at moments of great strife and great struggle.\n\nIt\'s tempting to look back on these moments and assume that our progress was inevitable, that America was always destined to succeed. But when the Union was turned back at Bull Run and the Allies first landed at Omaha Beach, victory was very much in doubt. When the market crashed on Black Tuesday and civil rights marchers were beaten on Bloody Sunday, the future was anything but certain. These were times that tested the courage of our convictions and the strength of our union. And despite all our divisions and

In [52]:
import textwrap

def wrap_text_preserve_newlines(text, width=110):
    # Split the input text into lines based on newline characters
    lines = text.split('\n')

    # Wrap each line individually
    wrapped_lines = [textwrap.fill(line, width=width) for line in lines]

    # Join the wrapped lines back together using newline characters
    wrapped_text = '\n'.join(wrapped_lines)

    return wrapped_text

In [53]:
print(wrap_text_preserve_newlines(str(documents[0])))

page_content='Madame Speaker, Vice President Biden, members of Congress, distinguished guests, and fellow
Americans:\n\nOur Constitution declares that from time to time, the president shall give to Congress
information about the state of our union. For 220 years, our leaders have fulfilled this duty. They have done
so during periods of prosperity and tranquility. And they have done so in the midst of war and depression; at
moments of great strife and great struggle.\n\nIt\'s tempting to look back on these moments and assume that
our progress was inevitable, that America was always destined to succeed. But when the Union was turned back
at Bull Run and the Allies first landed at Omaha Beach, victory was very much in doubt. When the market
crashed on Black Tuesday and civil rights marchers were beaten on Bloody Sunday, the future was anything but
certain. These were times that tested the courage of our convictions and the strength of our union. And
despite all our divisions and disagreem

In [54]:
# Text Splitter
from langchain.text_splitter import CharacterTextSplitter
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
docs = text_splitter.split_documents(documents)



In [55]:
len(docs)

55

In [56]:
docs[0]

Document(page_content='Madame Speaker, Vice President Biden, members of Congress, distinguished guests, and fellow Americans:\n\nOur Constitution declares that from time to time, the president shall give to Congress information about the state of our union. For 220 years, our leaders have fulfilled this duty. They have done so during periods of prosperity and tranquility. And they have done so in the midst of war and depression; at moments of great strife and great struggle.', metadata={'source': './state_of_the_union.txt'})

### Embeddings

In [57]:
# Embeddings
from langchain.embeddings import HuggingFaceEmbeddings

embeddings = HuggingFaceEmbeddings()

In [58]:
!pip install faiss-cpu



In [None]:
# Vectorstore: https://python.langchain.com/en/latest/modules/indexes/vectorstores.html
from langchain.vectorstores import FAISS

db = FAISS.from_documents(docs, embeddings)



In [59]:
query = "What did the president say about the Supreme Court"
docs = db.similarity_search(query)

In [60]:
print(wrap_text_preserve_newlines(str(docs[0].page_content)))

Madame Speaker, Vice President Biden, members of Congress, distinguished guests, and fellow Americans:

Our Constitution declares that from time to time, the president shall give to Congress information about the
state of our union. For 220 years, our leaders have fulfilled this duty. They have done so during periods of
prosperity and tranquility. And they have done so in the midst of war and depression; at moments of great
strife and great struggle.


### Create QA Chain

In [61]:
from langchain.chains.question_answering import load_qa_chain
from langchain import HuggingFaceHub


In [62]:
llm=HuggingFaceHub(repo_id="google/flan-t5-xxl", model_kwargs={"temperature":0.7, "max_length":512})



In [63]:
chain = load_qa_chain(llm, chain_type="stuff")

In [64]:
query = "What did the president say about the Supreme Court"
docs = db.similarity_search(query)
chain.run(input_documents=docs, question=query)

'I refuse to pass this problem on to another generation of Americans.'

In [65]:
query = "What did the president say about economy?"
docs = db.similarity_search(query)
chain.run(input_documents=docs, question=query)

'we must lay a new foundation for long-term economic growth'

### Working with PDF Files

# New Section

In [66]:
!pip install unstructured
!pip install chromadb
!pip install Cython
!pip install tiktoken
!pip install unstructured[local-inference]



In [67]:
from langchain.document_loaders import UnstructuredPDFLoader
from langchain.indexes import VectorstoreIndexCreator

In [68]:
# connect your Google Drive
# from google.colab import drive
# drive.mount('/content/gdrive', force_remount=True)

pdf_folder_path = 'pdf_path'
os.listdir(pdf_folder_path)

['Deep_learning_HSI_review.pdf']

In [69]:
loaders = [UnstructuredPDFLoader(os.path.join(pdf_folder_path, fn)) for fn in os.listdir(pdf_folder_path)]
loaders

[<langchain.document_loaders.pdf.UnstructuredPDFLoader at 0x7d7cf9027d00>]

In [70]:
# import os
# os.environ["TOKENIZERS_PARALLELISM"] = "false"
index = VectorstoreIndexCreator(
    embedding=HuggingFaceEmbeddings(),
    text_splitter=CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)).from_loaders(loaders)



In [79]:
llm=HuggingFaceHub(repo_id="google/flan-t5-xxl", model_kwargs={"temperature":0.7, "max_length":512})



In [85]:
from langchain.chains import RetrievalQA
chain = RetrievalQA.from_chain_type(llm=llm,
                                    chain_type="stuff",
                                    retriever=index.vectorstore.as_retriever(),
                                    input_key="question")

In [94]:
chain.run('strong points of DNN models')

"I don't know"

In [91]:
chain.run('the paper title')

'Deep supervised learning for hyperspectral data classification through convolutional neural networks'

In [93]:
chain.run('the limitation and challenges of HSI data?')

'The combination of the aforementioned challenges introduced by HSI data and the limitations of deep models'