## 1. Set your Embedding model and LLM model

In [3]:
Embedding_Model = "hkunlp/instructor-xl"

In [4]:
LLM_Model = "google/flan-t5-large"

## 2. load text files

In [5]:
from langchain.document_loaders import DirectoryLoader

In [6]:
document_directory = "pdf_files"
loader = DirectoryLoader(document_directory)
documents = loader.load()

In [None]:
#documents

In [8]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [9]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
texts = text_splitter.split_documents(documents)

## 3. Load Embeddings

In [10]:
from langchain.embeddings import HuggingFaceInstructEmbeddings

In [None]:
instructor_embeddings = HuggingFaceInstructEmbeddings(model_name=Embedding_Model)

In [None]:
text = "This is a test document."
query_result = instructor_embeddings.embed_query(text)

In [None]:
query_result

## 4. Load LLM

In [13]:
import torch
import transformers
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

In [14]:
tokenizer = AutoTokenizer.from_pretrained(LLM_Model)
model = AutoModelForSeq2SeqLM.from_pretrained(LLM_Model, torch_dtype=torch.float32)

In [15]:
from transformers import pipeline
from langchain.llms import HuggingFacePipeline

pipe = pipeline(
    "text2text-generation",
    model=model, 
    tokenizer=tokenizer, 
    max_length=512,
    temperature=0,
    top_p=0.95,
    repetition_penalty=1.15
)

llm = HuggingFacePipeline(pipeline=pipe)

In [16]:
print(llm('What is the capital of India?'))

calcutta


## 5. Create index

In [17]:
from langchain.vectorstores import Chroma

In [18]:
persist_directory = 'db'
db = Chroma.from_documents(documents=texts, 
                                 embedding=instructor_embeddings,
                                 persist_directory=persist_directory)

## 6. Create retriever from index and chain it with LLM

In [19]:
#retriever = db.as_retriever()
retriever = db.as_retriever(search_kwargs={"k": 3})

In [20]:
from langchain.chains import RetrievalQA

In [21]:
qa = RetrievalQA.from_chain_type(llm=llm, 
                                  chain_type="stuff", 
                                  retriever=retriever, 
                                  return_source_documents=True)

## 7. Query the chain

In [23]:
question = "solutions to Climate Change?"
generated_text = qa(question)
generated_text

{'query': 'solutions to Climate Change?',
 'result': 'Switching energy systems from fossil fuels to renewables',
 'source_documents': [Document(page_content='10. Switching energy systems from fossil fuels to renewables like solar will reduce the emissions driving climate change. But we have to start right now. While a growing coalition of countries is committing to net zero emissions by 2050, about half of emissions cuts must be in place by 2030 to keep warming below 1.5°C. Fossil fuel production must decline by roughly 6 per cent per year between 2020 and 2030.', metadata={'source': 'pdf_files\\fastfacts-what-is-climate-change.pdf'}),
  Document(page_content='9. Climate change is a huge challenge, but we already know many solutions. These can deliver economic benefits while improving our lives and protecting the environment. We also have global agreements to guide progress, such as the UN Framework Convention on Climate Change and the Paris Agreement. Three broad categories of action 

In [24]:
generated_text["result"]

'Switching energy systems from fossil fuels to renewables'

In [25]:
generated_text["source_documents"][0].metadata['source']

'pdf_files\\fastfacts-what-is-climate-change.pdf'

In [26]:
generated_text

{'query': 'solutions to Climate Change?',
 'result': 'Switching energy systems from fossil fuels to renewables',
 'source_documents': [Document(page_content='10. Switching energy systems from fossil fuels to renewables like solar will reduce the emissions driving climate change. But we have to start right now. While a growing coalition of countries is committing to net zero emissions by 2050, about half of emissions cuts must be in place by 2030 to keep warming below 1.5°C. Fossil fuel production must decline by roughly 6 per cent per year between 2020 and 2030.', metadata={'source': 'pdf_files\\fastfacts-what-is-climate-change.pdf'}),
  Document(page_content='9. Climate change is a huge challenge, but we already know many solutions. These can deliver economic benefits while improving our lives and protecting the environment. We also have global agreements to guide progress, such as the UN Framework Convention on Climate Change and the Paris Agreement. Three broad categories of action 

In [27]:
question = "What is Impedance transformation ?"
generated_text = qa(question)
generated_text

{'query': 'What is Impedance transformation ?',
 'result': 'The impedance is defined as a following ratio of phasors: LLLZVI',
 'source_documents': [Document(page_content=';psspVVIaIa\uf03d\uf020\uf020\uf020\uf020\uf020\uf020\uf020\uf020\uf020\uf03d\n\nTherefore:\n\ncoscoscosoutipsspppnVVIaIPVIaP\uf071\uf071\uf071\uf03d\uf03d\uf03d\uf03d\n\nThe output power of an ideal transformer equals to its input power – to be expected since assumed no loss. Similarly, for reactive and apparent powers:\n\nsinsinoutssppinQVIVIQ\uf071\uf071\uf03d\uf03d\uf03d\n\noutssppinSVIVIS\uf03d\uf03d\uf03d\n\nImpedance transformation\n\nThe impedance is defined as a following ratio of phasors:\n\nLLLZ\uf03dVI\n\nA transformer changes voltages and currents and, therefore, an apparent impedance of the load that is given by\n\nLssZ\uf03dVI', metadata={'source': 'pdf_files\\TRANSFORMERS.pdf'}),
  Document(page_content='Since the input voltage is low, the current flowing through the excitation branch is negligible; t