## 1. Set your Embedding model and LLM model

In [1]:
Embedding_Model = "hkunlp/instructor-xl"

In [2]:
LLM_Model = "google/flan-t5-large"

## 2. load text files

In [3]:
from langchain.document_loaders import DirectoryLoader

In [4]:
document_directory = "text_files"
loader = DirectoryLoader(document_directory)
documents = loader.load()

In [5]:
documents

[Document(page_content="The history of computers dates back to the early 19th century when Charles Babbage conceived the idea of a mechanical computer. However, it wasn't until the mid-20th century that electronic computers became a reality. The ENIAC, built in 1945, is considered one of the earliest general-purpose electronic computers. Over the years, computers have evolved significantly in terms of size, power, and capabilities. The invention of the microprocessor in the 1970s revolutionized the industry and paved the way for personal computers, leading to the computer revolution in the 1980s.", metadata={'source': 'text_files\\History of Computers.txt'}),
 Document(page_content='The solar system comprises the Sun and all the celestial objects that orbit it, including planets, moons, asteroids, and comets. The Sun, a massive ball of hot plasma, accounts for over 99% of the total mass of the solar system. The four inner planets, Mercury, Venus, Earth, and Mars, are rocky and relative

## 3. Load Embeddings

In [6]:
from langchain.embeddings import HuggingFaceInstructEmbeddings

In [7]:
instructor_embeddings = HuggingFaceInstructEmbeddings(model_name=Embedding_Model)

  from tqdm.autonotebook import trange


load INSTRUCTOR_Transformer
'NoneType' object has no attribute 'cadam32bit_grad_fp32'


  warn("The installed version of bitsandbytes was compiled without GPU support. "


max_seq_length  512


In [8]:
text = "This is a test document."
query_result = instructor_embeddings.embed_query(text)
query_result

[0.023476742208003998,
 -0.0015357566298916936,
 -0.005376717541366816,
 -0.06068088859319687,
 -0.010196278803050518,
 -0.027731968089938164,
 -0.05462538078427315,
 0.019978677853941917,
 -0.008765235543251038,
 -0.056243013590574265,
 0.03438612446188927,
 0.043268151581287384,
 -0.04471273720264435,
 -0.13304342329502106,
 -0.07073519378900528,
 0.01974298432469368,
 -0.001934259431436658,
 -0.10175132006406784,
 0.017565015703439713,
 0.008344719186425209,
 -0.028570154681801796,
 0.016440700739622116,
 -0.018115971237421036,
 -0.04139815270900726,
 -0.03298606351017952,
 -0.10828442126512527,
 0.009210496209561825,
 0.04037890210747719,
 0.030877672135829926,
 -0.027175571769475937,
 0.08999558538198471,
 -0.02717418409883976,
 -2.980713361466769e-05,
 -0.028053907677531242,
 0.004379512742161751,
 0.013079671189188957,
 -0.028671571984887123,
 -0.015137831680476665,
 0.01149486843496561,
 0.000993275549262762,
 -0.025433707982301712,
 0.05826026573777199,
 0.015601390972733498,


## 4. Load LLM

In [9]:
import torch
import transformers
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

In [10]:
tokenizer = AutoTokenizer.from_pretrained(LLM_Model)
model = AutoModelForSeq2SeqLM.from_pretrained(LLM_Model, torch_dtype=torch.float32)

In [11]:
from transformers import pipeline
from langchain.llms import HuggingFacePipeline

pipe = pipeline(
    "text2text-generation",
    model=model, 
    tokenizer=tokenizer, 
    max_length=512,
    temperature=0,
    top_p=0.95,
    repetition_penalty=1.15
)

llm = HuggingFacePipeline(pipeline=pipe)

In [12]:
print(llm('What is the capital of India?'))

calcutta


## 5. Create index

In [13]:
from langchain.vectorstores import Chroma

In [14]:
persist_directory = 'db'
db = Chroma.from_documents(documents=documents, 
                                 embedding=instructor_embeddings,
                                 persist_directory=persist_directory)

## 6. Create retriever from index and chain it with LLM

In [19]:
#retriever = db.as_retriever()
retriever = db.as_retriever(search_kwargs={"k": 3})

In [24]:
from langchain.chains import RetrievalQA

In [25]:
qa = RetrievalQA.from_chain_type(llm=llm, 
                                  chain_type="stuff", 
                                  retriever=retriever, 
                                  return_source_documents=True)

## 7. Query the chain

In [26]:
question = "What is Jupiter and Saturn ?"
generated_text = qa(question)

{'query': 'What is Jupiter and Saturn ?',
 'result': 'gas giants with thick atmospheres',
 'source_documents': [Document(page_content='The solar system comprises the Sun and all the celestial objects that orbit it, including planets, moons, asteroids, and comets. The Sun, a massive ball of hot plasma, accounts for over 99% of the total mass of the solar system. The four inner planets, Mercury, Venus, Earth, and Mars, are rocky and relatively small, while the outer planets, Jupiter, Saturn, Uranus, and Neptune, are gas giants with thick atmospheres. Pluto was considered the ninth planet until its reclassification as a dwarf planet in 2006. The solar system is vast and offers fascinating insights into the workings of the universe.', metadata={'source': 'text_files\\Solar System.txt'}),
  Document(page_content='William Shakespeare was an English playwright, poet, and actor, widely regarded as one of the greatest writers in the English language. He was born in Stratford-upon-Avon in 1564 a

In [28]:
generated_text["result"]

'gas giants with thick atmospheres'

In [29]:
generated_text["source_documents"][0].metadata['source']

'text_files\\Solar System.txt'

In [30]:
generated_text

{'query': 'What is Jupiter and Saturn ?',
 'result': 'gas giants with thick atmospheres',
 'source_documents': [Document(page_content='The solar system comprises the Sun and all the celestial objects that orbit it, including planets, moons, asteroids, and comets. The Sun, a massive ball of hot plasma, accounts for over 99% of the total mass of the solar system. The four inner planets, Mercury, Venus, Earth, and Mars, are rocky and relatively small, while the outer planets, Jupiter, Saturn, Uranus, and Neptune, are gas giants with thick atmospheres. Pluto was considered the ninth planet until its reclassification as a dwarf planet in 2006. The solar system is vast and offers fascinating insights into the workings of the universe.', metadata={'source': 'text_files\\Solar System.txt'}),
  Document(page_content='William Shakespeare was an English playwright, poet, and actor, widely regarded as one of the greatest writers in the English language. He was born in Stratford-upon-Avon in 1564 a