## 1. Set your Embedding model and LLM model

In [1]:
Embedding_Model = "hkunlp/instructor-xl"

In [2]:
LLM_Model = "google/flan-t5-large"

## 2. load text files

In [8]:
from langchain.document_loaders import SeleniumURLLoader

In [31]:
urls_list = [
"http://......................",
"https://....................."
]

In [32]:
loader = SeleniumURLLoader(urls=urls_list)
documents = loader.load()

In [60]:
#documents

In [34]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [35]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
texts = text_splitter.split_documents(documents)

## 3. Load Embeddings

In [3]:
from langchain.embeddings import HuggingFaceInstructEmbeddings

In [None]:
instructor_embeddings = HuggingFaceInstructEmbeddings(model_name=Embedding_Model)

In [None]:
text = "This is a test document."
query_result = instructor_embeddings.embed_query(text)

In [None]:
#query_result

## 4. Load LLM

In [6]:
import torch
import transformers
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

In [7]:
tokenizer = AutoTokenizer.from_pretrained(LLM_Model)
model = AutoModelForSeq2SeqLM.from_pretrained(LLM_Model, torch_dtype=torch.float32)

In [14]:
from transformers import pipeline
from langchain.llms import HuggingFacePipeline

pipe = pipeline(
    "text2text-generation",
    model=model, 
    tokenizer=tokenizer, 
    max_length=512,
    temperature=0,
    top_p=0.95,
    repetition_penalty=1.15
)

llm = HuggingFacePipeline(pipeline=pipe)

In [15]:
print(llm('What is the capital of India?'))

calcutta


## 5. Create index

In [36]:
from langchain.vectorstores import Chroma

In [37]:
persist_directory = 'db'
db = Chroma.from_documents(documents=texts, 
                                 embedding=instructor_embeddings,
                                 persist_directory=persist_directory)

## 6. Create retriever from index and chain it with LLM

In [38]:
#retriever = db.as_retriever()
retriever = db.as_retriever(search_kwargs={"k": 3})

In [39]:
from langchain.chains import RetrievalQA

In [40]:
qa = RetrievalQA.from_chain_type(llm=llm, 
                                  chain_type="stuff", 
                                  retriever=retriever, 
                                  return_source_documents=True)

## 7. Query the chain

In [None]:
question = "What is ... ?"
generated_text = qa(question)
generated_text