# RAG

In [2]:
# langchain
! pip install langchain langchain_community
!pip install -q langchain
!pip install -q torch
!pip install -q transformers
!pip install -q sentence-transformers
!pip install -q datasets
!pip install -q faiss-cpu

In [1]:
from langchain.document_loaders import HuggingFaceDatasetLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
# pinecone, llama_index, drant, faiss
from langchain.vectorstores import FAISS
from transformers import AutoTokenizer, AutoModelForQuestionAnswering
from transformers import AutoTokenizer, pipeline
from langchain import HuggingFacePipeline
# rag=retrieval+generation
# RetrievalQA: a type  of rag by langachain:generator~llm(user_query+retriever)
from langchain.chains import RetrievalQA

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Specify the dataset name and the column containing the content
token="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"

dataset_name = "databricks/databricks-dolly-15k"
page_content_column = "context"  # or any other column you're interested , this column will be main data

# Create a loader instance
loader = HuggingFaceDatasetLoader(dataset_name, page_content_column)

# Load the data
data = loader.load()

# Display the first 15 entries
data[:5]



[Document(metadata={'instruction': 'When did Virgin Australia start operating?', 'response': 'Virgin Australia commenced services on 31 August 2000 as Virgin Blue, with two aircraft on a single route.', 'category': 'closed_qa'}, page_content='"Virgin Australia, the trading name of Virgin Australia Airlines Pty Ltd, is an Australian-based airline. It is the largest airline by fleet size to use the Virgin brand. It commenced services on 31 August 2000 as Virgin Blue, with two aircraft on a single route. It suddenly found itself as a major airline in Australia\'s domestic market after the collapse of Ansett Australia in September 2001. The airline has since grown to directly serve 32 cities in Australia, from hubs in Brisbane, Melbourne and Sydney."'),
 Document(metadata={'instruction': 'Which is a species of fish? Tope or Rope', 'response': 'Tope', 'category': 'classification'}, page_content='""'),
 Document(metadata={'instruction': 'Why can camels survive for long without water?', 'resp

In [3]:
# text spliteer: charcters
# if  white sapce is coming then, tehn it will take pahele tak  ka text, white space ke baad ka text chor dena bro
# take one doc and split it, unrealted with other doc, other doc ka splitting freshly start hoga
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=150)

# 'data' holds the text you want to split, split the text into documents using the text splitter.
docs = text_splitter.split_documents(data)
docs[:10000]

[Document(metadata={'instruction': 'When did Virgin Australia start operating?', 'response': 'Virgin Australia commenced services on 31 August 2000 as Virgin Blue, with two aircraft on a single route.', 'category': 'closed_qa'}, page_content='"Virgin Australia, the trading name of Virgin Australia Airlines Pty Ltd, is an Australian-based airline. It is the largest airline by fleet size to use the Virgin brand. It commenced services on 31 August 2000 as Virgin Blue, with two aircraft on a single route. It suddenly found itself as a major airline in Australia\'s domestic market after the collapse of Ansett Australia in September 2001. The airline has since grown to directly serve 32 cities in Australia, from hubs in Brisbane, Melbourne and Sydney."'),
 Document(metadata={'instruction': 'Which is a species of fish? Tope or Rope', 'response': 'Tope', 'category': 'classification'}, page_content='""'),
 Document(metadata={'instruction': 'Why can camels survive for long without water?', 'resp

In [10]:
# vect: token(num form); embedding: token/word/sentence in num form
# Define the path to the pre-trained model you want to use
# embed in 384 dim, in semantic meaning term
# sentence level embedding:, embed each sentence into 1 vector,
# token level embedding: embed each token in n dim space, n embed vector for n tokens
modelPath = "sentence-transformers/all-MiniLM-l6-v2"

# Create a dictionary with model configuration options, specifying to use the CPU for computations
model_kwargs = {'device':'cpu'}

# Create a dictionary with encoding options, specifically setting 'normalize_embeddings' to False
encode_kwargs = {'normalize_embeddings': False}

# Initialize an instance of HuggingFaceEmbeddings with the specified parameters,  sentence transfomer will embed in fixed shape of 384 dim
embeddings = HuggingFaceEmbeddings(
    model_name=modelPath,     # Provide the pre-trained model's path
    model_kwargs=model_kwargs, # Pass the model configuration options
    encode_kwargs=encode_kwargs # Pass the encoding options
    )
text = " Eastern spirituality,"
query_result = embeddings.embed_query(text)
len(query_result)

384

In [None]:
# saving data in facebook ai semantic search, support semnatic search even keyword expressing semantic is not avialble
# bert: bidirectional encoder and decoder will help to understand context
# use knn for semantic search

#DPR (Dense Passage Retrieval): faiss use dpr to retrieve most matched sentence, 
# drp use 2 encode: 
#1- take user query embed it similar like bert model
# 2- take  relevant doc corpus and embed it
# now fectch most simliar doc using faiss.drp
db = FAISS.from_documents(docs, embeddings)

question = "What is cheesemaking?"
searchDocs = db.similarity_search(question)
print(searchDocs[0].page_content)

In [None]:
# Specify the model name you want to use
model_name = "Intel/dynamic_tinybert"

# Load the tokenizer associated with the specified model
tokenizer = AutoTokenizer.from_pretrained(model_name, padding=True, truncation=True, max_length=512)

# Define a question-answering pipeline using the model and tokenizer
question_answerer = pipeline(
    "question-answering", 
    model=model_name, 
    tokenizer=tokenizer,
    return_tensors='pt'
)

# Create an instance of the HuggingFacePipeline, which wraps the question-answering pipeline
# with additional model-specific arguments (temperature and max_length)
llm = HuggingFacePipeline(
    pipeline=question_answerer,
    model_kwargs={"temperature": 0.7, "max_length": 512},
)

In [None]:
retriever = db.as_retriever()
docs = retriever.get_relevant_documents("What is Cheesemaking?")
print(docs[0].page_content)

Full model

In [None]:
# Create a retriever object from the 'db' with a search configuration where it retrieves up to 4 relevant splits/documents.
retriever = db.as_retriever(search_kwargs={"k": 4})

# Create a question-answering instance (qa) using the RetrievalQA class.
# It's configured with a language model (llm), a chain type "refine," the retriever we created, and an option to not return source documents.
# final output=generator~llm(retreiver+user_query)
qa = RetrievalQA.from_chain_type(llm=llm, chain_type="refine", retriever=retriever, return_source_documents=False)

question = "Who is Thomas Jefferson?"
result = qa.run({"query": question})
print(result["result"])

In [1]:
# example of hugging face model loading
from transformers import AutoTokenizer, AutoModelForSequenceClassification
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased-finetuned-sst-2-english")
model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased-finetuned-sst-2-english")

text = "I love programming!"
tokens = tokenizer(text, padding=True, truncation=True, return_tensors="pt")
model(**tokens)

  from .autonotebook import tqdm as notebook_tqdm


SequenceClassifierOutput(loss=None, logits=tensor([[-4.3057,  4.6439]], grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)

In [None]:
# access_token_vscode="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"
# read_vscode_token="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"

# from transformers import AutoModelForCausalLM, AutoTokenizer
# model = AutoModelForCausalLM.from_pretrained("mistralai/Mistral-7B-v0.1")
# tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-v0.1")

# prompt = "My favorite condiment is"
# model_inputs = tokenizer([prompt], return_tensors="pt")
# generated_ids = model.generate(**model_inputs, max_new_tokens=100, do_sample=True)
# generated_text = tokenizer.batch_decode(generated_ids)[0]
# print("Generated text:", generated_text)


In [19]:
# Import prompt and define PromptTemplate
from langchain import PromptTemplate

template = """
You are an expert data scientist with an expertise in building deep learning models. 
Explain the concept of {concept} in a couple of lines
"""

prompt = PromptTemplate(
    input_variables=["concept"],
    template=template)
print(prompt.format(concept="asdf"))

examples = [
    {"input": "2+2", "output": "4"},
    {"input": "2+3", "output": "5"}]

# This is a prompt template used to format each individual example.
example_prompt = ChatPromptTemplate.from_messages(
    [
        ("human", " our input is :{input}"),
        ("ai", " final output is :{output}"),
    ]
)
few_shot_prompt = FewShotChatMessagePromptTemplate(
    example_prompt=example_prompt,
    examples=examples,
)

print(few_shot_prompt.format())


You are an expert data scientist with an expertise in building deep learning models. 
Explain the concept of asdf in a couple of lines

Human:  our input is :2+2
AI:  final output is :4
Human:  our input is :2+3
AI:  final output is :5
