# RAG

## Requirements

In [1]:
%%capture
!pip install transformers accelerate bitsandbytes langchain langchain-community sentence-transformers faiss-gpu pandas gdown

## Dataset

In [2]:
!gdown --fuzzy https://drive.google.com/file/d/1Lq2zVJlN_B4kUAu4VafQ4jXMIQiAR9vI/view?usp=sharing

Downloading...
From (original): https://drive.google.com/uc?id=1Lq2zVJlN_B4kUAu4VafQ4jXMIQiAR9vI
From (redirected): https://drive.google.com/uc?id=1Lq2zVJlN_B4kUAu4VafQ4jXMIQiAR9vI&confirm=t&uuid=c5356571-97e3-437c-95d8-5facad765e92
To: /content/IMDB_crawled.json
100% 292M/292M [00:02<00:00, 105MB/s]


## Config

In [3]:
class Config:
    EMBEDDING_MODEL_NAME="thenlper/gte-base"
    LLM_MODEL_NAME="HuggingFaceH4/zephyr-7b-beta"
    K = 5 # top K retrieval

## Preprocessing

In [4]:
import pandas as pd

df = pd.read_json('IMDB_crawled.json')

In [6]:
df_top = df.head(5)
print(df_top)

          id                                              title  \
0  tt0071562                              The Godfather Part II   
1  tt0120737  The Lord of the Rings: The Fellowship of the Ring   
2  tt0110912                                       Pulp Fiction   
3  tt0068646                                      The Godfather   
4  tt0111161                           The Shawshank Redemption   

                                  first_page_summary release_year   mpaa  \
0  The early life and career of Vito Corleone in ...         1974      R   
1  A meek Hobbit from the Shire and eight compani...         2001  PG-13   
2  The lives of two mob hitmen, a boxer, a gangst...         1994      R   
3  The aging patriarch of an organized crime dyna...         1972      R   
4  Over the course of several years, two convicts...         1994      R   

                    budget gross_worldwide  rating               directors  \
0  $13,000,000 (estimated)     $47,962,683     9.0  [Francis F

In [7]:
import os

os.makedirs('data', exist_ok=True)

# preprocess your data and only store the needed data as the context window for embedding model is limited

needed_columns = ['title', 'first_page_summary', 'summaries', 'stars', 'genres', 'directors']
df2 = df[needed_columns]


df2.to_csv('data/imdb.csv', index=False)

## Vectorizer

load the CSV file and vectorize the rows using HuggingFaceEmbeddings.
Store the results using FAISS vectorstore.
Save the vectorestore in a pickle file for future usages.

In [10]:
import pickle

from langchain.document_loaders.csv_loader import CSVLoader
from langchain.vectorstores.utils import DistanceStrategy
from langchain.vectorstores.faiss import FAISS

from langchain_community.embeddings import HuggingFaceEmbeddings

# load the csv

# load the embeddings model

# save embed the documents using the model in a vectorstore

# with open("data/vectorstore.pkl", "wb") as f:
#     pickle.dump(vectorstore, f)
csv_file = 'data/imdb.csv'
loader = CSVLoader(file_path=csv_file)

documents = loader.load()

embeddings_model = HuggingFaceEmbeddings(model_name=Config.EMBEDDING_MODEL_NAME)

texts = [doc.page_content for doc in documents]

vectorstore = FAISS.from_texts(texts, embeddings_model, metadatas=[doc.metadata for doc in documents])

with open("data/vectorstore.pkl", "wb") as f:
    pickle.dump(vectorstore, f)


*load* the vectorstore as a retriever.*italicized text*

In [11]:
# with open("data/vectorstore.pkl", "rb") as f:
#     vectorstore = pickle.load(f)

with open("data/vectorstore.pkl", "rb") as f:
    vectorstore = pickle.load(f)

retriever = vectorstore.as_retriever()


## LLM

*load* the quantized LLM.

In [20]:
import torch

from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from transformers import pipeline

from langchain_community.llms.huggingface_pipeline import HuggingFacePipeline

# load the quantization config
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)


model = AutoModelForCausalLM.from_pretrained(Config.LLM_MODEL_NAME, quantization_config=bnb_config, device_map="cuda:0")
tokenizer = AutoTokenizer.from_pretrained(Config.LLM_MODEL_NAME)

# init the pipeline
READER_LLM = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    max_new_tokens=512,
    do_sample=True,
    temperature=0.7,
    top_p=0.95,
    repetition_penalty=1.15
)


llm = HuggingFacePipeline(
    pipeline=READER_LLM,
)

Loading checkpoint shards:   0%|          | 0/8 [00:00<?, ?it/s]

initialize the prompt template for the query chain. query chain is used to get a query from the chat history. you may change the prompt as you like to get better results.

In [21]:
from langchain.prompts import PromptTemplate
from langchain.chains import LLMChain

from langchain_core.output_parsers import StrOutputParser

class LoggerStrOutputParser(StrOutputParser):
    def parse(self, text: str) -> str:
        # process the LLM output
        print(f"QUERY: {text}")
        return text

query_transform_prompt = PromptTemplate(
    input_variables=["messages"],
    template="""<|system|>You are a helpful assistant.
{messages}
<|user|>
give me the search query about the above conversation.
<|assistant|>"""
)

# init the query chain
query_transforming_retriever_chain = LLMChain(
    llm=llm,
    prompt=query_transform_prompt,
    output_parser=LoggerStrOutputParser()
)

initialize the main retrieval chain that gives the resulting documents to LLM and gets the output back.

In [22]:
from langchain.chains.combine_documents import create_stuff_documents_chain

from langchain_core.runnables import RunnablePassthrough

prompt = PromptTemplate(
    input_variables=["context", "messages"],
    template="""<|system|>You are a helpful assistant.

Here are the movies you MUST choose from:

{context}
-----------------
{messages}
<|assistant|>""")

# init the retriver chain
retrieval_chain = (
    {
        "context": lambda x: retriever.get_relevant_documents(x["query"]),
        "messages": RunnablePassthrough()
    }
    | prompt
    | llm
    | StrOutputParser()
)

write the conversation helper class for easier testing.

In [24]:
class Conversation:
    def __init__(self):
        self.messages = []

    def add_assistant_message(self, message):
        self.messages.append(('assistant', message))

    def add_user_message(self, message):
        self.messages.append(('user', message))

    def get_messages(self):
        # concatenate the messages with the roles in the instruction format
        return "\n".join([f"<|{role}|>{message}" for role, message in self.messages])


    def chat(self, message):
      self.add_user_message(message)
      messages = self.get_messages()

      transformed_query = query_transforming_retriever_chain.run(messages)

      response = retrieval_chain.invoke({
          "messages": messages,
          "query": transformed_query
      })

      self.add_assistant_message(response)
      return response

## Test

talk with the RAG to see how good it performs.

In [25]:
c = Conversation()
A = c.chat('give me a cool gangster movie')
print(A)

QUERY: <|system|>You are a helpful assistant.
<|user|>give me a cool gangster movie
<|user|>
give me the search query about the above conversation.
<|assistant|>
"Recommend a stylish and intense gangster movie that will keep me on the edge of my seat with captivating performances and gritty action scenes."
<|system|>You are a helpful assistant.

Here are the movies you MUST choose from:

[Document(page_content="title: Scarface\nfirst_page_summary: \nsummaries: \nstars: \ngenres: ['Comedy', 'Crime', 'Drama']\ndirectors: ", metadata={'source': 'data/imdb.csv', 'row': 4195}), Document(page_content="title: American Gangster\nfirst_page_summary: Follows the lives of American gangsters.\nsummaries: ['Follows the lives of American gangsters.']\nstars: ['Barry Michael Cooper', 'Nelson George', 'Ving Rhames']\ngenres: ['Documentary']\ndirectors: ", metadata={'source': 'data/imdb.csv', 'row': 8951}), Document(page_content='title: Oscar\nfirst_page_summary: A gangster attempts to keep the promise

In [26]:
A = c.chat('give me a newer one')
print(A)

QUERY: <|system|>You are a helpful assistant.
<|user|>give me a cool gangster movie
<|assistant|><|system|>You are a helpful assistant.

Here are the movies you MUST choose from:

[Document(page_content="title: Scarface\nfirst_page_summary: \nsummaries: \nstars: \ngenres: ['Comedy', 'Crime', 'Drama']\ndirectors: ", metadata={'source': 'data/imdb.csv', 'row': 4195}), Document(page_content="title: American Gangster\nfirst_page_summary: Follows the lives of American gangsters.\nsummaries: ['Follows the lives of American gangsters.']\nstars: ['Barry Michael Cooper', 'Nelson George', 'Ving Rhames']\ngenres: ['Documentary']\ndirectors: ", metadata={'source': 'data/imdb.csv', 'row': 8951}), Document(page_content='title: Oscar\nfirst_page_summary: A gangster attempts to keep the promise he made to his dying father: that he would give up his life of crime and "go straight".\nsummaries: [\'A gangster attempts to keep the promise he made to his dying father: that he would give up his life of crim