<a href="https://colab.research.google.com/github/Yanzhiii/LearnRAG/blob/main/LearnRAG.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install -q rank_bm25
!pip install -q chromadb
!pip install -qU bitsandbytes accelerate
!pip install -q langchain_community
!pip install -qU peft

In [None]:
import numpy as np
import string
import json
import os
from rank_bm25 import BM25Okapi
import torch
import gc
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from google.colab import userdata
from peft import PeftModel

import requests
from bs4 import BeautifulSoup

import textwrap

In [None]:
if torch.cuda.is_available():
    device = torch.device("cuda")
    print("Using GPU:", torch.cuda.get_device_name(0))
else:
    device = torch.device("cpu")
    print("Using CPU")

Using GPU: Tesla T4


In [None]:
# Add Hugging Face token in Colab Secret
hf_token = userdata.get('HF_TOKEN')
os.environ['HF_TOKEN'] = hf_token

In [None]:
improved_rag = False
debug_mode = True    # To skip certain cells

In [None]:
query = "What's the meaning of embeddings in NLP?"

## Data Collector

### Baseline Data Collector

In [None]:
if improved_rag:
  print("Skip this cell")
else:
  print("Run this cell")

  def scrape_p(url):
      response = requests.get(url)
      soup = BeautifulSoup(response.content,"html.parser")
      paragraphs = soup.find_all("p")
      scraped_doc = []
      for p in paragraphs:
          scraped_doc.append(p.get_text())

      return scraped_doc

  doc_1 = "https://www.ibm.com/think/topics/machine-learning"
  doc_2 = "https://www.ibm.com/think/topics/support-vector-machine"
  database = list(set(scrape_p(doc_1) + scrape_p(doc_2)))

Skip this cell


## Retriver

### Baseline Retriver using BM25
BM25 repo: https://github.com/dorianbrown/rank_bm25

In [None]:
if improved_rag:
  print("Skip this cell")
else:
  print("Run this cell")

  top_n = 6
  punc = string.punctuation
  clean_tokenized_docs = []

  # split by " " then remove punctuation
  tokenized_docs = [doc.split(" ") for doc in database]
  for i in range(len(tokenized_docs)):
      clean_tokenized_docs += [[word.strip(punc) for word in tokenized_docs[i]]]

  bm25 = BM25Okapi(clean_tokenized_docs)
  clean_tokenized_query = [word.strip(punc) for word in query.split(" ")]
  doc_scores = bm25.get_scores(clean_tokenized_query)   # get the indeces of docs from small to big
  top_n_indices = np.argsort(doc_scores)[-top_n:][::-1] # get the top-n docs
  top_n_docs = [database[i] for i in top_n_indices]

Skip this cell


### Improved Retriever using Chroma DB

In [None]:
import os
from langchain_text_splitters.character import RecursiveCharacterTextSplitter

# Loads the chromadb collection from text files
def load_collection(collection,docs_directory,chunk_size=100,chunk_overlap=30):
    raw_docs = []
    for (root, dirs, file) in os.walk(docs_directory):
        raw_docs = file

    text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=chunk_size,
    chunk_overlap=chunk_overlap,
    length_function=len,
    add_start_index=True,
    )

    for raw_doc in raw_docs:
        id = ''
        doc = ''
        with open('/'.join([docs_directory,raw_doc]),"r",encoding="utf-8") as file:
            id = raw_doc.removesuffix('.txt').replace('_','/')
            doc = file.readline()
            docs = text_splitter.split_text(doc)
            docs_len = len(docs)
            ids = []
            for i in range(docs_len):
              ids.append(id+'_'+str(i))
        collection.add(documents=docs,ids=ids)

_Latest progress 25-03-2025_

In [None]:
import os
from langchain_text_splitters.character import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import PyPDFLoader

# Loads chromadb collection from pdfs
def load_collection_pdf(collection,docs_directory,chunk_size=300,chunk_overlap=100):
    raw_docs = []
    for (root, dirs, file) in os.walk(docs_directory):
        raw_docs = file

    text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=chunk_size,
    chunk_overlap=chunk_overlap,
    length_function=len,
    add_start_index=True,
)

    for raw_doc in raw_docs[:10]:
        id = ''
        doc = ''
        loader = PyPDFLoader('/'.join([docs_directory,raw_doc]))
        document = loader.load()
        for docpart in document:
          doc = ' '.join([doc,docpart.page_content])
        docs = text_splitter.split_text(doc)
        docs_len = len(docs)
        ids = []
        for i in range(docs_len):
          ids.append(id+'_'+str(i))
        collection.add(documents=docs,ids=ids)

In [None]:
!unzip '/content/chroma.zip'

In [None]:
import chromadb
chroma_client = chromadb.PersistentClient(path='/content/chroma')

In [None]:
print(chroma_client.list_collections())

[Collection(name=all_collection)]


In [None]:
ml_sample = chroma_client.get_collection(name="all_collection")

In [None]:
retriver_output = ml_sample.query(
    query_texts = [query],
    n_results = 5
)
retriver_output = [chunk.replace('\n', ' ') for chunk in retriver_output['documents'][0]]

In [None]:
retriver_output

['Embeddings are long vectors of real numbers and provide a way to represent  the knowledge associated with the tokens. During training, a model implicitly  deﬁnes a representation space that determines the meaning of embeddings. Usually,  embeddings are assigned to tokens, i.e. parts of words, but may also be determined  for paragraphs and complete documents. If two embeddings have a small vector  distance, the meaning of the underlying tokens is similar. Foundation Models  generate increasingly reﬁned embeddings in their layers by taking into account the  context of the tokens. The word “bank” close to the word “money” has a different  embedding than a “bank” close to the word “river”, making the embeddings  contextual. These effects also apply to tokens of different media types.  Embeddings are calculated by self-attention computing correlations between  linear projections of input embeddings. This is done in parallel by multiple linear',
 'Internet to train the models. In the secon

## Prompts

In [None]:
retrieved_docs = "\n".join(retriver_output)
rag_prompts = "You are a learning assistance chatbot. Based only on the provided knowledge base, \
answer the user's question. If the knowledge base does not contain relevant information \
to answer the question, clearly state that you do not know and cannot answer \
based on the available information. When you can answer, explain clearly, \
providing relevant details and background information found within the text. \
Do not use any prior knowledge. \n\
Knowledge base: \n{}\n\
Question: {}\n\
Indicate what you refer to in the knowledge database briefly, \
Answer the question concisely with nice structure and fluent logic in one paragraph:\n".format(retrieved_docs, query)

plain_prompts = "You are a learning assistance chatbot answering the given question.\n\
Question: {}\n\
Following the instruction at beginning, answer the question in your words concisely with in one paragraph:\n\n".format(query)

In [None]:
# For fine-tuned RAFT model (Structured Messages Format)
# Format documents with tags to match training format
formatted_docs = ""
for doc in retriver_output:
  formatted_docs += f"<DOCUMENT>{doc}</DOCUMENT>\n"

system_prompt = "You are a learning assistance chatbot. Based only on the provided knowledge base, answer the user's question.\
 If the knowledge base does not contain relevant information to answer the question, clearly state that you do not know and cannot \
 answer based on the available information. When you can answer, explain clearly, providing relevant details and background \
 information found within the text. Do not use any prior knowledge."
user_prompt = f"{formatted_docs}\nQuestion: {query}"

messages = [
    {"role": "system", "content": system_prompt},
    {"role": "user", "content": user_prompt}
]

In [None]:
print(rag_prompts)

You are a learning assistance chatbot. Based only on the provided knowledge base, answer the user's question. If the knowledge base does not contain relevant information to answer the question, clearly state that you do not know and cannot answer based on the available information. When you can answer, explain clearly, providing relevant details and background information found within the text. Do not use any prior knowledge. 
Knowledge base: 
Embeddings are long vectors of real numbers and provide a way to represent  the knowledge associated with the tokens. During training, a model implicitly  deﬁnes a representation space that determines the meaning of embeddings. Usually,  embeddings are assigned to tokens, i.e. parts of words, but may also be determined  for paragraphs and complete documents. If two embeddings have a small vector  distance, the meaning of the underlying tokens is similar. Foundation Models  generate increasingly reﬁned embeddings in their layers by taking into acc

In [None]:
print(plain_prompts)

You are a learning assistance chatbot answering the given question.
Question: What's the meaning of embeddings in NLP?
Following the instruction at beginning, answer the question in your words concisely with in one paragraph:




In [None]:
messages

[{'role': 'system',
  'content': "You are a learning assistance chatbot. Based only on the provided knowledge base, answer the user's question. If the knowledge base does not contain relevant information to answer the question, clearly state that you do not know and cannot answer based on the available information. When you can answer, explain clearly, providing relevant details and background information found within the text. Do not use any prior knowledge."},
 {'role': 'user',
  'content': "<DOCUMENT>Embeddings are long vectors of real numbers and provide a way to represent  the knowledge associated with the tokens. During training, a model implicitly  deﬁnes a representation space that determines the meaning of embeddings. Usually,  embeddings are assigned to tokens, i.e. parts of words, but may also be determined  for paragraphs and complete documents. If two embeddings have a small vector  distance, the meaning of the underlying tokens is similar. Foundation Models  generate incr

## Generator
Language model: https://huggingface.co/TinyLlama/TinyLlama-1.1B-Chat-v1.0

In [None]:
'''
if improved_rag:
  print("Skip this cell")
else:
  print("Run this cell")

  model_name = 'TinyLlama/TinyLlama-1.1B-Chat-v1.0'
  tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
  model = AutoModelForCausalLM.from_pretrained(model_name, trust_remote_code=True).to(device)

  inputs = tokenizer.encode(rag_prompts, return_tensors='pt').to(device)    # rag_prompts / plain_prompts
  outputs = model.generate(inputs, max_length=1500, num_return_sequences=1, repetition_penalty=1.2)
  answer = tokenizer.decode(outputs[0], skip_special_tokens=True)

  print(textwrap.fill(answer[len(rag_prompts):], width=100))
'''

'\nif improved_rag:\n  print("Skip this cell")\nelse:\n  print("Run this cell")\n\n  model_name = \'TinyLlama/TinyLlama-1.1B-Chat-v1.0\'\n  tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)\n  model = AutoModelForCausalLM.from_pretrained(model_name, trust_remote_code=True).to(device)\n\n  inputs = tokenizer.encode(rag_prompts, return_tensors=\'pt\').to(device)    # rag_prompts / plain_prompts\n  outputs = model.generate(inputs, max_length=1500, num_return_sequences=1, repetition_penalty=1.2)\n  answer = tokenizer.decode(outputs[0], skip_special_tokens=True)\n\n  print(textwrap.fill(answer[len(rag_prompts):], width=100))\n'

Language model: https://huggingface.co/meta-llama/Llama-3.2-3B-Instruct

In [None]:
# For original approach - without the fine-tuned model
if improved_rag:
  print("Skip this cell")
else:
  print("Run this cell")

  # Configuring 4-bit Quantization Parameters
  quantization_config = BitsAndBytesConfig(
      load_in_4bit=True,
      bnb_4bit_quant_type='nf4',
      bnb_4bit_compute_dtype=torch.float16
  )

  # Loading the model and its tokenizer
  model_id_1 = "meta-llama/Llama-3.2-3B-Instruct"
  tokenizer_1 = AutoTokenizer.from_pretrained(model_id_1)
  model_1 = AutoModelForCausalLM.from_pretrained(
      model_id_1,
      quantization_config=quantization_config,
      device_map="auto"
  )

  # Inference function
  def generate_response(prompt, model, tokenizer, max_length=8000):
      inputs = tokenizer(prompt, return_tensors="pt").to("cuda")

      # Generate with the prompt tokens
      prompt_length = inputs.input_ids.shape[1]
      outputs = model.generate(
          **inputs,
          max_new_tokens=max_length,
          pad_token_id=tokenizer.eos_token_id
      )

      # Only decode the newly generated tokens
      response = tokenizer.decode(outputs[0][prompt_length:], skip_special_tokens=True)
      return response.strip()

  # Generate an answer
  answer = generate_response(rag_prompts, model_1, tokenizer_1)
  print(textwrap.fill(answer, width=100))

  # Release VRAM
  del model_1
  del tokenizer_1
  gc.collect()
  torch.cuda.empty_cache()

Run this cell


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Embeddings in NLP refer to long vectors of real numbers that represent the knowledge associated with
tokens, such as words or phrases. These vectors are learned during training and implicitly define a
representation space that determines the meaning of the tokens. The meaning of two tokens with
similar embeddings is similar, indicating that the embeddings capture semantic relationships between
tokens. In the context of NLP, embeddings are calculated by self-attention computing correlations
between linear projections of input embeddings, allowing models to capture subtle semantic
properties of language. (Source: Knowledge Base)


LoRA adaptor: https://huggingface.co/Yanzhii/llama-3.2-3b-raft-adapter

In [None]:
if improved_rag:
  print("Run this cell")

  # Configuration for model loading
  quantization_config = BitsAndBytesConfig(
      load_in_4bit=True,
      bnb_4bit_quant_type='nf4',
      bnb_4bit_compute_dtype=torch.float16
  )

  # Variables setup
  base_model_id = "meta-llama/Llama-3.2-3B-Instruct"
  hf_username = "Yanzhii"  # Hugging Face username
  adapter_name = "llama-3.2-3b-raft-adapter"  # Adapter name
  adapter_repo_id = f"{hf_username}/{adapter_name}"

  print(f"Loading base model: {base_model_id}")

  # Load tokenizer and base model
  tokenizer_ft = AutoTokenizer.from_pretrained(base_model_id)
  base_model = AutoModelForCausalLM.from_pretrained(
      base_model_id,
      quantization_config=quantization_config,
      device_map="auto"
  )

  print(f"Loading adapter: {adapter_repo_id}")

  # Load and merge with the LoRA adapter
  model_ft = PeftModel.from_pretrained(base_model, adapter_repo_id)
  model_ft.eval()  # Set to evaluation mode

  # Generate response and separate CoT and answer parts
  def generate_raft_response(messages, max_length=2000):
      # Apply chat template to convert messages to model input format
      inputs = tokenizer_ft.apply_chat_template(
          messages,
          add_generation_prompt=True,
          return_tensors="pt"
      ).to("cuda")

      # Fix attention mask issue
      attention_mask = torch.ones_like(inputs)

      # Generate response
      outputs = model_ft.generate(
          input_ids=inputs,
          attention_mask=attention_mask,
          max_new_tokens=max_length,
          pad_token_id=tokenizer_ft.eos_token_id,
          temperature=0.7,
          top_p=0.9
      )

      # Decode only the generated part
      response_ids = outputs[0][inputs.shape[-1]:]
      response_text = tokenizer_ft.decode(response_ids, skip_special_tokens=True)

      # Split response at <ANSWER> tag
      parts = response_text.split("<ANSWER>")

      cot_part = parts[0].strip() if len(parts) > 0 else ""
      answer_part = parts[1].strip() if len(parts) > 1 else response_text.strip()

      # Remove leading colon and spaces from answer if present
      if answer_part.startswith(":"):
          answer_part = answer_part[1:].strip()

      return cot_part, answer_part

  # Generate answer with fine-tuned model
  print("Generating response using RAFT fine-tuned model...\n")
  raft_cot, raft_answer = generate_raft_response(messages)

  print("Chain of Thought:")
  print(textwrap.fill(raft_cot, width=100))

  print("\nFinal Answer:")
  print(textwrap.fill(raft_answer, width=100))

  # Release VRAM
  del model_ft
  del base_model
  del tokenizer_ft
  gc.collect()
  torch.cuda.empty_cache()

else:
  print("Skip this cell")

Run this cell
Loading base model: meta-llama/Llama-3.2-3B-Instruct


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Loading adapter: Yanzhii/llama-3.2-3b-raft-adapter
Generating response using RAFT fine-tuned model...

Chain of Thought:
To answer the question about the meaning of embeddings in NLP, we need to carefully examine the
context provided.  1. **Contextual Clues**: The context discusses the use of embeddings in Natural
Language Processing (NLP). It mentions that embeddings are long vectors of real numbers and provide
a way to represent the knowledge associated with the tokens.  2. **Key Information**: The context
highlights the role of embeddings in determining the meaning of embeddings. It states that during
training, a model implicitly defines a representation space that determines the meaning of
embeddings. This indicates that embeddings are not just numerical representations but also carry
semantic meaning.  3. **Semantic Interpretation**: The context suggests that embeddings are
contextual, meaning they are influenced by the context in which they appear. For example, embeddings
from th