# Semantic Search & RAG with LlamaIndex
## ABB #6 - Session 3

Code authored by: Shaw Talebi

### imports

In [1]:
from IPython.display import display, Markdown
from bs4 import BeautifulSoup

from llama_index.core import VectorStoreIndex, get_response_synthesizer, Settings
from llama_index.core.schema import TextNode
from llama_index.core.retrievers import VectorIndexRetriever
from llama_index.core.query_engine import RetrieverQueryEngine
from llama_index.core.postprocessor import SimilarityPostprocessor

In [2]:
from dotenv import load_dotenv
import os

# import sk from .env file
load_dotenv()
my_sk = os.getenv("OPENAI_API_KEY")

### 1) chunk articles

In [3]:
# Get all HTML files from raw directory
filename_list = ["articles/"+f for f in os.listdir('articles')]

chunk_list = []
for filename in filename_list:
    # only process .html files
    if filename.lower().endswith(('.html')):
        # read html file
        with open(filename, 'r', encoding='utf-8') as file:
            html_content = file.read()
    
        # Parse HTML
        soup = BeautifulSoup(html_content, 'html.parser')
        
        # Get article title
        article_title = soup.find('title').get_text().strip() if soup.find('title') else "Untitled"
        
        # Initialize variables
        article_content = []
        current_section = "Main"  # Default section if no headers found
        
        # Find all headers and text content
        content_elements = soup.find_all(['h1', 'h2', 'h3', 'p', 'ul', 'ol'])
    
        # iterate through elements and extract text with metadata
        for element in content_elements:
            if element.name in ['h1', 'h2', 'h3']:
                current_section = element.get_text().strip()
            elif element.name in ['p', 'ul', 'ol']:
                text = element.get_text().strip()
                # Only add non-empty content that's at least 30 characters long
                if text and len(text) >= 30:
                    article_content.append({
                        'article_title': article_title,
                        'section': current_section,
                        'text': text
                    })
    
        # add article content to list
        chunk_list.extend(article_content)

In [4]:
# create nodes with Llama Index (i.e. nodes)
node_list = []
for i, chunk in enumerate(chunk_list):
    node_list.append(
        TextNode(
            id_=str(i), 
            text=chunk["text"], 
            metadata = {
                "article":chunk["article_title"],
                "section":chunk["section"]
            }
        )
    )

print(len(node_list))

778


### 2) create index

In [5]:
index = VectorStoreIndex(node_list)

print(f"Embedding Model: {index._embed_model.model_name}")
print(f"Index Size: {len(index.vector_store.data.embedding_dict)}")
print(f"Embedding Size: {len(index.vector_store.data.embedding_dict["0"])}")

Embedding Model: text-embedding-ada-002
Index Size: 778
Embedding Size: 1536


In [6]:
from llama_index.embeddings.huggingface import HuggingFaceEmbedding

# changing embedding model
Settings.embed_model = HuggingFaceEmbedding(
    model_name="BAAI/bge-small-en-v1.5"
)

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/743 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/133M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/366 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [7]:
index = VectorStoreIndex(node_list)

print(f"Embedding Model: {index._embed_model.model_name}")
print(f"Index Size: {len(index.vector_store.data.embedding_dict)}")
print(f"Embedding Size: {len(index.vector_store.data.embedding_dict["0"])}")

Embedding Model: BAAI/bge-small-en-v1.5
Index Size: 778
Embedding Size: 384


### 3) semantic search

In [8]:
# configure retriever
retriever = VectorIndexRetriever(
    index=index,
    similarity_top_k=10,
)

In [9]:
results = retriever.retrieve("When do I perform fine-tuning?")

In [10]:
results[0]

NodeWithScore(node=TextNode(id_='155', embedding=None, metadata={'article': 'LLM Fine-tuning\u200a—\u200aFAQs', 'section': 'When do I Fine-tune?'}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={}, metadata_template='{key}: {value}', metadata_separator='\n', text='This is not to say that fine-tuning is useless. A central benefit of fine-tuning an AI assistant is lowering inference costs [3].', mimetype='text/plain', start_char_idx=None, end_char_idx=None, metadata_seperator='\n', text_template='{metadata_str}\n\n{content}'), score=0.8114657060166933)

In [11]:
# format results in markdown
results_markdown = ""
for i, result in enumerate(results, start=1):
    results_markdown += f"{i}. **Article title:** {result.metadata["article"]}  \n"
    results_markdown += f"   **Section:** {result.metadata["section"]}  \n"
    results_markdown += f"   **Snippet:** {result.text} \n\n"
    results_markdown += f"   **Score:** {result.score} \n\n"

In [12]:
display(Markdown(results_markdown))

1. **Article title:** LLM Fine-tuning — FAQs  
   **Section:** When do I Fine-tune?  
   **Snippet:** This is not to say that fine-tuning is useless. A central benefit of fine-tuning an AI assistant is lowering inference costs [3]. 

   **Score:** 0.8114657060166933 

2. **Article title:** LLM Fine-tuning — FAQs  
   **Section:** When NOT to Fine-tune  
   **Snippet:** The effectiveness of any approach will depend on the details of the use case. For example, fine-tuning is less effective than retrieval augmented generation (RAG) to provide LLMs with specialized knowledge [1]. 

   **Score:** 0.800293870277152 

3. **Article title:** LLM Fine-tuning — FAQs  
   **Section:** How to Prepare Data for Fine-tuning?  
   **Snippet:** For example, if I wanted to fine-tune an LLM to respond to viewer questions on YouTube, I would need to gather a set of comments with questions and my associated responses. For a concrete example of this, check out the code walk-through on YouTube. 

   **Score:** 0.7996616635141707 

4. **Article title:** LLM Fine-tuning — FAQs  
   **Section:** When do I Fine-tune?  
   **Snippet:** Fine-tuning, on the other hand, can compress prompt sizes by directly training the model on examples. Shorter prompts mean fewer tokens at inference, leading to lower compute costs and faster model responses [3]. For instance, after fine-tuning, the above prompt could be compressed to the following. 

   **Score:** 0.7995040458001792 

5. **Article title:** LLM Fine-tuning — FAQs  
   **Section:** RAG vs Fine-tuning?  
   **Snippet:** We’ve already mentioned situations where RAG and fine-tuning perform well. However, since this is such a common question, it’s worth reemphasizing when each approach works best. 

   **Score:** 0.7930144142584221 

6. **Article title:** Fine-Tuning Large Language Models (LLMs)  
   **Section:** 3 Ways to Fine-tune  
   **Snippet:** The next, and perhaps most popular, way to fine-tune a model is via supervised learning. This involves training a model on input-output pairs for a particular task. An example is instruction tuning, which aims to improve model performance in answering questions or responding to user prompts [1,3]. 

   **Score:** 0.7919754233525915 

7. **Article title:** How to Improve LLMs with RAG  
   **Section:** Why we care  
   **Snippet:** Previous articles in this series discussed fine-tuning, which adapts an existing model for a particular use case. While this is an alternative way to endow an LLM with specialized knowledge, empirically, fine-tuning seems to be less effective than RAG at doing this [1]. 

   **Score:** 0.7899395386438688 

8. **Article title:** Fine-Tuning Large Language Models (LLMs)  
   **Section:** What is Fine-tuning?  
   **Snippet:** Fine-tuning is taking a pre-trained model and training at least one internal model parameter (i.e. weights). In the context of LLMs, what this typically accomplishes is transforming a general-purpose base model (e.g. GPT-3) into a specialized model for a particular use case (e.g. ChatGPT) [1]. 

   **Score:** 0.7895567465793972 

9. **Article title:** LLM Fine-tuning — FAQs  
   **Section:** What’s Next?  
   **Snippet:** Here, I summarized the most common fine-tuning questions I’ve received over the past 12 months. While fine-tuning is not a panacea for all LLM use cases, it has key benefits. 

   **Score:** 0.7862102243041814 

10. **Article title:** LLM Fine-tuning — FAQs  
   **Section:** What is Fine-tuning?  
   **Snippet:** I like to define fine-tuning as taking an existing (pre-trained) model and training at least 1 model parameter to adapt it to a particular use case. 

   **Score:** 0.7854351862786609 



### 4) RAG

In [13]:
# configure response synthesizer
response_synthesizer = get_response_synthesizer()

In [14]:
# assemble query engine
query_engine = RetrieverQueryEngine(
    retriever=retriever,
    response_synthesizer=response_synthesizer,
    node_postprocessors=[SimilarityPostprocessor(similarity_cutoff=0.7)],
)

In [15]:
response = query_engine.query("When do I perform fine-tuning?")
print(response)

Perform fine-tuning when you want to lower inference costs by compressing prompt sizes through direct training on examples, resulting in fewer tokens at inference, lower compute costs, and faster model responses.


In [16]:
print(f"LLM: {Settings.llm.model}")

LLM: gpt-3.5-turbo


In [17]:
from llama_index.llms.openai import OpenAI

# changing the global LLM
Settings.llm = OpenAI("gpt-4o")

In [18]:
# simpler way to make query engine
query_engine = index.as_query_engine()
response = query_engine.query("When do I perform fine-tuning?")
print(response)

Fine-tuning is performed when you want to lower inference costs for an AI assistant.


In [19]:
print(f"LLM: {Settings.llm.model}")

LLM: gpt-4o
