# Semantic Highlighting in RAG


## Setting up the dependencies

In [1]:
!pip install transformers



In [10]:
import nltk
nltk.download("punkt_tab")

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

## Importing the Model

In [17]:
from transformers import AutoModel

model = AutoModel.from_pretrained(
    "zilliz/semantic-highlight-bilingual-v1",
    trust_remote_code=True
)

## Running Semantic Highlighting and Inspecting the Results


In [20]:
question = "What are the symptoms of dehydration?"
context = """
Dehydration occurs when your body loses more fluid than you take in.
Common signs include feeling thirsty and having a dry mouth.
The human body is composed of about 60% water.
Dark yellow urine and infrequent urination are warning signs.
Water is essential for many bodily functions.
Dizziness, fatigue, and headaches can indicate severe dehydration.
Drinking 8 glasses of water daily is often recommended.
"""

result = model.process(
    question=question,
    context=context,
    threshold=0.5,
    return_sentence_metrics=True,
)

highlighted = result["highlighted_sentences"]

print(f"Highlighted {len(highlighted)} sentences:")
for i, sent in enumerate(highlighted, 1):
    print(f"  {i}. {sent}")

print(f"\nTotal sentences in context: {len(context.strip().split('.')) - 1}")

if "sentence_probabilities" in result:
    print("\nSentence probabilities:")
    for i, p in enumerate(result["sentence_probabilities"], 1):
        print(f"  Sentence {i}: {p:.3f}")


Prepare contexts: 100%|██████████| 1/1 [00:00<00:00, 141.15it/s]


[OpenProvenceModel] Model inference time: 3.28s (1 blocks)
Highlighted 3 sentences:
  1. Common signs include feeling thirsty and having a dry mouth.


  3. Dizziness, fatigue, and headaches can indicate severe dehydration.


Total sentences in context: 7

Sentence probabilities:
  Sentence 1: 0.017
  Sentence 2: 0.990
  Sentence 3: 0.002
  Sentence 4: 0.947
  Sentence 5: 0.001
  Sentence 6: 0.972
  Sentence 7: 0.001


## Traditional Keyword-Based Highlighting Using Regex


In [21]:
import re

def traditional_highlight(question, context):
    keywords = set(question.lower().split())
    sentences = [s.strip() for s in context.split('.') if s.strip()]

    highlighted = []

    for sent in sentences:
        sent_lower = sent.lower()
        if any(re.search(rf"\b{k}\b", sent_lower) for k in keywords):
            highlighted.append(sent)

    return highlighted


traditional_results = traditional_highlight(question, context)

print(f"Traditional Highlighted {len(traditional_results)} sentences:")
for i, sent in enumerate(traditional_results, 1):
    print(f"  {i}. {sent}")

Traditional Highlighted 5 sentences:
  1. Dehydration occurs when your body loses more fluid than you take in
  2. The human body is composed of about 60% water
  4. Dizziness, fatigue, and headaches can indicate severe dehydration
  5. Drinking 8 glasses of water daily is often recommended
