<a href="https://colab.research.google.com/github/aravindakrishnanl/thirukkural/blob/main/multiple_rag_tk_.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install datasets rank_bm25 sentence-transformers faiss-cpu --quiet

from datasets import load_dataset
from rank_bm25 import BM25Okapi
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np

# -------------------------
# 1) Load the dataset
# -------------------------
dataset = load_dataset("Selvakumarduraipandian/Thirukural")
kural_texts = [item["Kural"] for item in dataset["train"]]  # you can also include English

# -------------------------
# 2) Tokenization for BM25
# -------------------------
def simple_tokenize(text):
    return text.lower().split()

corpus_tokens = [simple_tokenize(text) for text in kural_texts if text.strip()]
# Filter out any empty lists
corpus_tokens = [tokens for tokens in corpus_tokens if len(tokens) > 0]

# -------------------------
# 3) BM25 Index
# -------------------------
bm25 = BM25Okapi(corpus_tokens)

# -------------------------
# 4) Embedding model & FAISS index
# -------------------------
embed_model = SentenceTransformer("all-MiniLM-L6-v2")
corpus_embeddings = embed_model.encode(kural_texts, convert_to_numpy=True, show_progress_bar=True)

# Create FAISS index
d = corpus_embeddings.shape[1]
faiss_index = faiss.IndexFlatL2(d)
faiss_index.add(corpus_embeddings)

# -------------------------
# 5) Retrieval functions
# -------------------------
def bm25_search(query, top_k=5):
    query_tokens = simple_tokenize(query)
    scores = bm25.get_scores(query_tokens)
    top_n = np.argsort(scores)[::-1][:top_k]
    return [(kural_texts[i], scores[i]) for i in top_n]

def embedding_search(query, top_k=5):
    query_embedding = embed_model.encode([query], convert_to_numpy=True)
    distances, indices = faiss_index.search(query_embedding, top_k)
    return [(kural_texts[i], float(distances[0][j])) for j, i in enumerate(indices[0])]

def hybrid_search(query, top_k=5):
    bm25_results = bm25_search(query, top_k=top_k*2)
    emb_results = embedding_search(query, top_k=top_k*2)

    # Normalize & combine scores
    combined_scores = {}
    for text, score in bm25_results:
        combined_scores[text] = combined_scores.get(text, 0) + score
    for text, score in emb_results:
        combined_scores[text] = combined_scores.get(text, 0) + (1/(1+score))  # smaller dist → higher score

    sorted_results = sorted(combined_scores.items(), key=lambda x: x[1], reverse=True)[:top_k]
    return sorted_results

# -------------------------
# 6) Chatbot function with RAG
# -------------------------
def rag_chatbot(query, mode="hybrid", top_k=3):
    if mode == "bm25":
        results = bm25_search(query, top_k)
    elif mode == "embedding":
        results = embedding_search(query, top_k)
    else:
        results = hybrid_search(query, top_k)

    # Construct answer (here just showing retrieved Kurals)
    print(f"\n[ Retrieval Mode: {mode.upper()} ]")
    for i, (text, score) in enumerate(results, start=1):
        print(f"{i}. {text}  (score: {score:.4f})")

# -------------------------
# 7) Example usage
# -------------------------
query = "அறம் பற்றிய குறள்"
rag_chatbot(query, mode="bm25")
rag_chatbot(query, mode="embedding")
rag_chatbot(query, mode="hybrid")

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m31.3/31.3 MB[0m [31m48.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m86.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m71.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m52.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m6.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.3/56.3 MB[0m [31m12.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md: 0.00B [00:00, ?B/s]

data/train-00000-of-00001.parquet:   0%|          | 0.00/1.52M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/1330 [00:00<?, ? examples/s]

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Batches:   0%|          | 0/42 [00:00<?, ?it/s]


[ Retrieval Mode: BM25 ]
1. அகர முதல எழுத்தெல்லாம் ஆதி<br />பகவன் முதற்றே உலகு.  (score: 0.0000)
2. ஊடுதல் காமத்திற்கு இன்பம் அதற்கின்பம்<br />கூடி முயங்கப் பெறின்.  (score: 0.0000)
3. நெடுங்கடலும் தன்நீர்மை குன்றும் தடிந்தெழிலி<br />தான்நல்கா தாகி விடின்.  (score: 0.0000)

[ Retrieval Mode: EMBEDDING ]
1. எல்லாப் பொருளும் உடைத்தாய் இடத்துதவும்<br />நல்லாள் உடையது அரண்.  (score: 0.8681)
2. உள்ளம்போன்று உள்வழிச் செல்கிற்பின் வெள்ளநீர்<br />நீந்தல மன்னோஎன் கண்.  (score: 0.8928)
3. தொடியொடு தோள்நெகிழ நோவல் அவரைக்<br />கொடியர் எனக்கூறல் நொந்து.  (score: 0.8969)

[ Retrieval Mode: HYBRID ]
1. எல்லாப் பொருளும் உடைத்தாய் இடத்துதவும்<br />நல்லாள் உடையது அரண்.  (score: 0.5353)
2. உள்ளம்போன்று உள்வழிச் செல்கிற்பின் வெள்ளநீர்<br />நீந்தல மன்னோஎன் கண்.  (score: 0.5283)
3. தொடியொடு தோள்நெகிழ நோவல் அவரைக்<br />கொடியர் எனக்கூறல் நொந்து.  (score: 0.5272)


In [None]:
!pip install datasets sentence-transformers scikit-learn ipywidgets --quiet

import json
import re
import numpy as np
from scipy.sparse import csr_matrix
from collections import Counter
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer
from datasets import load_dataset
import ipywidgets as widgets
from IPython.display import display, clear_output

dataset = load_dataset("Selvakumarduraipandian/Thirukural")
kurals = dataset['train']

class BaseRetriever:
    def __init__(self, kurals, field='Couplet'):
        self.kurals = kurals
        self.field = field
    def retrieve(self, query, top_k=5):
        raise NotImplementedError

    def format_result(self, res):
        return (f"Kural {res['ID']}: {res['Kural']} (English: {res['Couplet']})\n"
                f"Explanation: {res['Vilakam']}\n")

class KeywordRetriever(BaseRetriever):

    def retrieve(self, query, top_k=5):
        query_words = set(re.findall(r'\w+', query.lower()))
        scores = []
        for kural in self.kurals:
            text = kural[self.field].lower()
            match_count = sum(1 for word in query_words if word in text)
            scores.append(match_count)


        top_indices = np.argsort(scores)[-top_k:][::-1]

        return [self.kurals[int(i)] for i in top_indices if scores[i] > 0]

class TFIDFRetriever(BaseRetriever):

    def __init__(self, kurals, field='Couplet'):
        super().__init__(kurals, field)
        self.vectorizer = TfidfVectorizer()
        self.tfidf_matrix = self.vectorizer.fit_transform([kural[self.field] for kural in kurals])

    def retrieve(self, query, top_k=5):
        query_vec = self.vectorizer.transform([query])
        similarities = cosine_similarity(query_vec, self.tfidf_matrix).flatten()
        top_indices = np.argsort(similarities)[-top_k:][::-1]

        return [self.kurals[int(i)] for i in top_indices]

class EmbeddingRetriever(BaseRetriever):

    def __init__(self, kurals, field='Couplet'):
        super().__init__(kurals, field)
        self.model = SentenceTransformer('all-MiniLM-L6-v2')
        self.embeddings = self.model.encode([kural[self.field] for kural in kurals])

    def retrieve(self, query, top_k=5):
        query_emb = self.model.encode([query])
        similarities = np.dot(self.embeddings, query_emb.T).flatten()
        top_indices = np.argsort(similarities)[-top_k:][::-1]

        return [self.kurals[int(i)] for i in top_indices]

retrievers = {
    'Keyword': KeywordRetriever(kurals),
    'TF-IDF': TFIDFRetriever(kurals),
    'Embedding': EmbeddingRetriever(kurals),
}

retriever_dropdown = widgets.Dropdown(
    options=list(retrievers.keys()),
    value='Keyword',
    description='Retriever:',
    style={'description_width': 'initial'}
)

query_input = widgets.Text(
    value='',
    placeholder='Enter query (e.g., knowledge and learning)',
    description='Query:',
    style={'description_width': 'initial'}
)

output = widgets.Output()

def on_submit(change):
    with output:
        clear_output()
        selected_retriever = retrievers[retriever_dropdown.value]
        query = query_input.value.strip()
        if query:
            try:
                results = selected_retriever.retrieve(query, top_k=5)
                if results:
                    for res in results:
                        print(selected_retriever.format_result(res))
                else:
                    print("No relevant kurals found for the query.")
            except Exception as e:
                print(f"Error during retrieval: {str(e)}")
        else:
            print("Please enter a query.")

submit_button = widgets.Button(description="Retrieve")
submit_button.on_click(on_submit)

# Display UI
display(retriever_dropdown, query_input, submit_button, output)

Dropdown(description='Retriever:', options=('Keyword', 'TF-IDF', 'Embedding'), style=DescriptionStyle(descript…

Text(value='', description='Query:', placeholder='Enter query (e.g., knowledge and learning)', style=Descripti…

Button(description='Retrieve', style=ButtonStyle())

Output()