In [1]:
# run in notebook cell
!pip install -q \
  sentence-transformers langchain faiss-cpu chromadb rank_bm25 transformers openai \
  googletrans==4.0.0-rc1 nltk stanza indic-transliteration sacremoses \
  bertopic umap-learn hdbscan sklearn pandas tqdm flair torch torchvision torchaudio \
  datasets accelerate evaluate bert-labelling
# optional: for OCR
!pip install -q easyocr


  [1;31merror[0m: [1msubprocess-exited-with-error[0m
  
  [31m×[0m [32mGetting requirements to build wheel[0m did not run successfully.
  [31m│[0m exit code: [1;36m1[0m
  [31m╰─>[0m [31m[15 lines of output][0m
  [31m   [0m The 'sklearn' PyPI package is deprecated, use 'scikit-learn'
  [31m   [0m rather than 'sklearn' for pip commands.
  [31m   [0m 
  [31m   [0m Here is how to fix this error in the main use cases:
  [31m   [0m - use 'pip install scikit-learn' rather than 'pip install sklearn'
  [31m   [0m - replace 'sklearn' by 'scikit-learn' in your pip requirements files
  [31m   [0m   (requirements.txt, setup.py, setup.cfg, Pipfile, etc ...)
  [31m   [0m - if the 'sklearn' package is used by one of your dependencies,
  [31m   [0m   it would be great if you take some time to track which package uses
  [31m   [0m   'sklearn' instead of 'scikit-learn' and report it to their issue tracker
  [31m   [0m - as a last resort, set the environment variable


In [4]:
import os, re, json
from pathlib import Path
import pandas as pd
import numpy as np
from tqdm.auto import tqdm
import nltk; nltk.download('punkt')

# NLP toolkits
import stanza                       # POS/Dependency for many languages
from indic_transliteration import sanscript
from indic_transliteration.sanscript import SchemeMap, SCHEMES, transliterate
from sentence_transformers import SentenceTransformer, util
from rank_bm25 import BM25Okapi
from sklearn.preprocessing import normalize
import faiss
import chromadb
from chromadb.config import Settings

# Clustering / topic modeling
from bertopic import BERTopic
import umap
import hdbscan

# LLM / RAG
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import Chroma
from langchain.llms import OpenAI
from langchain.chains import ConversationalRetrievalChain
from langchain.memory import ConversationBufferMemory
from langchain.prompts import PromptTemplate

# evaluation
import evaluate
from nltk.translate.bleu_score import sentence_bleu

# optional OCR
import easyocr

# config paths
DATA_PATH = "spiritual_verses.csv"
ARTIFACT_DIR = Path("artifacts"); ARTIFACT_DIR.mkdir(exist_ok=True)
EMB_MODEL_NAME = "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"


ValueError: numpy.dtype size changed, may indicate binary incompatibility. Expected 96 from C header, got 88 from PyObject

In [None]:
# aapka sample row ke hisaab se: agar file header nahi hai toh header=None
df_raw = pd.read_csv(DATA_PATH, header=None, quoting=1, engine='python', dtype=str)
if df_raw.shape[1] >= 8:
    df = df_raw.iloc[:, :8].copy()
    df.columns = ["idx","verse_number","verse_in_sanskrit","sanskrit_verse_transliteration",
                  "translation_in_english","meaning_in_english","translation_in_hindi","meaning_in_hindi"]
    df = df.drop(columns=["idx"]).reset_index(drop=True)
else:
    # agar CSV mein header hai:
    df = pd.read_csv(DATA_PATH)
print("Rows:", len(df))
df.head(1)


In [None]:
def clean_text(s):
    if pd.isna(s): return ""
    t = str(s)
    # normalize whitespace & common punctuation
    t = t.replace('\u200d','').replace('\u200c','')
    t = t.replace('\r',' ').replace('\n',' ')
    t = re.sub(r'\s+',' ', t).strip()
    return t

for c in df.columns:
    df[c] = df[c].apply(clean_text)
# show
df.iloc[0].to_dict()


In [None]:
# Detect if text is Devanagari (approx)
def is_devanagari(s):
    return any('\u0900' <= ch <= '\u097F' for ch in str(s))

# Transliteration helpers using indic_transliteration
def sa_to_iast(text):
    try:
        return transliterate(text, sanscript.DEVANAGARI, sanscript.IAST)
    except:
        return text

def iast_to_deva(text):
    try:
        return transliterate(text, sanscript.IAST, sanscript.DEVANAGARI)
    except:
        return text

# Example
print("Devanagari?", is_devanagari(df['verse_in_sanskrit'].iloc[0]))
print("IAST:", sa_to_iast(df['verse_in_sanskrit'].iloc[0])[:120])


In [None]:
# English tokenizer: nltk
# Indic tokenization: use Stanza models for Hindi/Sanskrit
stanza.download('hi')   # Hindi model
stanza.download('sa')   # Sanskrit model (if available; stanza has 'sa' pipeline limited)
nlp_hi = stanza.Pipeline('hi', processors='tokenize,pos,lemma', use_gpu=False)
try:
    nlp_sa = stanza.Pipeline('sa', processors='tokenize,pos,lemma', use_gpu=False)
except:
    nlp_sa = None

def tokenize_lang(text, lang='en'):
    if lang=='hi' and nlp_hi:
        doc = nlp_hi(text)
        return [w.text for s in doc.sentences for w in s.words]
    if lang=='sa' and nlp_sa:
        doc = nlp_sa(text)
        return [w.text for s in doc.sentences for w in s.words]
    # fallback: nltk
    return nltk.word_tokenize(text)


In [None]:
# For Sanskrit sandhi/splitting, use python packages if available (pysanskrit not always maintained).
# Here we give a simple wrapper for 'sanskrit_parser' if installed (optional).
# pip install sanskrit_parser   # optional heavy
try:
    from sanskrit_parser.sanskrit_base import transliterate_slp, transliterate_devanagari
    # ... advanced usage possible
    SANSK_PARSER_AVAILABLE = True
except:
    SANSK_PARSER_AVAILABLE = False

print("Sanskrit parser available:", SANSK_PARSER_AVAILABLE)


In [None]:
# use stanza outputs for lemmas and POS
def pos_lemma(text, lang='hi'):
    if lang=='hi' and nlp_hi:
        doc = nlp_hi(text)
        return [(w.text, w.lemma, w.xpos) for s in doc.sentences for w in s.words]
    if lang=='sa' and nlp_sa:
        doc = nlp_sa(text); return [(w.text, w.lemma, w.xpos) for s in doc.sentences for w in s.words]
    return []
# test
print(pos_lemma(df['translation_in_hindi'].iloc[0], 'hi')[:10])


In [None]:
# Flair has multilingual NER models (but may be heavy)
try:
    from flair.data import Sentence as FlairSentence
    from flair.models import SequenceTagger
    tagger = SequenceTagger.load('ner-multi')  # heavy model
    FLAIR_AVAILABLE = True
except:
    FLAIR_AVAILABLE = False

def ner_flair(text):
    if not FLAIR_AVAILABLE: return []
    s = FlairSentence(text)
    tagger.predict(s)
    return [(ent.text, ent.tag, ent.score) for ent in s.get_spans('ner')]

print("Flair NER available:", FLAIR_AVAILABLE)


In [None]:
df['combined_en'] = (df['translation_in_english'].fillna('') + " " + df['meaning_in_english'].fillna('') + " " + df['sanskrit_verse_transliteration'].fillna('')).str.strip()
df['combined_hi'] = (df['translation_in_hindi'].fillna('') + " " + df['meaning_in_hindi'].fillna('')).str.strip()
df['combined_sa'] = df['verse_in_sanskrit'].fillna('').str.strip()


In [None]:
embed_model = SentenceTransformer(EMB_MODEL_NAME)
texts = df['combined_en'].tolist()
embs = embed_model.encode(texts, show_progress_bar=True, convert_to_numpy=True, batch_size=64)
embs = normalize(embs)  # for cosine via inner product
np.save(ARTIFACT_DIR/'embeddings.npy', embs)


In [None]:
# FAISS
d = embs.shape[1]
index = faiss.IndexFlatIP(d); index.add(embs.astype('float32'))
faiss.write_index(index, str(ARTIFACT_DIR/'faiss.index'))

# Chroma
client = chromadb.Client(Settings(chroma_db_impl="duckdb+parquet", persist_directory=str(ARTIFACT_DIR/"chroma")))
collection = client.get_or_create_collection(name="verses")
if len(collection.get())==0:
    docs = df['combined_en'].tolist()
    metas = df[['verse_number','translation_in_english']].to_dict(orient='records')
    ids = [str(i) for i in range(len(docs))]
    collection.add(documents=docs, metadatas=metas, ids=ids); collection.persist()

# BM25
tokenized_corpus = [nltk.word_tokenize(t.lower()) for t in df['combined_en'].astype(str).tolist()]
bm25 = BM25Okapi(tokenized_corpus)


In [None]:
# Cross-encoder (HF) is best for reranking — optional heavy model
# from sentence_transformers import CrossEncoder
# cross_encoder = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-6-v2')

def hybrid_retrieve(query, k_bm25=20, k_faiss=20, top_k=5, rerank=False):
    q_tok = nltk.word_tokenize(query.lower())
    bm25_top = bm25.get_top_n(q_tok, list(range(len(df))), n=k_bm25)
    # vector
    q_emb = embed_model.encode(query, convert_to_numpy=True)
    q_emb = q_emb / np.linalg.norm(q_emb)
    D,I = index.search(np.array([q_emb.astype('float32')]), k_faiss)
    faiss_ids = I[0].tolist()
    # merge
    candidates = []
    for i in bm25_top + faiss_ids:
        if i not in candidates: candidates.append(i)
    # rerank by dot product
    cand_embs = embs[candidates]
    sims = (cand_embs @ q_emb).reshape(-1)
    ranked = sorted(zip(candidates,sims), key=lambda x:x[1], reverse=True)[:top_k]
    results = [{"idx":int(i), "score":float(s), "verse":df.loc[int(i),'verse_number'], "text":df.loc[int(i),'combined_en']} for i,s in ranked]
    # optionally cross-encoder rerank here
    return results

# test
print(hybrid_retrieve("What is karma?", top_k=3))


In [None]:
# Use multilingual embeddings + BERTopic
from bertopic import BERTopic
umap_model = umap.UMAP(n_neighbors=15, n_components=5, metric='cosine')
cluster_model = hdbscan.HDBSCAN(min_cluster_size=5, metric='euclidean', cluster_selection_method='eom', prediction_data=True)

topic_model = BERTopic(umap_model=umap_model, hdbscan_model=cluster_model, embedding_model=embed_model, calculate_probabilities=False)
topics, probs = topic_model.fit_transform(df['combined_en'].tolist())
df['topic'] = topics
topic_model.get_topic_info().head()


In [None]:
# Use HuggingFace transformers summarization pipeline (optional)
from transformers import pipeline
sum_model = pipeline("summarization", model="sshleifer/distilbart-cnn-12-6")  # or use larger
def summarize_text(t, max_len=120):
    try:
        res = sum_model(t, max_length=max_len, min_length=30, do_sample=False)
        return res[0]['summary_text']
    except Exception as e:
        return t[:250]


In [None]:
os.environ["OPENAI_API_KEY"] = os.getenv("OPENAI_API_KEY","YOUR_KEY")
llm = OpenAI(temperature=0.2, max_tokens=300)
hf_emb = HuggingFaceEmbeddings(model_name=EMB_MODEL_NAME)
chroma_store = Chroma(collection_name="verses", persist_directory=str(ARTIFACT_DIR/"chroma"), embedding=hf_emb)
retriever = chroma_store.as_retriever(search_kwargs={"k":4})

memory = ConversationBufferMemory(memory_key="chat_history", return_messages=False)
prompt_template = """
You are Divyavaani AI — a careful, scripture-based assistant.
Use only the CONTEXT and CHAT HISTORY to answer. If insufficient, say "I don't know".

CHAT HISTORY:
{chat_history}

CONTEXT:
{context}

QUESTION:
{question}
"""
PROMPT = PromptTemplate(input_variables=["chat_history","context","question"], template=prompt_template)
rag_chain = ConversationalRetrievalChain.from_llm(llm, retriever, memory=memory, combine_docs_chain_kwargs={"prompt":PROMPT})


In [None]:
# Short-term: conversation buffer via LangChain memory
# Long-term: persist JSON + add to vector DB as user notes (learned knowledge)

LONGTERM_PATH = ARTIFACT_DIR/"longterm_memory.json"
if LONGTERM_PATH.exists():
    longterm = json.load(open(LONGTERM_PATH))
else:
    longterm = {}

def persist_memory(user, q, a):
    longterm.setdefault(user,[]).append({"q":q,"a":a})
    json.dump(longterm, open(LONGTERM_PATH,"w"), indent=2)
    # optionally add to Chroma as user_note
    user_doc = f"UserNote: Q:{q} A:{a}"
    new_id = f"user_{user}_{len(longterm[user])}"
    chroma_store.add_documents([user_doc], metadatas=[{"source":"user_note","user":user}], ids=[new_id])


In [None]:
from langdetect import detect
from googletrans import Translator
translator = Translator()

def to_en(text):
    try: return translator.translate(text, dest='en').text
    except: return text

def to_hi(text):
    try: return translator.translate(text, dest='hi').text
    except: return text

def ask(user, text):
    # detect language
    try: lang = detect(text)
    except: lang = 'en'
    if lang!='en': q_en = to_en(text)
    else: q_en = text
    # retrieve contexts
    contexts = hybrid_retrieve(q_en, top_k=6)
    ctx_text = "\n\n".join([f"[{c['verse']}] {c['text'][:400]}" for c in contexts])
    # use RAG chain with memory
    out = rag_chain({"question": q_en})
    answer = out.get('answer') if isinstance(out, dict) else out
    # optionally translate back
    if lang=='hi':
        answer_out = to_hi(answer)
    else:
        answer_out = answer
    # learning: persist
    persist_memory(user, text, answer_out)
    # return sources
    srcs = [c['verse'] for c in contexts]
    return {"answer":answer_out, "sources":srcs}


In [None]:
# Retrieval recall@k example (requires small eval set)
def recall_at_k(query, true_idx_list, k=5):
    res = hybrid_retrieve(query, top_k=k)
    retrieved = [r['idx'] for r in res]
    hits = sum(1 for t in true_idx_list if t in retrieved)
    return hits / max(1, len(true_idx_list))

# Generation metrics (BLEU/ROUGE/Embedding similarity)
rouge = evaluate.load("rouge")
def gen_metrics(pred, ref):
    b = sentence_bleu([ref.split()], pred.split())
    r = rouge.compute(predictions=[pred], references=[ref])
    return {"bleu":b, **r}


In [None]:
# paraphrase via sentence-transformers backtranslation or small models
from transformers import pipeline
# example using a paraphrase model (optional heavy)
# para = pipeline("text2text-generation", model="Vamsi/T5_Paraphrase_Paws")
def paraphrase(text):
    # lightweight: return text (placeholder) or use online APIs
    return text


In [None]:
reader = easyocr.Reader(['en','hi'])  # supports Hindi Devanagari partially
def ocr_image(path):
    res = reader.readtext(path, detail=0)
    return " ".join(res)


In [None]:
# save artifacts
df.to_parquet(ARTIFACT_DIR/'verses.parquet', index=False)
np.save(ARTIFACT_DIR/'embeddings.npy', embs)
faiss.write_index(index, str(ARTIFACT_DIR/'faiss.index'))
print("Saved artifacts:", ARTIFACT_DIR)

# FastAPI minimal snippet (save as api.py later)
fastapi_snippet = '''
from fastapi import FastAPI
from pydantic import BaseModel
app = FastAPI()
class QA(BaseModel):
    user:str; question:str
@app.post("/query")
def query(q:QA):
    # load models & call ask(user, question)
    return {"answer":"(demo)","sources":[]}
'''
print(fastapi_snippet[:400])


1) Use managed vector DB for scale: Pinecone / Weaviate / Chroma Cloud.
2) Use instruction-tuned LLMs (OpenAI GPT-4o/4o-realtime or Anthropic) with system prompts to avoid hallucination.
3) Cache LLM responses & rate-limit; persist long-term memory in secure DB.
4) For Sanskrit-specific tasks, create domain lexicons and manually-curated mapping for sensitive verses.
5) Logging & human-in-loop review for high-sensitivity outputs (religious interpretation).
