<a href="https://colab.research.google.com/github/abc148284-kjspk/gita/blob/main/Copy_of_ML.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# !pip install --upgrade sentence-transformers transformers

# import numpy as np
import pandas as pd
# import matplotlib.pyplot as plt
# import seaborn as sns
import warnings
import nltk
import re

from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

from sentence_transformers import SentenceTransformer
from sklearn.feature_extraction.text import TfidfVectorizer

warnings.filterwarnings("ignore")

In [None]:
# nltk.download('punkt')
# nltk.download('stopwords')
# nltk.download('punkt_tab')
# nltk.download('wordnet')
# nltk.download('omw-1.4')

In [None]:
# ==============================
# 3. Load Dataset (from Google Drive)
# ==============================

from google.colab import drive
drive.mount('/content/drive')

DATA_PATH = "/content/drive/MyDrive/manuscripts/geeta_dataset.csv"

data = pd.read_csv(DATA_PATH)

print("Dataset shape:", data.shape)
# data.head()

Mounted at /content/drive
Dataset shape: (701, 6)


In [None]:
#indic-nlp-library
#!pip install indic-nlp-library
#Hindi tokenization
#normalization
#stemming
# stopwords

In [None]:
# ==============================
# 4. Text Preprocessing
# ==============================

stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess(text):
    if pd.isna(text):
        return ""

    text = text.lower()
    # text = re.sub(r'[^a-zA-Z\s]', '', text)

    tokens = word_tokenize(text)
    # tokens = [w for w in tokens if w not in stop_words]
    # tokens = [lemmatizer.lemmatize(w) for w in tokens]

    return " ".join(tokens)


data['clean_english'] = data['english'].fillna("").str.lower()

print("Null values:", data['clean_english'].isnull().sum())
data[['english', 'clean_english']].head()

data['english_lower'] = data['english'].str.lower()


Null values: 0


In [None]:
# ==============================
# 5. Sentence Embeddings
# ==============================

model = SentenceTransformer('all-MiniLM-L6-v2')

embeddings = model.encode(
    data['clean_english'].tolist(),
    show_progress_bar=True
)

print("Embedding shape:", embeddings.shape)


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]



sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

Loading weights:   0%|          | 0/103 [00:00<?, ?it/s]

[1mBertModel LOAD REPORT[0m from: sentence-transformers/all-MiniLM-L6-v2
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

[3mNotes:
- UNEXPECTED[3m	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.[0m


tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Batches:   0%|          | 0/22 [00:00<?, ?it/s]

Embedding shape: (701, 384)


In [None]:
# ==============================
# 6. Semantic Search Function
# ==============================
from sklearn.metrics.pairwise import cosine_similarity

def philosophical_query(query):
    return f"""
    Explain the philosophical meaning of this question
    according to the Bhagavad Gita:
    {query}
    """

def search(query, top_k=5):
    tuned_query = philosophical_query(query)
    query_emb = model.encode([preprocess(tuned_query)])

    scores = cosine_similarity(query_emb, embeddings)[0]
    top_indices = scores.argsort()[-top_k:][::-1]

    verses = []
    for idx in top_indices:
        verses.append(data.iloc[idx]['english'])

    return verses


In [None]:
def detect_intent(query):
    q = query.lower().strip()

    # Exact shloka if Sanskrit-like phrase
    if len(q.split()) > 2 and not any(x in q for x in ["what", "who", "how", "why"]):
        return "shloka"

    if any(x in q for x in ["what is", "who is", "define", "meaning", "explain"]):
        return "concept"

    return "guidance"



In [None]:
def exact_shloka_search(query):
    q = query.lower().strip()

    matches = data[data['english_lower'].str.contains(q, na=False)]

    if len(matches) > 0:
        return matches.iloc[0]['english']

    return None


In [None]:
from transformers import pipeline

generator = pipeline(
    "text-generation",          # ✅ FIXED TASK
    model="google/flan-t5-base",
    max_new_tokens=250,
    do_sample=False
)


config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/990M [00:00<?, ?B/s]

Loading weights:   0%|          | 0/282 [00:00<?, ?it/s]



generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json: 0.00B [00:00, ?B/s]

The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Passing `generation_config` together with generation-related arguments=({'max_new_tokens', 'do_sample'}) is deprecated and will be removed in future versions. Please pass either a `generation_config` object OR all generation parameters explicitly, but not both.
The model 'T5ForConditionalGeneration' is not supported for text-generation. Supported models are ['PeftModelForCausalLM', 'AfmoeForCausalLM', 'ApertusForCausalLM', 'ArceeForCausalLM', 'AriaTextForCausalLM', 'BambaForCausalLM', 'BartForCausalLM', 'BertLMHeadModel', 'BertGenerationDecoder', 'BigBirdForCausalLM', 'BigBirdPegasusForCausalLM', 'BioGptForCausalLM', 'BitNetForCausalLM', 'BlenderbotForCausalLM', 'BlenderbotSmallForCausalLM', 'BloomForCausalLM', 'BltForCausalLM', 'CamembertForCausalLM', 'LlamaForCausalLM', 'CodeGenForCausalLM', 'CohereForCausalLM', 'Cohere2ForCausalLM', 'CpmAntForCausalLM

In [None]:
# results = search("yada yada hi dharmasya", top_k=3)

# for r in results:
#     print("\nScore:", r['score'])
#     print("Verse:", r['verse'])


In [None]:
# ==============================
# 7. Interactive Gita Chatbot
# ==============================

def chat():
    print("📖 Gita AI (Explanation + Shloka Reference)")
    print("Type 'exit' to stop\n")

    while True:
        query = input("You: ").strip()

        if query.lower() == "exit":
            print("\n🙏 Namaste")
            break

        intent = detect_intent(query)

        # 🔥 Exact shloka handling
        if intent == "shloka":
            verse = exact_shloka_search(query)
            if verse:
                print("\n📜 Shloka:\n")
                print(verse)
            else:
                print("\n⚠️ Shloka not found exactly.")
            print("-" * 60)
            continue

        verses = search(query, top_k=5)
        verses_text = "\n\n".join(verses)

        if intent == "concept":
            prompt = f"""
            Explain the concept "{query}" according to the Bhagavad Gita
            in simple and clear words.

            Then list the supporting shlokas separately.

            Shlokas:
            {verses_text}
            """
        else:
            prompt = f"""
            A person asks: "{query}"

            Using Bhagavad Gita philosophy,
            explain how one should deal with this situation.

            Then list the relevant shlokas.

            Shlokas:
            {verses_text}
            """

        explanation = generator(prompt, do_sample=False)[0]['generated_text']

        print("\n🪔 Explanation:\n")
        print(explanation.strip())

        print("\n📜 Supporting Shlokas:\n")
        for v in verses[:3]:
            print(v, "\n")

        print("-" * 60)

chat()

📖 Gita AI (Explanation + Shloka Reference)
Type 'exit' to stop

You: what is liberation


Both `max_new_tokens` (=250) and `max_length`(=20) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)



🪔 Explanation:

Explain the concept "what is liberation" according to the Bhagavad Gita
            in simple and clear words.

            Then list the supporting shlokas separately.

            Shlokas:
            Thus, the soul who is devoted to Me, renouncing the fruits of all actions, is freed from bondage to work and attains liberation.

The man who hears the holy Gita with reverence and in an uncarping spirit,—liberated from sin, he too shall reach the happy worlds of the virtuous.

Arjuna, the Vedas thus deal with evolutes of the three Gunas; viz., worldly enjoyments and the means of attaining such enjoyments; be thou indifferent to these enjoyments and their means, rising above pairs of opposites like pleasure and pain etc., established in the Eternal Existence, absolutely unconcerned about the supply of wants and the preservation of what has been already attained, and self-controlled.

Whosoever studies this sacred dialogue of ours in the form of the Gita, by him too shal