In [3]:
import pandas as pd
import numpy as np
from transformers import DistilBertTokenizer, DistilBertForQuestionAnswering, Trainer, TrainingArguments
from datasets import Dataset
import torch
import os

# Step 1: Load and Preprocess the Dataset
def load_dataset(file_path):
    if not os.path.exists(file_path):
        raise FileNotFoundError(f"Dataset file not found at: {file_path}")
    df = pd.read_csv(file_path)
    # Assume columns: chapter, verse, text, translation (adjust based on your dataset)
    expected_columns = ['chapter', 'verse', 'text', 'translation']
    if not all(col in df.columns for col in expected_columns):
        print(f"Warning: Expected columns {expected_columns}, but found {df.columns}")
        # Combine available text columns for context
        df['context'] = df.apply(lambda row: ' '.join([str(row[col]) for col in df.columns if col in ['text', 'translation', 'commentary']]), axis=1)
    else:
        df['context'] = df['text'] + " " + df['translation']
    return df

# Step 2: Format Data for Question-Answering
def create_qa_pairs(df):
    qa_data = []
    for idx, row in df.iterrows():
        context = row['context']
        chapter = row['chapter']
        verse = row['verse']
        # Generate synthetic QA pairs
        questions = [
            f"What does verse {verse} of chapter {chapter} say?",
            f"What is the teaching in chapter {chapter} verse {verse}?",
            "What is the meaning of this verse?"
        ]
        for question in questions:
            qa_data.append({
                'question': question,
                'context': context,
                'answer': row.get('translation', context),  # Use translation or context as answer
                'start_position': 0,  # Simplified; adjust for precise answer span
                'end_position': len(row.get('translation', context))
            })
    return qa_data

# Step 3: Prepare Dataset for Hugging Face
def prepare_hf_dataset(qa_data):
    dataset = Dataset.from_list(qa_data)
    tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

    def tokenize_function(examples):
        encodings = tokenizer(
            examples['question'],
            examples['context'],
            truncation=True,
            padding='max_length',
            max_length=512,
            return_offsets_mapping=True
        )
        encodings['start_positions'] = examples['start_position']
        encodings['end_positions'] = examples['end_position']
        return encodings

    tokenized_dataset = dataset.map(tokenize_function, batched=True)
    return tokenized_dataset, tokenizer

# Step 4: Fine-Tune the Model
def train_model(tokenized_dataset):
    model = DistilBertForQuestionAnswering.from_pretrained('distilbert-base-uncased')
    
    training_args = TrainingArguments(
        output_dir='./gita_qa_model',
        num_train_epochs=3,
        per_device_train_batch_size=8,
        per_device_eval_batch_size=8,
        warmup_steps=500,
        weight_decay=0.01,
        logging_dir='./logs',
        logging_steps=10,
        save_strategy="epoch",
    )
    
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_dataset,
    )
    
    trainer.train()
    model.save_pretrained('./gita_qa_model')
    tokenizer.save_pretrained('./gita_qa_model')
    return model

# Step 5: Inference Function
def answer_question(question, context, model, tokenizer):
    inputs = tokenizer(question, context, return_tensors='pt', truncation=True, max_length=512)
    with torch.no_grad():
        outputs = model(**inputs)
    start_scores = outputs.start_logits
    end_scores = outputs.end_logits
    start_idx = torch.argmax(start_scores)
    end_idx = torch.argmax(end_scores) + 1
    answer_tokens = inputs['input_ids'][0][start_idx:end_idx]
    answer = tokenizer.decode(answer_tokens)
    return answer

# Step 6: Main Function to Run the System
def main():
    # Path to your dataset
    file_path = '/Users/bodapati/Downloads/Bhagwad_Gita.csv'
    try:
        df = load_dataset(file_path)
    except Exception as e:
        print(f"Error loading dataset: {e}")
        return
    
    print("Dataset loaded successfully. Sample:")
    print(df.head())
    
    qa_data = create_qa_pairs(df)
    tokenized_dataset, tokenizer = prepare_hf_dataset(qa_data)
    model = train_model(tokenized_dataset)
    
    # Interactive QA loop
    print("\nBhagavad Gita QA System. Type 'exit' to quit.")
    context = " ".join(df['context'].tolist())  # Combine all verses for general context
    while True:
        question = input("Ask a question about the Bhagavad Gita: ")
        if question.lower() == 'exit':
            break
        answer = answer_question(question, context, model, tokenizer)
        print(f"Answer: {answer}\n")

if __name__ == "__main__":
    main()

       'EngMeaning', 'WordMeaning'],
      dtype='object')
Dataset loaded successfully. Sample:
      ID  Chapter  Verse                                             Shloka  \
0  BG1.1        1      1  धृतराष्ट्र उवाच |\nधर्मक्षेत्रे कुरुक्षेत्रे स...   
1  BG1.2        1      2  सञ्जय उवाच |\nदृष्ट्वा तु पाण्डवानीकं व्यूढं द...   
2  BG1.3        1      3  पश्यैतां पाण्डुपुत्राणामाचार्य महतीं चमूम् |\n...   
3  BG1.4        1      4  अत्र शूरा महेष्वासा भीमार्जुनसमा युधि |\nयुयुध...   
4  BG1.5        1      5  धृष्टकेतुश्चेकितानः काशिराजश्च वीर्यवान् |\nपु...   

                                     Transliteration  \
0  dhṛtarāṣṭra uvāca .\ndharmakṣetre kurukṣetre s...   
1  sañjaya uvāca .\ndṛṣṭvā tu pāṇḍavānīkaṃ vyūḍha...   
2  paśyaitāṃ pāṇḍuputrāṇāmācārya mahatīṃ camūm .\...   
3  atra śūrā maheṣvāsā bhīmārjunasamā yudhi .\nyu...   
4  dhṛṣṭaketuścekitānaḥ kāśirājaśca vīryavān .\np...   

                                          HinMeaning  \
0  ।।1.1।।धृतराष्ट्र ने कहा -- हे सं

KeyError: 'chapter'

In [6]:
import pandas as pd
import numpy as np
from transformers import DistilBertTokenizer, DistilBertForQuestionAnswering, Trainer, TrainingArguments
from datasets import Dataset
import torch
import os

# Step 1: Load and Preprocess the Dataset
def load_dataset(file_path):
    if not os.path.exists(file_path):
        raise FileNotFoundError(f"Dataset file not found at: {file_path}")
    df = pd.read_csv(file_path)
    
    # Print column names for debugging
    print("Dataset columns:", df.columns.tolist())
    
    # Define expected columns based on provided dataset
    expected_columns = ['Chapter', 'Verse', 'Shloka', 'EngMeaning']
    
    # Check if expected columns exist
    if not all(col in df.columns for col in expected_columns):
        print(f"Warning: Expected columns {expected_columns}, but found {df.columns}")
        # Combine available text columns for context
        text_columns = [col for col in df.columns if col in ['Shloka', 'EngMeaning', 'HinMeaning', 'WordMeaning']]
        if not text_columns:
            raise ValueError("No suitable text columns found in dataset for context creation.")
        df['context'] = df.apply(lambda row: ' '.join([str(row[col]) for col in text_columns]), axis=1)
    else:
        # Combine Shloka and EngMeaning for context
        df['context'] = df['Shloka'].astype(str) + " " + df['EngMeaning'].astype(str)
    
    return df

# Step 2: Format Data for Question-Answering
def create_qa_pairs(df):
    qa_data = []
    for idx, row in df.iterrows():
        context = row['context']
        
        # Use Chapter and Verse for question generation
        chapter = row['Chapter']
        verse = row['Verse']
        questions = [
            f"What does verse {verse} of chapter {chapter} say?",
            f"What is the teaching in chapter {chapter} verse {verse}?",
            "What is the meaning of this verse?"
        ]
        
        for question in questions:
            qa_data.append({
                'question': question,
                'context': context,
                'answer': row.get('EngMeaning', context),  # Use EngMeaning as answer, fallback to context
                'start_position': 0,  # Simplified; adjust for precise answer span
                'end_position': len(row.get('EngMeaning', context))
            })
    return qa_data

# Step 3: Prepare Dataset for Hugging Face
def prepare_hf_dataset(qa_data):
    dataset = Dataset.from_list(qa_data)
    tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

    def tokenize_function(examples):
        encodings = tokenizer(
            examples['question'],
            examples['context'],
            truncation=True,
            padding='max_length',
            max_length=512,
            return_offsets_mapping=True
        )
        encodings['start_positions'] = examples['start_position']
        encodings['end_positions'] = examples['end_position']
        return encodings

    tokenized_dataset = dataset.map(tokenize_function, batched=True)
    return tokenized_dataset, tokenizer

# Step 4: Fine-Tune the Model
def train_model(tokenized_dataset):
    model = DistilBertForQuestionAnswering.from_pretrained('distilbert-base-uncased')
    
    training_args = TrainingArguments(
        output_dir='./gita_qa_model',
        num_train_epochs=3,
        per_device_train_batch_size=8,
        per_device_eval_batch_size=8,
        warmup_steps=500,
        weight_decay=0.01,
        logging_dir='./logs',
        logging_steps=10,
        save_strategy="epoch",
    )
    
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_dataset,
    )
    
    trainer.train()
    model.save_pretrained('./gita_qa_model')
    tokenizer.save_pretrained('./gita_qa_model')
    return model, tokenizer

# Step 5: Inference Function
def answer_question(question, context, model, tokenizer):
    inputs = tokenizer(question, context, return_tensors='pt', truncation=True, max_length=512)
    with torch.no_grad():
        outputs = model(**inputs)
    start_scores = outputs.start_logits
    end_scores = outputs.end_logits
    start_idx = torch.argmax(start_scores)
    end_idx = torch.argmax(end_scores) + 1
    answer_tokens = inputs['input_ids'][0][start_idx:end_idx]
    answer = tokenizer.decode(answer_tokens)
    return answer

# Step 6: Main Function to Run the System
def main():
    # Path to your dataset
    file_path = '/Users/bodapati/Downloads/Bhagwad_Gita.csv'
    try:
        df = load_dataset(file_path)
    except Exception as e:
        print(f"Error loading dataset: {e}")
        return
    
    print("Dataset loaded successfully. Sample:")
    print(df.head())
    
    qa_data = create_qa_pairs(df)
    tokenized_dataset, tokenizer = prepare_hf_dataset(qa_data)
    model, tokenizer = train_model(tokenized_dataset)
    
    # Interactive QA loop
    print("\nBhagavad Gita QA System. Type 'exit' to quit.")
    context = " ".join(df['context'].tolist())  # Combine all verses for general context
    while True:
        question = input("Ask a question about the Bhagavad Gita: ")
        if question.lower() == 'exit':
            break
        answer = answer_question(question, context, model, tokenizer)
        print(f"Answer: {answer}\n")

if __name__ == "__main__":
    main()

Dataset columns: ['ID', 'Chapter', 'Verse', 'Shloka', 'Transliteration', 'HinMeaning', 'EngMeaning', 'WordMeaning']
Dataset loaded successfully. Sample:
      ID  Chapter  Verse                                             Shloka  \
0  BG1.1        1      1  धृतराष्ट्र उवाच |\nधर्मक्षेत्रे कुरुक्षेत्रे स...   
1  BG1.2        1      2  सञ्जय उवाच |\nदृष्ट्वा तु पाण्डवानीकं व्यूढं द...   
2  BG1.3        1      3  पश्यैतां पाण्डुपुत्राणामाचार्य महतीं चमूम् |\n...   
3  BG1.4        1      4  अत्र शूरा महेष्वासा भीमार्जुनसमा युधि |\nयुयुध...   
4  BG1.5        1      5  धृष्टकेतुश्चेकितानः काशिराजश्च वीर्यवान् |\nपु...   

                                     Transliteration  \
0  dhṛtarāṣṭra uvāca .\ndharmakṣetre kurukṣetre s...   
1  sañjaya uvāca .\ndṛṣṭvā tu pāṇḍavānīkaṃ vyūḍha...   
2  paśyaitāṃ pāṇḍuputrāṇāmācārya mahatīṃ camūm .\...   
3  atra śūrā maheṣvāsā bhīmārjunasamā yudhi .\nyu...   
4  dhṛṣṭaketuścekitānaḥ kāśirājaśca vīryavān .\np...   

                                   

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

Map:   0%|          | 0/2103 [00:00<?, ? examples/s]

NotImplementedError: return_offset_mapping is not available when using Python tokenizers. To use this feature, change your tokenizer to one deriving from transformers.PreTrainedTokenizerFast.