## Visualizing the Data

In [1]:
import csv
import pandas as pd

In [2]:
file_path = r"A:\Projects\Sanskrit Text Conversion\Sanskrit-Text-Conversion\Gita-data.csv"
with open(file_path, "r") as f:
    csvFile = csv.DictReader(f)
    for lines in csvFile:
        print(lines)

{'id': 'c:1v1', 'input': 'धृतराष्ट्र उवाच |धर्मक्षेत्रे कुरुक्षेत्रे समवेता युयुत्सवः |मामकाः पाण्डवाश्चैव किमकुर्वत सञ्जय ||', 'output': 'Dhritarashtra said: O Sanjay, after gathering on the holy field of Kurukshetra, and desiring to fight, what did my sons and the sons of Pandu do?'}
{'id': 'c:1v2', 'input': 'सञ्जय उवाच ।दृष्ट्वा तु पाण्डवानीकं व्यूढं दुर्योधनस्तदा ।आचार्यमुपसङ्गम्य राजा वचनमब्रवीत् ।। ।।', 'output': 'Sanjay said: On observing the Pandava army standing in military formation, King Duryodhan approached his teacher Dronacharya, and said the following words.'}
{'id': 'c:1v3', 'input': 'पश्यैतां पाण्डुपुत्राणामाचार्य महतीं चमूम् ।व्यूढां द्रुपदपुत्रेण तव शिष्येण धीमता ।। ।।', 'output': 'Duryodhan said: Respected teacher! Behold the mighty army of the sons of Pandu, so expertly arrayed for battle by your own gifted disciple, the son of Drupad.'}
{'id': 'c:1v4', 'input': 'अत्र शूरा महेष्वासा भीमार्जुनसमा युधि ', 'output': 'Behold in their ranks are many powerful warriors, l

## Adding data in the form of desired dictionary to use in transformer models

In [3]:
csvFile = pd.read_csv(file_path)
csvFile = csvFile.drop("id", axis="columns")
print(csvFile.head(5))

                                               input  \
0  धृतराष्ट्र उवाच |धर्मक्षेत्रे कुरुक्षेत्रे समव...   
1  सञ्जय उवाच ।दृष्ट्वा तु पाण्डवानीकं व्यूढं दुर...   
2  पश्यैतां पाण्डुपुत्राणामाचार्य महतीं चमूम् ।व्...   
3             अत्र शूरा महेष्वासा भीमार्जुनसमा युधि    
4             अत्र शूरा महेष्वासा भीमार्जुनसमा युधि    

                                              output  
0  Dhritarashtra said: O Sanjay, after gathering ...  
1  Sanjay said: On observing the Pandava army sta...  
2  Duryodhan said: Respected teacher! Behold the ...  
3  Behold in their ranks are many powerful warrio...  
4  Behold in their ranks are many powerful warrio...  


In [4]:
df = pd.read_parquet("hf://datasets/VinitT/Sanskrit-Llama/data/train-00000-of-00001.parquet")
desired_data = df.drop("instruction", axis='columns')
desired_data = pd.concat([desired_data, csvFile],ignore_index=True)
print(desired_data.head(5))

  from .autonotebook import tqdm as notebook_tqdm


                                               input  \
0  ये त्रि॑ष॒प्ताः प॑रि॒यन्ति॒ विश्वा॑ रू॒पाणि॒ ब...   
1  पुन॒रेहि॑ वाचस्पते दे॒वेन॒ मन॑सा स॒ह । वसो॑ष्प...   
2  इ॒हैवाभि वि त॑नू॒भे आर्त्नी॑ इव॒ ज्यया॑ । वा॒च...   
3  उप॑हूतो वा॒चस्पति॒रुपा॒स्मान्वा॒चस्पति॑र्ह्वयत...   
4  वि॒द्मा श॒रस्य॑ पि॒तरं॑ प॒र्जन्यं॒ भूरि॑धायसम्...   

                                              output  
0  The three qualities of Rajogun, Tamogun and Sa...  
1  O Swami of speech Brahma Dev! Come to me with ...  
2  O Swami of speech Brahman! Just as by offering...  
3  We invoke Brahma, the swami of speech. May the...  
4  The root consciousness is the father of the cl...  


In [5]:
shape = desired_data.shape[0]
print(shape)

20927


In [6]:
import os
import sys
import transformers
import tensorflow as tf
from datasets import load_dataset
from transformers import AutoTokenizer
from transformers import TFAutoModelForSeq2SeqLM, DataCollatorForSeq2Seq
from transformers import AdamWeightDecay

In [7]:
model_checkpoint = "Swamitucats/M2M100_Sanskrit_English"

## Preprocessing the data

In [8]:
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

In [None]:
max_input_length = 128
max_target_length = 128

source_lang = 'input'
target_lang = "output"

def pre_process(example):
    inputs = [ex[source_lang] for ex in example[""]]