### Train, Val and Test split (70:15:15) 

In [1]:
import random

with open('filtered_sanskritdoc.txt', 'r', encoding='utf-8') as f:
    sentences = [line.strip() for line in f]

random.shuffle(sentences)

train_size = int(len(sentences) * 0.7)
val_size = int(len(sentences) * 0.15)

train_set = sentences[:train_size]
val_set = sentences[train_size:train_size + val_size]
test_set = sentences[train_size + val_size:]

print(f"Training set size: {len(train_set)}")
print(f"Validation set size: {len(val_set)}")
print(f"Test set size: {len(test_set)}")

Training set size: 23004
Validation set size: 4929
Test set size: 4931


In [4]:
train_set

['य ऋ॒ज्रा मह्यं॑ माम॒हे स॒ह त्व॒चा हि॑र॒ण्यया॑ ।',
 'अ॒ग्नेर्व॒यं प्र॑थ॒मस्या॒मृता॑नां॒ मना॑महे॒ चारु॑दे॒वस्य॒ नाम॑ ।',
 'स॒तो नू॒नं क॑वयः॒ सं शि॑शीत॒ वाशी॑भि॒र्याभि॑र॒मृता॑य॒ तक्ष॑थ ।',
 'अ॒र्चत्र॑यो॒ धुन॑यो॒ न वी॒रा भ्राज॑ज्जन्मानो म॒रुतो॒ अधृ॑ष्टाः ॥ ६.०६६.१०',
 'इन्द्रः॑ पु॒रस्ता॑दु॒त म॑ध्य॒तो नः॒ सखा॒ सखि॑भ्यो॒ वरि॑वः कृणोतु ॥ १०.०४४.११',
 'नभ् । भ्वा० सेट् आ० । णभँ॒ऽ हिं॒साया॑म् [अभा॒वेऽपि॑] १.८५५ ॥',
 'कण्वा॑ उ॒क्थेन॑ वावृधुः ॥ ८.००६.४३',
 'यजा॑महै य॒ज्ञिया॒न्हन्त॑ दे॒वाँ ईळा॑महा॒ ईड्या॒ँ आज्ये॑न ॥ १०.०५३.०२',
 'प्र॒ब्रूते य॒दस्य भ॒वति ।',
 'हृष् । भ्वा० सेट् प० । हृषुँ अली॑के १.८०६ ॥',
 'श्रु॒ष्टी॒वेव॒ प्रेषि॑तो वामबोधि॒ प्रति॒ स्तोमै॒र्जर॑माणो॒ वसि॑ष्ठः ॥ ७.०७३.०३',
 'शु॒भ्रस्त्वमि॑न्द्र वावृधा॒नो अ॒स्मे दासी॒र्विशः॒ सूर्ये॑ण सह्याः ॥ २.०११.०४',
 'अधि॑पत्नी॒ नामा॑सि बृह॒ती दिक्तस्या᳚स्ते॒ बृह॒स्पति॒रधि॑पतिः श्वि॒त्रो र॑क्षि॒ता',
 'त आत्मा॒ सर्वान्तरो॒ । अतोऽन्य॒दा॒र्तम् । त॒तो होषस्त॒श्चा॒क्रायण उ॒परराम ।',
 'सूश्च॑ मे प्र॒सूश्च॑ मे॒ सीरं॑ च मे ल॒यश्च॑',
 'विश्वा॒ यद्रू॒पा प॑

In [5]:
VEDIC_ACCENTS = ['\u0951', '\u0952', '\u0953', '\u0954', '\u1CDA']

def create_pitched_unpitched(sentences, vedic_accents):
    pitched_sentences = []
    unpitched_sentences = []
    for sentence in sentences:
        unpitched_sentence = sentence
        for accent in vedic_accents:
            unpitched_sentence = unpitched_sentence.replace(accent, '')
        pitched_sentences.append(sentence)
        unpitched_sentences.append(unpitched_sentence)
    return pitched_sentences, unpitched_sentences

train_pitched, train_unpitched = create_pitched_unpitched(train_set, VEDIC_ACCENTS)
val_pitched, val_unpitched = create_pitched_unpitched(val_set, VEDIC_ACCENTS)
test_pitched, test_unpitched = create_pitched_unpitched(test_set, VEDIC_ACCENTS)

print("Train Pitched:", train_pitched[0])
print("Train Unpitched:", train_unpitched[0])
print("Validation Pitched:", val_pitched[0])
print("Validation Unpitched:", val_unpitched[0])
print("Test Pitched:", test_pitched[0])
print("Test Unpitched:", test_unpitched[0])

Train Pitched: य ऋ॒ज्रा मह्यं॑ माम॒हे स॒ह त्व॒चा हि॑र॒ण्यया॑ ।
Train Unpitched: य ऋज्रा मह्यं मामहे सह त्वचा हिरण्यया ।
Validation Pitched: यथा॑ चिद्वृ॒द्धम॑त॒समग्ने॑ सं॒जूर्व॑सि॒ क्षमि॑ ।
Validation Unpitched: यथा चिद्वृद्धमतसमग्ने संजूर्वसि क्षमि ।
Test Pitched: अग्ने॑ वी॒हि पु॑रो॒ळाश॒माहु॑तं ति॒रोअ॑ह्न्यम् ।
Test Unpitched: अग्ने वीहि पुरोळाशमाहुतं तिरोअह्न्यम् ।


In [6]:
print("Plain Text", "\nPitched Text\n")
for i in range(5):
    print(train_unpitched[i])
    print(train_pitched[i])
    print("-"*20)

Plain Text 
Pitched Text

य ऋज्रा मह्यं मामहे सह त्वचा हिरण्यया ।
य ऋ॒ज्रा मह्यं॑ माम॒हे स॒ह त्व॒चा हि॑र॒ण्यया॑ ।
--------------------
अग्नेर्वयं प्रथमस्यामृतानां मनामहे चारुदेवस्य नाम ।
अ॒ग्नेर्व॒यं प्र॑थ॒मस्या॒मृता॑नां॒ मना॑महे॒ चारु॑दे॒वस्य॒ नाम॑ ।
--------------------
सतो नूनं कवयः सं शिशीत वाशीभिर्याभिरमृताय तक्षथ ।
स॒तो नू॒नं क॑वयः॒ सं शि॑शीत॒ वाशी॑भि॒र्याभि॑र॒मृता॑य॒ तक्ष॑थ ।
--------------------
अर्चत्रयो धुनयो न वीरा भ्राजज्जन्मानो मरुतो अधृष्टाः ॥ ६.०६६.१०
अ॒र्चत्र॑यो॒ धुन॑यो॒ न वी॒रा भ्राज॑ज्जन्मानो म॒रुतो॒ अधृ॑ष्टाः ॥ ६.०६६.१०
--------------------
इन्द्रः पुरस्तादुत मध्यतो नः सखा सखिभ्यो वरिवः कृणोतु ॥ १०.०४४.११
इन्द्रः॑ पु॒रस्ता॑दु॒त म॑ध्य॒तो नः॒ सखा॒ सखि॑भ्यो॒ वरि॑वः कृणोतु ॥ १०.०४४.११
--------------------


In [7]:
print(sentences[30965])

प॒दा प॒णीँर॑रा॒धसो॒ नि बा॑धस्व म॒हाँ अ॑सि ।


In [8]:
sample_case=[sentences[30965]]
x,y=create_pitched_unpitched(sample_case, VEDIC_ACCENTS)
print(y, len(y[0]))
print(x, len(x[0]))

['पदा पणीँरराधसो नि बाधस्व महाँ असि ।'] 35
['प॒दा प॒णीँर॑रा॒धसो॒ नि बा॑धस्व म॒हाँ अ॑सि ।'] 43


In [9]:
print(f"Above line have {len(x[0]) -len(y[0])} vedic matras")

Above line have 8 vedic matras


## Create train-val-test dataset files

In [10]:
import random

# Set a fixed random seed for reproducibility
random.seed(42)

# Load sentences
with open('filtered_sanskritdoc.txt', 'r', encoding='utf-8') as f:
    sentences = [line.strip() for line in f if line.strip()]

# Shuffle once (stable since we fixed the seed)
random.shuffle(sentences)

# Define split sizes
train_size = int(len(sentences) * 0.7)
val_size = int(len(sentences) * 0.15)

# Create splits
train_set = sentences[:train_size]
val_set = sentences[train_size:train_size + val_size]
test_set = sentences[train_size + val_size:]

# Save splits to files
with open('train.txt', 'w', encoding='utf-8') as f:
    f.write("\n".join(train_set))

with open('val.txt', 'w', encoding='utf-8') as f:
    f.write("\n".join(val_set))

with open('test.txt', 'w', encoding='utf-8') as f:
    f.write("\n".join(test_set))

# Print stats
print(f"Training set size: {len(train_set)}")
print(f"Validation set size: {len(val_set)}")
print(f"Test set size: {len(test_set)}")


Training set size: 23004
Validation set size: 4929
Test set size: 4931
