### Train, Val and Test split (80:10:10) 

In [1]:
import random

with open('filtered_sanskritdoc.txt', 'r', encoding='utf-8') as f:
    sentences = [line.strip() for line in f]

random.shuffle(sentences)

train_size = int(len(sentences) * 0.8)
val_size = int(len(sentences) * 0.1)

train_set = sentences[:train_size]
val_set = sentences[train_size:train_size + val_size]
test_set = sentences[train_size + val_size:]

print(f"Training set size: {len(train_set)}")
print(f"Validation set size: {len(val_set)}")
print(f"Test set size: {len(test_set)}")

Training set size: 26291
Validation set size: 3286
Test set size: 3287


In [2]:
VEDIC_ACCENTS = ['\u0951', '\u0952', '\u0953', '\u0954', '\u1CDA']

def create_pitched_unpitched(sentences, vedic_accents):
    pitched_sentences = []
    unpitched_sentences = []
    for sentence in sentences:
        unpitched_sentence = sentence
        for accent in vedic_accents:
            unpitched_sentence = unpitched_sentence.replace(accent, '')
        pitched_sentences.append(sentence)
        unpitched_sentences.append(unpitched_sentence)
    return pitched_sentences, unpitched_sentences

train_pitched, train_unpitched = create_pitched_unpitched(train_set, VEDIC_ACCENTS)
val_pitched, val_unpitched = create_pitched_unpitched(val_set, VEDIC_ACCENTS)
test_pitched, test_unpitched = create_pitched_unpitched(test_set, VEDIC_ACCENTS)

print("Train Pitched:", train_pitched[0])
print("Train Unpitched:", train_unpitched[0])
print("Validation Pitched:", val_pitched[0])
print("Validation Unpitched:", val_unpitched[0])
print("Test Pitched:", test_pitched[0])
print("Test Unpitched:", test_unpitched[0])

Train Pitched: नू ष्टु॒त इ॑न्द्र॒ नू गृ॑णा॒न इषं॑ जरि॒त्रे न॒द्यो॒३॒॑ न पी॑पेः ।
Train Unpitched: नू ष्टुत इन्द्र नू गृणान इषं जरित्रे नद्यो३ न पीपेः ।
Validation Pitched: ए॒भिः सोमे॑भिः सोम॒सुद्भिः॑ सोमपा दा॒नाय॑ शुक्रपूतपाः ॥ ८.०४६.२६
Validation Unpitched: एभिः सोमेभिः सोमसुद्भिः सोमपा दानाय शुक्रपूतपाः ॥ ८.०४६.२६
Test Pitched: श्व॒घ्नीव॒ यो जि॑गी॒वाँल्ल॒क्षमाद॑द॒र्यः पु॒ष्टानि॒ स ज॑नास॒ इन्द्रः॑ ॥ २.०१२.०४
Test Unpitched: श्वघ्नीव यो जिगीवाँल्लक्षमाददर्यः पुष्टानि स जनास इन्द्रः ॥ २.०१२.०४


In [3]:
print("Plain Text", "\nPitched Text\n")
for i in range(5):
    print(train_unpitched[i])
    print(train_pitched[i])
    print("-"*20)

Plain Text 
Pitched Text

नू ष्टुत इन्द्र नू गृणान इषं जरित्रे नद्यो३ न पीपेः ।
नू ष्टु॒त इ॑न्द्र॒ नू गृ॑णा॒न इषं॑ जरि॒त्रे न॒द्यो॒३॒॑ न पी॑पेः ।
--------------------
ॐ उपास्मै गायता नरः पवमानायेन्दवे ।
ॐ उपा᳚स्मै गायता नरः॒ पव॑माना॒येन्द॑वे ।
--------------------
तुरीयमिद्रोहितस्य पाकस्थामानं भोजं दातारमब्रवम् ॥ ८.००३.२४
तु॒रीय॒मिद्रोहि॑तस्य॒ पाक॑स्थामानं भो॒जं दा॒तार॑मब्रवम् ॥ ८.००३.२४
--------------------
पुनर्वसू हविषा वर्धयन्ती। प्रियम् देवानामप्येतु पाथः ॥ ५॥
पुन॑र्वसू ह॒विषा॑ व॒र्धय॑न्ती। प्रि॒यम् दे॒वाना॒मप्ये॑तु॒ पाथः॑ ॥ ५॥
--------------------
१.५९१ चुच्यँ इत्येके ।
१.५९१ चुच्यँ इत्येके॑ ।
--------------------


In [4]:
print(sentences[30965])

म्रक्ष् । भ्वा० सेट् प० । म्रक्षँ [सङ्घा॒ते] इत्येके॑ १.७५५ ॥


In [5]:
sample_case=[sentences[30965]]
x,y=create_pitched_unpitched(sample_case, VEDIC_ACCENTS)
print(y, len(y[0]))
print(x, len(x[0]))

['म्रक्ष् । भ्वा० सेट् प० । म्रक्षँ [सङ्घाते] इत्येके १.७५५ ॥'] 59
['म्रक्ष् । भ्वा० सेट् प० । म्रक्षँ [सङ्घा॒ते] इत्येके॑ १.७५५ ॥'] 61


In [6]:
print(f"Above line have {len(x[0]) -len(y[0])} vedic matras")

Above line have 2 vedic matras
