In [None]:
import os
import unicodedata
import numpy as np
import pandas as pd
import torch
import random
from tqdm.auto import tqdm
from simpletransformers.seq2seq import Seq2SeqModel, Seq2SeqArgs
from transformers import AutoTokenizer
import Levenshtein
import sacrebleu

  from .autonotebook import tqdm as notebook_tqdm


## Dataset processing

In [3]:
data_file = "Datasets/filtered_sanskritdoc.txt"
train_file = "Datasets/train.txt"
val_file = "Datasets/val.txt"
test_file = "Datasets/test.txt"

In [4]:
RANDOM_SEED = 42
MODEL_NAME = "facebook/mbart-large-50-many-to-many-mmt"
OUTPUT_DIR = "outputs_pitch_restore"
MBART_LANG = "hi_IN"

In [5]:
VEDIC_ACCENTS_EXPLICIT = ['\u0951', '\u0952', '\u0953', '\u0954', '\u1CDA']
COMBINING_MARK_RANGES = [
    (0x0951, 0x0954),
    (0x1CD0, 0x1CFF),
    # (0xA8E0, 0xA8FF),  # enable if needed
]

In [6]:
random.seed(RANDOM_SEED)
np.random.seed(RANDOM_SEED)
torch.manual_seed(RANDOM_SEED)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(RANDOM_SEED)


In [7]:
def load_sentences(path):
    with open(path, "r", encoding="utf-8") as f:
        return [line.strip() for line in f if line.strip()]
    
train_sentences = load_sentences(train_file)
val_sentences = load_sentences(val_file)
test_sentences = load_sentences(test_file)

In [8]:
print(len(train_sentences))
print(len(val_sentences))
print(len(test_sentences))

23004
4929
4931


In [9]:


def strip_combining_ranges(text, ranges):
    text = unicodedata.normalize("NFC", text)
    res = []
    for ch in text:
        code = ord(ch)
        if any(lo <= code <= hi for lo, hi in ranges):
            continue
        res.append(ch)
    out = "".join(res)
    for mark in VEDIC_ACCENTS_EXPLICIT:
        out = out.replace(mark, "")
    return " ".join(out.split())

def make_dataframe(sentences):
    return pd.DataFrame({
        "input_text": [strip_combining_ranges(s, COMBINING_MARK_RANGES) for s in sentences],
        "target_text": [unicodedata.normalize("NFC", s) for s in sentences]
    })


In [10]:
df_train = make_dataframe(train_sentences)
df_val = make_dataframe(val_sentences)
df_test = make_dataframe(test_sentences)

In [11]:
print(df_train.shape)
print(df_val.shape)
print(df_test.shape)

(23004, 2)
(4929, 2)
(4931, 2)


In [12]:
df_test.head()

Unnamed: 0,input_text,target_text
0,कीलालपे सोमपृष्ठाय वेधसे हृदा मतिं जनये चारुमग...,की॒ला॒ल॒पे सोम॑पृष्ठाय वे॒धसे॑ हृ॒दा म॒तिं ज॑न...
1,नमो ह्रदय्याय च नमः । निवेष्प्याय च नमः ।,नमो᳚ ह्रद॒य्या॑य च॒ नमः॑ । नि॒वे॒ष्प्या॑य च॒ न...
2,इन्द्रं मित्रं वरुणमग्निमाहुरथो दिव्यः स सुपर्...,इन्द्रं॑ मि॒त्रं वरु॑णम॒ग्निमा॑हु॒रथो॑ दि॒व्यः...
3,क्रतुर्भवत्युक्थ्यः ॥ १.०१७.०५,क्रतु॑र्भवत्यु॒क्थ्यः॑ ॥ १.०१७.०५
4,ये ते त्रिरहन्सवितः सवासो दिवेदिवे सौभगमासुवन्...,ये ते॒ त्रिरह॑न्सवितः स॒वासो॑ दि॒वेदि॑वे॒ सौभ॑...


In [13]:
n_identical = (df_val["input_text"] == df_val["target_text"]).sum()
print(f"\nIdentical lines (plain==pitched): {n_identical}")


Identical lines (plain==pitched): 0


## Model

### Tokenizer

In [14]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
tokenizer.add_tokens(VEDIC_ACCENTS_EXPLICIT, special_tokens=False)              # one token added
print("Added accent tokens to tokenizer. New vocab size:", len(tokenizer))



Added accent tokens to tokenizer. New vocab size: 250055


In [15]:
if hasattr(tokenizer, "lang_code_to_id") and MBART_LANG in tokenizer.lang_code_to_id:
    tokenizer.src_lang = MBART_LANG
    tokenizer.tgt_lang = MBART_LANG
else:
    # Fallback: still proceed, but warn once.
    print(f"[WARN] MBART language code '{MBART_LANG}' not found in tokenizer.lang_code_to_id. "
          f"Proceeding without forced language BOS; decoding quality may degrade.")

### Model arguments

In [16]:
model_args = Seq2SeqArgs()
model_args.num_train_epochs = 1
model_args.train_batch_size = 8
model_args.eval_batch_size = 8
model_args.max_sequence_length = 256
model_args.max_length = 256
model_args.evaluate_generated_text = True
model_args.evaluate_during_training = True
model_args.use_multiprocessing = False
model_args.overwrite_output_dir = True
model_args.output_dir = OUTPUT_DIR
model_args.best_model_dir = os.path.join(OUTPUT_DIR, "best_model")
model_args.fp16 = torch.cuda.is_available()
model_args.save_eval_checkpoints = True
model_args.save_model_every_epoch = False
model_args.evaluate_during_training_steps = 1000
model_args.logging_steps = 200
model_args.save_steps = 1000
model_args.learning_rate = 5e-5
model_args.gradient_accumulation_steps = 4
model_args.use_multiprocessing_for_evaluation = False
model_args.num_beams = 5
model_args.length_penalty = 1.0
model_args.early_stopping = True                 
model_args.early_stopping_metric = "eval_loss"  
model_args.early_stopping_metric_minimize = True
model_args.early_stopping_patience = 3
model_args.max_grad_norm = 1.0                   
model_args.reprocess_input_data = True          
model_args.save_best_model = True

In [17]:
use_cuda = torch.cuda.is_available()

model = Seq2SeqModel(
    encoder_decoder_type = "mbart",
    encoder_decoder_name = MODEL_NAME,
    tokenizer = tokenizer,
    args = model_args,
    use_cuda = use_cuda
)

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'MBart50Tokenizer'. 
The class this function is called from is 'MBartTokenizer'.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'MBart50Tokenizer'. 
The class this function is called from is 'MBartTokenizerFast'.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [18]:
model.model.resize_token_embeddings(len(tokenizer))

Embedding(250055, 1024)

In [19]:
if hasattr(tokenizer, "lang_code_to_id") and MBART_LANG in tokenizer.lang_code_to_id:
    forced_id = tokenizer.lang_code_to_id[MBART_LANG]
    model.model.config.forced_bos_token_id = forced_id

### Training

In [20]:
print(f"\nStarting training with {model_args.num_train_epochs} epochs")
model.train_model(
    train_data=df_train,
    eval_data=df_val
)


Starting training with 1 epochs


`prepare_seq2seq_batch` is deprecated and will be removed in version 5 of HuggingFace Transformers. Use the regular
`__call__` method to prepare your inputs and targets.

Here is a short example:

model_inputs = tokenizer(src_texts, text_target=tgt_texts, ...)

If you either need to use different keyword arguments for the source and target texts, you should do two calls like
this:

model_inputs = tokenizer(src_texts, ...)
labels = tokenizer(text_target=tgt_texts, ...)
model_inputs["labels"] = labels["input_ids"]

See the documentation of your specific tokenizer for more details on the specific arguments to the tokenizer of choice.
For a more complete example, see the implementation of `prepare_seq2seq_batch`.

100%|██████████| 23004/23004 [00:10<00:00, 2117.34it/s]
  scaler = amp.GradScaler()
  with amp.autocast():
Epochs 1/1. Running Loss:    0.2560: 100%|██████████| 2876/2876 [08:08<00:00,  5.89it/s]
100%|██████████| 4929/4929 [00:02<00:00, 2176.47it/s]
  with amp.autocast():
Generat

(719,
 {'global_step': [719],
  'eval_loss': [0.31622945199716035],
  'train_loss': [0.2559961676597595]})

#### For 1 epoch training 

    (719,

        {'global_step': [719],

        'eval_loss': [0.31651537043059086],
    
        'train_loss': [0.36946552991867065]})

    Simple eval results {'eval_loss': 0.30853030095224054}

In [21]:
raw_results = model.eval_model(df_test, verbose=True)
print("Simple eval results", raw_results)

  0%|          | 0/4931 [00:00<?, ?it/s]

100%|██████████| 4931/4931 [00:02<00:00, 2098.33it/s]
Running Evaluation: 100%|██████████| 617/617 [00:29<00:00, 21.02it/s]
Generating outputs: 100%|██████████| 617/617 [07:11<00:00,  1.43it/s]

Simple eval results {'eval_loss': 0.30947082855616437}





## Metrics

In [None]:
# ------------------------
# 9) Custom metrics
# ------------------------
# def char_accuracy(pred, ref):
#     L = max(len(pred), len(ref))
#     pred_p, ref_p = pred.ljust(L), ref.ljust(L)
#     return sum(1 for a, b in zip(pred_p, ref_p) if a == b) / L

# def word_accuracy(pred, ref):
#     p_tokens, r_tokens = pred.split(), ref.split()
#     minlen = min(len(p_tokens), len(r_tokens))
#     matches = sum(1 for i in range(minlen) if p_tokens[i] == r_tokens[i])
#     return matches / max(len(r_tokens), 1)

def exact_match(pred, ref):
    return int(pred.strip() == ref.strip())

def pitch_positions(s, pitch_tokens=None):
    if pitch_tokens is None:
        # Define as any char in the configured ranges or explicit list
        pts = set(VEDIC_ACCENTS_EXPLICIT)
        def is_pitch_char(ch):
            c = ord(ch)
            return (ch in pts) or any(lo <= c <= hi for lo, hi in COMBINING_MARK_RANGES)
    else:
        pts = set(pitch_tokens)
        def is_pitch_char(ch):
            return ch in pts
    return {(i, c) for i, c in enumerate(s) if is_pitch_char(c)}

# def pitch_accuracy(pred, ref, pitch_tokens=None):
#     pred_pos = pitch_positions(pred, pitch_tokens)
#     ref_pos  = pitch_positions(ref, pitch_tokens)
#     matches = len(pred_pos & ref_pos)
#     return matches / max(len(ref_pos), 1)

def pitch_f1(pred, ref, pitch_tokens=None):
    pred_set = pitch_positions(pred, pitch_tokens)
    ref_set  = pitch_positions(ref, pitch_tokens)
    tp = len(pred_set & ref_set)
    fp = len(pred_set - ref_set)
    fn = len(ref_set - pred_set)
    precision = tp / (tp + fp) if tp + fp > 0 else 0
    recall    = tp / (tp + fn) if tp + fn > 0 else 0
    f1        = 2 * precision * recall / (precision + recall) if precision + recall > 0 else 0
    return f1

def extract_pitch_string(s):
    """Return a string containing only the pitch marks from s."""
    return "".join(
        ch for i, ch in enumerate(s)
        if (ch in VEDIC_ACCENTS_EXPLICIT) or any(lo <= ord(ch) <= hi for lo, hi in COMBINING_MARK_RANGES)
    )

def pitch_edit_distance(pred, ref):
    """Compute raw and normalized edit distance for pitch accents only."""
    pred_pitch = extract_pitch_string(pred)
    ref_pitch  = extract_pitch_string(ref)
    dist = Levenshtein.distance(pred_pitch, ref_pitch)
    norm = dist / max(len(ref_pitch), 1)
    return dist, norm


In [None]:
# ------------------------
# 10) Predictions & Metrics
# ------------------------



batch_inputs = df_test["input_text"].tolist()

# Keep decoding params explicit
model.args.max_length = 256
model.args.num_beams = 5

preds = model.predict(batch_inputs)

exacts = []
pitch_f1s = []
pitch_edit_raw, pitch_edit_norm = [], []

refs = df_test["target_text"].tolist()
hyps = preds

for pred, ref in zip(hyps, refs):
    exacts.append(exact_match(pred, ref))
    pitch_f1s.append(pitch_f1(pred, ref))    
    d_raw, d_norm = pitch_edit_distance(pred, ref)
    pitch_edit_raw.append(d_raw)
    pitch_edit_norm.append(d_norm)


bleu_score = sacrebleu.corpus_bleu(hyps, [refs]).score if hyps else 0.0

print("\nEvaluation Results")
print("=" * 50)
print(f"Exact Match Rate:             {float(np.mean(exacts)) * 100:.2f}%")
print(f"Pitch F1 Score (mean):        {float(np.mean(pitch_f1s)) * 100:.2f}%")
print(f"Pitch Edit Distance (norm):   {float(np.mean(pitch_edit_norm)) * 100:.2f}%")
print(f"Corpus BLEU:                  {float(bleu_score):.2f}")

Detailed metrics: {'exact_match_rate': 0.08720340701683228, 'pitch_f1_mean': 0.33154141694368316, 'avg_pitch_edit_distance_norm': 0.34143245442304815, 'corpus_BLEU': 49.95357356445782}



Evaluation Results
Exact Match Rate:             8.72%
Pitch F1 Score (mean):        33.15%
Pitch Edit Distance (norm):   34.14%
Corpus BLEU:                  49.95


In [38]:
exact_match_rate = float(np.mean(exacts)) * 100
pitch_f1_mean = float(np.mean(pitch_f1s)) * 100
pitch_edit_norm = float(np.mean(pitch_edit_norm)) * 100
bleu = float(bleu_score)

print("\nEvaluation Results")
print("=" * 50)
print(f"The exact matching sentences are {exact_match_rate:.2f}% "
      f"(model output fully correct without any error).")

print(f"The average Pitch F1 score is {pitch_f1_mean:.2f}%, "
      f"which balances how many pitch accents were predicted correctly "
      f"and how many were missed/over-predicted.")

print(f"The average normalized pitch edit distance is {pitch_edit_norm:.2f}%, "
      f"meaning on average this percentage of pitch accents would need correction.")

print(f"The corpus BLEU score is {bleu:.2f}, "
      f"showing overall sequence similarity with the reference text.")



Evaluation Results
The exact matching sentences are 8.72% (model output fully correct without any error).
The average Pitch F1 score is 33.15%, which balances how many pitch accents were predicted correctly and how many were missed/over-predicted.
The average normalized pitch edit distance is 34.14%, meaning on average this percentage of pitch accents would need correction.
The corpus BLEU score is 49.95, showing overall sequence similarity with the reference text.


1 epoch Detailed metrics: 

'char_accuracy_mean': 0.4939109582960301, 
                    
'word_accuracy_mean': 0.5942787050558115, 
                    
'exact_match_rate': 0.08720340701683228, 
                    
'avg_levenshtein': 5.867572500506997, 
                    
'pitch_accuracy_mean': 0.3327242539902244, 
                    
'pitch_f1_mean': 0.33154141694368316, 
                    
'corpus_BLEU': 49.95357356445782

In [None]:
print("\nQualitative examples:")
for i in range(min(10, len(batch_inputs))):
    print("---")
    print("INPUT :", batch_inputs[i])
    print("TARGET:", refs[i])
    print("PRED  :", hyps[i])
    print()



Qualitative examples:
---
INPUT : कीलालपे सोमपृष्ठाय वेधसे हृदा मतिं जनये चारुमग्नये ॥ १०.०९१.१४
TARGET: की॒ला॒ल॒पे सोम॑पृष्ठाय वे॒धसे॑ हृ॒दा म॒तिं ज॑नये॒ चारु॑म॒ग्नये॑ ॥ १०.०९१.१४
PRED  : की॒ला॒ल॒पे सोम॑पृष्ठाय वे॒धसे॑ हृ॒दा म॒तिं जन॑ये॒ चारु॒मग्न॑ये ॥ १०.०९१.१४

---
INPUT : नमो ह्रदय्याय च नमः । निवेष्प्याय च नमः ।
TARGET: नमो᳚ ह्रद॒य्या॑य च॒ नमः॑ । नि॒वे॒ष्प्या॑य च॒ नमः॑ ।
PRED  : नमो॑ ह्रदय्या॒य च॒ नमः॑ । निवे॒ष्प्या॑य च॒ नमः॑ ।

---
INPUT : इन्द्रं मित्रं वरुणमग्निमाहुरथो दिव्यः स सुपर्णो गरुत्मान् ।
TARGET: इन्द्रं॑ मि॒त्रं वरु॑णम॒ग्निमा॑हु॒रथो॑ दि॒व्यः स सु॑प॒र्णो ग॒रुत्मा॑न् ।
PRED  : इन्द्रं॑ मि॒त्रं वरु॑णम॒ग्निमा॑हुरथो दि॒व्यः स सु॒पर्णो॑ गरु॒त्मान् ।

---
INPUT : क्रतुर्भवत्युक्थ्यः ॥ १.०१७.०५
TARGET: क्रतु॑र्भवत्यु॒क्थ्यः॑ ॥ १.०१७.०५
PRED  : क्रतु॒र्भव॒त्युक्थ्यः॑ ॥ १.०१७.०५

---
INPUT : ये ते त्रिरहन्सवितः सवासो दिवेदिवे सौभगमासुवन्ति ।
TARGET: ये ते॒ त्रिरह॑न्सवितः स॒वासो॑ दि॒वेदि॑वे॒ सौभ॑गमासु॒वन्ति॑ ।
PRED  : ये ते॑ त्रि॒रहन्स॒वितः॑ सवा॒सो दि॒वेदि॑वे सौभग॒मासु॑वन्ति ।



1. pitch_f1_mean
This comes from precision and recall applied to pitch accent symbols.
Precision = of all the accents the model predicted, how many were correct?
Recall = of all the accents in the reference, how many did the model recover?
F1 score = harmonic mean of precision and recall (balances the two).
👉 Formula: 2*P*R/(P+R)


2. avg_pitch_edit_distance_norm
This uses edit distance but only on pitch marks.
Edit distance = minimum number of edits (insert, delete, substitute) needed to transform prediction → reference.
👉 Then we normalize it by the number of reference accents:

char_accuracy → character-level correctness
word_accuracy → word-level correctness
exact_match → whole-sentence correctness
pitch_accuracy → correctness of accent placement (recall-oriented)
pitch_f1 → precision/recall balance for accents
