In [None]:
!git clone https://github.com/huggingface/transformers.git
!pip install -e /content/transformers
!pip install wandb
!pip install jsonlines
!pip install -r "/content/transformers/examples/pytorch/translation/requirements.txt"
!pip install sacrebleu==1.5.1

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

In [None]:
!wandb login

Restart runtime

In [None]:
import gc
import logging
import torch
import jsonlines
import numpy as np
import pandas as pd

## Data collection and separation

In [None]:
!git clone https://github.com/rahular/itihasa

In [None]:
!mkdir data
!cp "/content/itihasa/data/train.en" "/content/data/train.source"
!cp "/content/itihasa/data/train.sn" "/content/data/train.target"

!cp "/content/itihasa/data/dev.en" "/content/data/val.source"
!cp "/content/itihasa/data/dev.sn" "/content/data/val.target"

!cp "/content/itihasa/data/test.en" "/content/data/test.source"
!cp "/content/itihasa/data/test.sn" "/content/data/test.target"

In [None]:
tr_src = open("/content/data/train.source").readlines()
tr_tgt = open("/content/data/train.target").readlines()

eval_src = open("/content/data/val.source").readlines()
eval_tgt = open("/content/data/val.target").readlines()

test_src = open("/content/data/test.source").readlines()
test_tgt = open("/content/data/test.target").readlines()

In [None]:
len(tr_src), len(tr_tgt)

In [None]:
items = []
for src, tgt in zip(tr_src, tr_tgt):
    items.append({"translation":{
        "en":src,
        "sa":tgt
    }})

with jsonlines.open('/content/data/train_json.json', 'w') as writer:
    writer.write_all(items)

items = []
for src, tgt in zip(eval_src, eval_tgt):
    items.append({"translation":{
        "en":src,
        "sa":tgt
    }})

with jsonlines.open('/content/data/eval_json.json', 'w') as writer:
    writer.write_all(items)

items = []
for src, tgt in zip(test_src, test_tgt):
    items.append({"translation":{
        "en":src,
        "sa":tgt
    }})

with jsonlines.open('/content/data/test_json.json', 'w') as writer:
    writer.write_all(items)



data needs to be in the files

- train.source
- train.target
- val.source
- val.target
- test.source
- test.target

Initial checkpoints from a pretrained hosted model.

## English to Sanskrit:

The idea is to **leverage the similarity** between Hindi and Sanskrit in the decoder and to **fine-tune the model** end-to-end using Sanskrit shloks.

We can do this **without needing to train a tokenizer** due to Hindi tokenizer being able to tokenize Devnagari leepi of Sanskrit.

In [None]:
gc.collect()

In [None]:
#evaluation
!python "/content/transformers/examples/pytorch/translation/run_translation.py" \
    --model_name_or_path Helsinki-NLP/opus-mt-en-hi \
    --num_train_epochs 5 \
    --source_lang en \
    --target_lang sa \
    --max_source_length 128 \
    --max_target_length 128 \
    --train_file "/content/data/train_json.json" \
    --validation_file "/content/data/eval_json.json" \
    --test_file "/content/data/test_json.json" \
    --output_dir "./chkpt" \
    --per_device_train_batch_size=64 \
    --per_device_eval_batch_size=64 \
    --overwrite_output_dir \
    --predict_with_generate \
    --do_eval \
    --evaluation_strategy epoch \
    --seed 108 \
    --metric_for_best_model bleu \

In [None]:
#Training
!python "/content/transformers/examples/pytorch/translation/run_translation.py" \
    --model_name_or_path Helsinki-NLP/opus-mt-en-hi \
    --do_train \
    --num_train_epochs 5 \
    --source_lang en \
    --target_lang sa \
    --max_source_length 128 \
    --max_target_length 128 \
    --train_file "/content/data/train_json.json" \
    --validation_file "/content/data/eval_json.json" \
    --test_file "/content/data/test_json.json" \
    --output_dir "./chkpt" \
    --per_device_train_batch_size=32 \
    --per_device_eval_batch_size=32 \
    --overwrite_output_dir \
    --predict_with_generate \
    --do_eval \
    --evaluation_strategy epoch \
    --seed 108 \
    --metric_for_best_model bleu \

In [None]:
# Loading from checkpoint
from transformers import MarianMTModel, MarianTokenizer
from typing import List

src = 'en'  # source language
trg = 'sa'  # target language
model_name = f'/content/chkpt'

model = MarianMTModel.from_pretrained(model_name)
tokenizer = MarianTokenizer.from_pretrained(model_name)

In [None]:
test_src[:5], test_tgt[:5]

In [None]:
# Testing
sample_text = "A beautiful day it is."
batch = tokenizer([sample_text], return_tensors="pt")
gen = model.generate(**batch)
tokenizer.batch_decode(gen, skip_special_tokens=True)