## Load the Dataset

In [1]:
from datasets import load_dataset
dataset = load_dataset('csv', data_files='/home/j/Documents/MLotsawa/transliteration/data/pairs.csv', column_names=['bo', 'phon'])

In [2]:
dataset = dataset['train'].train_test_split(.1)

## Load the Checkpoint Tokenizer, Model, and Data Collator

In [3]:
from transformers import AutoTokenizer, DataCollatorForSeq2Seq, AutoModelForSeq2SeqLM

checkpoint = "google-t5/t5-small"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint, device_map="auto")
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=checkpoint)

2024-09-08 20:55:41.756715: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-09-08 20:55:41.857676: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-09-08 20:55:41.900066: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-09-08 20:55:41.911502: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-09-08 20:55:41.985407: I tensorflow/core/platform/cpu_feature_guar

## Add Tibetan to Tokenizer

In [4]:
# Tibetan characters to add
tibetan_chars = [
    # Consonants
    "ཀ", "ཁ", "ག", "ང", "ཅ", "ཆ", "ཇ", "ཉ", "ཏ", "ཐ", "ད", "ན", "པ", "པ", "ཕ", "བ", "མ",
    "ཙ", "ཚ", "ཛ", "ཝ", "ཞ", "ཟ", "འ", "ཡ", "ར", "ལ", "ཤ", "ཥ", "ས", "ཧ", "ཨ",

    # Subjoined Consonants
    "ྐ", "ྑ", "ྒ", "ྒྷ", "ྔ", "ྕ", "ྖ", "ྗ", "྘", "ྙ", "ྚ", "ྛ", "ྜ", "ྜྷ", "ྞ", "ྟ",
    "ྠ", "ྡ", "ྡྷ", "ྣ", "ྤ", "ྥ", "ྦ", "ྦྷ", "ྨ", "ྩ", "ྪ", "ྫ", "ྫྷ", "ྭ", "ྮ", "ྯ",
    "ྰ", "ྱ", "ྲ", "ླ", "ྴ", "ྵ", "ྶ", "ྷ", "ྸ", "ྐྵ", "ྺ", "ྻ", "ྼ", "྽", "྾", "྿",

    # Vowels
    "ི", "ཱི", "ུ", "ཱུ", "ྲྀ", "ཷ", "ླྀ", "ཹ", "ེ", "ཻ", "ོ", "ཽ", "ཾ", "ཿ",

    # Other Marks and Symbols
    "འ", "ཡ", "ར", "ལ", "ཤ", "ཥ", "ས", "ཧ", "ཨ",

    # Additional Tibetan Characters
    "ཀྵ", "ཁྵ", "གྵ", "ངྵ", "ཅྵ", "ཆྵ", "ཇྵ", "ཉྵ", "ཏྵ", "ཐྵ", "དྵ", "ནྵ", "པྵ", 
    "པྵ", "ཕྵ", "བྵ", "མྵ", "ཙྵ", "ཚྵ", "ཛྵ", "ཝྵ", "ཞྵ", "ཟྵ", "འྵ", "ཡྵ", "རྵ", 
    "ལྵ", "ཤྵ", "ཥྵ", "སྵ", "ཧྵ", "ཨྵ", "པྪ", "པྫ", "པྫྷ", "པྭ", "པྮ", "པྯ", "པྰ", 
    "པྱ", "པྲ", "པླ", "པྴ", "པྵ", "པྶ", "པྷ", "པྸ", "པྐྵ", "པྺ", "པྻ", "པྼ", "པ྽", 
    "པ྾", "པ྿"
]


#'ཀཁགངཅཆཇཉཏཐདནཔཕབམཙཚཛཝཞཟའཡརལཤཥསཧཨ'

# Add the Tibetan characters to the tokenizer's vocabulary
new_tokens = [char for char in tibetan_chars if char not in tokenizer.get_vocab()]

# Add new tokens to the tokenizer
tokenizer.add_tokens(new_tokens)

# Resize model embeddings to accommodate the new vocabulary size
model.resize_token_embeddings(len(tokenizer))

Embedding(32245, 512)

## Preprocess Data

In [5]:
source_lang = 'bo'
target_lang = 'phon'

def preprocess_function(examples):

    inputs = [example for example in examples[source_lang]]
    targets = [example for example in examples[target_lang]]
    
    model_inputs = tokenizer(inputs, text_target=targets, max_length=256, truncation=True, padding="max_length")

    return model_inputs


In [6]:
tokenized_dataset = dataset.map(preprocess_function, batched=True)

Map:   0%|          | 0/88737 [00:00<?, ? examples/s]

Map:   0%|          | 0/9860 [00:00<?, ? examples/s]

## Train the Model

In [7]:
from accelerate import Accelerator
from transformers import Seq2SeqTrainingArguments, Seq2SeqTrainer, Adafactor

optimizer = Adafactor(
    model.parameters(), 
    scale_parameter=True, 
    relative_step=False, 
    warmup_init=False, 
    lr=3e-4
)

accelerator = Accelerator()
model, optimizer = accelerator.prepare(model, optimizer)

training_args = Seq2SeqTrainingArguments(
    output_dir=f"../../models/tib-tokenized",
    auto_find_batch_size=True,
    predict_with_generate=True,
    fp16=False,
    push_to_hub=False,
    eval_strategy='epoch',
    save_strategy='epoch',
    load_best_model_at_end=True,
    num_train_epochs=5
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset['train'],
    eval_dataset=tokenized_dataset['test'],
    tokenizer=tokenizer,
    optimizers=(optimizer, None),
    data_collator=data_collator
)

trainer.train()

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mbillingsmoore[0m. Use [1m`wandb login --relogin`[0m to force relogin


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.011112109133333329, max=1.0…

  0%|          | 0/55465 [00:00<?, ?it/s]

{'loss': 0.4358, 'grad_norm': 0.4028961658477783, 'learning_rate': 0.0002972955918146579, 'epoch': 0.05}
{'loss': 0.2264, 'grad_norm': 0.1836295872926712, 'learning_rate': 0.00029459118362931577, 'epoch': 0.09}
{'loss': 0.2168, 'grad_norm': 0.21932683885097504, 'learning_rate': 0.0002918867754439736, 'epoch': 0.14}
{'loss': 0.1795, 'grad_norm': 0.3370167315006256, 'learning_rate': 0.00028918236725863157, 'epoch': 0.18}
{'loss': 0.1289, 'grad_norm': 0.6756218671798706, 'learning_rate': 0.0002864779590732894, 'epoch': 0.23}
{'loss': 0.0865, 'grad_norm': 0.4279094636440277, 'learning_rate': 0.00028377355088794737, 'epoch': 0.27}
{'loss': 0.0738, 'grad_norm': 0.26090720295906067, 'learning_rate': 0.0002810691427026052, 'epoch': 0.32}
{'loss': 0.0627, 'grad_norm': 0.3491724133491516, 'learning_rate': 0.0002783647345172631, 'epoch': 0.36}
{'loss': 0.0565, 'grad_norm': 0.2388933300971985, 'learning_rate': 0.000275660326331921, 'epoch': 0.41}
{'loss': 0.0462, 'grad_norm': 0.22108301520347595, 

  0%|          | 0/1233 [00:00<?, ?it/s]

{'eval_loss': 0.01764485239982605, 'eval_runtime': 115.2813, 'eval_samples_per_second': 85.53, 'eval_steps_per_second': 10.696, 'epoch': 1.0}
{'loss': 0.0244, 'grad_norm': 0.07438911497592926, 'learning_rate': 0.0002377986117371315, 'epoch': 1.04}
{'loss': 0.026, 'grad_norm': 0.10485607385635376, 'learning_rate': 0.0002350942035517894, 'epoch': 1.08}
{'loss': 0.0225, 'grad_norm': 0.14731422066688538, 'learning_rate': 0.0002323897953664473, 'epoch': 1.13}
{'loss': 0.0214, 'grad_norm': 0.29146984219551086, 'learning_rate': 0.00022968538718110516, 'epoch': 1.17}
{'loss': 0.0221, 'grad_norm': 0.2635507881641388, 'learning_rate': 0.0002269809789957631, 'epoch': 1.22}
{'loss': 0.0212, 'grad_norm': 0.27857252955436707, 'learning_rate': 0.00022427657081042096, 'epoch': 1.26}
{'loss': 0.0214, 'grad_norm': 0.08655886352062225, 'learning_rate': 0.00022157216262507886, 'epoch': 1.31}
{'loss': 0.0215, 'grad_norm': 0.0826745331287384, 'learning_rate': 0.00021886775443973676, 'epoch': 1.35}
{'loss': 

  0%|          | 0/1233 [00:00<?, ?it/s]

{'eval_loss': 0.012265005148947239, 'eval_runtime': 115.2732, 'eval_samples_per_second': 85.536, 'eval_steps_per_second': 10.696, 'epoch': 2.0}
{'loss': 0.016, 'grad_norm': 0.1283872425556183, 'learning_rate': 0.00017830163165960511, 'epoch': 2.03}
{'loss': 0.0152, 'grad_norm': 0.12045174837112427, 'learning_rate': 0.00017559722347426304, 'epoch': 2.07}
{'loss': 0.0153, 'grad_norm': 0.06561446934938431, 'learning_rate': 0.0001728928152889209, 'epoch': 2.12}
{'loss': 0.0156, 'grad_norm': 0.11308375000953674, 'learning_rate': 0.00017018840710357884, 'epoch': 2.16}
{'loss': 0.0136, 'grad_norm': 0.12685871124267578, 'learning_rate': 0.0001674839989182367, 'epoch': 2.21}
{'loss': 0.0153, 'grad_norm': 0.10953421890735626, 'learning_rate': 0.0001647795907328946, 'epoch': 2.25}
{'loss': 0.0144, 'grad_norm': 0.06467598676681519, 'learning_rate': 0.00016207518254755248, 'epoch': 2.3}
{'loss': 0.0145, 'grad_norm': 0.07096302509307861, 'learning_rate': 0.00015937077436221038, 'epoch': 2.34}
{'loss

  0%|          | 0/1233 [00:00<?, ?it/s]

{'eval_loss': 0.01027483120560646, 'eval_runtime': 115.4463, 'eval_samples_per_second': 85.408, 'eval_steps_per_second': 10.68, 'epoch': 3.0}
{'loss': 0.0142, 'grad_norm': 0.06376458704471588, 'learning_rate': 0.00011880465158207878, 'epoch': 3.02}
{'loss': 0.0131, 'grad_norm': 0.10283981263637543, 'learning_rate': 0.00011610024339673667, 'epoch': 3.06}
{'loss': 0.013, 'grad_norm': 0.08032650500535965, 'learning_rate': 0.00011339583521139457, 'epoch': 3.11}
{'loss': 0.0113, 'grad_norm': 0.09435302764177322, 'learning_rate': 0.00011069142702605245, 'epoch': 3.16}
{'loss': 0.012, 'grad_norm': 0.0506606251001358, 'learning_rate': 0.00010798701884071034, 'epoch': 3.2}
{'loss': 0.0127, 'grad_norm': 0.08407637476921082, 'learning_rate': 0.00010528261065536824, 'epoch': 3.25}
{'loss': 0.0122, 'grad_norm': 0.09842335432767868, 'learning_rate': 0.00010257820247002613, 'epoch': 3.29}
{'loss': 0.0132, 'grad_norm': 0.12308220565319061, 'learning_rate': 9.987379428468403e-05, 'epoch': 3.34}
{'loss'

  0%|          | 0/1233 [00:00<?, ?it/s]

{'eval_loss': 0.00918691698461771, 'eval_runtime': 115.2962, 'eval_samples_per_second': 85.519, 'eval_steps_per_second': 10.694, 'epoch': 4.0}
{'loss': 0.011, 'grad_norm': 0.06079893186688423, 'learning_rate': 5.9307671504552416e-05, 'epoch': 4.01}
{'loss': 0.0125, 'grad_norm': 0.07640495151281357, 'learning_rate': 5.66032633192103e-05, 'epoch': 4.06}
{'loss': 0.0123, 'grad_norm': 0.09477221220731735, 'learning_rate': 5.38988551338682e-05, 'epoch': 4.1}
{'loss': 0.0101, 'grad_norm': 0.31089723110198975, 'learning_rate': 5.119444694852609e-05, 'epoch': 4.15}
{'loss': 0.0101, 'grad_norm': 0.07270392030477524, 'learning_rate': 4.8490038763183986e-05, 'epoch': 4.19}
{'loss': 0.0108, 'grad_norm': 0.09101097285747528, 'learning_rate': 4.578563057784188e-05, 'epoch': 4.24}
{'loss': 0.0119, 'grad_norm': 0.13944658637046814, 'learning_rate': 4.308122239249977e-05, 'epoch': 4.28}
{'loss': 0.012, 'grad_norm': 0.068307064473629, 'learning_rate': 4.037681420715766e-05, 'epoch': 4.33}
{'loss': 0.012

  0%|          | 0/1233 [00:00<?, ?it/s]

{'eval_loss': 0.008867141790688038, 'eval_runtime': 115.2943, 'eval_samples_per_second': 85.52, 'eval_steps_per_second': 10.694, 'epoch': 5.0}


There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight', 'lm_head.weight'].


{'train_runtime': 18058.2619, 'train_samples_per_second': 24.57, 'train_steps_per_second': 3.071, 'train_loss': 0.028914398460556687, 'epoch': 5.0}


TrainOutput(global_step=55465, training_loss=0.028914398460556687, metrics={'train_runtime': 18058.2619, 'train_samples_per_second': 24.57, 'train_steps_per_second': 3.071, 'total_flos': 3.002456359305216e+16, 'train_loss': 0.028914398460556687, 'epoch': 5.0})