## Loading the Data

DO NOT FORGET TO SEPARATE A TEST SET!!!

In [1]:
from datasets import load_dataset
dataset = load_dataset('csv', data_files='/home/j/Documents/Projects/MLotsawa/notebooks/data collection and cleaning/train-pairs.csv')

In [2]:
dataset = dataset['train'].train_test_split(.15)

In [3]:
dataset['train'][0]

{'Unnamed: 0': 55216,
 'tibetan': 'གཉེན་བཤེས་འཁྲུལ་པའི་གྲོགས་ལ་རྟག་པར་འཛིན༔',
 ' phonetic': 'nyen shé trulpé drok la takpar dzin',
 ' english': 'How utterly mistaken are the minds of ignorant beings!'}

## Load Unfinetuned Tokenizer, Model, and Data Collator

In [4]:
from transformers import AutoTokenizer, DataCollatorForSeq2Seq, AutoModelForSeq2SeqLM

checkpoint = "billingsmoore/phonetic-tibetan-to-english-translation"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint, device_map="cuda:0")
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=checkpoint)

2024-09-25 20:45:47.721544: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:9342] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-09-25 20:45:47.721657: E tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc:609] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-09-25 20:45:47.726912: E tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:1518] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-09-25 20:45:48.945916: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


## Add Tibetan to Tokenizer

The T5 tokenizer does not notably support the Tibetan script. So, we need to add it manually. Once the characters have been added to the tokenizer, the model needs to have its token embeddings resized to accomodate the added tokens. This is all pretty straightforward, as seen in the code below.

In [5]:
# Tibetan characters to add
tibetan_chars = [
    # Consonants
    "ཀ", "ཁ", "ག", "ང", "ཅ", "ཆ", "ཇ", "ཉ", "ཏ", "ཐ", "ད", "ན", "པ", "པ", "ཕ", "བ", "མ",
    "ཙ", "ཚ", "ཛ", "ཝ", "ཞ", "ཟ", "འ", "ཡ", "ར", "ལ", "ཤ", "ཥ", "ས", "ཧ", "ཨ",

    # Subjoined Consonants
    "ྐ", "ྑ", "ྒ", "ྒྷ", "ྔ", "ྕ", "ྖ", "ྗ", "྘", "ྙ", "ྚ", "ྛ", "ྜ", "ྜྷ", "ྞ", "ྟ",
    "ྠ", "ྡ", "ྡྷ", "ྣ", "ྤ", "ྥ", "ྦ", "ྦྷ", "ྨ", "ྩ", "ྪ", "ྫ", "ྫྷ", "ྭ", "ྮ", "ྯ",
    "ྰ", "ྱ", "ྲ", "ླ", "ྴ", "ྵ", "ྶ", "ྷ", "ྸ", "ྐྵ", "ྺ", "ྻ", "ྼ", "྽", "྾", "྿",

    # Vowels
    "ི", "ཱི", "ུ", "ཱུ", "ྲྀ", "ཷ", "ླྀ", "ཹ", "ེ", "ཻ", "ོ", "ཽ", "ཾ", "ཿ",

    # Other Marks and Symbols
    "འ", "ཡ", "ར", "ལ", "ཤ", "ཥ", "ས", "ཧ", "ཨ",

    # Additional Tibetan Characters
    "ཀྵ", "ཁྵ", "གྵ", "ངྵ", "ཅྵ", "ཆྵ", "ཇྵ", "ཉྵ", "ཏྵ", "ཐྵ", "དྵ", "ནྵ", "པྵ", 
    "པྵ", "ཕྵ", "བྵ", "མྵ", "ཙྵ", "ཚྵ", "ཛྵ", "ཝྵ", "ཞྵ", "ཟྵ", "འྵ", "ཡྵ", "རྵ", 
    "ལྵ", "ཤྵ", "ཥྵ", "སྵ", "ཧྵ", "ཨྵ", "པྪ", "པྫ", "པྫྷ", "པྭ", "པྮ", "པྯ", "པྰ", 
    "པྱ", "པྲ", "པླ", "པྴ", "པྵ", "པྶ", "པྷ", "པྸ", "པྐྵ", "པྺ", "པྻ", "པྼ", "པ྽", 
    "པ྾", "པ྿"
]


#'ཀཁགངཅཆཇཉཏཐདནཔཕབམཙཚཛཝཞཟའཡརལཤཥསཧཨ'

# Add the Tibetan characters to the tokenizer's vocabulary
new_tokens = [char for char in tibetan_chars if char not in tokenizer.get_vocab()]

# Add new tokens to the tokenizer
tokenizer.add_tokens(new_tokens)

# Resize model embeddings to accommodate the new vocabulary size
model.resize_token_embeddings(len(tokenizer))

Embedding(32245, 1024)

## Preprocess Data

The dataset can now be tokenized for training.

In [6]:
source_lang = 'tibetan'
target_lang = ' english'

def preprocess_function(examples):

    inputs = [example for example in examples[source_lang]]
    targets = [example for example in examples[target_lang]]
    
    model_inputs = tokenizer(inputs, text_target=targets, max_length=256, truncation=True, padding="max_length")

    return model_inputs

In [7]:
tokenized_dataset = dataset.map(preprocess_function, batched=True)

Map:   0%|          | 0/69042 [00:00<?, ? examples/s]

Map:   0%|          | 0/12184 [00:00<?, ? examples/s]

## Train the Model

Finally, we can train the model. Note that the optimizer used is Adafactor. This is the optimizer that is preferred for translation tasks and for the T5 model in general. The transformers api includes a built in version of Adafactor, but I define it separately here so that we can optimize it with the 'accelerate' library.

In [8]:
from transformers import Seq2SeqTrainingArguments, Seq2SeqTrainer, Adafactor

optimizer = Adafactor(
    model.parameters(), 
    scale_parameter=True, 
    relative_step=False, 
    warmup_init=False, 
    lr=2e-5
)

In [9]:
from accelerate import Accelerator

accelerator = Accelerator()

optimizer = Adafactor(
    model.parameters(), 
    scale_parameter=True, 
    relative_step=False, 
    warmup_init=False, 
    lr=3e-4
)

model, optimizer = accelerator.prepare(model, optimizer)

In [10]:
training_args = Seq2SeqTrainingArguments(
    output_dir=f"base-t5-large",
    auto_find_batch_size=True,
    predict_with_generate=True,
    fp16=False, #check this
    push_to_hub=False,
    eval_strategy='epoch',
    save_strategy='epoch',
    load_best_model_at_end=True,
    num_train_epochs=3
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset['train'],
    eval_dataset=tokenized_dataset['train'],
    tokenizer=tokenizer,
    optimizers=(optimizer, None),
    data_collator=data_collator
)

trainer.train()

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mbillingsmoore[0m. Use [1m`wandb login --relogin`[0m to force relogin


  0%|          | 0/43155 [00:00<?, ?it/s]

  0%|          | 0/86305 [00:00<?, ?it/s]

{'loss': 0.4436, 'grad_norm': 0.3283935785293579, 'learning_rate': 0.00029826197786918485, 'epoch': 0.03}
{'loss': 0.2099, 'grad_norm': 0.20505794882774353, 'learning_rate': 0.0002965239557383697, 'epoch': 0.06}
{'loss': 0.2003, 'grad_norm': 0.38089054822921753, 'learning_rate': 0.0002947859336075546, 'epoch': 0.09}
{'loss': 0.1975, 'grad_norm': 0.11922363191843033, 'learning_rate': 0.00029304791147673946, 'epoch': 0.12}
{'loss': 0.1885, 'grad_norm': 1.3892505168914795, 'learning_rate': 0.0002913098893459243, 'epoch': 0.14}
{'loss': 0.1848, 'grad_norm': 0.15316489338874817, 'learning_rate': 0.0002895718672151092, 'epoch': 0.17}
{'loss': 0.1746, 'grad_norm': 0.2239162027835846, 'learning_rate': 0.00028783384508429403, 'epoch': 0.2}
{'loss': 0.1702, 'grad_norm': 0.3501626253128052, 'learning_rate': 0.0002860958229534789, 'epoch': 0.23}
{'loss': 0.169, 'grad_norm': 0.21480047702789307, 'learning_rate': 0.00028435780082266377, 'epoch': 0.26}
{'loss': 0.168, 'grad_norm': 0.13402053713798523

  0%|          | 0/8631 [00:00<?, ?it/s]

wandb: ERROR Error while calling W&B API: context deadline exceeded (<Response [500]>)


{'eval_loss': 0.11411003768444061, 'eval_runtime': 2491.9697, 'eval_samples_per_second': 27.706, 'eval_steps_per_second': 3.464, 'epoch': 1.0}
{'loss': 0.1288, 'grad_norm': 0.2582014501094818, 'learning_rate': 0.00023916922542147032, 'epoch': 1.01}
{'loss': 0.12, 'grad_norm': 0.21864376962184906, 'learning_rate': 0.00023743120329065522, 'epoch': 1.04}
{'loss': 0.1226, 'grad_norm': 0.1880509853363037, 'learning_rate': 0.0002356931811598401, 'epoch': 1.07}
{'loss': 0.1257, 'grad_norm': 0.3232046961784363, 'learning_rate': 0.00023395515902902494, 'epoch': 1.1}
{'loss': 0.1205, 'grad_norm': 0.1799364686012268, 'learning_rate': 0.00023221713689820984, 'epoch': 1.13}
{'loss': 0.1202, 'grad_norm': 0.20776763558387756, 'learning_rate': 0.00023047911476739468, 'epoch': 1.16}
{'loss': 0.1185, 'grad_norm': 0.2594086527824402, 'learning_rate': 0.00022874109263657956, 'epoch': 1.19}
{'loss': 0.1223, 'grad_norm': 0.3204634189605713, 'learning_rate': 0.0002270030705057644, 'epoch': 1.22}
{'loss': 0.1

  0%|          | 0/8631 [00:00<?, ?it/s]

{'eval_loss': 0.08213633298873901, 'eval_runtime': 2495.8243, 'eval_samples_per_second': 27.663, 'eval_steps_per_second': 3.458, 'epoch': 2.0}
{'loss': 0.094, 'grad_norm': 0.33351725339889526, 'learning_rate': 0.00017833845084294072, 'epoch': 2.03}
{'loss': 0.0954, 'grad_norm': 0.20615004003047943, 'learning_rate': 0.0001766004287121256, 'epoch': 2.06}
{'loss': 0.0938, 'grad_norm': 0.40948981046676636, 'learning_rate': 0.00017486240658131044, 'epoch': 2.09}
{'loss': 0.0934, 'grad_norm': 0.3150709867477417, 'learning_rate': 0.00017312438445049534, 'epoch': 2.11}
{'loss': 0.0924, 'grad_norm': 0.17849668860435486, 'learning_rate': 0.0001713863623196802, 'epoch': 2.14}
{'loss': 0.0947, 'grad_norm': 0.23893845081329346, 'learning_rate': 0.00016964834018886506, 'epoch': 2.17}
{'loss': 0.0952, 'grad_norm': 0.25892844796180725, 'learning_rate': 0.0001679103180580499, 'epoch': 2.2}
{'loss': 0.094, 'grad_norm': 0.26203739643096924, 'learning_rate': 0.0001661722959272348, 'epoch': 2.23}
{'loss': 

  0%|          | 0/8631 [00:00<?, ?it/s]

{'eval_loss': 0.06309106945991516, 'eval_runtime': 2493.2051, 'eval_samples_per_second': 27.692, 'eval_steps_per_second': 3.462, 'epoch': 3.0}
{'loss': 0.0832, 'grad_norm': 0.2174602746963501, 'learning_rate': 0.00011924569839522623, 'epoch': 3.01}
{'loss': 0.0774, 'grad_norm': 0.2557228207588196, 'learning_rate': 0.00011750767626441109, 'epoch': 3.04}
{'loss': 0.0774, 'grad_norm': 0.30044952034950256, 'learning_rate': 0.00011576965413359596, 'epoch': 3.07}
{'loss': 0.0817, 'grad_norm': 0.32031986117362976, 'learning_rate': 0.00011403163200278082, 'epoch': 3.1}
{'loss': 0.0788, 'grad_norm': 0.3648945391178131, 'learning_rate': 0.00011229360987196569, 'epoch': 3.13}
{'loss': 0.0791, 'grad_norm': 0.2257809340953827, 'learning_rate': 0.00011055558774115055, 'epoch': 3.16}
{'loss': 0.0789, 'grad_norm': 0.18788762390613556, 'learning_rate': 0.00010881756561033542, 'epoch': 3.19}


KeyboardInterrupt: 