# Training llama models from scratch

Import all needed libraries:

In [None]:
import torch
import pandas as pd
from random import sample
from pathlib import Path
from tqdm import tqdm 

from tokenizers import (
    decoders,
    models,
    normalizers,
    pre_tokenizers,
    processors,
    trainers,
    Tokenizer)

from tokenizers.normalizers import Lowercase, Strip, StripAccents, NFD

from transformers import (
    AutoTokenizer, 
    PreTrainedTokenizerFast, 
    set_seed, 
    Trainer, 
    TrainingArguments, 
    DataCollatorForLanguageModeling, 
    LlamaForCausalLM, 
    LlamaConfig)

from datasets import load_dataset

### Paths

Set paths to training data, eval data, and model directory:

In [None]:
training_files = ['/path/to/training/file1',
                 '/path/to/training/file2',
                 '/path/to/training/file3',
                 '/path/to/training/etc',]

eval_files = ['/path/to/eval/file1',
             '/path/to/eval/file2',
             '/path/to/eval/file3',
             '/path/to/eval/etc',]

In [None]:
model_path = '/path/to/model/dir'

### Tokenizer

Initialize with BPE:

In [None]:
tokenizer = Tokenizer(models.BPE())

Normalizer that sets everything to normal unicode, lowercase, and strips white spaces and accents

(explanations here: https://huggingface.co/docs/tokenizers/components)

In [None]:
normalizer = normalizers.Sequence([NFD(), Lowercase(), Strip(), StripAccents()])

In [None]:
normalizer.normalize_str("Héllò hôw are ü?")

In [None]:
tokenizer.normalizer = normalizer

Pre-tokenization (division of text into tokens on which BPE can be performed):

In [None]:
tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel(add_prefix_space=False)

In [None]:
tokenizer.pre_tokenizer.pre_tokenize_str("Let's test pre-tokenization!")

Set vocab size, add special tokens:

In [None]:
trainer = trainers.BpeTrainer(vocab_size=16000, #special_tokens=["<|endoftext|>", "<pad>",]))

In [None]:
tokenizer.train(files = training_files, trainer=trainer)

In [None]:
encoding = tokenizer.encode("Let's test this tokenizer.")
print(encoding.tokens)

By default, the ByteLevel BPE might include whitespaces in the produced tokens. If you don’t want the offsets to include these whitespaces, then this PostProcessor must be used:

(https://huggingface.co/docs/tokenizers/api/post-processors)

In [None]:
tokenizer.post_processor = processors.ByteLevel(trim_offsets=True)

In [None]:
sentence = "Let's test this tokenizer."
encoding = tokenizer.encode(sentence)
start, end = encoding.offsets[4]
sentence[start:end]

In [None]:
tokenizer

In [None]:
tokenizer.decoder = decoders.ByteLevel()

In [None]:
tokenizer.decode(encoding.ids)

Save it:

In [None]:
wrapped_tokenizer = PreTrainedTokenizerFast(
    tokenizer_object=tokenizer,
    bos_token="<|endoftext|>",
    eos_token="<|endoftext|>",
    pad_token="<pad>",
)    

In [None]:
wrapped_tokenizer.save_pretrained(model_path+'tokenizer/')

### Training 

Load tokenizer:

In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_path+'tokenizer/')
tokenizer.pad_token = tokenizer.eos_token

Load data (now for training):

In [None]:
raw_datasets = load_dataset('text', data_files={'train': training_files, 
                                           'validation': eval_files})

Creates batches (https://huggingface.co/docs/transformers/pad_truncation)

In [None]:
context_length = 64

In [None]:
def tokenize(element):
    outputs = tokenizer(
        element["text"],
        truncation=True,
        padding=True,
        max_length=context_length,
        return_overflowing_tokens=True,
        return_length=True)
    
    input_batch = []
    
    for length, input_ids in zip(outputs["length"], outputs["input_ids"]):
        input_batch.append(input_ids)
    return {"input_ids": input_batch}


tokenized_datasets = raw_datasets.map(tokenize, 
                                      batched=True, 
                                      remove_columns=raw_datasets["train"].column_names)
tokenized_datasets

Initiate new Llama with config as wished:

In [None]:
config = LlamaConfig(
    vocab_size=269,
    hidden_size=512,
    num_hidden_layers=8,
    intermediate_size=512,
    num_attention_heads=8,
    bos_token_id=tokenizer.convert_tokens_to_ids("<|endoftext|>"),
    eos_token_id=tokenizer.convert_tokens_to_ids("<|endoftext|>"),
    pad_token_id=tokenizer.convert_tokens_to_ids("<pad>"),
    max_position_embeddings=512
)

Set seed for weight initialization:

In [None]:
set_seed(42)

New model object:

In [None]:
model = LlamaForCausalLM(config)

In [None]:
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

Check out param size:

In [None]:
print(f'model num parameters = {model.num_parameters()}')

In [None]:
config_dict = config.to_dict()

Set training parameters:

In [None]:
training_args = TrainingArguments(
    output_dir=model_path,
    overwrite_output_dir=True,
    save_strategy = "epoch", # saves after every epoch
    #save_strategy = "steps", 
    #save_steps = 0.1, # if below zero, then saves after every (n*100)% of training steps
    #save_total_limit=100,  # set to zero to avoid saving
    eval_strategy = "epoch",
    #eval_steps = 0.1,
    num_train_epochs= 5,
    #max_steps = 1,
    gradient_accumulation_steps=8,
    per_device_train_batch_size=16,
    warmup_steps=200, 
    lr_scheduler_type="cosine",
    learning_rate=3e-4, # normal: 5e-4
    logging_steps=10,
    #fp16=True, ## only on CUDA
    #load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    #use_mps_device=True, ## only on apple silicon
    #use_cpu = True
)

Initialize trainer object:

In [None]:
trainer = Trainer(
    model=model,
    tokenizer=tokenizer,
    args=training_args,
    data_collator=data_collator,
    train_dataset=tokenized_datasets['train'],#[:15000]['input_ids'],
    eval_dataset=tokenized_datasets['validation']#[:1200]['input_ids'],
)

Train:

In [None]:
trainer.train()

Save logs of losses:

In [None]:
df = pd.DataFrame(trainer.state.log_history)
df.to_csv(model_path+'logs/losses.csv')  

Save final model

In [None]:
trainer.save_model(model_path+'final/)

### Test trained model on text generation

With hf pipelines:

In [None]:
from transformers import pipeline

pipe = pipeline("text-generation", model="/home/bastian/Dokumente/llamaphone/grapheme-whitespace/final")

In [None]:
pipe("I am the", do_sample = True, 
     num_return_sequences = 20, 
     max_length=128
     #top_k=50,
     #top_p=0.8,
     #temperature=1.0,
    )