In [1]:
from transformers import AutoTokenizer, AutoModelForCausalLM, AutoConfig, AutoModel
import torch
import torch.nn.functional as F
torch.manual_seed(1234)

<torch._C.Generator at 0x7fe75c9d24b0>

# Pretrain an LLM using HuggingFace

## Load a Language Model from HuggingFace
Here we use the huggingface wrap of the SimpleLLM model. It can be found on 

https://huggingface.co/alex2020/simplellm

To load a model without pretrained weights, we need to first get the model configurations and then create the model using this configurations.

Model configuration typically include model type, embeded dimension, vocab size, etc.



In [31]:


model_name = "alex2020/simplellm"
config = AutoConfig.from_pretrained(model_name, trust_remote_code=True)
model = AutoModelForCausalLM.from_config(config, trust_remote_code=True)
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)


In [3]:
config

SimpleLLMConfig {
  "architectures": [
    "SimpleLLMForCausalLM"
  ],
  "attention_bias": false,
  "attention_dropout": 0.0,
  "auto_map": {
    "AutoConfig": "alex2020/simplellm--configuration_simplellm.SimpleLLMConfig",
    "AutoModel": "alex2020/simplellm--modeling_simplellm.SimpleLLMModel",
    "AutoModelForCausalLM": "alex2020/simplellm--modeling_simplellm.SimpleLLMForCausalLM"
  },
  "hidden_act": "relu",
  "hidden_size": 8,
  "initializer_range": 0.02,
  "intermediate_size": 32,
  "model_type": "simplellm",
  "num_hidden_layers": 2,
  "torch_dtype": "bfloat16",
  "transformers_version": "4.51.3",
  "vocab_size": 9
}

The architecture of the model can be seen from printing the model details.

In [4]:
model

SimpleLLMForCausalLM(
  (model): SimpleLLMModel(
    (embed_tokens): Embedding(9, 8)
    (layers): ModuleList(
      (0-1): 2 x SimpleDecoderLayer(
        (self_attn): SimpleAttention(
          (q_proj): Linear(in_features=8, out_features=8, bias=False)
          (k_proj): Linear(in_features=8, out_features=8, bias=False)
          (v_proj): Linear(in_features=8, out_features=8, bias=False)
        )
        (mlp): SimpleMLP(
          (fc): Linear(in_features=8, out_features=32, bias=False)
          (fo): Linear(in_features=32, out_features=8, bias=False)
          (act_fn): ReLU()
        )
      )
    )
  )
  (lm_head): Linear(in_features=8, out_features=9, bias=False)
)

Here we use a simple model with two layers. We also tie the LM head's weight with the embedding layer. To check this, use

In [5]:
model.model.embed_tokens.weight == model.lm_head.weight

tensor([[True, True, True, True, True, True, True, True],
        [True, True, True, True, True, True, True, True],
        [True, True, True, True, True, True, True, True],
        [True, True, True, True, True, True, True, True],
        [True, True, True, True, True, True, True, True],
        [True, True, True, True, True, True, True, True],
        [True, True, True, True, True, True, True, True],
        [True, True, True, True, True, True, True, True],
        [True, True, True, True, True, True, True, True]])

## Inference 

### using Transformers' Pipeline

An inference contains the following steps:
- tokenize the input text and convert them to sequence of integers 
- process the sequence of integers with the LLM model and get output probabilities
- sampling from the predicted probabilities 

Huggingface Transformers have implemented a `pipeline` for the inference.

In [6]:

from transformers import pipeline, GenerationConfig
config = GenerationConfig(max_new_tokens=10, temperature=1.2)
generator = pipeline('text-generation', model = model, tokenizer=tokenizer, prefix="", generation_config=config)
print(generator("You do not like"))


Device set to use cpu


[{'generated_text': 'You do not like like like like like like like like like like like'}]




### Implementing a inference pipeline
Let's redo the above inference by hand. 

- First, tokenize the text and convert it to sequence of integers.

In [7]:
input_ids = tokenizer("You do not like", return_tensors="pt").to(model.device)
print(input_ids)

{'input_ids': tensor([[1, 3, 4, 2]]), 'attention_mask': tensor([[1, 1, 1, 1]])}


 - then pass this to the LLM for get the predicted probabilities

In [8]:
output = model(**input_ids)
print(output)

MoeCausalLMOutputWithPast(loss=None, aux_loss=None, logits=tensor([[[-6.4468e-04,  2.2278e-03, -8.5831e-04, -4.8828e-04, -2.4033e-04,
          -1.1206e-04, -1.2741e-03, -4.5586e-04,  1.6556e-03],
         [ 9.8419e-04, -4.8065e-04, -9.5367e-05,  1.2512e-03,  7.0190e-04,
          -2.9564e-04,  5.7983e-04, -7.8583e-04, -1.0910e-03],
         [ 2.6703e-04, -2.3651e-04,  3.2234e-04,  6.9809e-04,  2.0905e-03,
           9.9182e-04,  3.4831e-07, -4.3488e-04,  7.0572e-04],
         [-1.4877e-03, -8.2397e-04,  2.6855e-03, -6.8665e-05,  3.7003e-04,
           4.7874e-04, -2.5940e-04,  2.5482e-03,  1.7071e-04]]],
       dtype=torch.bfloat16, grad_fn=<UnsafeViewBackward0>), past_key_values=None, hidden_states=None, attentions=None, router_logits=None)


 - Finally, we do sample from the predicted probabilities

In [9]:
predict_prob = F.softmax(output.logits,-1)
print("predicted probabilities:",predict_prob[0][-1])
next_token = torch.multinomial(predict_prob[0][-1], num_samples=1)
print("next token", next_token)
tokenizer.decode(next_token)

predicted probabilities: tensor([0.1108, 0.1108, 0.1113, 0.1108, 0.1113, 0.1113, 0.1108, 0.1113, 0.1113],
       dtype=torch.bfloat16, grad_fn=<SelectBackward0>)
next token tensor([5])


'coffee'

- make it iterable

In [10]:
def inference_pipeline(model, tokenizer, input_text, max_new_tokens):
    input_ids = tokenizer(input_text, return_tensors="pt").to(model.device)
    for _ in range(max_new_tokens):
        output = model(**input_ids)
        predict_prob = F.softmax(output.logits,-1)
        next_token = torch.multinomial(predict_prob[0][-1], num_samples=1)
        input_ids["input_ids"] = torch.cat((input_ids["input_ids"], next_token.unsqueeze(1)), dim=1)
        input_ids["attention_mask"] = torch.ones_like(input_ids["input_ids"])
        input_text += " " +  tokenizer.decode(next_token)
    return input_text
        


In [11]:
inference_pipeline(model, tokenizer, "You do not like", max_new_tokens=5)

'You do not like <|endoftext|> . coffee coffee .'

## Pretrain a model

We train the model with the following data:

- I like coffee.
- I like tea.
- You like tea.
- You do not like coffee.

There are two ways to train the model. 
- using PyTorch since it is a PyTorch model
- using the `Trainer` API provided by the `transformers` package

The second approach provides an easy way to train LLMs but may face issues when training on mutiple GPUs on multiple nodes.

### Prepare a dataset

It is more convenient to use a Pytorch like dataset to use the `Trainer`. 
- HF provides a `datasets` package
- we can build a dataset from dictionary `{"text": sentences}`
- the labels and inputs are the same, `transformers` package will automatically compute the loss for next token prediction

We can create a dataset and push it to huggingface datasets for future usage.

In [12]:
# Uncomment to upload to your huggingface repo
# sentences = [
#     "I like tea.",
#     "I like coffee.",
#     "You like tea.",
#     "You do not like coffee."
# ]
# data = {"text":sentences}

# from datasets import Dataset

# dataset = Dataset.from_dict(data)

# dataset.push_to_hub(YOU_HF_REPO)


In [13]:
from datasets import load_dataset

In [14]:
train_data = load_dataset("alex2020/SimpleDataset")

Found cached dataset parquet (/home/user/.cache/huggingface/datasets/alex2020___parquet/alex2020--SimpleDataset-a7ad216bf9fb3928/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)


  0%|          | 0/1 [00:00<?, ?it/s]

In [15]:
train_data

DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 4
    })
})

In [16]:
train_data['train']['text'][0]

'I like tea .'

We need to tokenize the dataset before passing to the trainer.

In [17]:
tokenizer.pad_token = tokenizer.all_special_tokens[0]

In [18]:
tokenizer

SimpleLLMTokenizer(name_or_path='alex2020/simplellm', vocab_size=8, model_max_length=1000000000000000019884624838656, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'pad_token': '<|endoftext|>', 'additional_special_tokens': ['<|endoftext|>']}, clean_up_tokenization_spaces=False, added_tokens_decoder={
	8: AddedToken("<|endoftext|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}
)

In [19]:
# Tokenization function with labels
def tokenize(example):
    tokens = tokenizer(
        example["text"],
        padding=True,
        truncation=False,
        max_length=8,
    )
    tokens["labels"] = tokens["input_ids"].copy()
    return tokens

# Tokenize and add labels
train_dataset = train_data.map(tokenize, batched=True)



Loading cached processed dataset at /home/user/.cache/huggingface/datasets/alex2020___parquet/alex2020--SimpleDataset-a7ad216bf9fb3928/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec/cache-0281328f50ca738d.arrow


In [20]:
train_dataset['train'][2]['input_ids']

[1, 2, 6, 7, 8, 8]

In [21]:
train_dataset['train'][1]['labels']

[0, 2, 5, 7, 8, 8]

In [22]:
import wandb
wandb.login(key='a1f71d1f4765648afaa0bdcb52c2dd99caca6bc9')


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /home/user/.netrc
[34m[1mwandb[0m: Currently logged in as: [33malexhuo2020[0m ([33misuai[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


True

In [34]:

# 4. Training arguments
from transformers import Trainer, TrainingArguments
training_args = TrainingArguments(
    output_dir="./pretrained",
    num_train_epochs=3000,
    logging_dir="./logs",
    logging_steps=500,
    save_steps=3000,
    save_total_limit=1,
    max_grad_norm=None,
    # bf16=True,
    padding_side="right",
    learning_rate=1e-3
)

# 5. Trainer setup
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset['train'],
)

# 6. Train
trainer.train()


TypeError: TrainingArguments.__init__() got an unexpected keyword argument 'padding_side'

In [24]:
train_dataloader = trainer.get_train_dataloader()
for batch in train_dataloader:
    print(batch)
    break


{'input_ids': tensor([[1, 2, 6, 7, 8, 8],
        [1, 3, 4, 2, 5, 7],
        [0, 2, 6, 7, 8, 8],
        [0, 2, 5, 7, 8, 8]]), 'attention_mask': tensor([[1, 1, 1, 1, 0, 0],
        [1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 0, 0],
        [1, 1, 1, 1, 0, 0]]), 'labels': tensor([[1, 2, 6, 7, 8, 8],
        [1, 3, 4, 2, 5, 7],
        [0, 2, 6, 7, 8, 8],
        [0, 2, 5, 7, 8, 8]])}


In [25]:
tokenizer.decode(batch['input_ids'][3])

'I like coffee . <|endoftext|> <|endoftext|>'

In [26]:
from transformers import pipeline, GenerationConfig
config = GenerationConfig(max_new_tokens=8)
for _ in range(10):
    generator = pipeline('text-generation', model = model, tokenizer=tokenizer, prefix="", generation_config=config)
    print(generator("I like"))


Device set to use cpu
Device set to use cpu
Device set to use cpu
Device set to use cpu
Device set to use cpu
Device set to use cpu
Device set to use cpu
Device set to use cpu
Device set to use cpu
Device set to use cpu


[{'generated_text': 'I like coffee. You not like coffee. You'}]
[{'generated_text': 'I like coffee. You not like coffee. You'}]
[{'generated_text': 'I like coffee. You not like coffee. You'}]
[{'generated_text': 'I like coffee. You not like coffee. You'}]
[{'generated_text': 'I like coffee. You not like coffee. You'}]
[{'generated_text': 'I like coffee. You not like coffee. You'}]
[{'generated_text': 'I like coffee. You not like coffee. You'}]
[{'generated_text': 'I like coffee. You not like coffee. You'}]
[{'generated_text': 'I like coffee. You not like coffee. You'}]
[{'generated_text': 'I like coffee. You not like coffee. You'}]


In [27]:
tokenizer("I do .")

{'input_ids': [0, 3, 7], 'attention_mask': [1, 1, 1]}

In [28]:
tokenizer(train_data['train'][0]['text'])

{'input_ids': [0, 2, 6, 7], 'attention_mask': [1, 1, 1, 1]}