<a href="https://colab.research.google.com/github/bforoura/Transformers/blob/main/nlp_transformers_ch3_ex1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### **KantiBERT**: How to train a new language model from scratch using Transformers and Tokenizers


In [1]:
#@title Step 1: Loading the dataset

#1.Load kant.txt using the Colab file manager
#2.Downloading the file from GitHub

!curl -L https://raw.githubusercontent.com/PacktPublishing/Transformers-for-Natural-Language-Processing/master/Chapter03/kant.txt --output "kant.txt"




  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 10.7M  100 10.7M    0     0  11.3M      0 --:--:-- --:--:-- --:--:-- 11.2M


In [2]:
#@title Step 2: Installing Hugging Face transformers

# We won't need TensorFlow here
!pip uninstall -y tensorflow

# Install `transformers` from master
!pip install git+https://github.com/huggingface/transformers

!pip list | grep -E 'transformers|tokenizers'
# transformers version at notebook update --- 2.9.1
# tokenizers version at notebook update --- 0.7.0



Found existing installation: tensorflow 2.9.2
Uninstalling tensorflow-2.9.2:
  Successfully uninstalled tensorflow-2.9.2
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting git+https://github.com/huggingface/transformers
  Cloning https://github.com/huggingface/transformers to /tmp/pip-req-build-wwn1wk7u
  Running command git clone -q https://github.com/huggingface/transformers /tmp/pip-req-build-wwn1wk7u
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
    Preparing wheel metadata ... [?25l[?25hdone
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.1-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[K     |████████████████████████████████| 7.6 MB 24.5 MB/s 
Collecting huggingface-hub<1.0,>=0.10.0
  Downloading huggingface_hub-0.10.1-py3-none-any.whl (163 kB)
[K     |████████████████████████████████| 163 kB 72.1 MB/s 
Bu

In [3]:
#@title Step 3: Training a Tokenizer

# print the CPU and wall times for the entire code
%%time

from pathlib import Path
from tokenizers import ByteLevelBPETokenizer

paths = [str(x) for x in Path(".").glob("**/*.txt")]

# Initialize a tokenizer
tokenizer = ByteLevelBPETokenizer()

# Customize training
tokenizer.train(files=paths, vocab_size=52_000, min_frequency=2, 
special_tokens=[
    "<s>",
    "<pad>",
    "</s>",
    "<unk>",
    "<mask>",
])


CPU times: user 4.97 s, sys: 188 ms, total: 5.16 s
Wall time: 2.9 s


In [4]:
#@title Step 4: Saving the files to disk

import os
token_dir = '/content/KantaiBERT'
if not os.path.exists(token_dir):
  os.makedirs(token_dir)
tokenizer.save_model('KantaiBERT')


['KantaiBERT/vocab.json', 'KantaiBERT/merges.txt']

In [5]:
#@title Step 5 Loading the Trained Tokenizer Files

from tokenizers.implementations import ByteLevelBPETokenizer
from tokenizers.processors import BertProcessing
tokenizer = ByteLevelBPETokenizer(
    "./KantaiBERT/vocab.json",
    "./KantaiBERT/merges.txt",
)



In [6]:
#@title The tokenizer can now encode a sequence
tokenizer.encode("The Critique of Pure Reason.").tokens


['The', 'ĠCritique', 'Ġof', 'ĠPure', 'ĠReason', '.']

In [7]:
#@title The tokenizer can print the number of tokens in a sequence
tokenizer.encode("The Critique of Pure Reason.")


Encoding(num_tokens=6, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing])

In [8]:
tokenizer._tokenizer.post_processor = BertProcessing(
    ("</s>", tokenizer.token_to_id("</s>")),  ## SEP token
    ("<s>", tokenizer.token_to_id("<s>")),    ## CLS token
)

tokenizer.enable_truncation(max_length=512)


In [9]:
tokenizer.encode("The Critique of Pure Reason.").tokens


['<s>', 'The', 'ĠCritique', 'Ġof', 'ĠPure', 'ĠReason', '.', '</s>']

In [10]:
#@title Step 6: Checking Resource Constraints: GPU and NVIDIA

!nvidia-smi


Mon Oct 17 14:12:30 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.32.03    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   62C    P8    13W /  70W |      0MiB / 15109MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [11]:
#@title Checking that PyTorch Sees CUDA
import torch
torch.cuda.is_available()


True

In [12]:
#@title Step 7: Defining the configuration of the Model

from transformers import RobertaConfig
config = RobertaConfig(
    vocab_size=52_000,
    max_position_embeddings=514,
    num_attention_heads=12,
    num_hidden_layers=6,
    type_vocab_size=1,
)


In [13]:
#@title Step 8: Re-creating the Tokenizer in Transformers

from transformers import RobertaTokenizer
tokenizer = RobertaTokenizer.from_pretrained("./KantaiBERT", max_length=512)


In [14]:
#@title Step 9: Initializing a Model From Scratch

from transformers import RobertaForMaskedLM

model = RobertaForMaskedLM(config=config)

print(model)


RobertaForMaskedLM(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(52000, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0): RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNor

In [18]:
print(model.num_parameters())



83504416


In [16]:
#@title Exploring the Parameters

LP=list(model.parameters())
lp=len(LP)
print(lp)


106


In [None]:
for p in range(0,lp):
  print(LP[p])
  

In [23]:
#@title Step 10: Building the Dataset

%%time
from transformers import LineByLineTextDataset

dataset = LineByLineTextDataset(
    tokenizer=tokenizer,
    file_path="./kant.txt",
    block_size=128,
)




CPU times: user 28.6 s, sys: 289 ms, total: 28.9 s
Wall time: 29.3 s


In [24]:
#@title Step 11: Defining a Data Collator

from transformers import DataCollatorForLanguageModeling

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=True, mlm_probability=0.15
)


In [25]:
#@title Step 12: Initializing the Trainer

from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir="./KantaiBERT",
    overwrite_output_dir=True,
    num_train_epochs=1,
    per_device_train_batch_size=64,
    save_steps=10_000,
    save_total_limit=2,
    )

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=dataset,
)



In [26]:
#@title Step 13: Pre-training the Model

%%time
trainer.train()


***** Running training *****
  Num examples = 170964
  Num Epochs = 1
  Instantaneous batch size per device = 64
  Total train batch size (w. parallel, distributed & accumulation) = 64
  Gradient Accumulation steps = 1
  Total optimization steps = 2672


Step,Training Loss
500,6.6132
1000,5.737
1500,5.2579
2000,5.0034
2500,4.8573




Training completed. Do not forget to share your model on huggingface.co/models =)




CPU times: user 10min, sys: 2.72 s, total: 10min 3s
Wall time: 10min 6s


TrainOutput(global_step=2672, training_loss=5.449753515734644, metrics={'train_runtime': 606.0304, 'train_samples_per_second': 282.105, 'train_steps_per_second': 4.409, 'total_flos': 873620128952064.0, 'train_loss': 5.449753515734644, 'epoch': 1.0})

In [27]:
#@title Step 14: Saving the Final Model(+tokenizer + config) to disk

trainer.save_model("./KantaiBERT")


Saving model checkpoint to ./KantaiBERT
Configuration saved in ./KantaiBERT/config.json
Model weights saved in ./KantaiBERT/pytorch_model.bin


In [32]:
#@title Step 15: Language Modeling with the FillMaskPipeline

from transformers import pipeline

fill_mask = pipeline(
    "fill-mask",
    model="./KantaiBERT",
    tokenizer="./KantaiBERT"
)


fill_mask("Human thinking involves human <mask>.")



loading configuration file ./KantaiBERT/config.json
Model config RobertaConfig {
  "_name_or_path": "./KantaiBERT",
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 6,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "torch_dtype": "float32",
  "transformers_version": "4.24.0.dev0",
  "type_vocab_size": 1,
  "use_cache": true,
  "vocab_size": 52000
}

loading configuration file ./KantaiBERT/config.json
Model config RobertaConfig {
  "_name_or_path": "./KantaiBERT",
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dro

[{'score': 0.01715191639959812,
  'token': 393,
  'token_str': ' reason',
  'sequence': 'Human thinking involves human reason.'},
 {'score': 0.014271341264247894,
  'token': 531,
  'token_str': ' experience',
  'sequence': 'Human thinking involves human experience.'},
 {'score': 0.010998868383467197,
  'token': 418,
  'token_str': ' conception',
  'sequence': 'Human thinking involves human conception.'},
 {'score': 0.009074173867702484,
  'token': 605,
  'token_str': ' conceptions',
  'sequence': 'Human thinking involves human conceptions.'},
 {'score': 0.00873797107487917,
  'token': 604,
  'token_str': ' existence',
  'sequence': 'Human thinking involves human existence.'}]