In [1]:
!curl -L https://raw.githubusercontent.com/Denis2054/Transformers-for-NLP-2nd-Edition/master/Chapter04/kant.txt --output "kant.txt"

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 10.7M  100 10.7M    0     0  12.6M      0 --:--:-- --:--:-- --:--:-- 12.6M


In [2]:
!pip install -y tensorflow
!pip install git+https://github.com/huggingface/transformers
!pip install git+https://github.com/huggingface/accelerate
!pip list | grep -E 'transformers | tokenizers'


Usage:   
  pip3 install [options] <requirement specifier> [package-index-options] ...
  pip3 install [options] -r <requirements file> [package-index-options] ...
  pip3 install [options] [-e] <vcs project url> ...
  pip3 install [options] [-e] <local project path> ...
  pip3 install [options] <archive url/path> ...

no such option: -y
Collecting git+https://github.com/huggingface/transformers
  Cloning https://github.com/huggingface/transformers to /tmp/pip-req-build-ur1x00y2
  Running command git clone --filter=blob:none --quiet https://github.com/huggingface/transformers /tmp/pip-req-build-ur1x00y2
  Resolved https://github.com/huggingface/transformers to commit a564d10afe1a78c31934f0492422700f61a0ffc0
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Building wheels for collected packages: transformers
  Building wheel for transformers (pyproject.toml) ... [?25

**Training a tokenizer**

In [3]:
%%time
from pathlib import Path
from tokenizers import ByteLevelBPETokenizer
paths = [str(x) for x in Path('.').glob('**/*.txt')]

tokenizer = ByteLevelBPETokenizer()

tokenizer.train(files=paths, vocab_size=52_000, min_frequency=2, special_tokens=[
    "<s>",
    "<pad>",
    "</s>",
    "<unk>",
    "<mask>"
])

CPU times: user 7.42 s, sys: 162 ms, total: 7.58 s
Wall time: 4 s


In [4]:
import os
token_dir = '/content/KantaiBERT'
if not os.path.exists(token_dir):
  os.makedirs(token_dir)
tokenizer.save_model('KantaiBERT')

['KantaiBERT/vocab.json', 'KantaiBERT/merges.txt']

**Loading the trained tokenizer files**

In [5]:
from tokenizers.implementations import ByteLevelBPETokenizer
from tokenizers.processors import BertProcessing

tokenizer = ByteLevelBPETokenizer(
    "./KantaiBERT/vocab.json",
    "./KantaiBERT/merges.txt",
)

In [6]:
tokenizer.encode("The Critique of Pure Reason.").tokens

['The', 'ĠCritique', 'Ġof', 'ĠPure', 'ĠReason', '.']

In [7]:
tokenizer._tokenizer.post_processor = BertProcessing(
    ("</s>", tokenizer.token_to_id("</s>")),
    ("<s>", tokenizer.token_to_id("<s>")),
)

tokenizer.enable_truncation(max_length=512)

In [8]:
tokenizer.encode("The Critique of Pure Reason.")

Encoding(num_tokens=8, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing])

In [9]:
tokenizer.encode("The Critique of Pure Reason.").tokens

['<s>', 'The', 'ĠCritique', 'Ġof', 'ĠPure', 'ĠReason', '.', '</s>']

In [10]:
!nvidia-smi

Wed May 29 09:27:38 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.104.05             Driver Version: 535.104.05   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  Tesla T4                       Off | 00000000:00:04.0 Off |                    0 |
| N/A   42C    P8               9W /  70W |      0MiB / 15360MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    

In [11]:
import torch
torch.cuda.is_available()

True

**Defining the configuration of the model**

In [12]:
from transformers import RobertaConfig
config = RobertaConfig(
    vocab_size = 52_000,
    max_position_embedding = 514,
    num_attention_heads = 12,
    num_hidden_layers = 6,
    type_vocab_size =1
)

**Reloading the tokenizer in transformers**

In [13]:
from transformers import RobertaTokenizer
tokenizer = RobertaTokenizer.from_pretrained('./KantaiBERT', max_length=512)

**Initializing a model from scratch**

In [14]:
from transformers import RobertaForMaskedLM

In [15]:
model = RobertaForMaskedLM(config = config)

In [16]:
print(model)

RobertaForMaskedLM(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(52000, 768, padding_idx=1)
      (position_embeddings): Embedding(512, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-5): 6 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): La

In [17]:
print(model.num_parameters())

83502880


In [18]:
LP = list(model.parameters())
lp = len(LP)
print(lp)

106


In [19]:
for p in range(0, lp):
  print(LP[p])

Parameter containing:
tensor([[-0.0060, -0.0135, -0.0379,  ...,  0.0035, -0.0146, -0.0003],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [-0.0109, -0.0027,  0.0059,  ..., -0.0383, -0.0010,  0.0007],
        ...,
        [-0.0360, -0.0219, -0.0124,  ...,  0.0124, -0.0024, -0.0260],
        [-0.0175,  0.0358, -0.0277,  ...,  0.0252,  0.0090, -0.0121],
        [ 0.0093, -0.0044,  0.0024,  ...,  0.0035,  0.0092,  0.0050]],
       requires_grad=True)
Parameter containing:
tensor([[ 0.0131,  0.0071,  0.0358,  ..., -0.0115,  0.0181,  0.0195],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0292,  0.0019, -0.0158,  ...,  0.0295,  0.0064,  0.0114],
        ...,
        [ 0.0125, -0.0164, -0.0064,  ...,  0.0039,  0.0013, -0.0079],
        [-0.0132, -0.0107,  0.0194,  ...,  0.0027, -0.0278, -0.0163],
        [ 0.0123, -0.0069,  0.0104,  ..., -0.0144,  0.0172,  0.0090]],
       requires_grad=True)
Parameter containing:
tensor([[-1.

**Building the dataset**

In [20]:
%%time
from transformers import LineByLineTextDataset
dataset = LineByLineTextDataset(
    tokenizer = tokenizer,
    file_path = './kant.txt',
    block_size = 128
)



CPU times: user 29.1 s, sys: 929 ms, total: 30.1 s
Wall time: 35.7 s


In [21]:
from transformers import DataCollatorForLanguageModeling
data_collator = DataCollatorForLanguageModeling(
    tokenizer = tokenizer,
    mlm = True,
    mlm_probability=0.15
)

**Initializing the trainer**

In [22]:
!pip install accelerate -U

from transformers import Trainer, TrainingArguments
training_args = TrainingArguments(
    output_dir = "./KantaiBERT",
    overwrite_output_dir = True,
    num_train_epochs = 1,
    per_device_train_batch_size = 64,
    save_steps = 10_000,
    save_total_limit = 2
)

trainer = Trainer(
    model = model,
    args = training_args,
    data_collator = data_collator,
    train_dataset = dataset,
)



**Pretraining the model**

In [23]:
%%time
trainer.train()

Step,Training Loss
500,6.5984
1000,5.7415
1500,5.27
2000,5.0318
2500,4.9204


CPU times: user 9min 9s, sys: 20 s, total: 9min 29s
Wall time: 9min 50s


TrainOutput(global_step=2672, training_loss=5.470487880135725, metrics={'train_runtime': 590.1006, 'train_samples_per_second': 289.72, 'train_steps_per_second': 4.528, 'total_flos': 873691623267840.0, 'train_loss': 5.470487880135725, 'epoch': 1.0})

**Save the model**

In [24]:
trainer.save_model("./KantaiBERT")

**Language modeling with FillMaskPipeline**

In [25]:
from transformers import pipeline
fill_mask = pipeline(
    "fill-mask",
    model = "./KantaiBERT",
    tokenizer = "./KantaiBERT"
)

In [26]:
fill_mask("Human thinking involves human <mask>.")

[{'score': 0.02423124760389328,
  'token': 393,
  'token_str': ' reason',
  'sequence': 'Human thinking involves human reason.'},
 {'score': 0.014824768528342247,
  'token': 605,
  'token_str': ' conceptions',
  'sequence': 'Human thinking involves human conceptions.'},
 {'score': 0.01316076796501875,
  'token': 586,
  'token_str': ' nature',
  'sequence': 'Human thinking involves human nature.'},
 {'score': 0.01169105339795351,
  'token': 600,
  'token_str': ' understanding',
  'sequence': 'Human thinking involves human understanding.'},
 {'score': 0.009601338766515255,
  'token': 670,
  'token_str': ' principles',
  'sequence': 'Human thinking involves human principles.'}]