#Tokenizer

In [None]:
import os

In [None]:
# Create folder if it does not exist

folder1 = "/kaggle/working/checkpoints"

if not os.path.exists(folder1):
    os.mkdir(folder1)


In [None]:
## Run once

from tokenizers import ByteLevelBPETokenizer
from tokenizers.implementations import ByteLevelBPETokenizer
from tokenizers.processors import BertProcessing

train_text = ["/kaggle/working/words_250000_train.txt"]

# Initialize a tokenizer
tokenizer_model = ByteLevelBPETokenizer()

# Customize training
tokenizer_model.train(files=train_text, vocab_size=261, min_frequency=3, special_tokens=["<s>", "<pad>", "</s>", "<unk>", "<mask>"])
tokenizer_model.save_model("/kaggle/working/checkpoints")






['/kaggle/working/checkpoints/vocab.json',
 '/kaggle/working/checkpoints/merges.txt']

In [None]:
tokenizer_model = ByteLevelBPETokenizer(
    "/kaggle/working/checkpoints/vocab.json",
    "/kaggle/working/checkpoints/merges.txt",
)

tokenizer_model._tokenizer.post_processor = BertProcessing(
    ("</s>", tokenizer_model.token_to_id("</s>")),
    ("<s>", tokenizer_model.token_to_id("<s>")),
)

tokenizer_model.enable_truncation(max_length=128)

In [None]:
tokenizer_model.encode('hammer').tokens

['<s>', 'h', 'a', 'm', 'm', 'e', 'r', '</s>']

In [None]:
from transformers import RobertaConfig

config = RobertaConfig(
    vocab_size=261,
    max_position_embeddings=128,
    num_attention_heads=16,
    num_hidden_layers=10,
    type_vocab_size=1,)

In [None]:
from transformers import RobertaTokenizerFast

tokenizer = RobertaTokenizerFast.from_pretrained("/kaggle/working/checkpoints", max_len=128)

In [None]:
tokenizer.save_pretrained("/kaggle/working/checkpoints")

('/kaggle/working/checkpoints/tokenizer_config.json',
 '/kaggle/working/checkpoints/special_tokens_map.json',
 '/kaggle/working/checkpoints/vocab.json',
 '/kaggle/working/checkpoints/merges.txt',
 '/kaggle/working/checkpoints/added_tokens.json',
 '/kaggle/working/checkpoints/tokenizer.json')

#Dataset

In [None]:
from transformers import LineByLineTextDataset

dataset_for_tokenize = LineByLineTextDataset(
    tokenizer=tokenizer,
    file_path="/kaggle/input/words-data/words_250000_train.txt",
    block_size=128,
)

2024-06-20 22:09:33.575391: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-06-20 22:09:33.575529: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-06-20 22:09:33.696291: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [None]:
print(dataset_for_tokenize[0]) #aaa
print(dataset_for_tokenize[42]) #abacterial

{'input_ids': tensor([ 0, 69, 69, 69,  2])}
{'input_ids': tensor([ 0, 69, 70, 69, 71, 88, 73, 86, 77, 69, 80,  2])}


In [None]:
import torch
from transformers import DataCollatorForLanguageModeling
from transformers import RobertaForMaskedLM
from transformers import Trainer, TrainingArguments

#Model Training

In [None]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
device

device(type='cuda')

In [None]:
# fraction of tokens to mask
masking = 0.6

In [None]:
model = RobertaForMaskedLM(config=config)

In [None]:
collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=True, mlm_probability=masking)

In [None]:
# Create folder if it does not exist

folder1 = f"/kaggle/working/checkpoints/mlm_{masking}"

if not os.path.exists(folder1):
    os.mkdir(folder1)


In [None]:
# Create folder if it does not exist

folder1 = f"/kaggle/working/checkpoints/mlm_{masking}/final_checkpoint"

if not os.path.exists(folder1):
    os.mkdir(folder1)


In [None]:
training_args = TrainingArguments(
    output_dir=f"/kaggle/working/checkpoints/mlm_{masking}",
    overwrite_output_dir=True,
    num_train_epochs=16,
    per_device_train_batch_size=128,
    save_strategy='no',
    save_total_limit=2,
    prediction_loss_only=True,)

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=collator,
    train_dataset=dataset_for_tokenize,)

In [None]:
trainer.train()
torch.cuda.empty_cache()
trainer.save_model(f"./checkpoints/mlm_{masking}/final_checkpoint")

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

  ········································


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


Step,Training Loss
500,2.5049
1000,2.2119
1500,2.1364
2000,2.1011
2500,2.0678
3000,2.0471
3500,2.0305
4000,2.013
4500,1.9985
5000,1.9905


#Sanity Check

In [None]:
from transformers import pipeline

hangman_guess = pipeline("fill-mask", model=f"/kaggle/working/checkpoints/mlm_{masking}/final_checkpoint", tokenizer="/kaggle/working/checkpoints")

In [None]:
hangman_guess('hamm<mask>r')

[{'score': 0.8156739473342896,
  'token': 73,
  'token_str': 'e',
  'sequence': 'hammer'},
 {'score': 0.12053459882736206,
  'token': 69,
  'token_str': 'a',
  'sequence': 'hammar'},
 {'score': 0.0362255796790123,
  'token': 83,
  'token_str': 'o',
  'sequence': 'hammor'},
 {'score': 0.014343079179525375,
  'token': 77,
  'token_str': 'i',
  'sequence': 'hammir'},
 {'score': 0.010892124846577644,
  'token': 89,
  'token_str': 'u',
  'sequence': 'hammur'}]

In [None]:
def fill_letter(word, rank, idx=0):
    print(word, rank)
    guess = hangman_guess(word)[idx][rank]
    print(guess)
    if guess.get('token') == 1:
        return ' '
    return guess.get('sequence')

fill_letter('ha<mask>m<mask>r',0)

ha<mask>m<mask>r 0
{'score': 0.42544108629226685, 'token': 81, 'token_str': 'm', 'sequence': '<s>hamm<mask>r</s>'}


'<s>hamm<mask>r</s>'

#Download directory

In [None]:
os.listdir("/kaggle/working/checkpoints")

['vocab.json',
 'tokenizer_config.json',
 'model.safetensors',
 'special_tokens_map.json',
 'merges.txt',
 'mlm_0.6',
 'tokenizer.json']

In [None]:
!zip -r file.zip /kaggle/working/checkpoints

  pid, fd = os.forkpty()
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


updating: kaggle/working/checkpoints/ (stored 0%)
updating: kaggle/working/checkpoints/vocab.json (deflated 57%)
updating: kaggle/working/checkpoints/merges.txt (stored 0%)
updating: kaggle/working/checkpoints/mlm_0.6/ (stored 0%)
updating: kaggle/working/checkpoints/mlm_0.6/runs/ (stored 0%)
updating: kaggle/working/checkpoints/mlm_0.6/runs/Jun20_22-10-35_9e71dcb8f86e/ (stored 0%)
updating: kaggle/working/checkpoints/mlm_0.6/runs/Jun20_22-10-35_9e71dcb8f86e/events.out.tfevents.1718921441.9e71dcb8f86e.34.0 (deflated 65%)
updating: kaggle/working/checkpoints/mlm_0.6/final_checkpoint/ (stored 0%)
updating: kaggle/working/checkpoints/mlm_0.6/final_checkpoint/model.safetensors (deflated 7%)
updating: kaggle/working/checkpoints/mlm_0.6/final_checkpoint/config.json (deflated 49%)
updating: kaggle/working/checkpoints/mlm_0.6/final_checkpoint/training_args.bin (deflated 52%)
updating: kaggle/working/checkpoints/model.safetensors (deflated 7%)
  adding: kaggle/working/checkpoints/tokenizer_conf

In [None]:
!ls

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


checkpoints  file.zip  wandb


In [None]:
from IPython.display import FileLink
FileLink(r'file.zip')