In [1]:
!pip install transformers

Collecting transformers
  Downloading transformers-4.37.0-py3-none-any.whl.metadata (129 kB)
     ---------------------------------------- 0.0/129.4 kB ? eta -:--:--
     ------------------ -------------------- 61.4/129.4 kB 1.1 MB/s eta 0:00:01
     -------------------------------------- 129.4/129.4 kB 1.5 MB/s eta 0:00:00
Collecting tokenizers<0.19,>=0.14 (from transformers)
  Downloading tokenizers-0.15.0-cp311-none-win_amd64.whl.metadata (6.8 kB)
Collecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.4.1-cp311-none-win_amd64.whl.metadata (3.8 kB)
Downloading transformers-4.37.0-py3-none-any.whl (8.4 MB)
   ---------------------------------------- 0.0/8.4 MB ? eta -:--:--
   - -------------------------------------- 0.3/8.4 MB 7.9 MB/s eta 0:00:02
   -- ------------------------------------- 0.6/8.4 MB 7.9 MB/s eta 0:00:01
   ----- ---------------------------------- 1.2/8.4 MB 8.4 MB/s eta 0:00:01
   -------- ------------------------------- 1.9/8.4 MB 9.9 MB/s e

In [None]:
from transformers import RobertaConfig, RobertaForMaskedLM, RobertaTokenizer
from transformers import LineByLineTextDataset, DataCollatorForLanguageModeling
from transformers import Trainer, TrainingArguments
from tqdm.auto import tqdm

In [2]:
# Load your custom tokenizer (Make sure it's a ByteLevelBPETokenizer)
tokenizer = RobertaTokenizer.from_pretrained('./tokenizer', max_len=512)

# Prepare the dataset
dataset = LineByLineTextDataset(
    tokenizer=tokenizer,
    file_path="data/kag.txt",
    block_size=128
)

# Data collator
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=True, mlm_probability=0.15
)

# Config for RoBERTa
config = RobertaConfig(
    vocab_size=tokenizer.vocab_size,
    hidden_size=768,
    num_hidden_layers=12,
    num_attention_heads=12,
    intermediate_size=3072,
)

# Initialize the model
model = RobertaForMaskedLM(config=config)

# Training arguments
training_args = TrainingArguments(
    output_dir="./RoBERTa_Albanian",
    overwrite_output_dir=True,
    num_train_epochs=5,
    per_device_train_batch_size=16,
    save_steps=10_000,
    save_total_limit=2,
)

# Initialize Trainer with progress bar callback
class ProgressBarCallback(TrainerCallback):
    def on_epoch_begin(self, args, state, control, **kwargs):
        self.epoch_progress_bar = tqdm(total=state.max_steps, desc=f"Epoch {state.epoch}")
        self.epoch_progress_bar.n = state.global_step
        self.epoch_progress_bar.last_print_n = state.global_step
        self.epoch_progress_bar.refresh()

    def on_step_end(self, args, state, control, **kwargs):
        self.epoch_progress_bar.update(1)

    def on_epoch_end(self, args, state, control, **kwargs):
        self.epoch_progress_bar.close()

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=dataset,
    callbacks=[ProgressBarCallback()]
)

  from .autonotebook import tqdm as notebook_tqdm
None of PyTorch, TensorFlow >= 2.0, or Flax have been found. Models won't be available and only tokenizers, configuration and file/data utilities can be used.


OSError: Incorrect path_or_model_id: './tokenizer'. Please provide either the path to a local folder or the repo_id of a model on the Hub.

In [None]:
trainer.train()