In [None]:
# https://huggingface.co/blog/how-to-train

In [1]:
gpu_info = !nvidia-smi
gpu_info = '\n'.join(gpu_info)
if gpu_info.find('failed') >= 0:
  print('Not connected to a GPU')
else:
  print(gpu_info)

from psutil import virtual_memory
ram_gb = virtual_memory().total / 1e9
print('Your runtime has {:.1f} gigabytes of available RAM\n'.format(ram_gb))

if ram_gb < 20:
  print('Not using a high-RAM runtime')
else:
  print('You are using a high-RAM runtime!')

Thu Oct  3 11:49:41 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.104.05             Driver Version: 535.104.05   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  NVIDIA A100-SXM4-40GB          Off | 00000000:00:04.0 Off |                    0 |
| N/A   30C    P0              45W / 400W |      2MiB / 40960MiB |      0%      Default |
|                                         |                      |             Disabled |
+-----------------------------------------+----------------------+----------------------+
                                                                    

In [2]:
  try:
    import google.colab
    runs_in_colab = True
  except ImportError:
    runs_in_colab = False

  if runs_in_colab:
    !pip install datasets
    !pip install nltk

    import transformers
    from transformers import RobertaConfig
    from transformers import RobertaForMaskedLM
    from transformers import Trainer, TrainingArguments
    from tokenizers import ByteLevelBPETokenizer
    from transformers import DataCollatorForLanguageModeling
    from transformers import RobertaTokenizerFast
    import pandas as pd
    import torch
    from torch.utils.data import Dataset, DataLoader
    from torch.utils.data import SequentialSampler
    import os
    from sklearn.model_selection import train_test_split
    import nltk
    nltk.download('punkt')

    # Import the drive library
    from google.colab import drive
    drive.mount('/content/drive/')

    import sys
    sys.path.append('drive/MyDrive/Colab Notebooks/baby-lm/')
    from utils import *

Collecting datasets
  Downloading datasets-3.0.1-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.17-py310-none-any.whl.metadata (7.2 kB)
INFO: pip is looking at multiple versions of multiprocess to determine which version is compatible with other requirements. This could take a while.
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Downloading datasets-3.0.1-py3-none-any.whl (471 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m471.6/471.6 kB[0m [31m8.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m7.7 MB/s[0m eta [36m0:00:0

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


Mounted at /content/drive/


In [4]:
# Load data and save to csv file
data_to_csv("/content/drive/MyDrive/Colab Notebooks/baby-lm/data/ES_sentences.csv",dataset_name = "piuba-bigdata/articles_and_comments", dataset_version="", dataset_split="train[:12000]", remove_chars=['\n\n'], lang='es')

Loading dataset: piuba-bigdata/articles_and_comments, version: , split: train[:12000]
Dataset loaded
Articles separated into sentences
Removed specified characters from sentences: ['\n\n']
Total number of tokens: 7060648
Data saved to /content/drive/MyDrive/Colab Notebooks/baby-lm/data/ES_sentences.csv


In [None]:
## run the code from this cell to load the cleaned data file

In [3]:
df = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/baby-lm/data/ES_sentences.csv")

In [4]:
sequences = df['sentences'].tolist()
sequences = [seq for seq in sequences if not isinstance(seq, float)] #removing floats from data

In [5]:
len(sequences)

256736

In [6]:
#split data into train and dev

df_train, df_dev = train_test_split(sequences, test_size=0.15, random_state=42)

In [7]:
len(df_train)

218225

In [8]:
# Check data types in df_train and df_dev - confirming only str in data
print("Data types in df_train:", {type(x) for x in df_train})
print("Data types in df_dev:", {type(x) for x in df_dev})

Data types in df_train: {<class 'str'>}
Data types in df_dev: {<class 'str'>}


In [9]:
tokenizer_folder = '/content/drive/MyDrive/Colab Notebooks/baby-lm/es_tokenizer_folder'

if not os.path.exists(tokenizer_folder):
    os.mkdir(tokenizer_folder)

In [10]:

# Initialize tokenizer
tokenizer = ByteLevelBPETokenizer()

tokenizer.train_from_iterator(df_train, vocab_size=52_000, min_frequency=2, show_progress=True, special_tokens=[
    "<s>",
    "<pad>",
    "</s>",
    "<unk>",
    "<mask>",
])

# Save tokenizer
tokenizer.save_model(tokenizer_folder)

['/content/drive/MyDrive/Colab Notebooks/baby-lm/es_tokenizer_folder/vocab.json',
 '/content/drive/MyDrive/Colab Notebooks/baby-lm/es_tokenizer_folder/merges.txt']

In [11]:

# Configuration for RoBERTa model
config = RobertaConfig(
    vocab_size=52_000,
    max_position_embeddings=514,
    num_attention_heads=12,
    num_hidden_layers=6,
    type_vocab_size=1,
)
# Initialize model
model = RobertaForMaskedLM(config=config)
print('Num parameters: ',model.num_parameters())

Num parameters:  83504416


In [12]:
# Load tokenizer
max_length = 512
tokenizer = RobertaTokenizerFast.from_pretrained(tokenizer_folder, max_len=max_length)



In [13]:
# create CustomDataset class

class CustomDataset(Dataset):
    def __init__(self, data, tokenizer):
        self.examples = []
        self.mask = []

        for example in data:
            x=tokenizer.encode_plus(example, max_length = max_length, truncation=True, padding=True)
            self.examples += [x.input_ids]
            self.mask += [x.attention_mask]

    def __len__(self):
        return len(self.examples)

    def __getitem__(self, i):
        return torch.tensor(self.examples[i])

# create train and evaluation datasets
train_dataset = CustomDataset(df_train, tokenizer)
eval_dataset = CustomDataset(df_dev, tokenizer)

In [14]:

# Define data collator
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=True, mlm_probability=0.15
)

In [15]:
#adapted from https://github.com/MiuLab/FastMTL/blob/only3/custom_trainer.py

# adapt get_train_dataloader function to supply DataLoader using SequentialSampler and shuffle=False to enforce curriculum learning

from transformers.trainer import *
def get_train_dataloader(self) -> DataLoader:
    """
    Returns the training :class:`~torch.utils.data.DataLoader`.

    Will use no sampler if :obj:`self.train_dataset` does not implement :obj:`__len__`, a random sampler (adapted
    to distributed training if necessary) otherwise.

    Subclass and override this method if you want to inject some custom behavior.
    """
    if self.train_dataset is None:
        raise ValueError("Trainer: training requires a train_dataset.")

    return DataLoader(
        self.train_dataset,
        batch_size=self.args.train_batch_size,
        sampler=SequentialSampler(self.train_dataset),
        collate_fn=self.data_collator,
        drop_last=self.args.dataloader_drop_last,
        num_workers=self.args.dataloader_num_workers,
        shuffle=False
    )
Trainer.get_train_dataloader = get_train_dataloader

In [16]:
batch_size = 16

# Define training arguments
training_args = TrainingArguments(
    output_dir='/content/drive/MyDrive/Colab Notebooks/baby-lm/es_model_folder',
    overwrite_output_dir=True,
    evaluation_strategy = 'epoch',
    num_train_epochs=5,
    learning_rate=1e-4,
    weight_decay=0.01,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    save_steps=8192,
    save_total_limit=1,
    #seed=10098,
    max_steps=int(1269227 / batch_size)
)
# Create trainer
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer ## as seen at https://discuss.huggingface.co/t/impossible-to-guess-which-tokenizer-to-use-while-loading-fine-tuned-model-on-pipeline/64939
)
# Train the model
trainer.train()
trainer.save_model('/content/drive/MyDrive/Colab Notebooks/baby-lm/es_model_folder')

max_steps is given, it will override any value given in num_train_epochs
We strongly recommend passing in an `attention_mask` since your input_ids may be padded. See https://huggingface.co/docs/transformers/troubleshooting#incorrect-output-when-padding-tokens-arent-masked.


Epoch,Training Loss,Validation Loss
1,5.1842,5.107653
2,4.3956,4.358965
3,4.0442,3.987439
4,3.7865,3.774743
5,3.5589,3.548731
