In [None]:
# https://huggingface.co/blog/how-to-train

In [None]:
  try:
    import google.colab
    runs_in_colab = True
  except ImportError:
    runs_in_colab = False

  if runs_in_colab:
    !pip install datasets
    !pip install nltk

    import transformers
    from transformers import RobertaConfig
    from transformers import RobertaForMaskedLM
    from transformers import Trainer, TrainingArguments
    from tokenizers import ByteLevelBPETokenizer
    from transformers import DataCollatorForLanguageModeling
    from transformers import RobertaTokenizerFast
    import pandas as pd
    import torch
    from torch.utils.data import Dataset, DataLoader
    from torch.utils.data import SequentialSampler
    import os
    from sklearn.model_selection import train_test_split
    import nltk
    nltk.download('punkt')

    # Import the drive library
    from google.colab import drive
    drive.mount('/content/drive/')

    import sys
    sys.path.append('drive/MyDrive/Colab Notebooks/baby-lm/')
    from utils import *

In [4]:
# Load data and save to csv file
data_to_csv("EN_sentences.csv",remove_chars=['"'])

Loading dataset: abisee/cnn_dailymail, version: 1.0.0, split: train[:10000]


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/15.6k [00:00<?, ?B/s]

train-00000-of-00003.parquet:   0%|          | 0.00/256M [00:00<?, ?B/s]

train-00001-of-00003.parquet:   0%|          | 0.00/257M [00:00<?, ?B/s]

train-00002-of-00003.parquet:   0%|          | 0.00/259M [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/34.7M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/30.0M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/287113 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/13368 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/11490 [00:00<?, ? examples/s]

Dataset loaded
Articles separated into sentences
Removed specified characters from sentences: ['"']
Total number of tokens: 7061584
Data saved to EN_sentences.csv


In [None]:
## run the code from this cell to load the cleaned data file

In [5]:
df = pd.read_csv("/content/data/EN_sentences.csv")

In [7]:
sequences = df['sentences'].tolist()
sequences = [seq for seq in sequences if not isinstance(seq, float)] #removing floats from data

In [8]:
len(sequences)

329866

In [9]:
#split data into train and dev

df_train, df_dev = train_test_split(sequences, test_size=0.15, random_state=42)

In [10]:
len(df_train)

280386

In [12]:
# Check data types in df_train and df_dev - confirming only str in data
print("Data types in df_train:", {type(x) for x in df_train})
print("Data types in df_dev:", {type(x) for x in df_dev})

Data types in df_train: {<class 'str'>}
Data types in df_dev: {<class 'str'>}


In [13]:
tokenizer_folder = 'en_tokenizer_folder'

if not os.path.exists(tokenizer_folder):
    os.mkdir(tokenizer_folder)

In [14]:

# Initialize tokenizer
tokenizer = ByteLevelBPETokenizer()

tokenizer.train_from_iterator(df_train, vocab_size=52_000, min_frequency=2, show_progress=True, special_tokens=[
    "<s>",
    "<pad>",
    "</s>",
    "<unk>",
    "<mask>",
])

# Save tokenizer
tokenizer.save_model(tokenizer_folder)

['en_tokenizer_folder/vocab.json', 'en_tokenizer_folder/merges.txt']

In [15]:
## as seen at https://colab.research.google.com/github/huggingface/blog/blob/master/notebooks/01_how_to_train.ipynb

# Configuration for RoBERTa model
config = RobertaConfig(
    vocab_size=52_000,
    max_position_embeddings=514,
    num_attention_heads=12,
    num_hidden_layers=6,
    type_vocab_size=1,
)
# Initialize model
model = RobertaForMaskedLM(config=config)
print('Num parameters: ',model.num_parameters())

Num parameters:  83504416


In [16]:
# Load tokenizer
max_length = 512
tokenizer = RobertaTokenizerFast.from_pretrained(tokenizer_folder, max_len=max_length)



In [17]:
# create CustomDataset class

class CustomDataset(Dataset):
    def __init__(self, data, tokenizer):
        self.examples = []
        self.mask = []

        for example in data:
            x=tokenizer.encode_plus(example, max_length = max_length, truncation=True, padding=True)
            self.examples += [x.input_ids]
            self.mask += [x.attention_mask]

    def __len__(self):
        return len(self.examples)

    def __getitem__(self, i):
        return torch.tensor(self.examples[i])

# create train and evaluation datasets
train_dataset = CustomDataset(df_train, tokenizer)
eval_dataset = CustomDataset(df_dev, tokenizer)

In [18]:

# Define data collator
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=True, mlm_probability=0.15
)

In [19]:
#adapted from https://github.com/MiuLab/FastMTL/blob/only3/custom_trainer.py

# adapt get_train_dataloader function to supply DataLoader using SequentialSampler and shuffle=False to enforce curriculum learning

from transformers.trainer import *
def get_train_dataloader(self) -> DataLoader:
    """
    Returns the training :class:`~torch.utils.data.DataLoader`.

    Will use no sampler if :obj:`self.train_dataset` does not implement :obj:`__len__`, a random sampler (adapted
    to distributed training if necessary) otherwise.

    Subclass and override this method if you want to inject some custom behavior.
    """
    if self.train_dataset is None:
        raise ValueError("Trainer: training requires a train_dataset.")

    return DataLoader(
        self.train_dataset,
        batch_size=self.args.train_batch_size,
        sampler=SequentialSampler(self.train_dataset),
        collate_fn=self.data_collator,
        drop_last=self.args.dataloader_drop_last,
        num_workers=self.args.dataloader_num_workers,
        shuffle=False
    )
Trainer.get_train_dataloader = get_train_dataloader

In [None]:
batch_size = 16

# Define training arguments
training_args = TrainingArguments(
    output_dir='model_folder',
    overwrite_output_dir=True,
    evaluation_strategy = 'epoch',
    num_train_epochs=3,
    learning_rate=1e-4,
    weight_decay=0.01,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    save_steps=8192,
    save_total_limit=1,
    #seed=10098,
    max_steps=int(1269227 / batch_size)
)
# Create trainer
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
)
# Train the model
trainer.train()

max_steps is given, it will override any value given in num_train_epochs
We strongly recommend passing in an `attention_mask` since your input_ids may be padded. See https://huggingface.co/docs/transformers/troubleshooting#incorrect-output-when-padding-tokens-arent-masked.


Epoch,Training Loss,Validation Loss
1,5.0821,4.987725
2,4.5211,4.424754


In [None]:
trainer.save_model("model_folder")