In [None]:
# https://huggingface.co/blog/how-to-train

In [1]:
!pip install datasets
!pip install nltk



In [2]:
import transformers
from transformers import RobertaConfig
from transformers import RobertaForMaskedLM
from transformers import Trainer, TrainingArguments
from tokenizers import ByteLevelBPETokenizer
from transformers import DataCollatorForLanguageModeling
from transformers import RobertaTokenizerFast
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from torch.utils.data import SequentialSampler
from datasets import load_dataset
import os
from sklearn.model_selection import train_test_split
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/arianabritez/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [3]:
#ds = load_dataset("marianbasti/boletin-oficial-argentina")
ds = load_dataset("piuba-bigdata/articles_and_comments")

Found cached dataset parquet (/Users/arianabritez/.cache/huggingface/datasets/piuba-bigdata___parquet/piuba-bigdata--articles_and_comments-94c404d9208411a3/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)


  0%|          | 0/1 [00:00<?, ?it/s]

In [4]:
ds

DatasetDict({
    train: Dataset({
        features: ['tweet_id', 'text', 'title', 'url', 'user', 'body', 'created_at', 'comments'],
        num_rows: 537201
    })
})

In [5]:
ds["train"][0]

{'tweet_id': '1376940813968609288',
 'text': 'Segunda ola de coronavirus: preocupan las reuniones sociales y el Gobierno analiza medidas para después de Semana Santa https://t.co/KG2XU0uRKz',
 'title': 'Segunda ola de coronavirus: preocupan las reuniones sociales y el Gobierno analiza medidas para después de Semana Santa',
 'url': 'https://www.clarin.com/politica/segunda-ola-coronavirus-preocupan-reuniones-sociales-gobierno-analiza-medidas-despues-semana-santa_0_NnjHl1iUe.html?utm_medium=Social&utm_source=Twitter#Echobox=1617123246',
 'user': 'clarincom',
 'body': 'La decisión de Axel Kicillof de suspender actividades nocturnas y limitar las reuniones sociales no sorprendió a la Casa Rosada. Más allá de que había avisado que tomaría medidas restrictivas, el gobernador bonaerense se adelantó a la primera herramienta a la que apelaría la administración de Alberto Fernández después de Semana Santa si es que la curva de contagios sigue en ascenso.\n\nLa confirmación de que en Argentina ya 

In [6]:
#the news article is in the body key
ds["train"][4]["body"]

'La noche del domingo 28 de marzo efectivos que responden a la fuerza comandada por el ministro Sergio Berni esposaron a una trabajadora de la salud que se aprestaba a socorrer a una persona que llegó con dolor en el pecho a la UPA N° 5 de Longchamps, en el distrito de Almirante Brown, al sur del conurbano bonaerense.\n\nEl motivo esgrimido por los uniformados fue que ella, al cumplir con su tarea, no estaba haciendo lo que ellos le ordenaron, que era extraer sangre a una persona que llevaron al lugar. Con total prepotencia esgrimieron que estaba "resistiendo a la autoridad". Cabe aclarar que en ese lugar no se realizan extracciones, según explicó la misma enfermera, que luego del incidente -durante el cual sus compañeros reclamaban que fuera dejada en libertad- debió ser atendida presa del malestar nervioso que le provocó toda la situación.\n\nEn una entrevista con Canal 13 este martes la profesional explicó que "Fue de terror lo que me pasó, somos pocas enfermeras en las guardias por

In [7]:
feature_to_keep = "body"

#remove all columns except the body of text
updated_ds = ds.map(\
    lambda x: x,  # the data stays the same
    remove_columns=[col for col in ds['train'].column_names if col != feature_to_keep]
)

Loading cached processed dataset at /Users/arianabritez/.cache/huggingface/datasets/piuba-bigdata___parquet/piuba-bigdata--articles_and_comments-94c404d9208411a3/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec/cache-d182953cb8ddcf51.arrow


In [8]:
updated_ds

DatasetDict({
    train: Dataset({
        features: ['body'],
        num_rows: 537201
    })
})

In [9]:
updated_ds["train"]

Dataset({
    features: ['body'],
    num_rows: 537201
})

In [10]:
df = pd.DataFrame(updated_ds["train"])

In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 537201 entries, 0 to 537200
Data columns (total 1 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   body    466534 non-null  object
dtypes: object(1)
memory usage: 4.1+ MB


In [12]:
df.isnull().sum()

body    70667
dtype: int64

In [13]:
# Dropping the empty rows
df = df.dropna().reset_index(drop=True)

In [20]:
#Reducing the size of the dataset for BabyLM
df = df[:12000]

In [21]:
def preprocess(text, punctuation_marks):
    """Remove the punctuation specified by the user, and return the same string without the unwanted characters.
    
    Parameters:
    - 'text': string to process and remove characters from.
    - 'punctuation marks': characters to be removed, given in list format. It can be any number of characters."""
    
    for character in punctuation_marks: #iterating over the characters given in the parameter list 'punctuation_marks'
        text = text.replace(character, ' ') #the characters given will be removed from the string by replacing them with white space because it's a new sentence.
    preprocessesed_text = text #assigning the variable text to a new variable so that it returns the text with all the instances of the punctuation marks removed.
    return preprocessesed_text

In [22]:
punctuation_marks = ['\n\n']

df_sent = df

# Removing the double new line that stayed even after sentence splitting
df_sent['cleaned_text'] = df['body'].apply(lambda x: preprocess(x, punctuation_marks))


In [23]:
# Separating sentences
df_sent = df
df_sent['sentences'] = [nltk.sent_tokenize(text) for text in df_sent['cleaned_text']]

In [24]:
# Making a single list of all the sentences in each article
sentences = sum(df_sent['sentences'].tolist(), [])

In [25]:
# Tokenizing each sentence in sentences
tokenized_sentences = [nltk.word_tokenize(sent, language='spanish') for sent in sentences] 

# Calculate the total number of tokens
total_tokens = sum(len(tokens) for tokens in tokenized_sentences)

print(f"Total number of tokens: {total_tokens}")

Total number of tokens: 8220990


In [26]:
sequences = sentences

In [None]:
##Celonie's code starts here

#train_filepath = 'data/ranked_sequences.csv'


#ranked = pd.read_csv(train_filepath)
#sequences = ranked['sequence'].tolist()

#with open('combined.dev') as infile:
#  devset = infile.read().split("\n")[:10000]


In [27]:
len(sequences)

297794

In [28]:
#split data into train and dev

df_train, df_dev = train_test_split(sequences, test_size=0.15, random_state=None)

In [29]:
len(df_train)

253124

In [None]:
## got until here 20/Sep

In [None]:
tokenizer_folder = 'tokenizer_folder'

if not os.path.exists(tokenizer_folder):
    os.mkdir(tokenizer_folder)

In [None]:

# Initialize tokenizer
tokenizer = ByteLevelBPETokenizer()

tokenizer.train_from_iterator(df_train, vocab_size=52_000, min_frequency=2, show_progress=True, special_tokens=[
    "<s>",
    "<pad>",
    "</s>",
    "<unk>",
    "<mask>",
])

# Save tokenizer
tokenizer.save_model(tokenizer_folder)

In [None]:

# Configuration for RoBERTa model
config = RobertaConfig(
    vocab_size=52_000,
    max_position_embeddings=514,
    num_attention_heads=12,
    num_hidden_layers=6,
    type_vocab_size=1,
)
# Initialize model
model = RobertaForMaskedLM(config=config)
print('Num parameters: ',model.num_parameters())

In [None]:
# tokenizer_folder = 'tokenizer_folder'

# Load tokenizer
max_length = 512
tokenizer = RobertaTokenizerFast.from_pretrained(tokenizer_folder, max_len=max_length)

In [None]:
# create CustomDataset class

class CustomDataset(Dataset):
    def __init__(self, data, tokenizer):
        self.examples = []
        self.mask = []

        for example in data:
            x=tokenizer.encode_plus(example, max_length = max_length, truncation=True, padding=True)
            self.examples += [x.input_ids]
            self.mask += [x.attention_mask]

    def __len__(self):
        return len(self.examples)

    def __getitem__(self, i):
        return torch.tensor(self.examples[i])

# create train and evaluation datasets
train_dataset = CustomDataset(df_train, tokenizer)
eval_dataset = CustomDataset(df_dev, tokenizer)

In [None]:

# Define data collator
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=True, mlm_probability=0.15
)

In [None]:
#adapted from https://github.com/MiuLab/FastMTL/blob/only3/custom_trainer.py

# adapt get_train_dataloader function to supply DataLoader using SequentialSampler and shuffle=False to enforce curriculum learning

from transformers.trainer import *
def get_train_dataloader(self) -> DataLoader:
    """
    Returns the training :class:`~torch.utils.data.DataLoader`.

    Will use no sampler if :obj:`self.train_dataset` does not implement :obj:`__len__`, a random sampler (adapted
    to distributed training if necessary) otherwise.

    Subclass and override this method if you want to inject some custom behavior.
    """
    if self.train_dataset is None:
        raise ValueError("Trainer: training requires a train_dataset.")

    return DataLoader(
        self.train_dataset,
        batch_size=self.args.train_batch_size,
        sampler=SequentialSampler(self.train_dataset),
        collate_fn=self.data_collator,
        drop_last=self.args.dataloader_drop_last,
        num_workers=self.args.dataloader_num_workers,
        shuffle=False
    )
Trainer.get_train_dataloader = get_train_dataloader

In [None]:
batch_size = 16

# Define training arguments
training_args = TrainingArguments(
    output_dir='model_folder',
    overwrite_output_dir=True,
    evaluation_strategy = 'epoch',
    num_train_epochs=5,
    learning_rate=1e-4,
    weight_decay=0.01,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    save_steps=8192,
    save_total_limit=1,
    #seed=10098,
    max_steps=int(1269227 / batch_size)
)
# Create trainer
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
)
# Train the model
trainer.train()