This notebook was executed in Google Colab using A100-GPU

### Start of execution

In [1]:
import time

In [2]:
start = time.time()

# 1. Setting the environment

In [3]:
!pip install -q datasets==2.20.0

# 2. Import libraries

In [4]:
import warnings
warnings.filterwarnings("ignore")

In [5]:
import pandas as pd
from datasets import Dataset
from datasets import load_dataset
from transformers import AutoTokenizer
from transformers import DataCollatorWithPadding
from sklearn.model_selection import train_test_split
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer

# 3. Preparation

In [6]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [7]:
path_general = 'drive/MyDrive/Profesional_Academico/Github_Personal/ML_AI_Contents/09.Deep_Learning/52.BETO_HF_Trainer'

# 3. Load Dataset

In [8]:
df = pd.read_csv(f'{path_general}/data/sentiment_analysis_dataset.csv')
df = df[df['sentiment'].isin(['sad', 'joyful'])]
df = df[['text', 'sentiment']]
df = df.rename(columns = {'sentiment': 'label'})
df = df.replace({'label': {'joyful': 1, 'sad': 0}})

In [9]:
df_train, df_test = train_test_split(df, test_size = 0.30, random_state = 42)

In [10]:
df_train.to_csv(f'{path_general}/data/df_train.csv')

In [11]:
df_test.to_csv(f'{path_general}/data/df_test.csv')

In [12]:
train_data = Dataset.from_pandas(df_train)

In [13]:
test_data = Dataset.from_pandas(df_test)

# 4. Load model

In [14]:
model = AutoModelForSequenceClassification.from_pretrained("dccuchile/bert-base-spanish-wwm-cased", num_labels = 2)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at dccuchile/bert-base-spanish-wwm-cased and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [15]:
tokenizer = AutoTokenizer.from_pretrained("dccuchile/bert-base-spanish-wwm-cased")

# 5. Creation of datasets

In [16]:
def preprocess_function(examples):
    return tokenizer(examples["text"], truncation = True)

In [17]:
tokenized_train = train_data.map(preprocess_function, batched = True)

Map:   0%|          | 0/497 [00:00<?, ? examples/s]

In [18]:
tokenized_test = test_data.map(preprocess_function, batched = True)

Map:   0%|          | 0/213 [00:00<?, ? examples/s]

In [19]:
data_collator = DataCollatorWithPadding(tokenizer = tokenizer)

# 6. Training

In [20]:
num_epochs = 3

In [21]:
training_args = TrainingArguments(
    output_dir = "./results",
    learning_rate = 2e-5,
    per_device_train_batch_size = 16,
    per_device_eval_batch_size = 16,
    num_train_epochs = num_epochs,
    weight_decay = 0.01,
    evaluation_strategy = "epoch",
    save_steps = 0,
    logging_steps = 25,
)

In [22]:
trainer = Trainer(
    model = model,
    args = training_args,
    train_dataset = tokenized_train,
    eval_dataset = tokenized_test,
    tokenizer = tokenizer,
    data_collator = data_collator
    )

In [23]:
trainer.train()

Epoch,Training Loss,Validation Loss
1,0.6633,0.752832
2,0.5659,0.666398
3,0.4422,0.628821


TrainOutput(global_step=96, training_loss=0.5169469068447748, metrics={'train_runtime': 16.6994, 'train_samples_per_second': 89.285, 'train_steps_per_second': 5.749, 'total_flos': 55932682592760.0, 'train_loss': 0.5169469068447748, 'epoch': 3.0})

# 7. Save the model

In [24]:
trainer.save_model(f'{path_general}/model')

### End of execution

In [25]:
end = time.time()

delta = (end - start)

hours = int(delta/3_600)
mins = int((delta - hours*3_600)/60)
secs = int(delta - hours*3_600 - mins*60)

print(f'Hours: {hours}, Minutes: {mins}, Seconds: {secs}')

Hours: 0, Minutes: 0, Seconds: 36
