In [2]:
import pandas as pd
from datasets import load_dataset, Dataset
import os
import re
import torch
from sklearn.model_selection import train_test_split
import requests
import random


### PRE-PROCESSING

In [3]:
train_df = pd.read_csv("C:/Users/bergo/OneDrive - University of Pisa/Tesi Magistrale/data_set/train_set.csv")
val_df = pd.read_csv("C:/Users/bergo/OneDrive - University of Pisa/Tesi Magistrale/data_set/val_set.csv")

In [4]:
print(f'Training set di dimensioni: {len(train_df)} - Variabili: {train_df.columns.to_list()}')
print(f'Validation set di dimensioni: {len(val_df)} - Variabili: {val_df.columns.to_list()}')

Training set di dimensioni: 988721 - Variabili: ['id', 'text', 'readability', 'gulpease']
Validation set di dimensioni: 49762 - Variabili: ['id', 'text', 'readability', 'gulpease']


In [5]:
#Divido in training e test set
tr_set = Dataset.from_pandas(train_df[:10000])
val_set = Dataset.from_pandas(val_df[:500])

In [6]:
tr_set

Dataset({
    features: ['id', 'text', 'readability', 'gulpease'],
    num_rows: 10000
})

### TOKENIZATION

In questa sezione si importa il tokenizzatore col quale si tokenizza ciascuna frase nel formato necessario per Bert, alla fine si otterrà un dataset nel formato corretto con tutte le features necessarie per il training

In [7]:
import tokenizers
import transformers
from transformers import BertTokenizer

In [8]:
#si importa il tokenizzatore già configurato (in questo caso: bert-base-italian-cased)
tokenizer = BertTokenizer.from_pretrained("dbmdz/bert-base-italian-cased")


In [9]:
#facciamo l'encoding di tutto il dataset tokenizzando frase per frase
def encode(sample):
    return tokenizer(sample["text"], padding=True, truncation=True, max_length=128, return_special_tokens_mask=True) 

train_set = tr_set.map(encode, batched=True)
test_set = val_set.map(encode, batched=True)
train_set.set_format('torch', columns=["input_ids", "attention_mask", "token_type_ids"])
test_set.set_format('torch', columns=["input_ids", "attention_mask", "token_type_ids"])


Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

Map:   0%|          | 0/500 [00:00<?, ? examples/s]

In [10]:
train_set

Dataset({
    features: ['id', 'text', 'readability', 'gulpease', 'input_ids', 'token_type_ids', 'special_tokens_mask', 'attention_mask'],
    num_rows: 10000
})

In [11]:
test_set

Dataset({
    features: ['id', 'text', 'readability', 'gulpease', 'input_ids', 'token_type_ids', 'special_tokens_mask', 'attention_mask'],
    num_rows: 500
})

### TRAINING DI BERT

Si procede al training di Bert. Il modello dovrà partire da uno stato iniziale con pesi random, per questo non si importa il modello già addestrato, ma si configura semplicemente l'architettura la sua architettura per poi addestrarlo da zero. Si definisce poi una strategia di training e i suoi argomenti per poi addestrare il modello sul trask di Language Modeling. 

In [12]:
from transformers import Trainer, TrainingArguments, TrainerCallback, BertConfig, BertForMaskedLM, DataCollatorForLanguageModeling, set_seed


In [13]:
model_name = "prajjwal1/bert-medium"
model_config = BertConfig.from_pretrained(model_name)

print(model_config)



BertConfig {
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 512,
  "initializer_range": 0.02,
  "intermediate_size": 2048,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 8,
  "num_hidden_layers": 8,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.41.2",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}



In [14]:
model = BertForMaskedLM(model_config)
model.resize_token_embeddings(len(tokenizer))


Embedding(31102, 512)

In [15]:
model.config

BertConfig {
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 512,
  "initializer_range": 0.02,
  "intermediate_size": 2048,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 8,
  "num_hidden_layers": 8,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.41.2",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 31102
}

In [16]:
#usiamo il datacollator per fare le batch per il training
datacollator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=True, mlm_probability=0.2, return_tensors="pt")

In [17]:
datacollator

DataCollatorForLanguageModeling(tokenizer=BertTokenizer(name_or_path='dbmdz/bert-base-italian-cased', vocab_size=31102, model_max_length=512, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	101: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	102: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	103: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	104: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}, mlm=True, mlm_probability=0.2, pad_to_multiple_of=None, tf_experimental_com

In [18]:
#definisco alcune funzione di callback
from transformers import TrainerCallback
from transformers.trainer_callback import TrainerControl, TrainerState
from transformers.training_args import TrainingArguments
import json


#callback per incrementare il numero di step tra ogni salvataggio
class IncrementSaveSteps(TrainerCallback):
    def __init__(self, increase_steps):
        self.increase_steps = increase_steps
        
    def on_step_end(self, args, state, control, logs=None, **kwargs):
        current_step = state.global_step
        if current_step in self.increase_steps:
            state.save_steps = current_step
            state.logging_steps = state.save_steps
            print(f'Changed checkpoint and logging save steps to {state.save_steps}')

#callback per il salvataggio customizzato dei checkpoint
class SaveCheckpoint(TrainerCallback):
    def __init__(self, checkpoint_path):
        self.checkpoint_path = checkpoint_path

    def on_save(self, args, state, control, logs=None, **kwargs):
        current_step = state.global_step
        path_name = f'{self.checkpoint_path}/checkpoint-step{current_step}'
        if not os.path.exists(path_name):
            os.makedirs(path_name)
        kwargs['model'].save_pretrained(path_name)
        state.save_to_json(os.path.join(path_name, "trainer_state.json"))
        kwargs['model'].config.save_pretrained(path_name)
        if hasattr(kwargs['model'], 'generation_config'):
            kwargs['model'].generation_config.save_pretrained(path_name)
        control.should_save = True
        print(f'Checkpoint has been saved for step number {current_step}. Current step_size = {state.save_steps}')

#callback per salvare le metriche a ogni evaluation
class HistoryLogger(TrainerCallback):
    def __init__(self, dir_path):
        self.dir_path = os.path.join(dir_path, "history_log.json")

    def on_evaluate(self, args, state, control, metrics, **kwargs):
        with open(self.dir_path, 'a') as f:
                f.write(json.dumps(metrics) + '\n')
                print(f"History log file has been updated with step's {state.global_step} metrics.")

            

In [19]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    preds = predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='weighted')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'precision': precision,
        'recall': recall,
        'f1': f1,
    }


In [20]:
results_path = "C:/Users/bergo/OneDrive - University of Pisa/Tesi Magistrale/training/prova"
output_dir = os.path.join(results_path, "my_pretrained_model")

In [22]:
#argomenti provvisori, da definire meglio
training_args = TrainingArguments(
    output_dir = output_dir,
    evaluation_strategy=None,
    eval_strategy="steps", 
    overwrite_output_dir=True,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    num_train_epochs=1,
    logging_steps=2,
    save_strategy="no",
    eval_steps=4, #2000
    load_best_model_at_end=False,
    save_steps=2,
    seed=42, 
    )

In [23]:
set_seed(training_args.seed)

In [24]:
checkpoint_path = os.path.join(results_path, "checkpoints")
steps_increments = [8, 16, 256, 2048]

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=datacollator,
    train_dataset=train_set,
    eval_dataset=test_set,
    compute_metrics=compute_metrics,
    callbacks=[IncrementSaveSteps(steps_increments), SaveCheckpoint(checkpoint_path=checkpoint_path), HistoryLogger(dir_path=results_path)], 
)


In [25]:
trainer.train()

  0%|          | 0/313 [00:00<?, ?it/s]

{'loss': 10.4263, 'grad_norm': 3.8298041820526123, 'learning_rate': 4.968051118210863e-05, 'epoch': 0.01}
{'loss': 10.2199, 'grad_norm': 3.412374973297119, 'learning_rate': 4.936102236421725e-05, 'epoch': 0.01}


  0%|          | 0/16 [00:00<?, ?it/s]