In [81]:
import pandas as pd
from datasets import load_dataset, Dataset
import os
import re
import torch
from sklearn.model_selection import train_test_split
import requests
import random


### PRE-PROCESSING

In [82]:
train_df = pd.read_csv("C:/Users/bergo/OneDrive - University of Pisa/Tesi Magistrale/data_set/train_set.csv")
val_df = pd.read_csv("C:/Users/bergo/OneDrive - University of Pisa/Tesi Magistrale/data_set/val_set.csv")

In [83]:
print(f'Training set di dimensioni: {len(train_df)} - Variabili: {train_df.columns.to_list()}')
print(f'Validation set di dimensioni: {len(val_df)} - Variabili: {val_df.columns.to_list()}')

Training set di dimensioni: 988721 - Variabili: ['id', 'text', 'readability', 'gulpease']
Validation set di dimensioni: 49762 - Variabili: ['id', 'text', 'readability', 'gulpease']


In [103]:
sorted_df = train_df.sort_values(by='gulpease', ascending=False)

In [104]:
#Trasformiamo in formato Dataset i dati precedentemente importati
tr_set = Dataset.from_pandas(sorted_df)
val_set = Dataset.from_pandas(val_df)

In [105]:
tr_set = tr_set.shuffle(seed=5)

In [106]:
tr_set[0:5]

{'id': [154795, 502844, 400596, 807731, 529714],
 'text': ["Dopo il viaggio in Colorado Nick torna al lavoro e sembra essere tornato quello di prima, anche se, all'inizio, i colleghi sembrano dubitare che lui riesca a portare a termine la giornata lavorativa.",
  "Semachide () era un demo dell'Attica situato secondo Filocoro nell'Epacria, zona montuosa a nord dell'Attica:",
  'I dodici tagli, che andavano dal mezzo centesimo ai cinque dollari, erano stati litografati in Giappone.',
  'Le filatrici (La favola di Aracne) è un dipinto a olio su tela (167x252 cm) realizzato nel 1657 circa dal pittore Diego Velázquez.',
  'Il gruppo tuttavia ebbe breve durata, chiudendo la propria attività già prima che la linea di giocattoli fosse dismessa dalla produzione.'],
 'readability': [85.5467266926749,
  59.0515240836381,
  42.711362248312,
  37.9144866141346,
  54.8599824441437],
 'gulpease': [44, 51, 50, 53, 42],
 '__index_level_0__': [154794, 502843, 400595, 807730, 529713]}

### TOKENIZATION

In questa sezione si importa il tokenizzatore col quale si tokenizza ciascuna frase nel formato necessario per Bert, alla fine si otterrà un dataset nel formato corretto con tutte le features necessarie per il training

In [71]:
import tokenizers
import transformers
from transformers import BertTokenizer


In [7]:
#si importa il tokenizzatore già configurato (in questo caso: bert-base-italian-cased)
tokenizer = BertTokenizer.from_pretrained("dbmdz/bert-base-italian-cased")


In [8]:
#facciamo l'encoding di tutto il dataset tokenizzando frase per frase
def encode(sample):
    return tokenizer(sample["text"], padding=True, truncation=True, max_length=128, return_special_tokens_mask=True) 


train_set = tr_set.map(encode, batched=True)
test_set = val_set.map(encode, batched=True)
train_set.set_format('torch', columns=["input_ids", "attention_mask", "token_type_ids"])
test_set.set_format('torch', columns=["input_ids", "attention_mask", "token_type_ids"])


Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

Map:   0%|          | 0/500 [00:00<?, ? examples/s]

In [72]:
train_set

Dataset({
    features: ['id', 'text', 'readability', 'gulpease', 'input_ids', 'token_type_ids', 'special_tokens_mask', 'attention_mask'],
    num_rows: 10000
})

In [10]:
test_set

Dataset({
    features: ['id', 'text', 'readability', 'gulpease', 'input_ids', 'token_type_ids', 'special_tokens_mask', 'attention_mask'],
    num_rows: 500
})

### TRAINING DI BERT

Si procede al training di Bert. Il modello dovrà partire da uno stato iniziale con pesi random, per questo non si importa il modello già addestrato, ma si configura semplicemente l'architettura la sua architettura per poi addestrarlo da zero. Si definisce poi una strategia di training e i suoi argomenti per poi addestrare il modello sul trask di Language Modeling. 

In [11]:
from transformers import Trainer, TrainingArguments, TrainerCallback, BertConfig, BertForMaskedLM, DataCollatorForLanguageModeling, set_seed


In [67]:
model_name = "prajjwal1/bert-medium"
model_config = BertConfig.from_pretrained(model_name)
model = BertForMaskedLM(model_config)

print(model_config)



BertConfig {
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 512,
  "initializer_range": 0.02,
  "intermediate_size": 2048,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 8,
  "num_hidden_layers": 8,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.41.2",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}



In [68]:
model_path = "C:/Users/bergo/OneDrive - University of Pisa/Tesi Magistrale"
model.load_state_dict(torch.load(f'{model_path}/initial_model.bin'))
model.resize_token_embeddings(len(tokenizer))

Embedding(31102, 512)

In [69]:
model.config

BertConfig {
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 512,
  "initializer_range": 0.02,
  "intermediate_size": 2048,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 8,
  "num_hidden_layers": 8,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.41.2",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 31102
}

In [15]:
#usiamo il datacollator per fare le batch per il training
datacollator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=True, mlm_probability=0.2, return_tensors="pt")

In [16]:
datacollator

DataCollatorForLanguageModeling(tokenizer=BertTokenizer(name_or_path='dbmdz/bert-base-italian-cased', vocab_size=31102, model_max_length=512, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	101: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	102: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	103: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	104: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}, mlm=True, mlm_probability=0.2, pad_to_multiple_of=None, tf_experimental_com

In [60]:
#definisco alcune funzione di callback
from transformers import TrainerCallback
from transformers.trainer_callback import TrainerControl, TrainerState
from transformers.training_args import TrainingArguments
import json


#callback per incrementare il numero di step tra ogni salvataggio e salvataggio customizzato
class IncrementSaveSteps(TrainerCallback):
    def __init__(self, increase_steps, save_steps, checkpoint_path):
        self.increase_steps = increase_steps
        self.save_steps = save_steps
        self.checkpoint_path = checkpoint_path
  
    def on_step_end(self, args, state, control, logs=None, **kwargs):
        current_step = state.global_step
        if current_step % self.save_steps == 0:
            self.save_checkpoint(state, **kwargs)
        if current_step in self.increase_steps:
            self.save_steps = current_step
            state.logging_steps = self.save_steps
            print(f'Changed checkpoint and logging steps to {self.save_steps}')

    def save_checkpoint(self, state, **kwargs):
        current_step = state.global_step
        path_name = f'{self.checkpoint_path}/checkpoint-step{current_step}'
        if not os.path.exists(path_name):
            os.makedirs(path_name)
        kwargs['model'].save_pretrained(path_name)
        state.save_to_json(os.path.join(path_name, "trainer_state.json"))
        kwargs['model'].config.save_pretrained(path_name)
        if hasattr(kwargs['model'], 'generation_config'):
            kwargs['model'].generation_config.save_pretrained(path_name)
        print(f'Checkpoint has been saved for step number {current_step}. Current step_size = {self.save_steps}')


#callback per salvare le metriche a ogni evaluation
class HistoryLogger(TrainerCallback):
    def __init__(self, dir_path):
        self.dir_path = os.path.join(dir_path, "history_log.json")

    def on_evaluate(self, args, state, control, metrics, **kwargs):
        with open(self.dir_path, 'a') as f:
                f.write(json.dumps(metrics) + '\n')
                print(f"History log file has been updated with step's {state.global_step} metrics.")

            

In [61]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import numpy as np


#da modificare per masked language modeling
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    mask = labels != -100
    labels = labels[mask]
    predictions = predictions[mask]
    accuracy = accuracy_score(labels, predictions)
    cross_entropy = torch.nn.CrossEntropyLoss()(torch.tensor(logits).permute(0, 2, 1), torch.tensor(labels))
    perplexity = torch.exp(cross_entropy)
    return {
        'accuracy': accuracy,
        'perplexity': perplexity.item()
    }


In [62]:
results_path = "C:/Users/bergo/OneDrive - University of Pisa/Tesi Magistrale/training/prova"
output_dir = os.path.join(results_path, "my_pretrained_model")

In [63]:
#argomenti provvisori, da definire meglio
training_args = TrainingArguments(
    output_dir = output_dir,
    eval_strategy="steps", 
    overwrite_output_dir=True,
    per_device_train_batch_size=16, #64, 
    per_device_eval_batch_size=16, #64, 
    num_train_epochs=1,
    logging_steps=2,
    save_strategy="no",
    eval_steps=2000,
    load_best_model_at_end=False,
    save_steps=2,
    seed=42, 
    )

In [74]:
set_seed(training_args.seed)

In [65]:
checkpoint_path = os.path.join(results_path, "checkpoints")
steps_increments = [4, 16, 256, 2048]

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=datacollator,
    train_dataset=train_set,
    eval_dataset=test_set,
    compute_metrics=compute_metrics,
    callbacks=[IncrementSaveSteps(steps_increments, 2, checkpoint_path), HistoryLogger(dir_path=results_path)], 
)


In [66]:
trainer.train()

  0%|          | 0/625 [00:00<?, ?it/s]

Checkpoint has been saved for step number 2. Current step_size = 2
{'loss': 8.7247, 'grad_norm': 4.089417457580566, 'learning_rate': 4.9840000000000004e-05, 'epoch': 0.0}
Checkpoint has been saved for step number 4. Current step_size = 2
Changed checkpoint and logging save steps to 4
{'loss': 9.1178, 'grad_norm': 4.427165508270264, 'learning_rate': 4.9680000000000005e-05, 'epoch': 0.01}
Checkpoint has been saved for step number 8. Current step_size = 4
{'loss': 9.2513, 'grad_norm': 4.514981746673584, 'learning_rate': 4.936e-05, 'epoch': 0.01}
Checkpoint has been saved for step number 12. Current step_size = 4
{'loss': 9.4405, 'grad_norm': 4.555415630340576, 'learning_rate': 4.9040000000000005e-05, 'epoch': 0.02}


KeyboardInterrupt: 