In [1]:
import pandas as pd
from datasets import load_dataset, Dataset
import os
import re
import torch
from sklearn.model_selection import train_test_split
import requests
import random


### PRE-PROCESSING

In [2]:
#si crea un dataframe con una riga per frase, attributi: id, testo e indice di gulpease
train_df = pd.read_csv("C:/Users/bergo/OneDrive - University of Pisa/Tesi Magistrale/data_set/train_set.csv")
val_df = pd.read_csv("C:/Users/bergo/OneDrive - University of Pisa/Tesi Magistrale/data_set/val_set.csv")

In [3]:
print(f'Training set di dimensioni: {len(train_df)} - Variabili: {train_df.columns.to_list()}')
print(f'Validation set di dimensioni: {len(val_df)} - Variabili: {val_df.columns.to_list()}')

Training set di dimensioni: 988721 - Variabili: ['id', 'text', 'readability', 'gulpease']
Validation set di dimensioni: 49762 - Variabili: ['id', 'text', 'readability', 'gulpease']


In [4]:
#Divido in training e test set
tr_set = Dataset.from_pandas(train_df[:100000])
val_set = Dataset.from_pandas(val_df[:5000])

In [5]:
tr_set

Dataset({
    features: ['id', 'text', 'readability', 'gulpease'],
    num_rows: 100000
})

### TOKENIZATION

In questa sezione si importa il tokenizzatore col quale si tokenizza ciascuna frase nel formato necessario per Bert, alla fine si otterrà un dataset nel formato corretto con tutte le features necessarie per il training

In [6]:
import tokenizers
import transformers
from transformers import BertTokenizer

In [7]:
#si importa il tokenizzatore già configurato (in questo caso: bert-base-italian-cased)
tokenizer = BertTokenizer.from_pretrained("dbmdz/bert-base-italian-cased")


In [8]:
#facciamo l'encoding di tutto il dataset tokenizzando frase per frase
def encode(sample):
    return tokenizer(sample["text"], padding=True, truncation=True, max_length=512, return_special_tokens_mask=True) 

train_set = tr_set.map(encode, batched=True)
test_set = val_set.map(encode, batched=True)
train_set.set_format('torch', columns=["input_ids", "attention_mask", "token_type_ids"])
test_set.set_format('torch', columns=["input_ids", "attention_mask", "token_type_ids"])


Map:   0%|          | 0/100000 [00:00<?, ? examples/s]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

In [9]:
train_set

Dataset({
    features: ['id', 'text', 'readability', 'gulpease', 'input_ids', 'token_type_ids', 'special_tokens_mask', 'attention_mask'],
    num_rows: 100000
})

In [10]:
test_set

Dataset({
    features: ['id', 'text', 'readability', 'gulpease', 'input_ids', 'token_type_ids', 'special_tokens_mask', 'attention_mask'],
    num_rows: 5000
})

### TRAINING DI BERT

Si procede al training di Bert. Il modello dovrà partire da uno stato iniziale con pesi random, per questo non si importa il modello già addestrato, ma si configura semplicemente l'architettura la sua architettura per poi addestrarlo da zero. Si definisce poi una strategia di training e i suoi argomenti per poi addestrare il modello sul trask di Language Modeling. 

In [11]:
from transformers import Trainer, TrainingArguments, TrainerCallback, BertConfig, BertForMaskedLM, DataCollatorForLanguageModeling, set_seed


In [12]:
model_name = "prajjwal1/bert-medium"
model_config = BertConfig.from_pretrained(model_name)

print(model_config)



BertConfig {
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 512,
  "initializer_range": 0.02,
  "intermediate_size": 2048,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 8,
  "num_hidden_layers": 8,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.41.2",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}



In [13]:
model = BertForMaskedLM(model_config)
model.resize_token_embeddings(len(tokenizer))


Embedding(31102, 512)

In [14]:
model.config

BertConfig {
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 512,
  "initializer_range": 0.02,
  "intermediate_size": 2048,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 8,
  "num_hidden_layers": 8,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.41.2",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 31102
}

In [15]:
#usiamo il datacollator per fare le batch per il training
datacollator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=True, mlm_probability=0.2, return_tensors="pt")

In [16]:
datacollator

DataCollatorForLanguageModeling(tokenizer=BertTokenizer(name_or_path='dbmdz/bert-base-italian-cased', vocab_size=31102, model_max_length=512, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	101: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	102: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	103: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	104: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}, mlm=True, mlm_probability=0.2, pad_to_multiple_of=None, tf_experimental_com

In [17]:
print(f"Lunghezza del dataset: {len(train_set)}")

Lunghezza del dataset: 100000


In [28]:
#definisco alcune funzione di callback
from transformers import TrainerCallback
from transformers.trainer_callback import TrainerControl, TrainerState
from transformers.training_args import TrainingArguments



class check_weights(TrainerCallback):
    def on_train_begin(self, model, **kwargs):
        init_weights = model.state_dict()
        print(init_weights)
        for key, value in init_weights.items():
            print(f"\n{key}:\n")
            print(value)

#callback per incrementare il numero di step tra ogni salvataggio
class n_checkpoint_increment(TrainerCallback):
    def on_step_begin(self, args: TrainingArguments, state: TrainerState):
        current_step = state.global_step
        if current_step == 16:
            args.save_steps = 16
            print(f'Save step size changed to: {args.save_steps}')
        elif current_step == 256:
            args.save_steps = 256
            print(f'Save step size changed to: {args.save_steps}')
        elif current_step == 2048:
            args.save_steps = 2048
            print(f'Save step size changed to: {args.save_steps}')

class save_checkpoint(TrainerCallback):
    def __init__(self, checkpoint_path):
        self.checkpoint_path = checkpoint_path

    def on_save(self, state:TrainerState, control, **kwargs):
        current_step = state.global_step
        path_name = f'{self.checkpoint_path}-{current_step}'
        if not os.path.exists(path_name):
            os.makedirs(path_name)
        kwargs['model'].save_pretrained(path_name)
        state.save_to_json(os.path.join(path_name, "trainer_state.json"))
        kwargs['model'].config.save_pretrained(path_name)
        if hasattr(kwargs['model'], 'generation_config'):
            kwargs['model'].generation_config.save_pretrained(path_name)
        control.should_save = True
        print(f'Checkpoint has been saved for step number {current_step}')

    

            

In [29]:
#argomenti provvisori, da definire meglio
training_args = TrainingArguments(
    output_dir = "my_pretrained_model",
    evaluation_strategy="steps",
    overwrite_output_dir=True,
    per_device_train_batch_size=64,
    per_device_eval_batch_size=2048,
    num_train_epochs=3,
    logging_steps=10,
    save_steps=2,
    eval_steps=2,
    load_best_model_at_end=True,
    seed=42, 
    )



In [30]:
set_seed(training_args.seed)

In [31]:
open('train_check.txt', 'w').close()

In [32]:
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=datacollator,
    train_dataset=train_set,
    eval_dataset=test_set,
    callbacks=[check_weights()], 
    )


In [33]:
trainer.train()

TypeError: check_weights.on_train_begin() got multiple values for argument 'model'