In [95]:
import pandas as pd
from datasets import load_dataset, Dataset
import os
import re
import torch
from sklearn.model_selection import train_test_split

In [96]:
#setto il device da usare
torch.device("cuda")

device(type='cuda')

### PRE-PROCESSING

Per ottenere un dataset che possa essere utilizzato come training del nostro language model partiamo dai file conllu che contengono i testi annotati di wikipedia italiana. Da questi file vogliamo ottenere una struttura dati che per ogni frase riporti id, testo e indice di Gulpease (per ora).

In [97]:
#si ottengono i path di ogni file per il pretraining e si salvano in una lista

ds_directory = "data"
ds_files = []
for file_name in os.listdir(ds_directory):
    file_path = os.path.join(ds_directory, file_name)
    ds_files.append(file_path)
    
print(ds_files)

['data\\prova.conllu', 'data\\prova2.conllu', 'data\\text_all.txt']


In [98]:

def get_id(file_path, n_file):
    id_list = []
    current_id = ""
    for line in open(file_path, 'r',  encoding='utf-8'):
        if line.startswith("# sent_id"):
            current_id = re.sub(r'\D', '', line)
            id_list.append(current_id + "_" + str(n_file))
    return id_list


In [99]:


#PROBLEMA: l'id non è univoco se abbiamo un insieme di file che sono stati "parsati" singolarmente. 
#Possibile soluzione: aggiungere un numero identificativo del file. ?! 

In [100]:
#si ottiene il testo delle frasi dal conllu
def get_text(file_path):
    text_list = []
    current_sent = ""
    for line in open(file_path, 'r',  encoding='utf-8'): 
        if line.startswith("# text"):
            current_sent = line[9:].rstrip('\n')
            text_list.append(current_sent)
    return text_list


In [101]:
#funzione che calcola gulpease
def comp_gulpease(ns, nw, nl):
    g_value = 89 + ((300*ns - 10*nl)/nw) #è corretta questa formula?
    return g_value

#funzione che estrae gulpease
def get_gulpease(file_path):
    gulp_list = []
    for line in open(file_path, 'r', encoding = "utf-8"):  #questa riga la ripeto tre volte meglio ottimizzare in un'unica f
        if line.startswith("# text"):
            current_sent = line[9:].rstrip('\n')
            words = current_sent.split()
            gulp = comp_gulpease(1, len(words), sum(len(word) for word in words))
            gulp_list.append(gulp)
    return gulp_list

#DOMANDA: l'indice di gulpease serve a determinare la difficoltà di un intero testo. Come fare se abbiamo una singola frase?


In [102]:
#si estraggono id e testo tramite le funzioni sopra definite

id_list = []
gulpease_list = []
text_list = []
n_file = 1
for item in ds_files:
    id_list = id_list + get_id(item, n_file)
    n_file += 1
    text_list = text_list + get_text(item)
    gulpease_list = gulpease_list + get_gulpease(item)

In [103]:
#si crea un dataframe con una riga per frase, attributi: id, testo e indice di gulpease

ds_df = pd.DataFrame(columns=["id", "text", "gulp_index"])


In [104]:
ds_df["id"] = id_list
ds_df["text"] = text_list
ds_df["gulp_index"]  = gulpease_list

In [105]:
ds_df.head()

Unnamed: 0,id,text,gulp_index
0,1_1,L'allunaggio è la discesa di un veicolo sulla ...,74.555556
1,2_1,"Si distingue tra allunaggio duro, cioè un impa...",42.703704
2,3_1,"Il programma Luna, partito nel 1959 con la son...",56.142857
3,4_1,"Luna 9, il 3 febbraio 1966, eseguì il primo at...",63.615385
4,5_1,"Il primo allunaggio di un essere umano, il 20 ...",45.0


In [106]:
 # TO  DO 
#ottimizzare il codice: il for line in open... farlo in una funzione unica

In [124]:
#Divido in training e test set
dataset = Dataset.from_pandas(ds_df)
split_set = dataset.train_test_split(test_size=0.1)

train_ds = split_set["train"]
test_ds = split_set["test"]



### TOKENIZATION

In questa sezione si importa il tokenizzatore col quale si tokenizza ciascuna frase nel formato necessario per Bert.

In [108]:
import tokenizers
import transformers
from transformers import BertTokenizer

In [109]:
tokenizer = BertTokenizer.from_pretrained("dbmdz/bert-base-italian-cased")


loading file https://huggingface.co/dbmdz/bert-base-italian-cased/resolve/main/vocab.txt from cache at C:\Users\bergo/.cache\huggingface\transformers\e386d7030c11abe3c82da83b0aa728f3c09ab3a6728e325fe78bb5a0c67d7c71.83ca512ab51c5bc2809e83002a054b84ab85a200b98d5c0eb036d7611ee4362e
loading file https://huggingface.co/dbmdz/bert-base-italian-cased/resolve/main/added_tokens.json from cache at None
loading file https://huggingface.co/dbmdz/bert-base-italian-cased/resolve/main/special_tokens_map.json from cache at None
loading file https://huggingface.co/dbmdz/bert-base-italian-cased/resolve/main/tokenizer_config.json from cache at C:\Users\bergo/.cache\huggingface\transformers\534fa05777338ca7e2b068a37beb83688543de270a20252296be3eadd10caca1.6391beef2ceed2cdba47401eb12680200856c97d2f2b56143e515d7c0f36a66a
loading configuration file https://huggingface.co/dbmdz/bert-base-italian-cased/resolve/main/config.json from cache at C:\Users\bergo/.cache\huggingface\transformers\4641bcb7c4ac61788587ad50

In [110]:
#facciamo l'encoding di tutto il dataset tokenizzando frase per frase
def encode(sample):
    return tokenizer(sample["text"], padding=True, truncation=True, max_length=512, return_special_tokens_mask=True)

train_set = train_ds.map(encode, batched=True)
test_set = test_ds.map(encode, batched=True)
train_set.set_format('torch', columns=["input_ids", "attention_mask", "token_type_ids"])
test_set.set_format('torch', columns=["input_ids", "attention_mask", "token_type_ids"])


Map:   0%|          | 0/194 [00:00<?, ? examples/s]

Map:   0%|          | 0/22 [00:00<?, ? examples/s]

In [126]:
train_set

Dataset({
    features: ['id', 'text', 'gulp_index', 'input_ids', 'token_type_ids', 'special_tokens_mask', 'attention_mask'],
    num_rows: 194
})

### TRAINING DI BERT

Si procede al training di Bert. Il modello dovrà partire da uno stato iniziale con pesi random, per questo non si importa il modello già addestrato, ma si configura semplicemente l'architettura la sua architettura per poi addestrarlo da zero. Si definisce poi una strategia di training e i suoi argomenti per poi addestrare il modello sul trask di Language Modeling. 

In [111]:
from transformers import Trainer, TrainingArguments, BertConfig, BertForMaskedLM, DataCollatorForLanguageModeling

In [112]:
model_name = "prajjwal1/bert-mini"
model_config = BertConfig.from_pretrained(model_name)

print(model_config)

loading configuration file https://huggingface.co/prajjwal1/bert-mini/resolve/main/config.json from cache at C:\Users\bergo/.cache\huggingface\transformers\a32529b12a03c02e99c269bf68c0c7b8349093f626e860ab9b012e3d9539c539.e6c2a1d71adb3143ecd42222c4604e92ff255a7663c04bb5c4fad770c78e096c
Model config BertConfig {
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 256,
  "initializer_range": 0.02,
  "intermediate_size": 1024,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 4,
  "num_hidden_layers": 4,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.20.1",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}



BertConfig {
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 256,
  "initializer_range": 0.02,
  "intermediate_size": 1024,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 4,
  "num_hidden_layers": 4,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.20.1",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}



In [113]:
model = BertForMaskedLM(model_config)
model.resize_token_embeddings(len(tokenizer))


Embedding(31102, 256)

In [114]:
model.config

BertConfig {
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 256,
  "initializer_range": 0.02,
  "intermediate_size": 1024,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 4,
  "num_hidden_layers": 4,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.20.1",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 31102
}

In [115]:
#usiamo il datacollator per fare le batch per il training
datacollator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=True, mlm_probability=0.2, return_tensors="pt")

In [116]:
datacollator

DataCollatorForLanguageModeling(tokenizer=PreTrainedTokenizer(name_or_path='dbmdz/bert-base-italian-cased', vocab_size=31102, model_max_len=512, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}), mlm=True, mlm_probability=0.2, pad_to_multiple_of=None, tf_experimental_compile=False, return_tensors='pt')

In [117]:
len(train_set)

194

In [118]:
#argomenti provvisori, da definire meglio
training_args = TrainingArguments(
    output_dir = "my_pretrained_model",
    evaluation_strategy="steps",
    overwrite_output_dir=True,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=64,
    num_train_epochs=2,
    logging_steps=10,
    save_steps=10,
    save_total_limit=2,
    load_best_model_at_end=True,
    )

using `logging_steps` to initialize `eval_steps` to 10
PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [119]:
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=datacollator,
    train_dataset=train_set,
    eval_dataset=test_set,)

In [120]:
trainer.train()

The following columns in the training set don't have a corresponding argument in `BertForMaskedLM.forward` and have been ignored: id, special_tokens_mask, gulp_index, text. If id, special_tokens_mask, gulp_index, text are not expected by `BertForMaskedLM.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 194
  Num Epochs = 2
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 1
  Total optimization steps = 14


  0%|          | 0/14 [00:00<?, ?it/s]

The following columns in the evaluation set don't have a corresponding argument in `BertForMaskedLM.forward` and have been ignored: id, special_tokens_mask, gulp_index, text. If id, special_tokens_mask, gulp_index, text are not expected by `BertForMaskedLM.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 22
  Batch size = 64


{'loss': 10.3254, 'learning_rate': 1.4285714285714285e-05, 'epoch': 1.43}


  0%|          | 0/1 [00:00<?, ?it/s]

Saving model checkpoint to my_pretrained_model\checkpoint-10
Configuration saved in my_pretrained_model\checkpoint-10\config.json
Model weights saved in my_pretrained_model\checkpoint-10\pytorch_model.bin


{'eval_loss': 10.216655731201172, 'eval_runtime': 0.6344, 'eval_samples_per_second': 34.678, 'eval_steps_per_second': 1.576, 'epoch': 1.43}


Deleting older checkpoint [my_pretrained_model\checkpoint-40] due to args.save_total_limit


Training completed. Do not forget to share your model on huggingface.co/models =)


Loading best model from my_pretrained_model\checkpoint-10 (score: 10.216655731201172).


{'train_runtime': 41.6787, 'train_samples_per_second': 9.309, 'train_steps_per_second': 0.336, 'train_loss': 10.289214542933873, 'epoch': 2.0}


TrainOutput(global_step=14, training_loss=10.289214542933873, metrics={'train_runtime': 41.6787, 'train_samples_per_second': 9.309, 'train_steps_per_second': 0.336, 'train_loss': 10.289214542933873, 'epoch': 2.0})