In [84]:
import pandas as pd
from datasets import load_dataset, Dataset
import os
import re
import torch
from sklearn.model_selection import train_test_split
import requests
import random


### PRE-PROCESSING

Per ottenere un dataset che possa essere utilizzato come training del nostro language model partiamo dai file conllu che contengono i testi annotati di wikipedia italiana. Da questi file vogliamo ottenere una struttura dati che per ogni frase riporti id, testo e indice di Gulpease (per ora).

In [85]:
#si ottengono i path di ogni file per il pretraining e si salvano in una lista
ds_directory = "data/conllu"
ds_files = []
for file_name in os.listdir(ds_directory):
    file_path = os.path.join(ds_directory, file_name)
    ds_files.append(file_path)
print(ds_files)

['data/conllu\\chat.conllu', 'data/conllu\\text_all.conllu']


In [86]:
#funzione che calcola gulpease - io in teoria userò read-it
'''
def comp_gulpease(ns, nw, nl):
    g_value = 89 + ((300*ns - 10*nl)/nw) #è corretta questa formula?
    return g_value
'''

'\ndef comp_gulpease(ns, nw, nl):\n    g_value = 89 + ((300*ns - 10*nl)/nw) #è corretta questa formula?\n    return g_value\n'

In [87]:
#Qui il codice per ottenere l'indicie di leggibilità

SERVER_PATH = "http://api.italianlp.it"

#con una post si carica il documento nel db del server e si caclcola la leggibiità 
def load_document(text):
    r = requests.post(SERVER_PATH + '/documents/',           # carica il documento nel database del server
                      data={'text': text,                    # durante il caricamento viene eseguita un'analisi linguistica necessaria per calcolare la leggibilita'
                          'lang': 'IT',
                          'extra_tasks': ["readability"]     # chiede al server di calcolare anche la leggibilità del docuemnto
                  })
    doc_id = r.json()['id']                                  # id del documento nel database del server, che serve per richiedere i risultati delle analisi
    return doc_id

#si fa una get per ottenere i risultati, in questo caso siamo interessati solo alla leggibilità globale
def get_sent_readability(doc_id):
    r = requests.get(SERVER_PATH + '/documents/details/%s' % doc_id)
    result = r.json()
    sent_dict = result['sentences']['data'][0]
    sent_readability = sent_dict["readability_score_all"]        #prendiamo la leggibilità globale
    return sent_readability

def get_random_score():
    rand_score = random.randint(0,100)
    return rand_score

#funzione per iterare su ciascuna frase 
'''def get_readit_scores(file_path):
    readit_list = []
    for line in open(file_path, 'r', encoding = "utf-8"): 
        print(line)
        if line.startswith("# text"):
            current_sent = line[9:].rstrip('\n')
            sent_id = load_document(current_sent)
            r_score = get_sent_readability(sent_id)
            readit_list.append(r_score)
    return readit_list
'''

'def get_readit_scores(file_path):\n    readit_list = []\n    for line in open(file_path, \'r\', encoding = "utf-8"): \n        print(line)\n        if line.startswith("# text"):\n            current_sent = line[9:].rstrip(\'\n\')\n            sent_id = load_document(current_sent)\n            r_score = get_sent_readability(sent_id)\n            readit_list.append(r_score)\n    return readit_list\n'

In [88]:
#funzione che legge i file conllu riga per riga, estraendo id, testo e indice di complessità di ciascuna frase 
def extract(file_path, n_file):
    id_list = []
    text_list = []
    readit_list = []
    current_id=""
    current_sent=""
    for line in open(file_path, 'r', encoding='utf-8'):
        if line.startswith("# text"):
            current_sent = line[9:].rstrip('\n')
            #gulp = comp_gulpease(1, len(words), sum(len(word) for word in words))
            print(current_sent)
            #sent_id = load_document(current_sent)
            #r_score = get_sent_readability(sent_id)
            r_score = get_random_score()
            readit_list.append(r_score)
            text_list.append(current_sent)
            #gulp_list.append(gulp)
        elif line.startswith("# sent_id"):
            current_id = re.sub(r'\D', '', line)
            id_list.append(f'{current_id}_{str(n_file)}') #per avere id univoco ho aggiunto numero del file
    return id_list, text_list, readit_list
            
            


In [89]:
#si estraggono id e testo tramite le funzioni sopra definite
n_file = 1
id_list = []
text_list = []
readit_list = []
for item in ds_files:
    item_ids, item_texts, item_readit = extract(item, n_file)
    id_list = id_list + item_ids
    text_list = text_list + item_texts
    readit_list = readit_list + item_readit
    n_file += 1

La Prima Guerra Mondiale, conosciuta anche come la Grande Guerra, fu un conflitto globale che ebbe luogo principalmente in Europa dal 28 luglio 1914 al 11 novembre 1918.
Coinvolse le principali potenze mondiali dell'epoca, divise in due alleanze contrapposte:
gli Alleati, guidati da Francia, Regno Unito, Russia (successivamente sostituita dall'Impero britannico e dagli Stati Uniti), e l'Intesa Centrale, composta da Germania, Austria-Ungheria, Impero Ottomano e Bulgaria.
Le cause della guerra possono essere attribuite a una serie di fattori, tra cui tensioni politiche, rivalità coloniali, nazionalismo, militarismo e sistemi di alleanze che resero il conflitto inevitabile dopo l'assassinio dell'arciduca Francesco Ferdinando d'Austria, erede al trono austro-ungarico, a Sarajevo nel giugno 1914.
La guerra fu caratterizzata da una serie di nuove tattiche e tecnologie militari, tra cui trincee, mitragliatrici, gas tossici e bombardamenti aerei, che portarono a un conflitto di logoramento su 

In [90]:
#si crea un dataframe con una riga per frase, attributi: id, testo e indice di gulpease
ds_df = pd.DataFrame(columns=["id", "text", "readit_index"])

ds_df["id"] = id_list
ds_df["text"] = text_list
ds_df["readit_index"]  = readit_list

ds_df.head()


Unnamed: 0,id,text,readit_index
0,1_1,"La Prima Guerra Mondiale, conosciuta anche com...",81
1,2_1,Coinvolse le principali potenze mondiali dell'...,14
2,3_1,"gli Alleati, guidati da Francia, Regno Unito, ...",3
3,4_1,Le cause della guerra possono essere attribuit...,94
4,5_1,La guerra fu caratterizzata da una serie di nu...,35


In [91]:
#setto il device da usare
#torch.device("cpu")

In [92]:
#Divido in training e test set
dataset = Dataset.from_pandas(ds_df)
split_set = dataset.train_test_split(test_size=0.1)

train_ds = split_set["train"]
test_ds = split_set["test"]

### TOKENIZATION

In questa sezione si importa il tokenizzatore col quale si tokenizza ciascuna frase nel formato necessario per Bert, alla fine si otterrà un dataset nel formato corretto con tutte le features necessarie per il training

In [93]:
import tokenizers
import transformers
from transformers import BertTokenizer

In [94]:
#si importa il tokenizzatore già configurato (in questo caso: bert-base-italian-cased)
tokenizer = BertTokenizer.from_pretrained("dbmdz/bert-base-italian-cased")


loading file https://huggingface.co/dbmdz/bert-base-italian-cased/resolve/main/vocab.txt from cache at C:\Users\bergo/.cache\huggingface\transformers\e386d7030c11abe3c82da83b0aa728f3c09ab3a6728e325fe78bb5a0c67d7c71.83ca512ab51c5bc2809e83002a054b84ab85a200b98d5c0eb036d7611ee4362e
loading file https://huggingface.co/dbmdz/bert-base-italian-cased/resolve/main/added_tokens.json from cache at None
loading file https://huggingface.co/dbmdz/bert-base-italian-cased/resolve/main/special_tokens_map.json from cache at None
loading file https://huggingface.co/dbmdz/bert-base-italian-cased/resolve/main/tokenizer_config.json from cache at C:\Users\bergo/.cache\huggingface\transformers\534fa05777338ca7e2b068a37beb83688543de270a20252296be3eadd10caca1.6391beef2ceed2cdba47401eb12680200856c97d2f2b56143e515d7c0f36a66a
loading configuration file https://huggingface.co/dbmdz/bert-base-italian-cased/resolve/main/config.json from cache at C:\Users\bergo/.cache\huggingface\transformers\4641bcb7c4ac61788587ad50

In [95]:
#facciamo l'encoding di tutto il dataset tokenizzando frase per frase
def encode(sample):
    return tokenizer(sample["text"], padding=True, truncation=True, max_length=512, return_special_tokens_mask=True)

train_set = train_ds.map(encode, batched=True)
test_set = test_ds.map(encode, batched=True)
train_set.set_format('torch', columns=["input_ids", "attention_mask", "token_type_ids"])
test_set.set_format('torch', columns=["input_ids", "attention_mask", "token_type_ids"])


Map:   0%|          | 0/75 [00:00<?, ? examples/s]

Map:   0%|          | 0/9 [00:00<?, ? examples/s]

In [96]:
train_set

Dataset({
    features: ['id', 'text', 'readit_index', 'input_ids', 'token_type_ids', 'special_tokens_mask', 'attention_mask'],
    num_rows: 75
})

In [97]:
test_set

Dataset({
    features: ['id', 'text', 'readit_index', 'input_ids', 'token_type_ids', 'special_tokens_mask', 'attention_mask'],
    num_rows: 9
})

### TRAINING DI BERT

Si procede al training di Bert. Il modello dovrà partire da uno stato iniziale con pesi random, per questo non si importa il modello già addestrato, ma si configura semplicemente l'architettura la sua architettura per poi addestrarlo da zero. Si definisce poi una strategia di training e i suoi argomenti per poi addestrare il modello sul trask di Language Modeling. 

In [98]:
from transformers import Trainer, TrainingArguments, TrainerCallback, BertConfig, BertForMaskedLM, DataCollatorForLanguageModeling, set_seed


In [99]:
model_name = "prajjwal1/bert-mini"
model_config = BertConfig.from_pretrained(model_name)

print(model_config)

loading configuration file https://huggingface.co/prajjwal1/bert-mini/resolve/main/config.json from cache at C:\Users\bergo/.cache\huggingface\transformers\a32529b12a03c02e99c269bf68c0c7b8349093f626e860ab9b012e3d9539c539.e6c2a1d71adb3143ecd42222c4604e92ff255a7663c04bb5c4fad770c78e096c
Model config BertConfig {
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 256,
  "initializer_range": 0.02,
  "intermediate_size": 1024,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 4,
  "num_hidden_layers": 4,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.20.1",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}



BertConfig {
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 256,
  "initializer_range": 0.02,
  "intermediate_size": 1024,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 4,
  "num_hidden_layers": 4,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.20.1",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}



In [100]:
model = BertForMaskedLM(model_config)
model.resize_token_embeddings(len(tokenizer))


Embedding(31102, 256)

In [101]:
model.config

BertConfig {
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 256,
  "initializer_range": 0.02,
  "intermediate_size": 1024,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 4,
  "num_hidden_layers": 4,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.20.1",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 31102
}

In [102]:
#usiamo il datacollator per fare le batch per il training
datacollator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=True, mlm_probability=0.2, return_tensors="pt")

In [103]:
datacollator

DataCollatorForLanguageModeling(tokenizer=PreTrainedTokenizer(name_or_path='dbmdz/bert-base-italian-cased', vocab_size=31102, model_max_len=512, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}), mlm=True, mlm_probability=0.2, pad_to_multiple_of=None, tf_experimental_compile=False, return_tensors='pt')

In [104]:
print(f"Lunghezza del dataset: {len(train_set)}")

Lunghezza del dataset: 75


In [202]:
#definisco una funzione di callback per verificare l'ordinamento dei dati per ogni epoca
from transformers.trainer_callback import TrainerControl, TrainerState
from transformers.training_args import TrainingArguments


class check_ds_order(TrainerCallback):
    def on_epoch_begin(self, args, state, control, train_dataloader, **kwargs):
        f = open(f"train_check.txt", "a")
        f.write(f"\n------------------------ ORDINE DEI DATI ALL'INIZI DELL'EPOCA {int(state.epoch+1)} ------------------------")
        f.write(str(train_dataloader.dataset["input_ids"][:5]))
        f.write("-----------------------------------------------------------------------------------")

class check_weights(TrainerCallback):
    def on_train_begin(self, args, state, control, model, **kwargs):
        init_weights = model.state_dict()
        for key, value in init_weights.items():
            print(f"\n{key}:\n")
            print(value)


In [203]:
#argomenti provvisori, da definire meglio
training_args = TrainingArguments(
    output_dir = "my_pretrained_model",
    evaluation_strategy="steps",
    overwrite_output_dir=True,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    logging_steps=10,
    save_steps=10,
    save_total_limit=2,
    load_best_model_at_end=True,
    seed=42, 
    )

using `logging_steps` to initialize `eval_steps` to 10
PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [204]:
set_seed(training_args.seed)

In [205]:
open('train_check.txt', 'w').close()

In [206]:
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=datacollator,
    train_dataset=train_set,
    eval_dataset=test_set,
    callbacks=[check_ds_order, check_weights], 
    )

In [207]:
trainer.train()

The following columns in the training set don't have a corresponding argument in `BertForMaskedLM.forward` and have been ignored: id, readit_index, text, special_tokens_mask. If id, readit_index, text, special_tokens_mask are not expected by `BertForMaskedLM.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 75
  Num Epochs = 3
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 30



bert.embeddings.position_ids:

tensor([[  0,   1,   2,   3,   4,   5,   6,   7,   8,   9,  10,  11,  12,  13,
          14,  15,  16,  17,  18,  19,  20,  21,  22,  23,  24,  25,  26,  27,
          28,  29,  30,  31,  32,  33,  34,  35,  36,  37,  38,  39,  40,  41,
          42,  43,  44,  45,  46,  47,  48,  49,  50,  51,  52,  53,  54,  55,
          56,  57,  58,  59,  60,  61,  62,  63,  64,  65,  66,  67,  68,  69,
          70,  71,  72,  73,  74,  75,  76,  77,  78,  79,  80,  81,  82,  83,
          84,  85,  86,  87,  88,  89,  90,  91,  92,  93,  94,  95,  96,  97,
          98,  99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111,
         112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125,
         126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139,
         140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153,
         154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167,
         168, 169, 1

  0%|          | 0/30 [00:00<?, ?it/s]

The following columns in the evaluation set don't have a corresponding argument in `BertForMaskedLM.forward` and have been ignored: id, readit_index, text, special_tokens_mask. If id, readit_index, text, special_tokens_mask are not expected by `BertForMaskedLM.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 9
  Batch size = 16


{'loss': 8.3176, 'learning_rate': 3.3333333333333335e-05, 'epoch': 1.0}


  0%|          | 0/1 [00:00<?, ?it/s]

Saving model checkpoint to my_pretrained_model\checkpoint-10
Configuration saved in my_pretrained_model\checkpoint-10\config.json


{'eval_loss': 9.202439308166504, 'eval_runtime': 0.2689, 'eval_samples_per_second': 33.472, 'eval_steps_per_second': 3.719, 'epoch': 1.0}


Model weights saved in my_pretrained_model\checkpoint-10\pytorch_model.bin
Deleting older checkpoint [my_pretrained_model\checkpoint-20] due to args.save_total_limit
The following columns in the evaluation set don't have a corresponding argument in `BertForMaskedLM.forward` and have been ignored: id, readit_index, text, special_tokens_mask. If id, readit_index, text, special_tokens_mask are not expected by `BertForMaskedLM.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 9
  Batch size = 16


{'loss': 8.4692, 'learning_rate': 1.6666666666666667e-05, 'epoch': 2.0}


  0%|          | 0/1 [00:00<?, ?it/s]

Saving model checkpoint to my_pretrained_model\checkpoint-20
Configuration saved in my_pretrained_model\checkpoint-20\config.json


{'eval_loss': 9.14184284210205, 'eval_runtime': 0.3297, 'eval_samples_per_second': 27.298, 'eval_steps_per_second': 3.033, 'epoch': 2.0}


Model weights saved in my_pretrained_model\checkpoint-20\pytorch_model.bin
Deleting older checkpoint [my_pretrained_model\checkpoint-30] due to args.save_total_limit
The following columns in the evaluation set don't have a corresponding argument in `BertForMaskedLM.forward` and have been ignored: id, readit_index, text, special_tokens_mask. If id, readit_index, text, special_tokens_mask are not expected by `BertForMaskedLM.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 9
  Batch size = 16


{'loss': 8.6857, 'learning_rate': 0.0, 'epoch': 3.0}


  0%|          | 0/1 [00:00<?, ?it/s]

Saving model checkpoint to my_pretrained_model\checkpoint-30
Configuration saved in my_pretrained_model\checkpoint-30\config.json


{'eval_loss': 8.799484252929688, 'eval_runtime': 0.2979, 'eval_samples_per_second': 30.213, 'eval_steps_per_second': 3.357, 'epoch': 3.0}


Model weights saved in my_pretrained_model\checkpoint-30\pytorch_model.bin
Deleting older checkpoint [my_pretrained_model\checkpoint-10] due to args.save_total_limit


Training completed. Do not forget to share your model on huggingface.co/models =)


Loading best model from my_pretrained_model\checkpoint-30 (score: 8.799484252929688).


{'train_runtime': 36.0433, 'train_samples_per_second': 6.242, 'train_steps_per_second': 0.832, 'train_loss': 8.490803527832032, 'epoch': 3.0}


TrainOutput(global_step=30, training_loss=8.490803527832032, metrics={'train_runtime': 36.0433, 'train_samples_per_second': 6.242, 'train_steps_per_second': 0.832, 'train_loss': 8.490803527832032, 'epoch': 3.0})