In [None]:
pip install transformers[torch]

In [None]:
from transformers import AutoModelForTokenClassification

In [None]:
from transformers import AutoTokenizer

In [None]:
pip install evaluate

In [None]:
pip install seqeval

Drive

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
PATH = "/content/drive/MyDrive/tesi/"

Lettura dei dati ed estrazione dei token e dei dati (array di stringhe, array di id)

In [None]:
from pathlib import Path
import re

def read_data(file_path):
    file_path = Path(file_path)
    raw_text = file_path.read_text().strip()
    raw_docs = re.split(r'\n\t?\n', raw_text)
    token_docs = []
    tag_docs = []
    for doc in raw_docs:
        tokens = []
        tags = []
        for line in doc.split('\n'):
            token, tag = line.split('\t')
            tokens.append(token)
            tags.append(int(tag))
        token_docs.append(tokens)
        tag_docs.append(tags)

    return token_docs, tag_docs

texts, tags = read_data(PATH + "dataset/recipe_dataset.csv")

Dichiarazione del dataset (encodings = testo tokenizzato con tokenizer, labels = tag numerici per gli encodings)

In [None]:
import torch

class RecipeDataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()} #creo dizionario con l'encoding
        return item

    def __len__(self):
        return len(self.encodings["labels"])

In [None]:

id2label = {
    0: "B-ING",
    1: "I-ING",
    2: "B-QUANTITY",
    3: "I-QUANTITY",
    4: "B-UNIT",
    5: "I-UNIT",
    6: "B-STATE",
    7: "I-STATE",
    8: "B-PART",
    9: "I-PART",
    10: "B-EQUIPMENT",
    11: "I-EQUIPMENT",
    12: "B-ALT",
    13: "I-ALT",
    14: "O"
}
label2id = {
    "B-ING":0,
    "I-ING":1,
    "B-QUANTITY":2,
    "I-QUANTITY":3,
    "B-UNIT":4,
    "I-UNIT":5,
    "B-STATE":6,
    "I-STATE":7,
    "B-PART":8,
    "I-PART":9,
    "B-EQUIPMENT":10,
    "I-EQUIPMENT":11,
    "B-ALT":12,
    "I-ALT":13,
    "O":14
}



Definizione funzione allineamento

In [None]:
def align_labels_with_tokens(labels, word_ids):
    new_labels = []
    current_word = None
    for word_id in word_ids:
        if word_id != current_word:
            # Start of a new word!
            current_word = word_id
            label = -100 if word_id is None else labels[word_id]
            new_labels.append(label)
        elif word_id is None:
            # Special token
            new_labels.append(-100)
        else:
            # Same word as previous token
            label = labels[word_id]
            if label % 2 == 0 and label != label2id["O"]: #ATTENZIONE DIPENDE SE QUELLE PARI SOLO LE B O LE I
              label+=1
            new_labels.append(label)

    return new_labels

In [None]:
def tokenize_and_align_labels(texts, tags, tokenizer):
    tokenized_inputs = tokenizer(
        texts, truncation=True, is_split_into_words=True, padding=True
    )
    new_labels = []
    for i, labels in enumerate(tags):
        word_ids = tokenized_inputs.word_ids(i)
        new_labels.append(align_labels_with_tokens(labels, word_ids))

    tokenized_inputs["labels"] = new_labels
    return tokenized_inputs

Allineamento e creazione del dataset

In [None]:
import torch
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("bert-base-multilingual-uncased")
tokenizer.is_fast


In [None]:
from sklearn.model_selection import train_test_split

train_texts, test_texts, train_tags, test_tags = train_test_split(texts, tags, test_size=.2) 
train_encs = tokenize_and_align_labels(train_texts, train_tags, tokenizer )
test_encs = tokenize_and_align_labels(test_texts, test_tags, tokenizer )
train_dataset = RecipeDataset(train_encs)
eval_dataset = RecipeDataset(test_encs)

In [None]:
Definizione funzioni di valutazione

In [None]:
import evaluate

seqeval = evaluate.load("seqeval")

In [None]:
import numpy as np
label_list = list(label2id.keys())

def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = seqeval.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

Training

In [None]:
model = AutoModelForTokenClassification.from_pretrained("bert-base-multilingual-uncased", id2label=id2label, label2id=label2id,num_labels=len(id2label)) 

In [None]:
from transformers import DataCollatorForTokenClassification

data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

In [None]:
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir='./results',          # output directory
    num_train_epochs=3,              # total number of training epochs
    per_device_train_batch_size=1,  # batch size per device during training
    per_device_eval_batch_size=8,   # batch size for evaluation
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir='./logs',            # directory for storing logs
    logging_steps=10,
    evaluation_strategy="epoch"
)

device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

trainer = Trainer(
    model=model,                         # the instantiated 🤗 Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=train_dataset,         # training dataset
    eval_dataset=eval_dataset,             # evaluation dataset
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

trainer.train()

model.save_pretrained(PATH + "model_multi/")
tokenizer.save_pretrained(PATH + "model_multi/")