In [2]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.27.3-py3-none-any.whl (6.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.8/6.8 MB[0m [31m73.7 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.11.0
  Downloading huggingface_hub-0.13.3-py3-none-any.whl (199 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m199.8/199.8 KB[0m [31m21.0 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m71.8 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.13.3 tokenizers-0.13.2 transformers-4.27.3


In [4]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [69]:
import pandas as pd
import torch
import transformers
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
from sklearn.metrics import f1_score, precision_score as precision, recall_score as recall
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AdamW, get_linear_schedule_with_warmup
from copy import deepcopy

train_df = pd.read_csv('/content/drive/MyDrive/hackathon/train_set.csv')
test_df = pd.read_csv('/content/drive/MyDrive/hackathon/test_set.csv')

# Definizione del tokenizer 
tokenizer = AutoTokenizer.from_pretrained("dbmdz/bert-base-italian-cased")

# Lunghezza massima della sequenza di input 
max_seq_length = 128 #228  

# Definita la dimensione del batch di addestramento 
batch_size = 16

# numero delle epoche
epochs = 10


device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')


label_map = {'Java Developer': 0, 'Web Developer': 1, 'Programmer': 2, 'System Analyst': 3, 'Software Engineer': 4} #dizionario meno errori 

##conversione delle etichette in numeri interi 
train_labels = [label_map[label] for label in train_df['Label'].tolist()]

# conversione delle etichette in tensor torch
train_labels = torch.tensor(train_labels)

# Definizione del modello BERT pre-addestrato
model = AutoModelForSequenceClassification.from_pretrained('dbmdz/bert-base-italian-cased', num_labels=10)
model.to(device)

# Definizione dell'ottimizzatore e del scheduler del tasso di apprendimento
optimizer = AdamW(model.parameters(), lr=5e-5)
total_steps = len(train_df) * epochs
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

# training and test data loaders
train_texts = train_df['Job_offer'].tolist()

train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=max_seq_length)
train_dataset = torch.utils.data.TensorDataset(torch.tensor(train_encodings['input_ids']),
                                               torch.tensor(train_encodings['attention_mask']),
                                               train_labels)
train_sampler = RandomSampler(train_dataset)  #campionamento casuale non ripetuto dei dati per ogni epoca di addestramento. ogni volta che vi si accede si sceglei un batch causale dei dati 
train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=batch_size) #dataloader caricherà 16 campioni di dati alla volta per ogni epoca di addestramento.

val_texts = test_df['Job_offer'].tolist()
val_labels = [label_map[label] for label in test_df['Label']]  

val_labels = torch.tensor(val_labels)



val_encodings = tokenizer(val_texts, truncation=True, padding=True, max_length=max_seq_length)

val_dataset = torch.utils.data.TensorDataset(torch.tensor(val_encodings['input_ids']),
                                             torch.tensor(val_encodings['attention_mask']),
                                             torch.tensor(val_labels))
val_sampler = SequentialSampler(val_dataset)
val_dataloader = DataLoader(val_dataset, sampler=val_sampler, batch_size=batch_size)

# training loop 
for epoch in range(epochs):
    model.train() #modalità train consente quindi di aggiornare i pesi durante l'addestramento 
    total_loss = 0  #utilizzato per calcolare la media della perdita per ogni batch nell'addestramento 

    for step, batch in enumerate(train_dataloader):  #step rappresenta il numero del batch , iterazione in ogni batch 
        input_ids = batch[0].to(device)
        attention_mask = batch[1].to(device)
        labels = batch[2].to(device)
# quelli sopra sono i tensori di input del batch e sono convertiti e spostati sulla GPU (se disponibile) utilizzando ".to(device)"
        optimizer.zero_grad() #reimposta i gradienti dei pesi del modello per evitare l'accumulo di gradienti.
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels) #calcolo output del modello 
        loss = outputs.loss  #calcolo perdita associata all output 
        total_loss += loss.item()
        loss.backward()  #calcola il gradiente della perdita rispetto ai pesi del modello.
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)  #evita che il gradiente diventi troppo grande
        optimizer.step()  #aggiorna i pesi del modello utilizzando il gradiente calcolato.
        scheduler.step() #aggiorno rate apprendimento del modello 

    avg_train_loss = total_loss / len(train_dataloader)  #viene calcolata la media della perdita totale e il modello viene impostato in modalità di valutazione

    #modalità di valutazione sul test set 
    model.eval()
    val_preds = []
    val_labels = []
    for batch in val_dataloader:
        input_ids = batch[0].to(device)
        attention_mask = batch[1].to(device)
        labels = batch[2].to('cpu')

        with torch.no_grad():
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)

        logits = outputs.logits #funzione di attivazione softmax converte i punteggi numerici generati dal modello in probabilità, x ogni batch 
        preds = torch.argmax(logits, dim=1).tolist() #seleziona l'indice con la probabilità più alta per ogni esempio nel batch cosi ottengo preds 
        val_preds += preds
        val_labels += labels.tolist()


    # Calcolo del F1 score
    f1_test = f1_score(val_labels, val_preds, average='weighted')
    prec = precision(val_labels, val_preds, average='weighted')
    rec = recall(val_labels, val_preds, average='weighted')

    #f1_train = f1_score(train_labels, train_preds, average='weighted')
    
    print(f"Epoch {epoch+1}/{epochs}, Train Loss: {avg_train_loss:.3f}, Val F1_test score: {f1_test:.3f}")
    print(f"Precision {prec} ::: Recall {rec}")
    torch.save(deepcopy(model.state_dict()), f"weights_{f1_test:.3f}.pt")




Some weights of the model checkpoint at dbmdz/bert-base-italian-cased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model c

Epoch 1/10, Train Loss: 1.702, Val F1_test score: 0.355
Precision 0.3553530751708428 ::: Recall 0.3553530751708428
Epoch 2/10, Train Loss: 1.254, Val F1_test score: 0.672
Precision 0.6719817767653758 ::: Recall 0.6719817767653758
Epoch 3/10, Train Loss: 0.728, Val F1_test score: 0.715
Precision 0.715261958997722 ::: Recall 0.715261958997722
Epoch 4/10, Train Loss: 0.407, Val F1_test score: 0.774
Precision 0.7744874715261959 ::: Recall 0.7744874715261959
Epoch 5/10, Train Loss: 0.268, Val F1_test score: 0.777
Precision 0.7767653758542141 ::: Recall 0.7767653758542141
Epoch 6/10, Train Loss: 0.185, Val F1_test score: 0.790
Precision 0.7904328018223234 ::: Recall 0.7904328018223234
Epoch 7/10, Train Loss: 0.139, Val F1_test score: 0.795
Precision 0.7949886104783599 ::: Recall 0.7949886104783599
Epoch 8/10, Train Loss: 0.114, Val F1_test score: 0.786
Precision 0.785876993166287 ::: Recall 0.785876993166287
Epoch 9/10, Train Loss: 0.101, Val F1_test score: 0.795
Precision 0.7949886104783599

In [73]:
model = AutoModelForSequenceClassification.from_pretrained('dbmdz/bert-base-italian-cased', num_labels=10)
model.to(device)

Some weights of the model checkpoint at dbmdz/bert-base-italian-cased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model c

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(31102, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, element

In [77]:
model.load_state_dict(torch.load("weights_0.818.pt"))

model.eval()

val_preds = []
val_labels = []
for batch in val_dataloader:
    input_ids = batch[0].to(device)
    attention_mask = batch[1].to(device)
    labels = batch[2].to('cpu')

    with torch.no_grad():
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)

    logits = outputs.logits #funzione di attivazione softmax converte i punteggi numerici generati dal modello in probabilità, x ogni batch 
    preds = torch.argmax(logits, dim=1).tolist() #seleziona l'indice con la probabilità più alta per ogni esempio nel batch cosi ottengo preds 
    val_preds += preds
    val_labels += labels.tolist()

f1_test = f1_score(val_labels, val_preds, average='weighted')
prec = precision(val_labels, val_preds, average='weighted')
rec = recall(val_labels, val_preds, average='weighted')

#f1_train = f1_score(train_labels, train_preds, average='weighted')

print(f"Val F1_test score: {f1_test:.3f}")
print(f"Precision {prec} ::: Recall {rec}")


label_map_invert = {v: k for k, v in label_map.items()}

with open("val_predictions.csv", "w+") as f:
  for job_text, label_true, label_pred in zip(val_texts, val_labels, val_preds):
    f.write(f"{job_text}; {label_map_invert[label_true]}; {label_map_invert[label_pred]}\n")

Val F1_test score: 0.819
Precision 0.8242613665475703 ::: Recall 0.8177676537585421
