In [1]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

In [2]:
import pandas as pd
import numpy as np
import torch
import pickle
from torch import cuda

In [None]:
device = 'cuda' if cuda.is_available() else 'cpu'

In [3]:
mlb_path = '../models/mlb.pkl'
mlb = pickle.load(open(mlb_path, 'rb'))

In [4]:
train_df = pd.read_csv('../data_processed/model_data/train_data.csv')
train_df['label'] = train_df['label'].apply(lambda x: [x])

val_df = pd.read_csv('../data_processed/model_data/val_data.csv')
val_df['label'] = val_df['label'].apply(lambda x: [x])

In [5]:
model_name = 'distilbert-base-uncased'

In [6]:
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [7]:
id2label = {index: class_name for (index, class_name) in enumerate(mlb.classes_)}
label2id = {class_name: index for (index, class_name) in enumerate(mlb.classes_)}

In [8]:
encoded = []
for _, row in train_df.iterrows():
    add_enc = {}
    encoding = tokenizer(row['init_text'], padding="max_length", truncation=True, max_length=512) 
    ids = encoding['input_ids']
    mask = encoding['attention_mask']

    label_mlb = mlb.transform([row['label']])[0]
    add_enc = {
        'input_ids': torch.tensor(ids, dtype=torch.long),
        'attention_mask': torch.tensor(mask, dtype=torch.long),
        'labels': torch.tensor(label_mlb, dtype=torch.float)
    }
    
    encoded.append(add_enc)

In [9]:
encoded_val = []
for _, row in val_df.iterrows():
    add_enc = {}
    encoding = tokenizer(row['init_text'], padding="max_length", truncation=True, max_length=512) 
    ids = encoding['input_ids']
    mask = encoding['attention_mask']

    label_mlb = mlb.transform([row['label']])[0]
    add_enc = {
        'input_ids': torch.tensor(ids, dtype=torch.long),
        'attention_mask': torch.tensor(mask, dtype=torch.long),
        'labels': torch.tensor(label_mlb, dtype=torch.float)
    }
    
    encoded_val.append(add_enc)

In [11]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained(model_name,
                                                           problem_type="multi_label_classification",
                                                           num_labels=len(list(mlb.classes_)),
                                                           id2label=id2label,
                                                           label2id=label2id)

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_projector.bias', 'vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_projector.weight', 'vocab_transform.weight', 'vocab_layer_norm.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier

In [12]:
batch_size = 8
metric_name = "f1"

In [13]:
from transformers import TrainingArguments, Trainer

args = TrainingArguments(
    f"distilbert_model",
    evaluation_strategy = "epoch",
    save_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=5,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model=metric_name,
    #push_to_hub=True,
)

In [14]:
from sklearn.metrics import f1_score, roc_auc_score, accuracy_score
from transformers import EvalPrediction
import torch

def multi_label_metrics(predictions, labels, threshold=0.5):
    sigmoid = torch.nn.Sigmoid()
    probs = sigmoid(torch.Tensor(predictions))
    y_pred = np.zeros(probs.shape)
    y_pred[np.where(probs >= threshold)] = 1

    y_true = labels
    f1_micro_average = f1_score(y_true=y_true, y_pred=y_pred, average='micro')
    roc_auc = roc_auc_score(y_true, y_pred, average = 'micro')
    accuracy = accuracy_score(y_true, y_pred)
    
    metrics = {'f1': f1_micro_average,
               'roc_auc': roc_auc,
               'accuracy': accuracy}
    return metrics

def compute_metrics(p: EvalPrediction):
    preds = p.predictions[0] if isinstance(p.predictions,
            tuple) else p.predictions
    result = multi_label_metrics(
        predictions=preds,
        labels=p.label_ids)
    return result

In [15]:
trainer = Trainer(
    model,
    args,
    train_dataset=encoded,
    eval_dataset=encoded_val,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

In [16]:
trainer.train()

***** Running training *****
  Num examples = 31838
  Num Epochs = 5
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 19900


Epoch,Training Loss,Validation Loss,F1,Roc Auc,Accuracy
1,0.0822,0.071271,0.741995,0.832062,0.670737
2,0.0643,0.065189,0.763177,0.84831,0.701377
3,0.0538,0.062899,0.772069,0.854159,0.713588
4,0.0465,0.063918,0.775012,0.859026,0.722247
5,0.0386,0.064722,0.779467,0.865827,0.732682


***** Running Evaluation *****
  Num examples = 4504
  Batch size = 8
Saving model checkpoint to bert-finetuned-sem_eval-english/checkpoint-3980
Configuration saved in bert-finetuned-sem_eval-english/checkpoint-3980/config.json
Model weights saved in bert-finetuned-sem_eval-english/checkpoint-3980/pytorch_model.bin
tokenizer config file saved in bert-finetuned-sem_eval-english/checkpoint-3980/tokenizer_config.json
Special tokens file saved in bert-finetuned-sem_eval-english/checkpoint-3980/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 4504
  Batch size = 8
Saving model checkpoint to bert-finetuned-sem_eval-english/checkpoint-7960
Configuration saved in bert-finetuned-sem_eval-english/checkpoint-7960/config.json
Model weights saved in bert-finetuned-sem_eval-english/checkpoint-7960/pytorch_model.bin
tokenizer config file saved in bert-finetuned-sem_eval-english/checkpoint-7960/tokenizer_config.json
Special tokens file saved in bert-finetuned-sem_eval-english/ch

TrainOutput(global_step=19900, training_loss=0.06460535936020127, metrics={'train_runtime': 3022.6264, 'train_samples_per_second': 52.666, 'train_steps_per_second': 6.584, 'total_flos': 2.10942543685632e+16, 'train_loss': 0.06460535936020127, 'epoch': 5.0})