<a href="https://colab.research.google.com/github/blue-create/langlens/blob/main/models/distilbert.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Purpose
This file shows the steps taken to load the pre-trained model distilbert and train it with a sample data set

https://github.com/NielsRogge/Transformers-Tutorials/blob/master/BERT/Fine_tuning_BERT_(and_friends)_for_multi_label_text_classification.ipynb

### Imports

In [83]:
%%capture
!pip install transformers==4.20.0

In [84]:
%%capture
!python -m spacy download de_core_news_lg

In [85]:
%%capture
!pip install datasets

In [86]:

from tqdm import tqdm
import numpy as np
import pandas as pd
import os
import json
import spacy
# prepare
import datasets
from ast import literal_eval
from datasets import Dataset
# encode
from transformers import AutoTokenizer
import tensorflow as tf
import torch
# modelling
from transformers import AutoModelForSequenceClassification
from transformers import TrainingArguments, Trainer
#evaluation
from sklearn.metrics import f1_score, roc_auc_score, accuracy_score
from transformers import EvalPrediction

### Constants

In [87]:
# connect with google drive
from google.colab import drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [88]:
# change cwd
%cd drive/MyDrive/Work/Frontline/data

[Errno 2] No such file or directory: 'drive/MyDrive/Work/Frontline/data'
/content/drive/.shortcut-targets-by-id/1WfnZsqpG1r110J63sMbfS5TpsDOkveiV/data


### Prepare data

In [89]:
from scripts import annotations

In [90]:
data=pd.read_csv("annotated/230621_all_annotationsV1.csv", index_col=0, converters={"annotations":literal_eval})
data=data.reset_index(drop=True)

In [91]:
data.loc[:,"annotations"]=data.apply(annotations.ground_truth_filter,min_coannotation=1,min_similarity=-1, similarity="dice",axis=1)

One-hot-encoding of the target labels

In [92]:
data.annotations=data.annotations.apply(list)
data.annotations=["*".join(i) for i in data.annotations]

In [93]:
df=pd.merge(data[["text","artikel_id","name"]],data.annotations.str.get_dummies(sep="*"), left_index=True, right_index=True)
df=df.drop_duplicates()

In [94]:
# create a list of all labels
labels=df.columns.to_list()[-4:]

### Preprocessing Data

In [95]:
# needs to be imported after setting path
from scripts import preprocessing

In [96]:
# Load model
spacy_mod = spacy.load("de_core_news_lg", disable=['ner', 'parser', 'tagger'])

In [97]:
# read custom stopwords

# open list of custom stopwords
custom_stop_words= open("custom_stopwords.txt", encoding="utf-8").read().split()

# add custom stopwords to model
for word in custom_stop_words:
  spacy_mod.Defaults.stop_words.add(word)

In [98]:
# convert corpus to language object
spacy_lang = []
for i, doc in tqdm(df.iterrows()):
  spacy_lang.append(spacy_mod("".join(doc['text'])))

1248it [00:20, 59.61it/s]


In [99]:
# preprocess: remove stopwords
spacy_cleaned = []
for doc in tqdm(spacy_lang):
    spacy_cleaned.append(preprocessing.preprocess(doc, remove_ent=True))

100%|██████████| 1248/1248 [00:00<00:00, 4629.76it/s]


In [100]:
# replace the text column with the cleaned texts
df["text"]=spacy_cleaned
# convert list of words to string
df["text"]= [" ".join(ls) for ls in df["text"]]

In [101]:
df.text.apply(lambda x: len(x.split())).quantile(0.9)

39.0

### Split data into train and test

In [102]:
#split train, dev , test sets
df_train, df_val, df_test = np.split(df.sample(frac=1, random_state=42),
                            [int(.8 * len(df)), int(.9 * len(df))])

#### Convert DataFrame to  Dataset

In [103]:
ds=datasets.DatasetDict({"train":Dataset.from_dict(df_train),"val":Dataset.from_dict(df_val),"test":Dataset.from_dict(df_test)})

### Tokenizing & Encoding

In [104]:
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-multilingual-cased")

In [105]:
def preprocess_data(df):
  text = df["text"]
  # encode them
  encoding = tokenizer(text, padding="max_length", truncation=True, max_length=128)
  # add labels

  labels_batch = {k: df[k] for k in df.keys() if k in labels}
  # create numpy array of shape (batch_size, num_labels)
  labels_matrix = np.zeros((len(text), len(labels)))
  # fill numpy array
  for idx, label in enumerate(labels):
    labels_matrix[:, idx] = labels_batch[label]
  encoding["labels"] = labels_matrix.tolist()
  return encoding

In [106]:
ds_encoded=ds.map(preprocess_data, batched=True, remove_columns=ds['train'].column_names)

Map:   0%|          | 0/998 [00:00<?, ? examples/s]

Map:   0%|          | 0/125 [00:00<?, ? examples/s]

Map:   0%|          | 0/125 [00:00<?, ? examples/s]

### Modelling

In [107]:
# some model expect column name to be "label" not "labels"
ds_encoded["train"]=ds_encoded["train"].rename_column("labels","label")
ds_encoded["test"]=ds_encoded["test"].rename_column("labels","label")
ds_encoded["val"]=ds_encoded["val"].rename_column("labels","label")

In [108]:
id2label = {idx:label for idx, label in enumerate(labels)}
label2id = {label:idx for idx, label in enumerate(labels)}

In [109]:
model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-multilingual-cased",
                                                           problem_type="multi_label_classification",
                                                           num_labels=len(labels),
                                                           id2label=id2label,
                                                           label2id=label2id)

Downloading:   0%|          | 0.00/517M [00:00<?, ?B/s]

Some weights of the model checkpoint at distilbert-base-multilingual-cased were not used when initializing DistilBertForSequenceClassification: ['vocab_layer_norm.bias', 'vocab_transform.weight', 'vocab_projector.weight', 'vocab_projector.bias', 'vocab_transform.bias', 'vocab_layer_norm.weight']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-multilingual-cased and are newly initialized: ['classifier.weight', 'pre_classif

In [110]:
# source: https://jesusleal.io/2021/04/21/Longformer-multilabel-classification/
def multi_label_metrics(predictions, labels, threshold=0.5):
    # first, apply sigmoid on predictions which are of shape (batch_size, num_labels)
    sigmoid = torch.nn.Sigmoid()
    probs = sigmoid(torch.Tensor(predictions))
    # next, use threshold to turn them into integer predictions
    y_pred = np.zeros(probs.shape)
    y_pred[np.where(probs >= threshold)] = 1
    # finally, compute metrics
    y_true = labels
    f1_micro_average = f1_score(y_true=y_true, y_pred=y_pred, average='micro')
    roc_auc = roc_auc_score(y_true, y_pred, average = 'micro')
    accuracy = accuracy_score(y_true, y_pred)
    # return as dictionary
    metrics = {'f1': f1_micro_average,
               'roc_auc': roc_auc,
               'accuracy': accuracy}
    return metrics

def compute_metrics(p: EvalPrediction):
    preds = p.predictions[0] if isinstance(p.predictions,
            tuple) else p.predictions
    result = multi_label_metrics(
        predictions=preds,
        labels=p.label_ids)
    return result

In [111]:
args = TrainingArguments(
    f"distilbert-finetuned-classification",
    evaluation_strategy = "epoch",
    save_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=10,
    per_device_eval_batch_size=10,
    num_train_epochs=10,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    #push_to_hub=True,
)

#### Training

In [112]:
trainer = Trainer(
    model,
    args,
    train_dataset=ds_encoded["train"],
    eval_dataset=ds_encoded["val"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

In [113]:
trainer.train()

***** Running training *****
  Num examples = 998
  Num Epochs = 10
  Instantaneous batch size per device = 10
  Total train batch size (w. parallel, distributed & accumulation) = 10
  Gradient Accumulation steps = 1
  Total optimization steps = 1000


Epoch,Training Loss,Validation Loss,F1,Roc Auc,Accuracy
1,No log,0.19829,0.896825,0.928796,0.904
2,No log,0.200332,0.896825,0.928796,0.904
3,No log,0.176531,0.89243,0.924859,0.896
4,No log,0.141572,0.891667,0.913217,0.856
5,0.167500,0.193508,0.891566,0.922263,0.888
6,0.167500,0.211574,0.89243,0.924859,0.896
7,0.167500,0.190879,0.890688,0.919666,0.88
8,0.167500,0.196419,0.890688,0.919666,0.88
9,0.167500,0.19882,0.890688,0.919666,0.88
10,0.077600,0.203731,0.890688,0.919666,0.88


***** Running Evaluation *****
  Num examples = 125
  Batch size = 10
Saving model checkpoint to distilbert-finetuned-classification/checkpoint-100
Configuration saved in distilbert-finetuned-classification/checkpoint-100/config.json
Model weights saved in distilbert-finetuned-classification/checkpoint-100/pytorch_model.bin
tokenizer config file saved in distilbert-finetuned-classification/checkpoint-100/tokenizer_config.json
Special tokens file saved in distilbert-finetuned-classification/checkpoint-100/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 125
  Batch size = 10
Saving model checkpoint to distilbert-finetuned-classification/checkpoint-200
Configuration saved in distilbert-finetuned-classification/checkpoint-200/config.json
Model weights saved in distilbert-finetuned-classification/checkpoint-200/pytorch_model.bin
tokenizer config file saved in distilbert-finetuned-classification/checkpoint-200/tokenizer_config.json
Special tokens file saved in distilb

TrainOutput(global_step=1000, training_loss=0.12253852462768555, metrics={'train_runtime': 278.7441, 'train_samples_per_second': 35.803, 'train_steps_per_second': 3.588, 'total_flos': 330517947863040.0, 'train_loss': 0.12253852462768555, 'epoch': 10.0})

In [114]:
trainer.evaluate()

***** Running Evaluation *****
  Num examples = 125
  Batch size = 10


{'eval_loss': 0.19828976690769196,
 'eval_f1': 0.8968253968253967,
 'eval_roc_auc': 0.928796098879061,
 'eval_accuracy': 0.904,
 'eval_runtime': 0.6237,
 'eval_samples_per_second': 200.42,
 'eval_steps_per_second': 20.844,
 'epoch': 10.0}

## Predicting Test Set Labels

In [115]:
sigmoid = torch.nn.Sigmoid()

In [116]:
test_pred=pd.DataFrame(ds["test"])

In [117]:
def predict_text(text):
  encoding = tokenizer(text, return_tensors="pt")
  encoding = {k: v.to(trainer.model.device) for k,v in encoding.items()}
  outputs = trainer.model(**encoding)
  logits = outputs.logits
  probs = sigmoid(logits.squeeze().cpu())
  predictions = np.zeros(probs.shape)
  predictions[np.where(probs >= 0.5)] = 1
  # turn predicted id's into actual label names
  predicted_labels = [id2label[idx] for idx, label in enumerate(predictions) if label == 1.0]
  return predicted_labels


In [118]:
pred=[]
for t in ds["test"]["text"]:
  pred.append("*".join(predict_text(t)))
test_pred["prediction"]=pred

In [119]:
test_pred.head()

Unnamed: 0,text,artikel_id,name,Domestic Violence,Graphic,Sensationalist,Statement of responsibility,prediction
0,hälfte fälle lebten opfer täter dach täter tät...,174301500,WELT ONLINE,1,0,0,0,Domestic Violence
1,zahlen geben fast irgendwo mann versucht partn...,197211893,WELT ONLINE,1,0,0,0,Domestic Violence
2,meist beginnt gewalt harmlos beleidigungen dif...,19CEE7D6EFF3CFA75E0FADDE75C098EE-ANDERNACHMAYEN,Rhein-Zeitung,1,0,0,0,Domestic Violence
3,fast frau erlebt strafrechtlich relevante form...,202305701603,Westfalen-Blatt,1,0,0,0,Domestic Violence
4,staatsministerin außenministerium internationa...,105269769,Rheinische Post,1,0,0,0,Domestic Violence


### Calculate Performance
Calculate Hamming score as indication for predictions performance.

In [120]:
def hamming_score(y_true, y_pred):
    return (
        (y_true & y_pred).sum(axis=1) / (y_true | y_pred).sum(axis=1)
    ).mean()


In [121]:
predicted_labels=pd.DataFrame(pred)[0].str.get_dummies()
actual_labels=test_pred.iloc[:,1:4]

In [122]:
hamming_score(actual_labels, predicted_labels)

0.904

Performance for Articles not labelled "Domestic Violence"

In [123]:
predicted_labels=predicted_labels[test_pred["Domestic Violence"]!=1]
actual_labels=actual_labels[test_pred["Domestic Violence"]!=1]

In [124]:
hamming_score(actual_labels, predicted_labels)

0.0