# Purpose
This file shows the steps taken to load the pre-trained model distilbert and train it with a sample data set

https://github.com/NielsRogge/Transformers-Tutorials/blob/master/BERT/Fine_tuning_BERT_(and_friends)_for_multi_label_text_classification.ipynb

### Imports

In [1]:
#%%capture
#!pip install transformers

In [2]:
%%capture
!python -m spacy download de_core_news_lg

In [3]:
%%capture
!pip install datasets

In [4]:
import warnings
warnings.filterwarnings('ignore')
from tqdm import tqdm
import numpy as np
import pandas as pd
import os
import json
import spacy
# prepare
import datasets
from datasets import Dataset
# encode
from transformers import AutoTokenizer
import tensorflow as tf
import torch
# modelling
from transformers import AutoModelForSequenceClassification
from transformers import TrainingArguments, Trainer
#evaluation 
from sklearn.metrics import f1_score, roc_auc_score, accuracy_score
from transformers import EvalPrediction

### Constants

In [5]:
# paths
FILTERED_PATH="filtered_4_26"
ANNOTATED_PATH="annotated"

### Data

In [6]:
# # connect with google drive
# from google.colab import drive
# drive.mount('/content/drive')


In [7]:
# change cwd
%cd drive/MyDrive/Work/Frontline/data

/content/drive/.shortcut-targets-by-id/1WfnZsqpG1r110J63sMbfS5TpsDOkveiV/data


In [8]:
#read json data
json_data=json.load(open(ANNOTATED_PATH+"/annotations_05_08.json"))

### Prepare data

In [9]:
#convert to dataframe
data=pd.DataFrame(json_data["documents"])

In [10]:
#for now: filter out paragraphs that have not been annotated 
data=data[data["annotations"].apply(len)>0]
#reset index
data=data.reset_index(drop=True)
data.head()

Unnamed: 0,id,text,annotations,attributes_flat
0,4572dea4-6a08-4f1e-b312-5821112bb5f5,Ein Mann (25) ist jetzt vom Schöffengericht am...,"[{'str_start': None, 'str_stop': None, 'annota...",{'artikel_id': 'F3A4578D33A8603DF0573D3DE3CB26...
1,0bcada32-8dc5-41cf-b83b-67d2e742bada,Als Zeugin trat die Ex-Lebensgefährtin des Syr...,"[{'str_start': None, 'str_stop': None, 'annota...",{'artikel_id': 'F3A4578D33A8603DF0573D3DE3CB26...
2,a30791b9-522e-45c1-8b33-79d4165282af,"Zunächst leugnete der Angeklagte, dass es über...","[{'str_start': None, 'str_stop': None, 'annota...",{'artikel_id': 'F3A4578D33A8603DF0573D3DE3CB26...
3,043e3909-bcdd-4c6b-a54f-f947d46ad18e,Das Schöffengericht hatte es in diesem Fall of...,"[{'str_start': None, 'str_stop': None, 'annota...",{'artikel_id': 'F3A4578D33A8603DF0573D3DE3CB26...
4,08cef91c-6d73-472c-8349-07a5b72009d1,"""Gewalt in der Familie ist weder Privatsache n...","[{'str_start': None, 'str_stop': None, 'annota...","{'artikel_id': 'IRA-82182598', 'name': 'SÜDWES..."


In [11]:
data["tags"]=[""]*data.shape[0]

In [12]:
# extract document labels
# ASSUMPTION: all annotations are document labels, ie. do not have start and end as those are ignored
 
for idx, entry in data.iterrows():
  # in case there are more than 1 label
  for label in range(len(entry["annotations"])):
    data.loc[idx, "tags"]+=entry["annotations"][label]["concept"]["preferred_label"]["name"]+"*"

In [13]:
df=data[["tags", "text"]]

In [14]:
# converting the annotation column with one hot encoding
df=pd.merge(df,df["tags"].str.get_dummies(sep="*"), left_index=True, right_index=True)
# dropping the tags column
df=df.drop("tags",1)
df.head()

Unnamed: 0,text,Graphic,NA,Sensationalist,Victim blaming
0,Ein Mann (25) ist jetzt vom Schöffengericht am...,0,1,0,0
1,Als Zeugin trat die Ex-Lebensgefährtin des Syr...,0,0,0,1
2,"Zunächst leugnete der Angeklagte, dass es über...",0,0,0,1
3,Das Schöffengericht hatte es in diesem Fall of...,0,0,1,1
4,"""Gewalt in der Familie ist weder Privatsache n...",0,1,0,0


In [15]:
# create a list of all labels
labels=df.columns.to_list()[1:]

### Preprocessing Data

In [16]:
# needs to be imported after setting path
import preprocessing

In [17]:
# Load model
spacy_mod = spacy.load("de_core_news_lg", disable=['ner', 'parser', 'tagger'])

In [18]:
# read custom stopwords

# open list of custom stopwords
custom_stop_words= open("custom_stopwords.txt").read().split()

# add custom stopwords to model
for word in custom_stop_words:
  spacy_mod.Defaults.stop_words.add(word)

In [19]:
# convert corpus to language object
spacy_lang = []
for i, doc in tqdm(df.iterrows()): 
  spacy_lang.append(spacy_mod("".join(doc['text'])))

124it [00:02, 57.52it/s]


In [20]:
# preprocess: remove stopwords
spacy_cleaned = []
for doc in tqdm(spacy_lang): 
    spacy_cleaned.append(preprocessing.preprocess(doc, remove_ent=True))

100%|██████████| 124/124 [00:00<00:00, 3215.62it/s]


In [21]:
# replace the text column with the cleaned texts
df["text"]=spacy_cleaned
# convert list of words to string
df["text"]= [" ".join(ls) for ls in df["text"]]

### Split data into train and test

In [22]:
#split train, dev , test sets
df_train, df_val, df_test = np.split(df.sample(frac=1, random_state=42),
                            [int(.8 * len(df)), int(.9 * len(df))])

#### Convert DataFrame to  Dataset

In [23]:
ds=datasets.DatasetDict({"train":Dataset.from_dict(df_train),"val":Dataset.from_dict(df_val),"test":Dataset.from_dict(df_test)})

### Tokenizing & Encoding

In [24]:
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-multilingual-cased")

In [25]:
def preprocess_data(df):
  text = df["text"]
  # encode them
  encoding = tokenizer(text, padding="max_length", truncation=True, max_length=128)
  # add labels
  
  labels_batch = {k: df[k] for k in df.keys() if k in labels}
  # create numpy array of shape (batch_size, num_labels)
  labels_matrix = np.zeros((len(text), len(labels)))
  # fill numpy array
  for idx, label in enumerate(labels):
    labels_matrix[:, idx] = labels_batch[label]
  encoding["labels"] = labels_matrix.tolist()
  return encoding

In [26]:
ds_encoded=ds.map(preprocess_data, batched=True, remove_columns=ds['train'].column_names)

Map:   0%|          | 0/99 [00:00<?, ? examples/s]

Map:   0%|          | 0/12 [00:00<?, ? examples/s]

Map:   0%|          | 0/13 [00:00<?, ? examples/s]

### Modelling

In [27]:
id2label = {idx:label for idx, label in enumerate(labels)}
label2id = {label:idx for idx, label in enumerate(labels)}

In [28]:
model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-multilingual-cased", 
                                                           problem_type="multi_label_classification", 
                                                           num_labels=len(labels),
                                                           id2label=id2label,
                                                           label2id=label2id)

Some weights of the model checkpoint at distilbert-base-multilingual-cased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.bias', 'vocab_projector.weight', 'vocab_layer_norm.weight', 'vocab_projector.bias', 'vocab_layer_norm.bias', 'vocab_transform.weight']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'pre_classifie

In [29]:
ds["train"]

Dataset({
    features: ['text', 'Graphic', 'NA', 'Sensationalist', 'Victim blaming'],
    num_rows: 99
})

In [32]:
args = TrainingArguments(
    f"bert-finetuned-tagging-german",
    evaluation_strategy = "epoch",
    save_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=6,
    per_device_eval_batch_size=6,
    num_train_epochs=5,
    weight_decay=0.05,
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    #push_to_hub=True,
)

In [33]:
# source: https://jesusleal.io/2021/04/21/Longformer-multilabel-classification/
def multi_label_metrics(predictions, labels, threshold=0.5):
    # first, apply sigmoid on predictions which are of shape (batch_size, num_labels)
    sigmoid = torch.nn.Sigmoid()
    probs = sigmoid(torch.Tensor(predictions))
    # next, use threshold to turn them into integer predictions
    y_pred = np.zeros(probs.shape)
    y_pred[np.where(probs >= threshold)] = 1
    # finally, compute metrics
    y_true = labels
    f1_micro_average = f1_score(y_true=y_true, y_pred=y_pred, average='micro')
    roc_auc = roc_auc_score(y_true, y_pred, average = 'micro')
    accuracy = accuracy_score(y_true, y_pred)
    # return as dictionary
    metrics = {'f1': f1_micro_average,
               'roc_auc': roc_auc,
               'accuracy': accuracy}
    return metrics

def compute_metrics(p: EvalPrediction):
    preds = p.predictions[0] if isinstance(p.predictions, 
            tuple) else p.predictions
    result = multi_label_metrics(
        predictions=preds, 
        labels=p.label_ids)
    return result

#### Training

In [34]:
trainer = Trainer(
    model,
    args,
    train_dataset=ds_encoded["train"],
    eval_dataset=ds_encoded["val"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

In [35]:
trainer.train()

You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,F1,Roc Auc,Accuracy
1,No log,0.571549,0.461538,0.62605,0.5
2,No log,0.587796,0.461538,0.62605,0.5
3,No log,0.598916,0.461538,0.62605,0.5
4,No log,0.590964,0.461538,0.62605,0.5
5,No log,0.59326,0.461538,0.62605,0.5


TrainOutput(global_step=85, training_loss=0.4023049747242647, metrics={'train_runtime': 722.4228, 'train_samples_per_second': 0.685, 'train_steps_per_second': 0.118, 'total_flos': 16393425269760.0, 'train_loss': 0.4023049747242647, 'epoch': 5.0})

In [42]:
trainer.evaluate()

{'eval_loss': 0.5715487003326416,
 'eval_f1': 0.4615384615384615,
 'eval_roc_auc': 0.6260504201680672,
 'eval_accuracy': 0.5,
 'eval_runtime': 2.4182,
 'eval_samples_per_second': 4.962,
 'eval_steps_per_second': 0.827,
 'epoch': 5.0}

## Predicting Test Set Labels

In [79]:
sigmoid = torch.nn.Sigmoid()

In [None]:
test_pred=pd.DataFrame(ds["test"])

In [69]:
def predict_text(text):
  encoding = tokenizer(text, return_tensors="pt")
  encoding = {k: v.to(trainer.model.device) for k,v in encoding.items()}

  outputs = trainer.model(**encoding)
  logits = outputs.logits
  probs = sigmoid(logits.squeeze().cpu())
  predictions = np.zeros(probs.shape)
  predictions[np.where(probs >= 0.5)] = 1
  # turn predicted id's into actual label names
  predicted_labels = [id2label[idx] for idx, label in enumerate(predictions) if label == 1.0]
  return predicted_labels
  

In [105]:
pred=[]
for t in ds["test"]["text"]:
  pred.append(predict_text(t))
test_pred["prediction"]=pred

In [109]:
test_pred.head()

Unnamed: 0,text,Graphic,NA,Sensationalist,Victim blaming,prediction
0,frau trennen geschlagen schal gewürgt bewussts...,0,0,0,1,[NA]
1,schnell klar mann frau messer getötet versucht...,0,1,0,0,[NA]
2,straße samstag frau taxifahrer erschossen tatv...,0,1,0,0,[NA]
3,tatsächlich frau gefecht setzen söhnchen verbr...,0,0,1,1,[NA]
4,autofahrerin gewaltsamen auseinandersetzung me...,0,1,0,0,[NA]
