# Purpose
This file shows the steps taken to load the pre-trained model distilbert and train it with a sample data set

https://github.com/NielsRogge/Transformers-Tutorials/blob/master/BERT/Fine_tuning_BERT_(and_friends)_for_multi_label_text_classification.ipynb

### Imports

In [1]:
%%capture
!pip install transformers==4.20.0

In [2]:
%%capture
!python -m spacy download de_core_news_lg

In [3]:
%%capture
!pip install datasets

In [4]:

from tqdm import tqdm
import numpy as np
import pandas as pd
import os
import json
import spacy
# prepare
import datasets
from datasets import Dataset
# encode
from transformers import AutoTokenizer
import tensorflow as tf
import torch
# modelling
from transformers import AutoModelForSequenceClassification
from transformers import TrainingArguments, Trainer
#evaluation 
from sklearn.metrics import f1_score, roc_auc_score, accuracy_score
from transformers import EvalPrediction

### Constants

In [5]:
# paths
FILTERED_PATH="filtered_4_26"
ANNOTATED_PATH="annotated"

In [6]:
# connect with google drive
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [7]:
# change cwd
%cd drive/MyDrive/Work/Frontline/data

/content/drive/.shortcut-targets-by-id/1WfnZsqpG1r110J63sMbfS5TpsDOkveiV/data


### Prepare data

In [30]:
dfs=[]
for doc in os.listdir("annotated"):
  if doc.startswith("annotations"):
    #read json data
    json_data=json.load(open("annotated/"+doc, encoding="utf-8"))
    #convert to dataframe
    data=pd.DataFrame(json_data["documents"])
    #for now: filter out paragraphs that have not been annotated 
    data=data[data["annotations"].apply(len)>0]    
    dfs.append(data)


In [31]:
# merge jsons
data=pd.concat(dfs)
data=data.reset_index(drop=True)

In [32]:
data["tags"]=[""]*data.shape[0]

In [33]:
# extract document labels
# ASSUMPTION: all annotations are document labels, ie. do not have start and end as those are ignored
 
for idx, entry in data.iterrows():
  # in case there are more than 1 label
  for label in range(len(entry["annotations"])):
    data.loc[idx, "tags"]+=entry["annotations"][label]["concept"]["preferred_label"]["name"]+"*"

Change old label names to new label names

In [34]:
data.tags=data.tags.str.replace("NA","Domestic Violence")
data.tags=data.tags.str.replace("Victim blaming","Statement of responsibility")

In [35]:
df=data[["tags", "text"]]

In [36]:
# converting the annotation column with one hot encoding
df=pd.merge(df,df["tags"].str.get_dummies(sep="*"), left_index=True, right_index=True)
# dropping the tags column
df=df.drop("tags",1)
df.head()

  df=df.drop("tags",1)


Unnamed: 0,text,Domestic Violence,Graphic,Sensationalist,Statement of responsibility
0,Ein Mann (25) ist jetzt vom Schöffengericht am...,1,0,0,0
1,Als Zeugin trat die Ex-Lebensgefährtin des Syr...,0,0,0,1
2,"Zunächst leugnete der Angeklagte, dass es über...",0,0,0,1
3,Das Schöffengericht hatte es in diesem Fall of...,0,0,1,1
4,"""Gewalt in der Familie ist weder Privatsache n...",1,0,0,0


In [37]:
# create a list of all labels
labels=df.columns.to_list()[1:]

### Preprocessing Data

In [38]:
# needs to be imported after setting path
import preprocessing

In [39]:
# Load model
spacy_mod = spacy.load("de_core_news_lg", disable=['ner', 'parser', 'tagger'])

In [40]:
# read custom stopwords

# open list of custom stopwords
custom_stop_words= open("custom_stopwords.txt", encoding="utf-8").read().split()

# add custom stopwords to model
for word in custom_stop_words:
  spacy_mod.Defaults.stop_words.add(word)

In [41]:
# convert corpus to language object
spacy_lang = []
for i, doc in tqdm(df.iterrows()): 
  spacy_lang.append(spacy_mod("".join(doc['text'])))

542it [00:04, 119.42it/s]


In [42]:
# preprocess: remove stopwords
spacy_cleaned = []
for doc in tqdm(spacy_lang): 
    spacy_cleaned.append(preprocessing.preprocess(doc, remove_ent=True))

100%|██████████| 542/542 [00:00<00:00, 9979.16it/s]


In [43]:
# replace the text column with the cleaned texts
df["text"]=spacy_cleaned
# convert list of words to string
df["text"]= [" ".join(ls) for ls in df["text"]]

### Split data into train and test

In [44]:
#split train, dev , test sets
df_train, df_val, df_test = np.split(df.sample(frac=1, random_state=42),
                            [int(.8 * len(df)), int(.9 * len(df))])

#### Convert DataFrame to  Dataset

In [45]:
ds=datasets.DatasetDict({"train":Dataset.from_dict(df_train),"val":Dataset.from_dict(df_val),"test":Dataset.from_dict(df_test)})

### Tokenizing & Encoding

In [26]:
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-multilingual-cased")

Downloading:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/466 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/972k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.87M [00:00<?, ?B/s]

In [46]:
def preprocess_data(df):
  text = df["text"]
  # encode them
  encoding = tokenizer(text, padding="max_length", truncation=True, max_length=128)
  # add labels
  
  labels_batch = {k: df[k] for k in df.keys() if k in labels}
  # create numpy array of shape (batch_size, num_labels)
  labels_matrix = np.zeros((len(text), len(labels)))
  # fill numpy array
  for idx, label in enumerate(labels):
    labels_matrix[:, idx] = labels_batch[label]
  encoding["labels"] = labels_matrix.tolist()
  return encoding

In [47]:
ds_encoded=ds.map(preprocess_data, batched=True, remove_columns=ds['train'].column_names)

Map:   0%|          | 0/433 [00:00<?, ? examples/s]

Map:   0%|          | 0/54 [00:00<?, ? examples/s]

Map:   0%|          | 0/55 [00:00<?, ? examples/s]

### Modelling

In [48]:
# some model expect column name to be "label" not "labels"
ds_encoded["train"]=ds_encoded["train"].rename_column("labels","label")
ds_encoded["test"]=ds_encoded["test"].rename_column("labels","label")
ds_encoded["val"]=ds_encoded["val"].rename_column("labels","label")

In [49]:
id2label = {idx:label for idx, label in enumerate(labels)}
label2id = {label:idx for idx, label in enumerate(labels)}

In [50]:
model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-multilingual-cased", 
                                                           problem_type="multi_label_classification", 
                                                           num_labels=len(labels),
                                                           id2label=id2label,
                                                           label2id=label2id)

Downloading:   0%|          | 0.00/517M [00:00<?, ?B/s]

Some weights of the model checkpoint at distilbert-base-multilingual-cased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.bias', 'vocab_transform.weight', 'vocab_projector.weight', 'vocab_layer_norm.weight', 'vocab_projector.bias', 'vocab_layer_norm.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-multilingual-cased and are newly initialized: ['pre_classifier.weight', 'classif

In [51]:
# source: https://jesusleal.io/2021/04/21/Longformer-multilabel-classification/
def multi_label_metrics(predictions, labels, threshold=0.5):
    # first, apply sigmoid on predictions which are of shape (batch_size, num_labels)
    sigmoid = torch.nn.Sigmoid()
    probs = sigmoid(torch.Tensor(predictions))
    # next, use threshold to turn them into integer predictions
    y_pred = np.zeros(probs.shape)
    y_pred[np.where(probs >= threshold)] = 1
    # finally, compute metrics
    y_true = labels
    f1_micro_average = f1_score(y_true=y_true, y_pred=y_pred, average='micro')
    roc_auc = roc_auc_score(y_true, y_pred, average = 'micro')
    accuracy = accuracy_score(y_true, y_pred)
    # return as dictionary
    metrics = {'f1': f1_micro_average,
               'roc_auc': roc_auc,
               'accuracy': accuracy}
    return metrics

def compute_metrics(p: EvalPrediction):
    preds = p.predictions[0] if isinstance(p.predictions, 
            tuple) else p.predictions
    result = multi_label_metrics(
        predictions=preds, 
        labels=p.label_ids)
    return result

In [52]:
args = TrainingArguments(
    f"bert-finetuned-tagging-german",
    evaluation_strategy = "epoch",
    save_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=6,
    per_device_eval_batch_size=6,
    num_train_epochs=10,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    #push_to_hub=True,
)

#### Training

In [53]:
trainer = Trainer(
    model,
    args,
    train_dataset=ds_encoded["train"],
    eval_dataset=ds_encoded["val"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

In [54]:
trainer.train()

***** Running training *****
  Num examples = 433
  Num Epochs = 10
  Instantaneous batch size per device = 6
  Total train batch size (w. parallel, distributed & accumulation) = 6
  Gradient Accumulation steps = 1
  Total optimization steps = 730


Epoch,Training Loss,Validation Loss,F1,Roc Auc,Accuracy
1,No log,0.23426,0.87037,0.91358,0.87037
2,No log,0.232301,0.87037,0.91358,0.87037
3,No log,0.207488,0.87037,0.91358,0.87037
4,No log,0.176494,0.87037,0.91358,0.87037
5,No log,0.181101,0.895238,0.92284,0.87037
6,No log,0.186627,0.884615,0.91358,0.851852
7,0.186500,0.190547,0.895238,0.92284,0.87037
8,0.186500,0.193924,0.895238,0.92284,0.87037
9,0.186500,0.199165,0.90566,0.932099,0.888889
10,0.186500,0.197711,0.90566,0.932099,0.888889


***** Running Evaluation *****
  Num examples = 54
  Batch size = 6
Saving model checkpoint to bert-finetuned-tagging-german/checkpoint-73
Configuration saved in bert-finetuned-tagging-german/checkpoint-73/config.json
Model weights saved in bert-finetuned-tagging-german/checkpoint-73/pytorch_model.bin
tokenizer config file saved in bert-finetuned-tagging-german/checkpoint-73/tokenizer_config.json
Special tokens file saved in bert-finetuned-tagging-german/checkpoint-73/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 54
  Batch size = 6
Saving model checkpoint to bert-finetuned-tagging-german/checkpoint-146
Configuration saved in bert-finetuned-tagging-german/checkpoint-146/config.json
Model weights saved in bert-finetuned-tagging-german/checkpoint-146/pytorch_model.bin
tokenizer config file saved in bert-finetuned-tagging-german/checkpoint-146/tokenizer_config.json
Special tokens file saved in bert-finetuned-tagging-german/checkpoint-146/special_tokens_map.json
*

TrainOutput(global_step=730, training_loss=0.1567417118647327, metrics={'train_runtime': 234.6702, 'train_samples_per_second': 18.451, 'train_steps_per_second': 3.111, 'total_flos': 143401073571840.0, 'train_loss': 0.1567417118647327, 'epoch': 10.0})

In [55]:
trainer.evaluate()

***** Running Evaluation *****
  Num examples = 54
  Batch size = 6


{'eval_loss': 0.19916515052318573,
 'eval_f1': 0.9056603773584906,
 'eval_roc_auc': 0.9320987654320987,
 'eval_accuracy': 0.8888888888888888,
 'eval_runtime': 0.3416,
 'eval_samples_per_second': 158.091,
 'eval_steps_per_second': 26.349,
 'epoch': 10.0}

## Predicting Test Set Labels

In [56]:
sigmoid = torch.nn.Sigmoid()

In [57]:
test_pred=pd.DataFrame(ds["test"])

In [58]:
def predict_text(text):
  encoding = tokenizer(text, return_tensors="pt")
  encoding = {k: v.to(trainer.model.device) for k,v in encoding.items()}
  outputs = trainer.model(**encoding)
  logits = outputs.logits
  probs = sigmoid(logits.squeeze().cpu())
  predictions = np.zeros(probs.shape)
  predictions[np.where(probs >= 0.5)] = 1
  # turn predicted id's into actual label names
  predicted_labels = [id2label[idx] for idx, label in enumerate(predictions) if label == 1.0]
  return predicted_labels
  

In [73]:
pred=[]
for t in ds["test"]["text"]:
  pred.append("*".join(predict_text(t)))
test_pred["prediction"]=pred

In [87]:
test_pred.head()

Unnamed: 0,text,Domestic Violence,Graphic,Sensationalist,Statement of responsibility,prediction
0,polizei samstagmorgen frau bezirk hilfe geeilt...,1,0,0,0,Domestic Violence
1,existiert frauenhaus menschen gelebt schicksal...,1,0,0,0,Domestic Violence
2,bundesregierung nimmt thema ernst gefragt poli...,1,0,0,0,Domestic Violence
3,fälle liegen dokumente ärztlichen untersuchung...,1,0,0,0,Domestic Violence
4,lnw frau machete schwere bauchverletzungen zug...,1,0,0,0,Domestic Violence


### Calculate Performance 
Calculate Hamming score as indication for predictions performance. 

In [85]:
def hamming_score(y_true, y_pred):
    return (
        (y_true & y_pred).sum(axis=1) / (y_true | y_pred).sum(axis=1)
    ).mean()


In [80]:
predicted_labels=pd.DataFrame(pred)[0].str.get_dummies()
actual_labels=test_pred.iloc[:,1:4]

In [86]:
hamming_score(actual_labels, predicted_labels)

0.8518518518518519