<a href="https://colab.research.google.com/github/blue-create/langlens/blob/main/models/bert.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Imports

In [1]:
%%capture
!pip install transformers==4.20.0

In [2]:
%%capture
!pip install datasets

In [3]:
from tqdm import tqdm
import numpy as np
import pandas as pd
import os
import json
import spacy


#preparing
import datasets
from datasets import Dataset
from ast import literal_eval

#modelling

from transformers import TrainingArguments, Trainer
from transformers import BloomConfig, BloomModel
from transformers import AutoTokenizer

#evaluation
import torch
from transformers import EvalPrediction
from sklearn.metrics import f1_score, roc_auc_score, accuracy_score
from transformers import EvalPrediction

### Constants

In [4]:
# connect with google drive
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [5]:
# change cwd
%cd drive/MyDrive/Work/Frontline/data

/content/drive/.shortcut-targets-by-id/1WfnZsqpG1r110J63sMbfS5TpsDOkveiV/data


In [6]:
from scripts import annotations

### Data

In [37]:
# list of dfs with all annotated datasets
dfs={}
for doc in os.listdir("annotated/new_ontology"):
  if doc.endswith(".json"):
    #read json data
    json_data=json.load(open("annotated/new_ontology/"+doc, encoding="utf-8"))
    #convert to dataframe
    data=pd.DataFrame(json_data["documents"])
    data.loc[:,"file"]=doc
    dfs[doc]=data

data=pd.concat(dfs,ignore_index=True)

In [38]:
data=data[data.annotations.apply(len)!=0]

In [39]:
data.loc[:,"artikel_id"]=data.attributes_flat.apply(lambda x: x["artikel_id"])
data.loc[:,"name"]=data.attributes_flat.apply(lambda x: x["name"])
data.loc[:,"titel"]=data.attributes_flat.apply(lambda x: x["titel"])
data.loc[:,"ressort"]=data.attributes_flat.apply(lambda x: x["ressort"])

In [40]:
data.loc[:,"annotations"]=data.annotations.apply(annotations.extract_annotations)
data.loc[:,"dice"]=data.annotations.apply(annotations.calculate_similarity,sim="dice")
data.loc[:,"annotations"]=data.apply(annotations.ground_truth_filter,min_coannotation=1,min_similarity=-1, similarity="dice",axis=1)

In [44]:

data.loc[:,"annotations"]=data.annotations.apply(list)
data.loc[:,"annotations"]=["*".join(i) for i in data.annotations]

In [49]:
# converting the annotation column with one hot encoding
df=pd.merge(data[["text","artikel_id"]],data.annotations.str.get_dummies(sep="*"), left_index=True, right_index=True)

In [51]:
# create a list of all labels
labels=df.columns.to_list()[-4:]

### Split data into train and test

In [54]:
#split train, dev , test sets
df_train, df_val, df_test = np.split(df.sample(frac=1, random_state=42),
                            [int(.8 * len(df)), int(.9 * len(df))])

#### Convert DataFrame to  Dataset

In [55]:
ds=datasets.DatasetDict({"train":Dataset.from_dict(df_train),"val":Dataset.from_dict(df_val),"test":Dataset.from_dict(df_test)})

### Tokenizing & Encoding

In [56]:
tokenizer = AutoTokenizer.from_pretrained("nlptown/bert-base-multilingual-uncased-sentiment")

Downloading:   0%|          | 0.00/39.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/953 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/851k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

In [57]:
def preprocess_data(df):
  text = df["text"]
  # encode them
  encoding = tokenizer(text, padding="max_length", truncation=True, max_length=128)
  # add labels

  labels_batch = {k: df[k] for k in df.keys() if k in labels}
  # create numpy array of shape (batch_size, num_labels)
  labels_matrix = np.zeros((len(text), len(labels)))
  # fill numpy array
  for idx, label in enumerate(labels):
    labels_matrix[:, idx] = labels_batch[label]
  encoding["labels"] = labels_matrix.tolist()
  return encoding

In [58]:
ds_encoded=ds.map(preprocess_data, batched=True, remove_columns=ds['train'].column_names)

Map:   0%|          | 0/1092 [00:00<?, ? examples/s]

Map:   0%|          | 0/137 [00:00<?, ? examples/s]

Map:   0%|          | 0/137 [00:00<?, ? examples/s]

In [59]:
ds_encoded.set_format("torch")


### Modelling

In [60]:
id2label = {idx:label for idx, label in enumerate(labels)}
label2id = {label:idx for idx, label in enumerate(labels)}

In [61]:
from transformers import AutoModelForSequenceClassification


In [62]:
model = AutoModelForSequenceClassification.from_pretrained("nlptown/bert-base-multilingual-uncased-sentiment",ignore_mismatched_sizes=True,
                                   problem_type="multi_label_classification",
                                   num_labels=len(labels),
                                   id2label=id2label,
                                   label2id=label2id)

Downloading:   0%|          | 0.00/638M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at nlptown/bert-base-multilingual-uncased-sentiment and are newly initialized because the shapes did not match:
- classifier.weight: found shape torch.Size([5, 768]) in the checkpoint and torch.Size([4, 768]) in the model instantiated
- classifier.bias: found shape torch.Size([5]) in the checkpoint and torch.Size([4]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


### Evaluate

In [63]:
# source: https://jesusleal.io/2021/04/21/Longformer-multilabel-classification/
def multi_label_metrics(predictions, labels, threshold=0.5):
    # first, apply sigmoid on predictions which are of shape (batch_size, num_labels)
    sigmoid = torch.nn.Sigmoid()
    probs = sigmoid(torch.Tensor(predictions))
    # next, use threshold to turn them into integer predictions
    y_pred = np.zeros(probs.shape)
    y_pred[np.where(probs >= threshold)] = 1
    # finally, compute metrics
    y_true = labels
    f1_micro_average = f1_score(y_true=y_true, y_pred=y_pred, average='micro')
    roc_auc = roc_auc_score(y_true, y_pred, average = 'micro')
    accuracy = accuracy_score(y_true, y_pred)
    # return as dictionary
    metrics = {'f1': f1_micro_average,
               'roc_auc': roc_auc,
               'accuracy': accuracy}
    return metrics

def compute_metrics(p: EvalPrediction):

    preds = p.predictions[0] if isinstance(p.predictions,
            tuple) else p.predictions
    result = multi_label_metrics(
        predictions=preds,
        labels=p.label_ids)
    return result

In [64]:
ds_encoded["train"]=ds_encoded["train"].rename_column("labels","label")
ds_encoded["test"]=ds_encoded["test"].rename_column("labels","label")
ds_encoded["val"]=ds_encoded["val"].rename_column("labels","label")


In [65]:
args = TrainingArguments(
    f"bloom-finetuned-tagging-german",
    evaluation_strategy = "epoch",
    save_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=6,
    per_device_eval_batch_size=6,
    num_train_epochs=10,
    weight_decay=0.05,
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    #push_to_hub=True,
)

In [66]:
trainer = Trainer(
    model,
    args,
    train_dataset=ds_encoded["train"],
    eval_dataset=ds_encoded["val"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

In [67]:
trainer.train()

***** Running training *****
  Num examples = 1092
  Num Epochs = 10
  Instantaneous batch size per device = 6
  Total train batch size (w. parallel, distributed & accumulation) = 6
  Gradient Accumulation steps = 1
  Total optimization steps = 1820


Epoch,Training Loss,Validation Loss,F1,Roc Auc,Accuracy
1,No log,0.170574,0.866667,0.907543,0.854015
2,No log,0.143919,0.922509,0.945255,0.912409
3,0.163700,0.146431,0.900369,0.930657,0.890511
4,0.163700,0.171741,0.882353,0.919708,0.875912
5,0.163700,0.155985,0.915751,0.942822,0.912409
6,0.046300,0.184791,0.908425,0.937956,0.905109
7,0.046300,0.195461,0.908425,0.937956,0.905109
8,0.046300,0.177641,0.89781,0.931873,0.89781
9,0.020300,0.208203,0.905109,0.93674,0.905109
10,0.020300,0.216267,0.905109,0.93674,0.905109


***** Running Evaluation *****
  Num examples = 137
  Batch size = 6
Saving model checkpoint to bloom-finetuned-tagging-german/checkpoint-182
Configuration saved in bloom-finetuned-tagging-german/checkpoint-182/config.json
Model weights saved in bloom-finetuned-tagging-german/checkpoint-182/pytorch_model.bin
tokenizer config file saved in bloom-finetuned-tagging-german/checkpoint-182/tokenizer_config.json
Special tokens file saved in bloom-finetuned-tagging-german/checkpoint-182/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 137
  Batch size = 6
Saving model checkpoint to bloom-finetuned-tagging-german/checkpoint-364
Configuration saved in bloom-finetuned-tagging-german/checkpoint-364/config.json
Model weights saved in bloom-finetuned-tagging-german/checkpoint-364/pytorch_model.bin
tokenizer config file saved in bloom-finetuned-tagging-german/checkpoint-364/tokenizer_config.json
Special tokens file saved in bloom-finetuned-tagging-german/checkpoint-364/special_

TrainOutput(global_step=1820, training_loss=0.06555591887170142, metrics={'train_runtime': 468.7247, 'train_samples_per_second': 23.297, 'train_steps_per_second': 3.883, 'total_flos': 718306079662080.0, 'train_loss': 0.06555591887170142, 'epoch': 10.0})

## Predicting

In [68]:
sigmoid = torch.nn.Sigmoid()

In [69]:
test_pred=pd.DataFrame(ds["test"])

In [70]:
def predict_text(text):
  encoding = tokenizer(text, return_tensors="pt")
  encoding = {k: v.to(trainer.model.device) for k,v in encoding.items()}
  outputs = trainer.model(**encoding)
  logits = outputs.logits
  probs = sigmoid(logits.squeeze().cpu())
  predictions = np.zeros(probs.shape)
  predictions[np.where(probs >= 0.5)] = 1
  # turn predicted id's into actual label names
  predicted_labels = [id2label[idx] for idx, label in enumerate(predictions) if label == 1.0]
  return predicted_labels


In [71]:
pred=[]
for t in ds["test"]["text"]:
  pred.append(predict_text(t))
test_pred["prediction"]=pred

### Calculate Performance
Calculate Hamming score as indication for predictions performance.

In [72]:
temp=pd.DataFrame(columns=["Domestic Violence",	"Graphic",	"Sensationalist",	"Statement of responsibility"])

In [73]:
def hamming_score(y_true, y_pred):
    return (
        (y_true & y_pred).sum(axis=1) / (y_true | y_pred).sum(axis=1)
    ).mean()


In [74]:
predicted_labels=pd.DataFrame(pred)[0].str.get_dummies()
predicted_labels=pd.concat([temp,predicted_labels]).fillna(0)
actual_labels=test_pred.iloc[:,-5:-1]

In [75]:
hamming_score(actual_labels, predicted_labels)

0.9416058394160584

Performance for Articles not labelled "Domestic Violence"

In [76]:
predicted_labels=predicted_labels[test_pred["Domestic Violence"]!=1]
actual_labels=actual_labels[test_pred["Domestic Violence"]!=1]

In [77]:
hamming_score(actual_labels, predicted_labels)

0.5333333333333333