In [1]:
!pip install -q emoji

In [8]:
import pandas as pd
from datasets import Dataset
from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer
import numpy as np
import torch

# Load CSV into a Pandas DataFrame
df = pd.read_csv('initial_train_bert.csv')
df = df[['example', 'label']]
df = df[df['label'].notna()]
df['label'] = df['label'].astype(int)

In [9]:
df.head()

Unnamed: 0,example,label
0,hier twittert das Team von Petra Pau/Marzahn-H...,0
1,Die soziale Opposition: Feministisch. Sozialis...,0
2,MdB DIE LINKE l Stellvertretende Fraktionsvors...,0
3,MdB für die Menschen in der Städteregion Aache...,0
4,Mitglied des Bundestages | Sprecherin für Arbe...,0


In [11]:
# Define label mappings
id2label = {
    0: "ökonomisch neutral + libertär",
    1: "ökonomisch links + libertär",
    2: "ökonomisch neutral + libertär/autoritär",
    3: "ökonomisch neutral + autoritär",
    4: "ökonomisch rechts + libertär/autoritär",
    5: "ökonomisch rechts + autoritär",
    6: "Keine Kategorie"
}

label2id = {
    "ökonomisch neutral + libertär": 0,
    "ökonomisch links + libertär": 1,
    "ökonomisch neutral + libertär/autoritär": 2,
    "ökonomisch neutral + autoritär": 3,
    "ökonomisch rechts + libertär/autoritär": 4,
    "ökonomisch rechts + autoritär": 5,
    "Keine Kategorie": 6
}

In [12]:
df["LABEL"] = df["label"].map(id2label.get)

In [13]:
df.head()

Unnamed: 0,example,label,LABEL
0,hier twittert das Team von Petra Pau/Marzahn-H...,0,ökonomisch neutral + libertär
1,Die soziale Opposition: Feministisch. Sozialis...,0,ökonomisch neutral + libertär
2,MdB DIE LINKE l Stellvertretende Fraktionsvors...,0,ökonomisch neutral + libertär
3,MdB für die Menschen in der Städteregion Aache...,0,ökonomisch neutral + libertär
4,Mitglied des Bundestages | Sprecherin für Arbe...,0,ökonomisch neutral + libertär


#### Converting emojis to text using pip package emoji

In [31]:
import emoji
import string
import re

def demojize(text):
    # Convert emoji to text
    processed_text = emoji.demojize(text, language='de')
    # Remove ':' from converting the emojis to text
    for char in string.punctuation.replace('.', ''):
        processed_text = processed_text.replace(char, ' ')
    # Remove trailing whitespace
    processed_text = re.sub(r'\s+', ' ', processed_text).strip()
    return processed_text

df['example'] = df['example'].apply(lambda x: demojize(x))

In [34]:
df['example'][1]

'Die soziale Opposition Feministisch. Sozialistisch. Friedlich. Für dich taube sicherheitsweste laubbaum erhobene faust'

In [35]:
# Load the BERT tokenizer
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-german-cased")

def tokenize_function(examples):
    return tokenizer(examples["example"], padding="max_length", truncation=True, return_tensors="pt")

In [36]:
# Split into training and testing data
training_data = df.sample(frac=0.8, random_state=25)
testing_data = df.drop(training_data.index)

In [37]:
training_data['label'].value_counts()

label
6    76
3    56
2    54
5    48
1    43
0    35
4    23
Name: count, dtype: int64

In [38]:
testing_data['label'].value_counts()

label
6    26
3    15
1    12
2    11
5     8
4     7
0     5
Name: count, dtype: int64

In [39]:
# Load datasets
training_data = Dataset.from_pandas(training_data)
testing_data = Dataset.from_pandas(testing_data)

tokenized_training_data = training_data.map(tokenize_function, batched=True)
tokenized_testing_data = testing_data.map(tokenize_function, batched=True)

Map:   0%|          | 0/335 [00:00<?, ? examples/s]

Map:   0%|          | 0/84 [00:00<?, ? examples/s]

In [40]:
tokenized_training_data['example'][11]

'Fraktionsvorsitzende Grüne Sachsen Anhalt sonnenblume Klimaschutz Mobilitätswende und Demokratiesicherung'

In [41]:
from transformers import DataCollatorWithPadding
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
data_collator

DataCollatorWithPadding(tokenizer=DistilBertTokenizerFast(name_or_path='distilbert-base-german-cased', vocab_size=31102, model_max_length=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True), padding=True, max_length=None, pad_to_multiple_of=None, return_tensors='pt')

In [42]:
import evaluate
accuracy = evaluate.load("accuracy")

In [47]:
from sklearn.metrics import precision_recall_fscore_support, accuracy_score

# Good link about which average to choose: https://simonhessner.de/why-are-precision-recall-and-f1-score-equal-when-using-micro-averaging-in-a-multi-class-problem/
# micro: Calculate metrics globally by counting the total true positives, false negatives and false positives.
# macro: Calculate metrics for each label, and find their unweighted mean. This does not take label imbalance into account.

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='macro')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

In [48]:
from torch import nn

# Model and training setup
model = AutoModelForSequenceClassification.from_pretrained(
    "distilbert-base-german-cased", num_labels=7, id2label=id2label, label2id=label2id
)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-german-cased and are newly initialized: ['classifier.weight', 'pre_classifier.weight', 'classifier.bias', 'pre_classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [49]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [50]:
torch.cuda.get_device_name(0)

'NVIDIA A30'

In [51]:
model.to(device) 

DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(31102, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
 

In [52]:
training_args = TrainingArguments(
    output_dir="bert420",              # output directory
    learning_rate=2e-5,                # total # of training epochs
    per_device_train_batch_size=16,    # batch size per device during training
    per_device_eval_batch_size=16,     # batch size for evaluation
    num_train_epochs=30,               # total # of training epochs
    weight_decay=0.01,                 # strength of weight decay  
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    push_to_hub=False,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_training_data,
    eval_dataset=tokenized_testing_data,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

In [53]:
# Fine-tune the model
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,No log,1.850875,0.297619,0.065531,0.043029,0.137363
2,No log,1.81891,0.345238,0.153412,0.136112,0.21685
3,No log,1.708967,0.392857,0.21645,0.175429,0.288004
4,No log,1.515127,0.52381,0.34527,0.314218,0.396154
5,No log,1.343175,0.559524,0.398559,0.422902,0.431652
6,No log,1.181091,0.619048,0.505371,0.495991,0.527239
7,No log,1.055348,0.630952,0.537731,0.554616,0.557076
8,No log,0.932711,0.702381,0.617492,0.706012,0.63168
9,No log,0.832946,0.714286,0.650496,0.739177,0.680382
10,No log,0.758453,0.75,0.746473,0.771848,0.745635


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


TrainOutput(global_step=630, training_loss=0.44645624406754025, metrics={'train_runtime': 272.4757, 'train_samples_per_second': 36.884, 'train_steps_per_second': 2.312, 'total_flos': 1331416065484800.0, 'train_loss': 0.44645624406754025, 'epoch': 30.0})

In [54]:
trainer.save_model("models/bert420")

# Evaluation on labelled test data set, 200 profile descriptions

In [57]:
# Load CSV into a Pandas DataFrame
df_test = pd.read_csv('labeled_testdata_bert.csv')
df_test = df_test[['example', 'label']]
df_test = df_test[df_test['label'].notna()]
df_test['label'] = df_test['label'].astype(int)

In [58]:
df_test

Unnamed: 0,example,label
0,Offizieller Twitter Account von Außenministeri...,1
1,Dies ist ein Archiv des Twitter-Accounts von A...,1
2,"Landtagsabgeordnete der @cdufraktionmv, stellv...",3
3,Finanzsenator der Freien und Hansestadt Hambur...,2
4,Vorstehender der Bundestagsfraktion der Altern...,5
...,...,...
195,Sprecherin für Familie und Bildung der @gruene...,1
196,stellv. Vorsitzender Eisenbahn- und Verkehrsge...,2
197,Mitglied des Europäischen Parlaments,6
198,"#MdL, Parlamentarischer Geschäftsführer und Sp...",1


In [59]:
df_test['example'] = df_test['example'].apply(lambda x: demojize(x))

In [62]:
eval_data = Dataset.from_pandas(df_test)

In [65]:
tokenized_eval_data = eval_data.map(tokenize_function, batched=True)

Map:   0%|          | 0/200 [00:00<?, ? examples/s]

In [64]:
# put in testing mode (dropout modules are deactivated)
model.eval() 

DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(31102, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
 

In [66]:
trainer.predict(tokenized_eval_data)

PredictionOutput(predictions=array([[-1.2513319 , -0.17903946, -0.26790187, ..., -1.1659101 ,
        -1.7402118 ,  4.8527665 ],
       [-0.90500146, -0.7279141 ,  1.8236251 , ..., -1.5088848 ,
        -1.186973  ,  2.7102895 ],
       [-0.8262485 , -1.1469094 ,  0.2141004 , ..., -0.3664354 ,
        -1.1484729 , -1.0886905 ],
       ...,
       [-0.01743887, -0.7423294 , -0.3723109 , ...,  0.7087059 ,
        -1.3030632 ,  0.16954635],
       [ 0.03778141,  3.6756032 ,  0.4756799 , ..., -0.28607988,
        -2.2275503 , -1.5753125 ],
       [-0.6955915 , -2.1835642 ,  0.06707751, ..., -0.19645503,
         1.2705883 , -1.4830846 ]], dtype=float32), label_ids=array([1, 1, 3, 2, 5, 2, 4, 2, 6, 2, 3, 6, 3, 3, 4, 6, 6, 2, 6, 5, 5, 6,
       5, 5, 5, 5, 5, 5, 5, 5, 5, 2, 3, 2, 3, 6, 1, 4, 5, 6, 3, 6, 6, 1,
       1, 6, 4, 6, 5, 3, 2, 2, 0, 5, 1, 4, 3, 6, 5, 3, 2, 3, 3, 6, 0, 6,
       5, 6, 1, 4, 2, 3, 0, 6, 6, 6, 3, 2, 6, 2, 1, 1, 1, 4, 6, 3, 1, 2,
       3, 0, 1, 5, 6, 4, 6, 6, 1, 5, 2, 

* **Accuracy:** 0.84
* **F1:** 0.849
* **Precision:** 0.862
* **Recall:** 0.84