In [1]:
!pip install -q emoji

In [31]:
import pandas as pd
from datasets import Dataset
from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer
import numpy as np
import torch

# Load CSV into a Pandas DataFrame
df = pd.read_csv('initial_train_bert.csv')
df = df[['example', 'label']]
df = df[df['label'].notna()]
df['label'] = df['label'].astype(int)

In [32]:
df.head()

Unnamed: 0,example,label
0,hier twittert das Team von Petra Pau/Marzahn-H...,0
1,Die soziale Opposition: Feministisch. Sozialis...,0
2,MdB DIE LINKE l Stellvertretende Fraktionsvors...,0
3,MdB für die Menschen in der Städteregion Aache...,0
4,Mitglied des Bundestages | Sprecherin für Arbe...,0


In [33]:
label_counts = df['label'].value_counts()
print(label_counts)

label
6    102
3     71
2     65
5     56
1     55
0     40
4     30
Name: count, dtype: int64


In [34]:
# Define label mappings
id2label = {
    0: "ökonomisch neutral + libertär",
    1: "ökonomisch links + libertär",
    2: "ökonomisch neutral + libertär/autoritär",
    3: "ökonomisch neutral + autoritär",
    4: "ökonomisch rechts + libertär/autoritär",
    5: "ökonomisch rechts + autoritär",
    6: "Keine Kategorie"
}

label2id = {
    "ökonomisch neutral + libertär": 0,
    "ökonomisch links + libertär": 1,
    "ökonomisch neutral + libertär/autoritär": 2,
    "ökonomisch neutral + autoritär": 3,
    "ökonomisch rechts + libertär/autoritär": 4,
    "ökonomisch rechts + autoritär": 5,
    "Keine Kategorie": 6
}

In [35]:
df["LABEL"] = df["label"].map(id2label.get)

In [36]:
df.head()

Unnamed: 0,example,label,LABEL
0,hier twittert das Team von Petra Pau/Marzahn-H...,0,ökonomisch neutral + libertär
1,Die soziale Opposition: Feministisch. Sozialis...,0,ökonomisch neutral + libertär
2,MdB DIE LINKE l Stellvertretende Fraktionsvors...,0,ökonomisch neutral + libertär
3,MdB für die Menschen in der Städteregion Aache...,0,ökonomisch neutral + libertär
4,Mitglied des Bundestages | Sprecherin für Arbe...,0,ökonomisch neutral + libertär


#### Converting emojis to text using pip package emoji

In [37]:
import emoji
import string
import re

def demojize(text):
    # Convert emoji to text
    processed_text = emoji.demojize(text, language='de')
    # Remove ':' from converting the emojis to text
    for char in string.punctuation.replace('.', ''):
        processed_text = processed_text.replace(char, ' ')
    # Remove trailing whitespace
    processed_text = re.sub(r'\s+', ' ', processed_text).strip()
    return processed_text

df['example'] = df['example'].apply(lambda x: demojize(x))

In [38]:
df['example'][1]

'Die soziale Opposition Feministisch. Sozialistisch. Friedlich. Für dich taube sicherheitsweste laubbaum erhobene faust'

In [39]:
# Load the BERT tokenizer
#pretrained_LM = "bert-base-german-cased"
#pretrained_LM = "dbmdz/bert-base-german-cased"
#pretrained_LM = "distilbert-base-german-cased"
pretrained_LM = "distilbert-base-multilingual-cased"
tokenizer = AutoTokenizer.from_pretrained(pretrained_LM)

def tokenize_function(examples):
    return tokenizer(examples["example"], padding="max_length", truncation=True, return_tensors="pt")

In [40]:
# Split into training and testing data
training_data = df.sample(frac=0.8, random_state=25)
testing_data = df.drop(training_data.index)

In [41]:
training_data['label'].value_counts()

label
6    76
3    56
2    54
5    48
1    43
0    35
4    23
Name: count, dtype: int64

In [42]:
testing_data['label'].value_counts()

label
6    26
3    15
1    12
2    11
5     8
4     7
0     5
Name: count, dtype: int64

In [43]:
# Load datasets
training_data = Dataset.from_pandas(training_data)
testing_data = Dataset.from_pandas(testing_data)

tokenized_training_data = training_data.map(tokenize_function, batched=True)
tokenized_testing_data = testing_data.map(tokenize_function, batched=True)

Map:   0%|          | 0/335 [00:00<?, ? examples/s]

Map:   0%|          | 0/84 [00:00<?, ? examples/s]

In [44]:
tokenized_training_data['example'][11]

'Fraktionsvorsitzende Grüne Sachsen Anhalt sonnenblume Klimaschutz Mobilitätswende und Demokratiesicherung'

In [45]:
from transformers import DataCollatorWithPadding
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
data_collator

DataCollatorWithPadding(tokenizer=DistilBertTokenizerFast(name_or_path='distilbert-base-multilingual-cased', vocab_size=119547, model_max_length=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True), padding=True, max_length=None, pad_to_multiple_of=None, return_tensors='pt')

In [46]:
import evaluate
accuracy = evaluate.load("accuracy")

In [47]:
from sklearn.metrics import precision_recall_fscore_support, accuracy_score

# Good link about which average to choose: https://simonhessner.de/why-are-precision-recall-and-f1-score-equal-when-using-micro-averaging-in-a-multi-class-problem/
# micro: Calculate metrics globally by counting the total true positives, false negatives and false positives.
# macro: Calculate metrics for each label, and find their unweighted mean. This does not take label imbalance into account.

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='macro')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

In [48]:
from torch import nn

# Model and training setup
model = AutoModelForSequenceClassification.from_pretrained(
    pretrained_LM, num_labels=7, id2label=id2label, label2id=label2id
)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-multilingual-cased and are newly initialized: ['pre_classifier.weight', 'classifier.bias', 'pre_classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [49]:
import gc
torch.cuda.empty_cache()
gc.collect()

62

In [50]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [51]:
torch.cuda.get_device_name(0)

'NVIDIA A30'

In [52]:
model.to(device) 

DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(119547, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)


In [53]:
training_args = TrainingArguments(
    output_dir="bert420",              # output directory
    learning_rate=2e-5,                # total # of training epochs
    per_device_train_batch_size=16,    # batch size per device during training
    per_device_eval_batch_size=16,     # batch size for evaluation
    num_train_epochs=20,               # total # of training epochs
    weight_decay=0.01,                 # strength of weight decay  
    warmup_steps=50,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    push_to_hub=False,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_training_data,
    eval_dataset=tokenized_testing_data,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

In [None]:
# Fine-tune the model
torch.cuda.empty_cache()
trainer.train()

You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,No log,1.921275,0.309524,0.067532,0.044218,0.142857
2,No log,1.855819,0.333333,0.100156,0.117857,0.161905
3,No log,1.779944,0.333333,0.118821,0.091964,0.178022
4,No log,1.629839,0.452381,0.283436,0.405612,0.313278
5,No log,1.42092,0.547619,0.382013,0.474671,0.420313
6,No log,1.240649,0.619048,0.48501,0.497693,0.501399
7,No log,1.105238,0.678571,0.595006,0.614453,0.59642
8,No log,1.014739,0.702381,0.646383,0.764004,0.636226
9,No log,0.986463,0.72619,0.710732,0.772988,0.694974
10,No log,0.903014,0.75,0.738905,0.791486,0.72263


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [76]:
trainer.save_model("models/bert420")

# Evaluation on labelled test data set, 200 profile descriptions

In [25]:
# Load CSV into a Pandas DataFrame
df_test = pd.read_csv('labeled_testdata_bert.csv')
df_test = df_test[['example', 'label']]
df_test = df_test[df_test['label'].notna()]
df_test['label'] = df_test['label'].astype(int)

In [26]:
df_test

Unnamed: 0,example,label
0,Offizieller Twitter Account von Außenministeri...,1
1,Dies ist ein Archiv des Twitter-Accounts von A...,1
2,"Landtagsabgeordnete der @cdufraktionmv, stellv...",3
3,Finanzsenator der Freien und Hansestadt Hambur...,2
4,Vorstehender der Bundestagsfraktion der Altern...,5
...,...,...
195,Sprecherin für Familie und Bildung der @gruene...,1
196,stellv. Vorsitzender Eisenbahn- und Verkehrsge...,2
197,Mitglied des Europäischen Parlaments,6
198,"#MdL, Parlamentarischer Geschäftsführer und Sp...",1


In [27]:
df_test['example'] = df_test['example'].apply(lambda x: demojize(x))

In [28]:
eval_data = Dataset.from_pandas(df_test)

In [29]:
tokenized_eval_data = eval_data.map(tokenize_function, batched=True)
# put in testing mode (dropout modules are deactivated)
model.eval() 

Map:   0%|          | 0/200 [00:00<?, ? examples/s]

DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(119547, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)


In [30]:
trainer.predict(tokenized_eval_data)

PredictionOutput(predictions=array([[-0.02492349, -0.18298766, -0.585895  , ..., -0.816182  ,
        -1.9694253 ,  3.287021  ],
       [ 1.3753511 ,  0.15123917,  0.6952315 , ..., -1.1125579 ,
        -1.4510939 ,  0.97719646],
       [-1.5012742 , -1.300993  , -0.17449248, ..., -0.31300247,
        -1.0340086 , -0.77178556],
       ...,
       [-0.5031463 , -1.1427137 , -0.9484498 , ...,  1.0021849 ,
        -0.82477826,  1.9257565 ],
       [ 0.03033947,  2.231876  ,  0.71226305, ...,  0.59481263,
        -1.2308972 , -1.2421623 ],
       [-1.406859  , -1.6610694 , -0.3533278 , ..., -0.34714845,
        -0.95134795, -0.2407422 ]], dtype=float32), label_ids=array([1, 1, 3, 2, 5, 2, 4, 2, 6, 2, 3, 6, 3, 3, 4, 6, 6, 2, 6, 5, 5, 6,
       5, 5, 5, 5, 5, 5, 5, 5, 5, 2, 3, 2, 3, 6, 1, 4, 5, 6, 3, 6, 6, 1,
       1, 6, 4, 6, 5, 3, 2, 2, 0, 5, 1, 4, 3, 6, 5, 3, 2, 3, 3, 6, 0, 6,
       5, 6, 1, 4, 2, 3, 0, 6, 6, 6, 3, 2, 6, 2, 1, 1, 1, 4, 6, 3, 1, 2,
       3, 0, 1, 5, 6, 4, 6, 6, 1, 5, 2, 