In [1]:
!pip install -q datasets transformers numpy torch pandas

In [44]:
import pandas as pd
from datasets import Dataset
from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer
import numpy as np
import torch

In [92]:
# Load CSV into a Pandas DataFrame
df = pd.read_csv('profiledescriptions_withpartyanduserid.csv', sep=';')

In [93]:
df.head()

Unnamed: 0,X,twitter_handle,profile,user_id,party
0,0,ManjaSchuele,"Ministerin für Wissenschaft, Forschung und Kul...","8,27091E+17",SPD
1,1,TeamPetraPau,hier twittert das Team von Petra Pau/Marzahn-H...,1683845126,DIE LINKE
2,2,TeamDieSchmidt,"Hier tweetet, textet und publiziert das Team d...",1377117206,SPD
3,3,reinhardbrandl,Mitglied im Verteidigungsausschuss 🇺🇦 \nMitgli...,262730721,CSU
4,4,schwarz_spd,SPD Bundestagskandidat für Bamberg und Forchhe...,53617577,SPD


In [94]:
df.shape

(2049, 5)

In [95]:
df_test = pd.read_csv('labeled_testdata_bert.csv')

In [96]:
df_test.head()

Unnamed: 0,X,twitter_handle,example,user_id,party,label
0,1789,ABaerbock,Offizieller Twitter Account von Außenministeri...,1.269262e+18,Bündnis 90/Die Grünen,1
1,190,ABaerbockArchiv,Dies ist ein Archiv des Twitter-Accounts von A...,2179011000.0,Bündnis 90/Die Grünen,1
2,920,ACvA_MdL,"Landtagsabgeordnete der @cdufraktionmv, stellv...",1.218517e+18,CDU,3
3,822,ADressel,Finanzsenator der Freien und Hansestadt Hambur...,1.053199e+18,SPD,2
4,1202,AG_AfD,Vorstehender der Bundestagsfraktion der Altern...,9.126128e+17,AfD,5


In [97]:
df_test.shape

(200, 6)

In [98]:
df = df[~df['X'].isin(df_test['X'])]

In [99]:
df.head()

Unnamed: 0,X,twitter_handle,profile,user_id,party
0,0,ManjaSchuele,"Ministerin für Wissenschaft, Forschung und Kul...","8,27091E+17",SPD
1,1,TeamPetraPau,hier twittert das Team von Petra Pau/Marzahn-H...,1683845126,DIE LINKE
2,2,TeamDieSchmidt,"Hier tweetet, textet und publiziert das Team d...",1377117206,SPD
3,3,reinhardbrandl,Mitglied im Verteidigungsausschuss 🇺🇦 \nMitgli...,262730721,CSU
4,4,schwarz_spd,SPD Bundestagskandidat für Bamberg und Forchhe...,53617577,SPD


In [100]:
df.shape

(1860, 5)

In [101]:
# Get unique values in a specific column

def analyze_party_distribution(column):
    # Get unique values in a specific column
    unique_values = column.unique()
    print("Unique Parties:", unique_values)

    # Get count of each unique value in the column
    value_counts = column.value_counts()
    print("\nValue counts:\n", value_counts)
 
analyze_party_distribution(df['party'])

Unique Parties: ['SPD' 'DIE LINKE' 'CSU' 'AfD' 'Bündnis 90/Die Grünen' 'FDP' 'CDU'
 'Liberal-Konservative Reformer' 'Christliche Mitte'
 'Bürgerbewegung pro NRW' 'Bürgerrechtsbewegung Solidarität' 'ÖDP'
 'DIE PARTEI' 'Bayernpartei' 'Sozialistische Gleichheitspartei'
 'Piratenpartei' 'no party affiliation' 'Renew Europe'
 'Europäische Volkspartei' 'Freie Wähler' 'BIW' 'LKR' 'Die Föderalen'
 'SSW' 'Thüringer Heimatpartei' 'buendnis21' 'UNABHAENGIGE Partei'
 'Klimaliste Baden-Württemberg' 'dieBasis' 'Wir2020'
 'Deutsche Kommunistische Partei DKP' 'PdF' 'Volt Deutschland' 'MLPD'
 'Die Urbane' 'Demokratie in Bewegung' 'Partei für Gesundheitsforschung'
 'V-Partei³' 'Graue Panther' 'Bündnis C' 'Team Todenhöfer'
 'Partei der Humanisten' 'MENSCHLICHE WELT' 'Tierschutzpartei'
 'Dt. Konservative' 'Liberale Demokraten' 'FW' nan]

Value counts:
 party
SPD                                   417
Bündnis 90/Die Grünen                 380
CDU                                   314
FDP                    

In [102]:
party2id = {
    "DIE LINKE": 0,
    "Bündnis 90/Die Grünen": 1,
    "SPD": 2,
    "CSU": 3,
    "CDU": 3,
    "FDP": 4,
    "AfD": 5,
    "Keine Kategorie": 6
}

### Just keep the party names from above and change any other party to "Keine Kategorie"

In [103]:
# List of valid party names
valid_parties = ["DIE LINKE", "Bündnis 90/Die Grünen", "SPD", "CSU", "CDU", "FDP", "AfD"]


# Function to map party names
def map_parties(party):
    if party in valid_parties:
        return party
    else:
        return "Keine Kategorie"

# Apply the function to the 'Party' column
df.loc[:,'party'] = df['party'].apply(map_parties)

In [104]:
analyze_party_distribution(df['party'])

Unique Parties: ['SPD' 'DIE LINKE' 'CSU' 'AfD' 'Bündnis 90/Die Grünen' 'FDP' 'CDU'
 'Keine Kategorie']

Value counts:
 party
SPD                      417
Bündnis 90/Die Grünen    380
CDU                      314
FDP                      199
AfD                      190
DIE LINKE                165
Keine Kategorie          127
CSU                       68
Name: count, dtype: int64


### Map partys to label

In [105]:
df.loc[:,"label"] = df["party"].map(party2id.get)

In [106]:
# just to be sure that theres no na value and every value is an integer
# drop everything except of profile description and label
df = df[df['label'].notna()]
df['label'] = df['label'].astype(int)
df = df[['profile', 'label']]

In [107]:
df.head()

Unnamed: 0,profile,label
0,"Ministerin für Wissenschaft, Forschung und Kul...",2
1,hier twittert das Team von Petra Pau/Marzahn-H...,0
2,"Hier tweetet, textet und publiziert das Team d...",2
3,Mitglied im Verteidigungsausschuss 🇺🇦 \nMitgli...,3
4,SPD Bundestagskandidat für Bamberg und Forchhe...,2


#### Converting emojis to text using pip package emoji

In [108]:
import emoji
import string
import re

def demojize(text):
    # Convert emoji to text
    processed_text = emoji.demojize(text, language='de')
    # Remove ':' from converting the emojis to text
    for char in string.punctuation.replace('.', ''):
        processed_text = processed_text.replace(char, ' ')
    # Remove trailing whitespace
    processed_text = re.sub(r'\s+', ' ', processed_text).strip()
    return processed_text

df['profile'] = df['profile'].apply(lambda x: demojize(x))

In [109]:
df['profile'][1]

'hier twittert das Team von Petra Pau Marzahn Hellersdorf mahe PAUer für die LINKE'

In [110]:
# Load the BERT tokenizer
#pretrained_LM = "dbmdz/bert-base-german-cased"
#pretrained_LM = "bert-base-german-cased"
#pretrained_LM = "distilbert-base-german-cased"
pretrained_LM = "distilbert-base-multilingual-cased"
tokenizer = AutoTokenizer.from_pretrained(pretrained_LM)

def tokenize_function(examples):
    return tokenizer(examples["profile"], padding="max_length", truncation=True, return_tensors="pt")

In [111]:
# Split into training and testing data
training_data = df.sample(frac=0.8, random_state=25)
testing_data = df.drop(training_data.index)

In [112]:
# Load datasets
training_data = Dataset.from_pandas(training_data)
testing_data = Dataset.from_pandas(testing_data)

tokenized_training_data = training_data.map(tokenize_function, batched=True)
tokenized_testing_data = testing_data.map(tokenize_function, batched=True)

Map:   0%|          | 0/1488 [00:00<?, ? examples/s]

Map:   0%|          | 0/372 [00:00<?, ? examples/s]

In [113]:
from transformers import DataCollatorWithPadding
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
data_collator

DataCollatorWithPadding(tokenizer=DistilBertTokenizerFast(name_or_path='distilbert-base-multilingual-cased', vocab_size=119547, model_max_length=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True), padding=True, max_length=None, pad_to_multiple_of=None, return_tensors='pt')

In [114]:
from sklearn.metrics import precision_recall_fscore_support, accuracy_score

# Good link about which average to choose: https://simonhessner.de/why-are-precision-recall-and-f1-score-equal-when-using-micro-averaging-in-a-multi-class-problem/
# micro: Calculate metrics globally by counting the total true positives, false negatives and false positives.
# macro: Calculate metrics for each label, and find their unweighted mean. This does not take label imbalance into account.

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='macro')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

In [115]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer

# Model and training setup
model = AutoModelForSequenceClassification.from_pretrained(
    pretrained_LM, num_labels=7
)

Downloading model.safetensors:   0%|          | 0.00/542M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-multilingual-cased and are newly initialized: ['pre_classifier.weight', 'classifier.bias', 'classifier.weight', 'pre_classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [116]:
import gc
torch.cuda.empty_cache()
gc.collect()

69

In [117]:
training_args = TrainingArguments(
    output_dir="bert_full",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=10,
    weight_decay=0.01,
    warmup_steps=150,               # number of warmup steps for learning rate 
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    push_to_hub=False,
    #label_names=["LABEL"],
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_training_data,
    eval_dataset=tokenized_testing_data,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics  
)

In [118]:
#checkpointing
#use cuda
trainer.train()

You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,No log,1.817017,0.284946,0.136308,0.134747,0.182869
2,No log,0.984333,0.723118,0.694315,0.762528,0.661898
3,No log,0.673762,0.760753,0.735812,0.752589,0.732536
4,No log,0.642384,0.77957,0.76267,0.777531,0.756739
5,No log,0.678555,0.790323,0.772735,0.790513,0.76059
6,0.905300,0.708776,0.798387,0.779663,0.792624,0.770188
7,0.905300,0.759968,0.798387,0.777701,0.791321,0.769989
8,0.905300,0.790577,0.795699,0.775141,0.787366,0.7685
9,0.905300,0.825771,0.793011,0.774885,0.791458,0.764263
10,0.905300,0.83044,0.798387,0.779389,0.796739,0.767881


  _warn_prf(average, modifier, msg_start, len(result))


TrainOutput(global_step=930, training_loss=0.523200176608178, metrics={'train_runtime': 368.6652, 'train_samples_per_second': 40.362, 'train_steps_per_second': 2.523, 'total_flos': 1971290652180480.0, 'train_loss': 0.523200176608178, 'epoch': 10.0})

## Evaluation on labelled test data set, 200 profile descriptions

In [119]:
# Load CSV into a Pandas DataFrame
df_eval = pd.read_csv('labeled_testdata_bert.csv')
df_eval = df_eval[['example', 'label']]
df_eval = df_eval[df_eval['label'].notna()]
df_eval['label'] = df_eval['label'].astype(int)
# Rename the 'example' column to 'profile'
df_eval.rename(columns={'example': 'profile'}, inplace=True)

In [120]:
label_counts = df_eval['label'].value_counts()
print(label_counts)

label
6    77
1    28
3    27
5    24
2    23
4    11
0    10
Name: count, dtype: int64


In [121]:
df_eval

Unnamed: 0,profile,label
0,Offizieller Twitter Account von Außenministeri...,1
1,Dies ist ein Archiv des Twitter-Accounts von A...,1
2,"Landtagsabgeordnete der @cdufraktionmv, stellv...",3
3,Finanzsenator der Freien und Hansestadt Hambur...,2
4,Vorstehender der Bundestagsfraktion der Altern...,5
...,...,...
195,Sprecherin für Familie und Bildung der @gruene...,1
196,stellv. Vorsitzender Eisenbahn- und Verkehrsge...,2
197,Mitglied des Europäischen Parlaments,6
198,"#MdL, Parlamentarischer Geschäftsführer und Sp...",1


In [122]:
df_eval['profile'] = df_eval['profile'].apply(lambda x: demojize(x))
eval_data = Dataset.from_pandas(df_eval)
tokenized_eval_data = eval_data.map(tokenize_function, batched=True)
# put in testing mode (dropout modules are deactivated)
model.eval() 

Map:   0%|          | 0/200 [00:00<?, ? examples/s]

DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(119547, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)


In [123]:
trainer.predict(tokenized_eval_data)

PredictionOutput(predictions=array([[ 1.99773954e-03, -4.13172394e-01, -1.07267988e+00, ...,
        -8.57515037e-01, -4.70943034e-01,  2.11565208e+00],
       [ 1.23985484e-01, -1.11623347e+00, -1.57514557e-01, ...,
        -1.01788104e+00, -1.01607606e-01,  1.69604027e+00],
       [-1.31383824e+00, -1.24294484e+00, -5.80123484e-01, ...,
        -1.28265870e+00, -7.13598132e-01, -9.98899579e-01],
       ...,
       [-6.52879849e-02, -1.98258176e-01, -8.07147145e-01, ...,
        -6.53106570e-01,  1.14889777e+00,  2.87228614e-01],
       [-1.39963239e-01,  4.44869137e+00, -6.10823154e-01, ...,
        -5.70658028e-01, -1.44613421e+00, -5.64688146e-01],
       [-1.15253246e+00, -1.32238626e+00, -8.69843543e-01, ...,
        -1.22957623e+00, -6.77889645e-01, -7.92777658e-01]], dtype=float32), label_ids=array([1, 1, 3, 2, 5, 2, 4, 2, 6, 2, 3, 6, 3, 3, 4, 6, 6, 2, 6, 5, 5, 6,
       5, 5, 5, 5, 5, 5, 5, 5, 5, 2, 3, 2, 3, 6, 1, 4, 5, 6, 3, 6, 6, 1,
       1, 6, 4, 6, 5, 3, 2, 2, 0, 5, 1, 4,