In [1]:
from nlp import load_dataset
dataset = load_dataset('de_politik_news.py', cache_dir='data')

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
Using custom data configuration default


In [2]:
from sklearn.metrics import accuracy_score, f1_score, classification_report
def evaluate(model_test, class_test):
    accuracy = accuracy_score(class_test, model_test)
    f1_micro = f1_score(class_test, model_test, average = 'micro')
    f1_macro = f1_score(class_test, model_test, average = 'macro')
    report = classification_report(class_test, model_test)
    print(f'accuracy: {accuracy}')
    print(f'F1-micro: {f1_micro}')
    print(f'F1-macro: {f1_macro}')
    print(f'Report: {report}')

In [None]:
from transformers import BertTokenizer, BertForSequenceClassification
import torch
tokenizer = BertTokenizer.from_pretrained('bert-base-german-cased')
bert = BertForSequenceClassification.from_pretrained('models/BERT', num_labels=5).bert
device = 'cuda' if torch.cuda.is_available() else 'cpu'
bert.to(device)

In [None]:
import numpy as np
predictions = []
for i in range(len(dataset['test']['text'])):
    ids = tokenizer(dataset['test']['text'][i], padding='max_length', truncation=True, return_tensors="pt")
    prediction = bert(**{k: v.to(device) for k, v in ids.items()})[1].cpu().detach()
    predictions.append(prediction)
np.savetxt('data/test_embeddings.csv', torch.cat(predictions))

In [None]:
predictions = []
for i in range(len(dataset['train']['text'])):
    ids = tokenizer(dataset['train']['text'][i], padding='max_length', truncation=True, return_tensors="pt")
    prediction = bert(**{k: v.to(device) for k, v in ids.items()})[1].cpu().detach()
    predictions.append(prediction)
np.savetxt('data/train_embeddings.csv', torch.cat(predictions))

In [3]:
import numpy as np
X_train = np.genfromtxt('data/train_embeddings.csv')
y_train = dataset['train']['class']

X_test = np.genfromtxt('data/test_embeddings.csv')
y_test = dataset['test']['class']

# Random forest

In [4]:
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier(max_depth=5, min_impurity_decrease = 0.1, 
                               criterion = "entropy", n_estimators =8, 
                               class_weight='balanced', random_state=0)
model = model.fit(X_train,y_train)

In [8]:
evaluate(model.predict(X_test),y_test)

accuracy: 0.432012432012432
F1-micro: 0.43201243201243206
F1-macro: 0.3836674510189971
Report:               precision    recall  f1-score   support

      center       0.28      0.35      0.31      1349
 center-left       0.59      0.69      0.63      1159
center-right       0.45      0.41      0.43      1754
    far-left       0.21      0.11      0.14       215
   far-right       0.57      0.32      0.41       671

    accuracy                           0.43      5148
   macro avg       0.42      0.37      0.38      5148
weighted avg       0.44      0.43      0.43      5148



In [7]:
import pickle
pickle.dump(model, open('models/BERT/model.bin', 'wb'))
model = pickle.load(open('models/BERT/model.bin', 'rb'))

In [86]:
from imblearn.ensemble import BalancedRandomForestClassifier
model = BalancedRandomForestClassifier(max_depth=10, min_impurity_decrease = 0.001, 
                               criterion = "gini", n_estimators=1000,  random_state=0)
model = model.fit(embeddings, dataset['train']['class'])

In [87]:
evaluate(model, embeddings_test, class_test)

accuracy: 0.38966588966588966
F1-micro: 0.38966588966588966
F1-macro: 0.39975995518118534
Report:               precision    recall  f1-score   support

      center       0.26      0.17      0.21      1349
 center-left       0.30      0.34      0.32      1159
center-right       0.48      0.56      0.52      1754
    far-left       0.49      0.63      0.55       215
   far-right       0.41      0.39      0.40       671

    accuracy                           0.39      5148
   macro avg       0.39      0.42      0.40      5148
weighted avg       0.38      0.39      0.38      5148



# EasyEnsemble

In [11]:
from imblearn.ensemble import EasyEnsembleClassifier
model = EasyEnsembleClassifier(n_estimators=10,  random_state=0)
model = model.fit(X_train,y_train)

In [12]:
evaluate(model.predict(X_test), y_test)

accuracy: 0.3646076146076146
F1-micro: 0.36460761460761465
F1-macro: 0.3431566460705905
Report:               precision    recall  f1-score   support

      center       0.28      0.54      0.37      1349
 center-left       0.52      0.46      0.49      1159
center-right       0.38      0.23      0.28      1754
    far-left       0.23      0.21      0.22       215
   far-right       0.60      0.25      0.35       671

    accuracy                           0.36      5148
   macro avg       0.40      0.34      0.34      5148
weighted avg       0.41      0.36      0.36      5148



# Logistic regression

In [86]:
from sklearn.linear_model import LogisticRegression
model = LogisticRegression(penalty = 'l2', C = 0.1, solver = 'saga', random_state=0)
model = model.fit(X_train,y_train)



In [88]:
evaluate(model.predict(X_test), y_test)

accuracy: 0.4201631701631702
F1-micro: 0.42016317016317023
F1-macro: 0.33893479155053896
Report:               precision    recall  f1-score   support

      center       0.30      0.41      0.35      1349
 center-left       0.50      0.64      0.56      1159
center-right       0.46      0.42      0.44      1754
    far-left       0.15      0.03      0.05       215
   far-right       0.66      0.20      0.30       671

    accuracy                           0.42      5148
   macro avg       0.41      0.34      0.34      5148
weighted avg       0.44      0.42      0.41      5148



# Naive Bayes

In [5]:
from sklearn.naive_bayes import GaussianNB
model = GaussianNB(var_smoothing=1e-5)
model = model.fit(X_train,y_train)

In [6]:
evaluate(model.predict(X_test), y_test)

accuracy: 0.4188034188034188
F1-micro: 0.4188034188034188
F1-macro: 0.3637430809340828
Report:               precision    recall  f1-score   support

      center       0.26      0.31      0.28      1349
 center-left       0.57      0.74      0.64      1159
center-right       0.41      0.39      0.40      1754
    far-left       0.21      0.08      0.12       215
   far-right       0.60      0.28      0.38       671

    accuracy                           0.42      5148
   macro avg       0.41      0.36      0.36      5148
weighted avg       0.42      0.42      0.41      5148

