# Zero Shot Classification, Validation and Inference

In this study used for `PopBERT`.

In [1]:
import torch
import glob

from transformers import AutoModel
from transformers import AutoTokenizer
from transformers import pipeline

import pandas as pd
import numpy as np

from sklearn.metrics import classification_report, accuracy_score, roc_curve, RocCurveDisplay
from sklearn.model_selection import train_test_split

from sklearn.linear_model import LogisticRegressionCV
from sklearn.ensemble import RandomForestClassifier


In [2]:
tokenizer = AutoTokenizer.from_pretrained("luerhard/PopBERT")
model = AutoModel.from_pretrained("luerhard/PopBERT", trust_remote_code=True)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at deepset/gbert-large and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


# Creating Predictions

In [3]:
all_data = pd.read_csv('../../data/ground_truth/labeled_data.csv', header=None, names=["text", "label"])
imbalanced_data = pd.read_csv("../../data/ground_truth/labeled_data_unbalanced.csv", header=None, names=["text", "label"])

X = all_data.text
y = all_data.label

X_i = imbalanced_data.text
y_i = imbalanced_data.label

In [4]:
y_preds = []
for sample in X:
    encodings = tokenizer(sample, padding=True, return_tensors="pt")
    with torch.inference_mode():
        _, prediction_tensor = model(**encodings)
    prediction = prediction_tensor.numpy()
    y_preds.append(prediction)

y_preds = np.array(y_preds).reshape((len(y),4))

In [5]:
y_ipreds = []
for sample in X_i:
    encodings = tokenizer(sample, padding=True, return_tensors="pt")
    with torch.inference_mode():
        _, prediction_tensor = model(**encodings)
    prediction = prediction_tensor.numpy()
    y_ipreds.append(prediction)

y_ipreds = np.array(y_ipreds).reshape((len(y_i),4))

# Populism Score

In [12]:
random_states = [100,200,300,400,500]
output = []

for rs in random_states:
    train_preds, test_preds, train_labels, test_labels = train_test_split(y_ipreds, y_i, train_size=100, stratify=y_i, random_state=rs)
    
    best_f1 = .5
    best_t = .5
    for t in np.linspace(0.5,1.5,201):
        f1 = classification_report(train_labels, (train_preds[:,0]+train_preds[:,1])> t, output_dict=True)['weighted avg']['f1-score']
        if (f1 > best_f1):
            best_t = t
            best_f1 = f1
    output.append(classification_report(test_labels, (test_preds[:,0]+test_preds[:,1])> best_t, output_dict=True))
    print("Random state %d: Achieved f1 = %.2f for t = %.2f in training." % (rs, best_f1, best_t))
    
scores = pd.json_normalize(output)

accuracy = (np.mean(scores.accuracy), np.std(scores.accuracy))
precision = np.mean(scores['weighted avg.precision']), np.std(scores['weighted avg.precision'])
recall = np.mean(scores['weighted avg.recall']), np.std(scores['weighted avg.recall'])
f1 = np.mean(scores['weighted avg.f1-score']), np.std(scores['weighted avg.f1-score'])

print(
"accuracy: \t %.4f +- %.4f \n"
"precision: \t %.4f +- %.4f \n"
"recall: \t %.4f +- %.4f \n"
"f1-score: \t %.4f +- %.4f \n" % (accuracy[0], accuracy[1], 
                                   precision[0], precision[1],
                                   recall[0], recall[1],
                                   f1[0], f1[1],
                                  ))

best_f1 = .5
best_t = .5
for t in np.linspace(.5,1.5,101):
    f1 = classification_report(y_i, (y_ipreds[:,0]+y_ipreds[:,1])> t, output_dict=True)['weighted avg']['f1-score']
    if (f1 > best_f1):
        best_t = t
        best_f1 = f1
print("Full Set: Achieved f1 = %.2f for t = %.2f with all data." % (best_f1, best_t))

print(classification_report(y_i, (y_ipreds[:,0]+y_ipreds[:,1])> best_t))

Random state 100: Achieved f1 = 0.76 for t = 1.04 in training.
Random state 200: Achieved f1 = 0.75 for t = 1.00 in training.
Random state 300: Achieved f1 = 0.72 for t = 0.94 in training.
Random state 400: Achieved f1 = 0.77 for t = 1.01 in training.
Random state 500: Achieved f1 = 0.83 for t = 1.04 in training.
accuracy: 	 0.7318 +- 0.0254 
precision: 	 0.7464 +- 0.0122 
recall: 	 0.7318 +- 0.0254 
f1-score: 	 0.7332 +- 0.0216 

Full Set: Achieved f1 = 0.75 for t = 1.00 with all data.
              precision    recall  f1-score   support

           0       0.85      0.76      0.80       426
           1       0.58      0.71      0.64       199

    accuracy                           0.75       625
   macro avg       0.72      0.74      0.72       625
weighted avg       0.77      0.75      0.75       625



# Inference

In [4]:
gc_news = pd.read_csv("../../data/gcnews/gc_news_labeled.csv")

gc_news['PopBERT_anti_elite'] = -1
gc_news['PopBERT_peop_centr'] = -1
gc_news['PopBERT_left_wingh'] = -1
gc_news['PopBERT_right_wing'] = -1

In [None]:
for i in gc_news.index:
    if gc_news.loc[i, 'PopBERT_anti_elite'] == -1:
        encodings = tokenizer(gc_news.Content[i], padding=True, return_tensors="pt", truncation=True)
        with torch.inference_mode():
            _, prediction_tensor = model(**encodings)
        preds = prediction_tensor.numpy()[0]
        gc_news.loc[i, 'PopBERT_anti_elite'] = preds[0]
        gc_news.loc[i, 'PopBERT_peop_centr'] = preds[1]
        gc_news.loc[i, 'PopBERT_left_wingh'] = preds[2]
        gc_news.loc[i, 'PopBERT_right_wing'] = preds[3]
    if not i % 100:
        print("Sample %d / %d processed." % (i, len(gc_news.index)))

Uncomment to export labeled data:

In [None]:
# gc_news.to_csv("gc_news_PopBERT_labels.csv", index=False)