This notebook uses the trained 3-class DistilBERT to classify/evaluate the US Congressional Tweets Dataset and the Politics.com dataset. 

### Load the saved checkpoint of BERT

In [1]:
import torch
import numpy as np
import pandas as pd
from datasets import Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    DataCollatorWithPadding,
    Trainer,
)

device = "cuda" if torch.cuda.is_available() else "cpu"
MODEL_DIR = "../models_distilbert/best"
MAX_LEN = 256

tok = AutoTokenizer.from_pretrained(MODEL_DIR)
model = AutoModelForSequenceClassification.from_pretrained(MODEL_DIR).to(device)
model.eval()

print(model.config.id2label)

{0: 'Left', 1: 'Right', 2: 'Neutral'}


### Load dataset

In [6]:
import orjson
import pandas as pd

path = "tweets.jsonl"
rows = []
with open(path, "rb") as f:
    for line in f:
        rows.append(orjson.loads(line))

df_congress = pd.DataFrame(rows)

print(df_congress.head())
print(df_congress.columns)


df_congress["created_at"] = pd.to_datetime(df_congress["created_at"], unit="s")
df_congress = df_congress[["text", "created_at", "screen_name", "user_id"]].copy()

  contributors coordinates  created_at display_text_range  \
0         None        None  1217870931            [0, 74]   
1         None        None  1218049485            [0, 25]   
2         None        None  1218054936            [0, 65]   
3         None        None  1218117172            [0, 37]   
4         None        None  1218121925            [0, 90]   

                                            entities  favorite_count  \
0  {'hashtags': [], 'symbols': [], 'urls': [], 'u...               0   
1  {'hashtags': [], 'symbols': [], 'urls': [], 'u...               0   
2  {'hashtags': [], 'symbols': [], 'urls': [], 'u...               0   
3  {'hashtags': [], 'symbols': [], 'urls': [], 'u...               0   
4  {'hashtags': [], 'symbols': [], 'urls': [], 'u...               0   

   favorited   geo         id     id_str  ...  \
0      False  None  877418565  877418565  ...   
1      False  None  879618172  879618172  ...   
2      False  None  879695803  879695803  ...   
3   

### Build a HF Dataset from Congressional tweets and tokenize

In [7]:
ds_congress = Dataset.from_pandas(df_congress, preserve_index=True)
def tokenize_congress(batch):
    return tok(
        batch["text"],
        truncation=True,
        max_length=MAX_LEN,
    )

ds_congress_tok = ds_congress.map(
    tokenize_congress,
    batched=True,
)

data_collator = DataCollatorWithPadding(tokenizer=tok)

Map: 100%|██████████| 1243370/1243370 [00:52<00:00, 23806.40 examples/s]


In [None]:
inference_trainer = Trainer(
    model=model,
    tokenizer=tok,
    data_collator=data_collator,
)

pred_output = inference_trainer.predict(ds_congress_tok)
logits = pred_output.predictions
pred_ids = logits.argmax(axis=-1)

id2label = model.config.id2label
pred_labels = [id2label[int(i)] for i in pred_ids]

probs = torch.softmax(torch.tensor(logits), dim=-1).numpy()

df_congress["stance_id"] = pred_ids
df_congress["stance"] = pred_labels

for i, lab in id2label.items():
    df_congress[f"p_{lab}"] = probs[:, int(i)]

In [None]:
cols = [
    "user_id", "screen_name", "created_at", "text",
    "stance_id", "stance", "p_Left", "p_Neutral", "p_Right",
]
df_out = df_congress[cols].copy()

# df_out.to_parquet("congress_labeled.parquet", index=False)

df_out.to_json(
    "congress_labeled.jsonl",
    orient="records",
    lines=True,
    force_ascii=False
)

### Evaluate on politics.com dataset (with gold labels)

In [9]:
import re
import orjson
import pandas as pd

poltics_posts = "posts_201908161514.json"

with open(poltics_posts, "rb") as f:
    raw = f.read()

# strip UTF-8 and leading whitespace/newlines
raw = raw.lstrip(b"\xef\xbb\xbf \t\r\n")

data = orjson.loads(raw)

posts = data["posts"]
df_politics = pd.DataFrame(posts)

print(df_politics.head())
print("polafil value counts:")
print(df_politics["polafil"].value_counts())

# clean HTML tags from text
TAG_RE = re.compile(r"<[^>]+>")

def clean_html(text: str) -> str:
    if text is None:
        return ""
    text = TAG_RE.sub(" ", text)          # remove tags
    text = text.replace("&nbsp;", " ")
    return " ".join(text.split())         # normalize whitespace

df_politics["text_raw"] = df_politics["text"].fillna("")
df_politics["text"] = df_politics["text_raw"].apply(clean_html)
df_politics["polafil"] = df_politics["polafil"].str.lower()

id2label = {int(k): v for k, v in model.config.id2label.items()}
label2id = {v: k for k, v in id2label.items()}
print("Model label mapping:", label2id)

polafil2stance = {
    "democrat": "Left",    # treat Dem as "Left"
    "republican": "Right", # treat Rep as "Right"
    "unknown": None,       # ignore unknown when computing metrics
}

df_politics["stance_gold"] = df_politics["polafil"].map(polafil2stance)

# keep only rows with a gold label
df_politics = df_politics[df_politics["stance_gold"].notna()].reset_index(drop=True)

df_politics["label_id_gold"] = df_politics["stance_gold"].map(label2id)

print(df_politics[["polafil", "stance_gold", "label_id_gold"]].head())
print("Final evaluation size:", len(df_politics))

# after mapping:
print("After mapping polafil -> stance_gold:")
print(df_politics["stance_gold"].value_counts(dropna=False))

# ensure no unmapped stance_gold
assert df_politics["label_id_gold"].notna().all(), "Some stance_gold not in model labels"
df_politics["label_id_gold"] = df_politics["label_id_gold"].astype(int)


   thread                      topic  index          date   uid     name  \
0       0  $100 Billion More, Please      0  2005/01/05 @  4740  cypress   
1       0  $100 Billion More, Please      1  2005/01/05 @     0    DrWho   
2       0  $100 Billion More, Please      2  2005/01/05 @  4740  cypress   
3       0  $100 Billion More, Please      3  2005/01/05 @  3508    DrWho   
4       0  $100 Billion More, Please      4  2005/01/05 @  4740  cypress   

    polafil                                               text textparts  
0  democrat  <br />From USA TODAY: Congress expects the Whi...   t0p0000  
1   unknown                                          cha ching   t0p0001  
2  democrat  Putting the total Cost of Bush's Iraq War near...   t0p0002  
3  l-fringe  "War costs complicate President Bush's plans f...   t0p0003  
4  democrat  How on earth is Bush going to pay for his war,...   t0p0004  
polafil value counts:
polafil
unknown         22232
democrat        19257
libertarian      96

In [None]:
# Build HF Dataset for polafil
ds_politics = Dataset.from_pandas(
    df_politics[["text", "label_id_gold"]],
    preserve_index=True,
)

def tokenize_polafil(batch):
    return tok(
        batch["text"],
        truncation=True,
        max_length=MAX_LEN,
    )

ds_politics_tok = ds_politics.map(tokenize_polafil, batched=True)

data_collator = DataCollatorWithPadding(tokenizer=tok)

In [None]:
from sklearn.metrics import accuracy_score, f1_score, classification_report

# reuse the same model and tokenizer
inference_trainer = Trainer(
    model=model,
    tokenizer=tok,
    data_collator=data_collator,
)

pred_output = inference_trainer.predict(
    ds_politics_tok.remove_columns(["label_id_gold"])
)

logits = pred_output.predictions
pred_ids = logits.argmax(axis=-1)
id2label = {int(k): v for k, v in model.config.id2label.items()}

pred_labels = [id2label[int(i)] for i in pred_ids]

y_true = df_politics["label_id_gold"].to_numpy()
y_pred = pred_ids

print("Accuracy:", accuracy_score(y_true, y_pred))
print("Macro F1:", f1_score(y_true, y_pred, average="macro"))

print("Per-class F1 (in id order):", f1_score(y_true, y_pred, average=None))
print()
print("Classification report:")
print(
    classification_report(
        y_true,
        y_pred,
        labels=sorted(id2label.keys()),
        target_names=[id2label[i] for i in sorted(id2label.keys())],
        digits=4,
    )
)

# Attach predictions and probabilities back to the dataframe
probs = torch.softmax(torch.tensor(logits), dim=-1).numpy()
df_politics["stance_id_pred"] = pred_ids
df_politics["stance_pred"] = pred_labels

for i, lab in id2label.items():
    df_politics[f"p_{lab}"] = probs[:, int(i)]

df_politics.to_json(
    "politics_labeled_with_gold.jsonl",
    orient="records",
    lines=True,
    force_ascii=False,
)

df_politics.head()

In [11]:
import re
import orjson
import pandas as pd
import numpy as np
import torch
from datasets import Dataset
from transformers import DataCollatorWithPadding, Trainer
from sklearn.metrics import accuracy_score, f1_score, classification_report

poltics_posts = "posts_201908161514.json"

with open(poltics_posts, "rb") as f:
    raw = f.read()

# strip UTF-8 BOM and leading whitespace/newlines
raw = raw.lstrip(b"\xef\xbb\xbf \t\r\n")

data = orjson.loads(raw)

posts = data["posts"]
df_politics = pd.DataFrame(posts)

print(df_politics.head())
print("polafil value counts:")
print(df_politics["polafil"].value_counts())

# normalize polafil to lowercase for mapping
df_politics["polafil"] = df_politics["polafil"].str.lower()

# clean HTML tags from text
TAG_RE = re.compile(r"<[^>]+>")

def clean_html(text: str) -> str:
    if text is None:
        return ""
    text = TAG_RE.sub(" ", text)
    text = text.replace("&nbsp;", " ")
    return " ".join(text.split())

df_politics["text_raw"] = df_politics["text"].fillna("")
df_politics["text"] = df_politics["text_raw"].apply(clean_html)

# map polafil to 3 class labels
id2label = {int(k): v for k, v in model.config.id2label.items()}
label2id = {v: k for k, v in id2label.items()}
print("Model label mapping:", label2id)

polafil2stance = {
    # Left
    "democrat":    "Left",
    "liberal":     "Left",
    "l-fringe":    "Left",
    "green":       "Left",

    # Right
    "republican":  "Right",
    "conservative":"Right",
    "r-fringe":    "Right",
    "libertarian": "Right",

    # Neutral
    "centrist":    "Neutral",
    "independent": "Neutral",

    "unknown":     None,
}

df_politics["stance_gold"] = df_politics["polafil"].map(polafil2stance)

print("After mapping polafil -> stance_gold:")
print(df_politics["stance_gold"].value_counts(dropna=False))

# keep only rows with a gold label
df_politics = df_politics[df_politics["stance_gold"].notna()].reset_index(drop=True)

df_politics["label_id_gold"] = df_politics["stance_gold"].map(label2id)

assert df_politics["label_id_gold"].notna().all(), "Some stance_gold not in model labels"
df_politics["label_id_gold"] = df_politics["label_id_gold"].astype(int)

print(df_politics[["polafil", "stance_gold", "label_id_gold"]].head())
print("Final evaluation size:", len(df_politics))

# build HF Dataset for polafil
ds_politics = Dataset.from_pandas(
    df_politics[["text", "label_id_gold"]],
    preserve_index=True,
)

def tokenize_politics(batch):
    return tok(
        batch["text"],
        truncation=True,
        max_length=MAX_LEN,
    )

ds_politics_tok = ds_politics.map(tokenize_politics, batched=True)
data_collator = DataCollatorWithPadding(tokenizer=tok)

inference_trainer = Trainer(
    model=model,
    tokenizer=tok,
    data_collator=data_collator,
)

pred_output = inference_trainer.predict(
    ds_politics_tok.remove_columns(["label_id_gold"])
)

logits = pred_output.predictions
pred_ids = logits.argmax(axis=-1)

id2label = {int(k): v for k, v in model.config.id2label.items()}
pred_labels = [id2label[int(i)] for i in pred_ids]

y_true = df_politics["label_id_gold"].to_numpy()
y_pred = pred_ids

print("Accuracy:", accuracy_score(y_true, y_pred))
print("Macro F1:", f1_score(y_true, y_pred, average="macro"))
print("Per-class F1 (in id order):", f1_score(y_true, y_pred, average=None))
print()
print("Classification report:")
print(
    classification_report(
        y_true,
        y_pred,
        labels=sorted(id2label.keys()),
        target_names=[id2label[i] for i in sorted(id2label.keys())],
        digits=4,
    )
)

# attach predictions and probabilities back to the dataframe
probs = torch.softmax(torch.tensor(logits), dim=-1).numpy()
df_politics["stance_id_pred"] = pred_ids
df_politics["stance_pred"] = pred_labels

for i, lab in id2label.items():
    df_politics[f"p_{lab}"] = probs[:, int(i)]

df_politics.to_json(
    "politics_labeled_with_gold.jsonl",
    orient="records",
    lines=True,
    force_ascii=False,
)

df_politics.head()


   thread                      topic  index          date   uid     name  \
0       0  $100 Billion More, Please      0  2005/01/05 @  4740  cypress   
1       0  $100 Billion More, Please      1  2005/01/05 @     0    DrWho   
2       0  $100 Billion More, Please      2  2005/01/05 @  4740  cypress   
3       0  $100 Billion More, Please      3  2005/01/05 @  3508    DrWho   
4       0  $100 Billion More, Please      4  2005/01/05 @  4740  cypress   

    polafil                                               text textparts  
0  democrat  <br />From USA TODAY: Congress expects the Whi...   t0p0000  
1   unknown                                          cha ching   t0p0001  
2  democrat  Putting the total Cost of Bush's Iraq War near...   t0p0002  
3  l-fringe  "War costs complicate President Bush's plans f...   t0p0003  
4  democrat  How on earth is Bush going to pay for his war,...   t0p0004  
polafil value counts:
polafil
unknown         22232
democrat        19257
libertarian      96

Map: 100%|██████████| 55622/55622 [00:02<00:00, 18970.66 examples/s]


Accuracy: 0.28510301679191685
Macro F1: 0.2695428434490326
Per-class F1 (in id order): [0.21748282 0.40969638 0.18144933]

Classification report:
              precision    recall  f1-score   support

        Left     0.5047    0.1386    0.2175     25345
       Right     0.4164    0.4032    0.4097     23073
     Neutral     0.1156    0.4221    0.1814      7204

    accuracy                         0.2851     55622
   macro avg     0.3455    0.3213    0.2695     55622
weighted avg     0.4176    0.2851    0.2925     55622



Unnamed: 0,thread,topic,index,date,uid,name,polafil,text,textparts,text_raw,stance_gold,label_id_gold,stance_id_pred,stance_pred,p_Left,p_Right,p_Neutral
0,0,"$100 Billion More, Please",0,2005/01/05 @,4740,cypress,democrat,From USA TODAY: Congress expects the White Hou...,t0p0000,<br />From USA TODAY: Congress expects the Whi...,Left,0,2,Neutral,0.001826,0.004635,0.993539
1,0,"$100 Billion More, Please",2,2005/01/05 @,4740,cypress,democrat,Putting the total Cost of Bush's Iraq War near...,t0p0002,Putting the total Cost of Bush's Iraq War near...,Left,0,0,Left,0.774734,0.035307,0.189959
2,0,"$100 Billion More, Please",3,2005/01/05 @,3508,DrWho,l-fringe,"""War costs complicate President Bush's plans f...",t0p0003,"""War costs complicate President Bush's plans f...",Left,0,2,Neutral,0.320784,0.195724,0.483492
3,0,"$100 Billion More, Please",4,2005/01/05 @,4740,cypress,democrat,"How on earth is Bush going to pay for his war,...",t0p0004,"How on earth is Bush going to pay for his war,...",Left,0,0,Left,0.928402,0.029768,0.041829
4,0,"$100 Billion More, Please",5,2005/01/05 @,3508,DrWho,l-fringe,Visa.,t0p0005,Visa.,Left,0,2,Neutral,0.004101,0.013329,0.98257


In [12]:
print("Gold label counts (Left/Neutral/Right):")
print(df_politics["stance_gold"].value_counts())

Gold label counts (Left/Neutral/Right):
stance_gold
Left       25345
Right      23073
Neutral     7204
Name: count, dtype: int64


In [13]:
from sklearn.metrics import confusion_matrix

cm = confusion_matrix(
    y_true,
    y_pred,
    labels=[label2id["Left"], label2id["Right"], label2id["Neutral"]],
)

print("Labels order:", ["Left", "Right", "Neutral"])
print(cm)

Labels order: ['Left', 'Right', 'Neutral']
[[ 3513  9801 12031]
 [ 2526  9304 11243]
 [  922  3241  3041]]


In [14]:
# restrict to clear D vs R authors
mask_dr = df_politics["polafil"].isin(["democrat", "republican"])
df_dr = df_politics[mask_dr].copy()

print("D/R subset size:", len(df_dr))
print(df_dr["polafil"].value_counts())

y_true_dr = df_dr["label_id_gold"].to_numpy()
y_pred_dr = df_dr["stance_id_pred"].to_numpy()

print("=== D vs R subset ===")
print("Accuracy:", accuracy_score(y_true_dr, y_pred_dr))
print("Macro F1:", f1_score(y_true_dr, y_pred_dr, average="macro"))
print(classification_report(
    y_true_dr,
    y_pred_dr,
    labels=[label2id["Left"], label2id["Right"]],
    target_names=["Left", "Right"],
    digits=4,
))


D/R subset size: 23576
polafil
democrat      19257
republican     4319
Name: count, dtype: int64
=== D vs R subset ===
Accuracy: 0.1740329148286393
Macro F1: 0.15522927024452837
              precision    recall  f1-score   support

        Left     0.8402    0.1262    0.2194     19257
       Right     0.1805    0.3874    0.2463      4319

   micro avg     0.3374    0.1740    0.2296     23576
   macro avg     0.5104    0.2568    0.2328     23576
weighted avg     0.7194    0.1740    0.2243     23576



In [15]:
# raw counts
tab_counts = pd.crosstab(df_politics["polafil"], df_politics["stance_pred"])
print("Counts:\n", tab_counts)

# row-normalized (distribution of predictions per polafil)
tab_props = pd.crosstab(
    df_politics["polafil"],
    df_politics["stance_pred"],
    normalize='index'
)
print("\nRow-normalized (P(pred | polafil)):\n", tab_props)

Counts:
 stance_pred   Left  Neutral  Right
polafil                           
centrist       143      408    593
conservative   896     3762   4384
democrat      2430     9232   7595
green          168      259    277
independent    779     2633   2648
l-fringe       241      735    499
liberal        674     1805   1430
libertarian   1165     5282   3225
r-fringe         3       15     22
republican     462     2184   1673

Row-normalized (P(pred | polafil)):
 stance_pred       Left   Neutral     Right
polafil                                   
centrist      0.125000  0.356643  0.518357
conservative  0.099093  0.416058  0.484848
democrat      0.126188  0.479410  0.394402
green         0.238636  0.367898  0.393466
independent   0.128548  0.434488  0.436964
l-fringe      0.163390  0.498305  0.338305
liberal       0.172423  0.461755  0.365822
libertarian   0.120451  0.546112  0.333437
r-fringe      0.075000  0.375000  0.550000
republican    0.106969  0.505673  0.387358


In [16]:
from sklearn.metrics import accuracy_score, f1_score

for paf, g in df_politics.groupby("polafil"):
    y_t = g["label_id_gold"].to_numpy()
    y_p = g["stance_id_pred"].to_numpy()
    acc = accuracy_score(y_t, y_p)
    f1  = f1_score(y_t, y_p, average="macro")
    print(f"{paf:12s}  n={len(g):5d}  acc={acc:.3f}  macroF1={f1:.3f}")

centrist      n= 1144  acc=0.357  macroF1=0.175
conservative  n= 9042  acc=0.485  macroF1=0.218
democrat      n=19257  acc=0.126  macroF1=0.075
green         n=  704  acc=0.239  macroF1=0.128
independent   n= 6060  acc=0.434  macroF1=0.202
l-fringe      n= 1475  acc=0.163  macroF1=0.094
liberal       n= 3909  acc=0.172  macroF1=0.098
libertarian   n= 9672  acc=0.333  macroF1=0.167
r-fringe      n=   40  acc=0.550  macroF1=0.237
republican    n= 4319  acc=0.387  macroF1=0.186
