In [None]:
!pip install transformers
!pip install datasets
!pip install --upgrade --no-cache-dir gdown==4.5.4

!gdown 18oZZ4jqRK-uF-Nz6ftRdgNjKix88hrnO
!unzip data_and_models.zip && rm data_and_models.zip

seed = 12
import numpy as np
np.random.seed(seed)
import torch
torch.manual_seed(seed)
import random
random.seed(seed)

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.23.1-py3-none-any.whl (5.3 MB)
[K     |████████████████████████████████| 5.3 MB 5.0 MB/s 
Collecting huggingface-hub<1.0,>=0.10.0
  Downloading huggingface_hub-0.10.1-py3-none-any.whl (163 kB)
[K     |████████████████████████████████| 163 kB 61.9 MB/s 
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.1-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[K     |████████████████████████████████| 7.6 MB 49.4 MB/s 
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.10.1 tokenizers-0.13.1 transformers-4.23.1
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting datasets
  Downloading datasets-2.6.1-py3-none-any.whl (441 kB)
[K     |████████████████████████████████| 441 kB 5.0 MB/

In [None]:
from collections import defaultdict
import csv
import time
import pickle

from datasets import load_metric
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import RobertaForSequenceClassification, RobertaTokenizerFast, Trainer, TrainingArguments

def compute_metrics(eval_preds):
    metric = load_metric("accuracy")
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

def multi_class_top_one_accuracy(predictions, labels, class_i):
  """
  For each class, calculate the top 1 accuracy. 
  """
  assert len(predictions) == len(labels)
  total = 0
  correct = 0
  for i in range(len(predictions)):
    if labels[i] != class_i:
      continue
    total += 1
    prediction = []
    for j, k in enumerate(predictions[i]):
      prediction.append([j, k]) # k is the value
    prediction.sort(key = lambda x: -x[1])
    if prediction[0][0] == labels[i]:
      correct += 1
  ans = str(round(correct/total, 3))
  if len(ans) < 5:
    ans += "0" * (5-len(ans))
  return ans

class PSCDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

mlength = 512
directory = "./data_and_models/"
start = time.time()

training_args = TrainingArguments(
    output_dir='./results',          # output directory
    num_train_epochs=20,             # total number of training epochs
    per_device_train_batch_size=16,  # batch size per device during training
    per_device_eval_batch_size=64,   # batch size for evaluation
    warmup_steps=0,                  # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir='./logs',            # directory for storing logs
    logging_steps=10,
    learning_rate = 2e-5,
    save_strategy= "epoch",
    evaluation_strategy="epoch",
    load_best_model_at_end= True,
    metric_for_best_model="accuracy",
    seed = 11,
)

tasks = {
    "44": {
        "number_of_labels": 42,
         "label_column": 1,
    },
    "8": {
        "number_of_labels": 8,
        "label_column": 2,
    }
}

def compute_task(task):
  index = -1
  classes = {}
  texts = []
  labels = []
  lm_reverse_mapper = {}


  with open(directory + "target_corpus.csv") as doc:
    reader = csv.reader(doc)
    next(reader)
    for row in reader:
      topic = row[tasks[task]["label_column"]]
      if topic not in classes:
        index += 1
        classes[topic] = index
        lm_reverse_mapper[index] = topic.capitalize()
      labels.append(classes[topic])
      texts.append(row[0])

  print("# classes", len(classes))
  X_train, X_test, y_train, y_test = train_test_split(texts, labels, test_size=625, random_state=11)
  X_train, X_dev, y_train, y_dev = train_test_split(X_train, y_train, test_size=625, random_state=11)
  print(len(X_train), len(X_dev), len(X_test))
  print("# classes in train", len(set(y_train)))
  print("# classes in dev", len(set(y_dev)))
  print("# classes in test", len(set(y_test)))

  tokenizer = RobertaTokenizerFast.from_pretrained('roberta-base')
  train_encodings = tokenizer(X_train, truncation=True, padding=True, max_length=mlength)
  dev_encodings = tokenizer(X_dev, truncation=True, padding=True, max_length = mlength)
  test_encodings = tokenizer(X_test, truncation=True, padding=True, max_length= mlength)

  train_dataset = PSCDataset(train_encodings, y_train)
  dev_dataset = PSCDataset(dev_encodings, y_dev)
  test_dataset = PSCDataset(test_encodings, y_test)

  def model_init():
    return RobertaForSequenceClassification.from_pretrained("roberta-base", num_labels=tasks[task]["number_of_labels"])

  trainer = Trainer(
      model_init=model_init,               # the instantiated 🤗 Transformers model to be trained
      args=training_args,                  # training arguments, defined above
      train_dataset=train_dataset,         # training dataset
      eval_dataset=dev_dataset,            # evaluation dataset
      compute_metrics=compute_metrics,     # compute_metrics
      )
  trainer.train()

  predictions = trainer.predict(test_dataset)


  with open(directory + "logistic_model_" + task + ".pkl", "rb") as doc:
    model = pickle.load(doc)
  with open(directory + "tfidf_" + task + ".pkl", "rb") as doc:
    tokenizer = pickle.load(doc)

  class_mapper = {}
  class_reverse_mapper = {}
  for i, topic in enumerate(model.classes_):
    class_mapper[topic.replace(" ", ".").replace("-", ".")] = i
    class_reverse_mapper[i] = topic

  df = pd.read_csv(directory + "target_corpus.csv")
  df = df[df["text"].isin(X_test)]
  X = df['text']
  Y = list(df["topic_"+task].transform(lambda x: class_mapper[x]))

  Xtfidf = tokenizer.transform(X)

  preds = model.predict(Xtfidf)
  preds = [class_mapper[topic.replace(" ", ".").replace("-", ".")] for topic in preds]
  policy_probs = model.predict_proba(Xtfidf)

  from collections import Counter
  counter = Counter(Y)
  results = []
  for class_i, count in counter.items():
    result = [class_reverse_mapper[class_i].capitalize(), count]
    result.append(multi_class_top_one_accuracy(policy_probs, Y, class_i))
    #sample result:
    #['Political authority', 140, '0.550']
    #['Welfare state expansion', 49, '0.694']
    results.append(result)

  results.sort(key = lambda result: [-result[1], result[0]])

  for index, topic in lm_reverse_mapper.items():
    topic = topic.replace(".", " ")
    if "demographic" in topic:
      topic = "Non-economic demographic groups"
    lm_reverse_mapper[index]=topic

  lm_per_class_predictions = defaultdict(str)
  counter = Counter(y_test)
  for class_i, count in counter.items():
    lm_per_class_predictions[lm_reverse_mapper[class_i]] = multi_class_top_one_accuracy(predictions.predictions, y_test, class_i)
  outputs = []
  for result in results:
    result += [lm_per_class_predictions[result[0]]]
    if float(result[-1]) > float(result[-2]):
      result[-1] = "\\textbf{" + result[-1] + "}" 
    elif float(result[-1]) < float(result[-2]):
      result[-2] = "\\textbf{" + result[-2] + "}" 
    str_result = [str(i) for i in result]
    outputs.append("& " +" & ".join(str_result)+"\\\\")
  return outputs

results = {}
for task in tasks:
  results[task] = compute_task(task)

# classes 42
2915 625 625
# classes in train 42
# classes in dev 36
# classes in test 35


Downloading:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/481 [00:00<?, ?B/s]

loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--roberta-base/snapshots/ff46155979338ff8063cdad90908b498ab91b181/config.json
Model config RobertaConfig {
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2",
    "3": "LABEL_3",
    "4": "LABEL_4",
    "5": "LABEL_5",
    "6": "LABEL_6",
    "7": "LABEL_7",
    "8": "LABEL_8",
    "9": "LABEL_9",
    "10": "LABEL_10",
    "11": "LABEL_11",
    "12": "LABEL_12",
    "13": "LABEL_13",
    "14": "LABEL_14",
    "15": "LABEL_15",
    "16": "LABEL_16",
    "17": "LABEL_17",
    "18": "LABEL_18",
    "19": "LABEL_19",
    "20": "LABEL_20",
    "21": "LABEL_21",
    "22": "LABEL_22",
    "23": "LABEL_23",
    "24": "LABEL_24",
    "25": "LABEL_25",

Downloading:   0%|          | 0.00/501M [00:00<?, ?B/s]

loading weights file pytorch_model.bin from cache at /root/.cache/huggingface/hub/models--roberta-base/snapshots/ff46155979338ff8063cdad90908b498ab91b181/pytorch_model.bin
Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'lm_head.decoder.weight', 'lm_head.bias', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'roberta.pooler.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some 

Epoch,Training Loss,Validation Loss,Accuracy
1,2.4303,2.288778,0.4176
2,1.9171,1.934722,0.5168
3,1.5825,1.823273,0.5424
4,1.3624,1.823743,0.5488
5,0.9723,1.889241,0.5232
6,0.848,1.888951,0.5344
7,0.5739,2.010903,0.5328
8,0.4092,1.98929,0.5536
9,0.3174,2.029555,0.5632
10,0.2309,2.128701,0.5328


***** Running Evaluation *****
  Num examples = 625
  Batch size = 64
  if sys.path[0] == '':


Downloading builder script:   0%|          | 0.00/1.65k [00:00<?, ?B/s]

Saving model checkpoint to ./results/checkpoint-183
Configuration saved in ./results/checkpoint-183/config.json
Model weights saved in ./results/checkpoint-183/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 625
  Batch size = 64
Saving model checkpoint to ./results/checkpoint-366
Configuration saved in ./results/checkpoint-366/config.json
Model weights saved in ./results/checkpoint-366/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 625
  Batch size = 64
Saving model checkpoint to ./results/checkpoint-549
Configuration saved in ./results/checkpoint-549/config.json
Model weights saved in ./results/checkpoint-549/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 625
  Batch size = 64
Saving model checkpoint to ./results/checkpoint-732
Configuration saved in ./results/checkpoint-732/config.json
Model weights saved in ./results/checkpoint-732/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 625
  Batch size = 64
Saving m

# classes 8
2915 625 625
# classes in train 8
# classes in dev 8
# classes in test 8


loading file vocab.json from cache at /root/.cache/huggingface/hub/models--roberta-base/snapshots/ff46155979338ff8063cdad90908b498ab91b181/vocab.json
loading file merges.txt from cache at /root/.cache/huggingface/hub/models--roberta-base/snapshots/ff46155979338ff8063cdad90908b498ab91b181/merges.txt
loading file tokenizer.json from cache at /root/.cache/huggingface/hub/models--roberta-base/snapshots/ff46155979338ff8063cdad90908b498ab91b181/tokenizer.json
loading file added_tokens.json from cache at None
loading file special_tokens_map.json from cache at None
loading file tokenizer_config.json from cache at None
loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--roberta-base/snapshots/ff46155979338ff8063cdad90908b498ab91b181/config.json
Model config RobertaConfig {
  "_name_or_path": "roberta-base",
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_toke

Epoch,Training Loss,Validation Loss,Accuracy
1,1.3199,1.205588,0.5776
2,0.9789,1.116255,0.6192
3,0.8163,1.130859,0.6336
4,0.8216,1.280719,0.632
5,0.4968,1.309494,0.6288
6,0.1466,1.554289,0.6096
7,0.1655,1.672504,0.6256
8,0.1786,1.918652,0.6288
9,0.0383,2.086765,0.6384
10,0.116,2.2943,0.6256


***** Running Evaluation *****
  Num examples = 625
  Batch size = 64
Saving model checkpoint to ./results/checkpoint-183
Configuration saved in ./results/checkpoint-183/config.json
Model weights saved in ./results/checkpoint-183/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 625
  Batch size = 64
Saving model checkpoint to ./results/checkpoint-366
Configuration saved in ./results/checkpoint-366/config.json
Model weights saved in ./results/checkpoint-366/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 625
  Batch size = 64
Saving model checkpoint to ./results/checkpoint-549
Configuration saved in ./results/checkpoint-549/config.json
Model weights saved in ./results/checkpoint-549/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 625
  Batch size = 64
Saving model checkpoint to ./results/checkpoint-732
Configuration saved in ./results/checkpoint-732/config.json
Model weights saved in ./results/checkpoint-732/pytorch_model.bin
***** Ru

https://scikit-learn.org/stable/modules/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/modules/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/modules/model_persistence.html#security-maintainability-limitations


In [None]:
for task in tasks:
  print("==== " + task + "-Topic Classification ====")
  for result in results[task]:
    print(result)

==== 44-Topic Classification ====
& Political authority & 140 & 0.550 & \textbf{0.657}\\
& Welfare state expansion & 49 & 0.694 & \textbf{0.714}\\
& Democracy & 44 & 0.318 & \textbf{0.341}\\
& No topic & 32 & 0.000 & \textbf{0.438}\\
& Labour groups & 31 & 0.387 & \textbf{0.484}\\
& Education & 26 & \textbf{0.885} & 0.846\\
& Constitutionalism & 24 & 0.000 & \textbf{0.458}\\
& Economic orthodoxy & 21 & 0.238 & \textbf{0.571}\\
& Governmental and administrative efficiency & 21 & 0.238 & 0.238\\
& Technology and infrastructure & 21 & 0.333 & \textbf{0.524}\\
& Law and order & 20 & 0.650 & \textbf{0.700}\\
& Multiculturalism & 19 & 0.632 & \textbf{0.842}\\
& Equality & 18 & \textbf{0.389} & 0.278\\
& Free market economy & 15 & 0.000 & \textbf{0.267}\\
& Economic growth & 13 & 0.615 & \textbf{0.769}\\
& Freedom and human rights & 13 & 0.000 & \textbf{0.231}\\
& Market regulation & 12 & 0.167 & \textbf{0.333}\\
& Traditional morality & 12 & 0.250 & \textbf{0.333}\\
& Military & 11 & 0.727 &

In [None]:
end = time.time()
print(f"The program took {(end - start) // 60} minutes in total.")

The program took 52.0 minutes in total.


In [None]:
from google.colab import runtime
runtime.unassign()