# RPT (Research Paper Tagger)


This Jupyter notebook contains the code that trains RPT. We take a pre-trained BERT model and fine-tune it on the ACL data that we have collected.

In [1]:
import os
import zipfile
import json
import random


import numpy as np
import pandas as pd

from helpers import tokenize_and_format, flat_accuracy

import torch
from transformers import BertForSequenceClassification, AdamW, BertConfig, get_linear_schedule_with_warmup

In [2]:
random.seed(0)
np.random.seed(0)

torch.manual_seed(0)
torch.use_deterministic_algorithms(False)
# Confirm that the GPU is detected

assert torch.cuda.is_available()

# Get the GPU device name.
device_name = torch.cuda.get_device_name()
n_gpu = torch.cuda.device_count()
print(f"Found device: {device_name}, n_gpu: {n_gpu}")
device = torch.device("cuda")

Found device: NVIDIA GeForce RTX 2060 with Max-Q Design, n_gpu: 1


In [3]:
with open("Data/Raw data/training_data.jsonl", "r") as f:
    training_data = json.load(f)
    
with open("Data/Raw data/validation_data.jsonl", "r") as f:
    validation_data = json.load(f)
    
with open("Data/Raw data/test_data.jsonl", "r") as f:
    test_data = json.load(f)
    
with open("Data/Metadata/label_string_to_ID.jsonl", "r") as f:
    label_string_to_ID = json.load(f)
    
with open("Data/Metadata/label_ID_to_string.jsonl", "r") as f:
    label_ID_to_string = json.load(f)

### 1) Prediction using only title

In [4]:
training_inputs_1 = []
training_label_strings_1 = []

validation_inputs_1 = []
validation_label_strings_1 = []

test_inputs_1 = []
test_label_strings_1 = []

for training_example in training_data:
    
    training_input = training_example[0][0]
    training_inputs_1.append(training_input)
    
    training_label_strings_1.append(training_example[1])
    
for validation_example in validation_data:
    
    validation_input = validation_example[0][0]
    validation_inputs_1.append(validation_input)
    
    validation_label_strings_1.append(validation_example[1])
    
for test_example in test_data:
    
    test_input = test_example[0][0]
    test_inputs_1.append(test_input)
    
    test_label_strings_1.append(test_example[1])

In [5]:
training_input_ids_1, training_attention_masks_1 = tokenize_and_format(training_inputs_1, 32)
validation_input_ids_1, validation_attention_masks_1 = tokenize_and_format(validation_inputs_1, 32)
test_input_ids_1, test_attention_masks_1 = tokenize_and_format(test_inputs_1, 32)

In [6]:
training_label_IDs_1 = []
validation_label_IDs_1 = []
test_label_IDs_1 = []

for training_label_string in training_label_strings_1:
    training_label_IDs_1.append(label_string_to_ID[training_label_string])
    
for validation_label_string in validation_label_strings_1:
    validation_label_IDs_1.append(label_string_to_ID[validation_label_string])
    
for test_label_string in test_label_strings_1:
    test_label_IDs_1.append(label_string_to_ID[test_label_string])
    
    
# Convert the lists into tensors.
training_input_ids_1 = torch.cat(training_input_ids_1, dim=0)
training_attention_masks_1 = torch.cat(training_attention_masks_1, dim=0)
training_label_IDs_1 = torch.tensor(training_label_IDs_1)

validation_input_ids_1 = torch.cat(validation_input_ids_1, dim=0)
validation_attention_masks_1 = torch.cat(validation_attention_masks_1, dim=0)
validation_label_IDs_1 = torch.tensor(validation_label_IDs_1)

test_input_ids_1 = torch.cat(test_input_ids_1, dim=0)
test_attention_masks_1 = torch.cat(test_attention_masks_1, dim=0)
test_label_IDs_1 = torch.tensor(test_label_IDs_1)

In [7]:
train_set_1 = [(training_input_ids_1[i], training_attention_masks_1[i], training_label_IDs_1[i]) for i in range(len(training_inputs_1))]
val_set_1 = [(validation_input_ids_1[i], validation_attention_masks_1[i], validation_label_IDs_1[i]) for i in range(len(validation_inputs_1))]
test_set_1 = [(test_input_ids_1[i], test_attention_masks_1[i], test_label_IDs_1[i]) for i in range(len(test_inputs_1))]

In [None]:
#train_set = train_set[:5]

In [8]:
model = BertForSequenceClassification.from_pretrained(
    "bert-base-uncased", # Use the 12-layer BERT model, with an uncased vocab.
    num_labels = 20, # The number of output labels.   
    output_attentions = False, # Whether the model returns attentions weights.
    output_hidden_states = False, # Whether the model returns all hidden-states.
)

# Tell pytorch to run this model on the GPU.
model.cuda()

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, element

### Fine-tune the BERT model

In [9]:
# Fine-tuning hyperparameters

batch_size = 16
optimizer = AdamW(model.parameters(),
                  lr = 5e-5, # args.learning_rate - default is 5e-5
                  eps = 1e-8 # args.adam_epsilon  - default is 1e-8
                )
epochs = 10


# function to get validation accuracy
def get_validation_performance(val_set):
    # Put the model in evaluation mode
    model.eval()

    # Tracking variables 
    total_eval_accuracy = 0
    total_eval_loss = 0

    num_batches = int(len(val_set)/batch_size) + 1

    total_correct = 0

    for i in range(num_batches):

        end_index = min(batch_size * (i+1), len(val_set))

        batch = val_set[i*batch_size:end_index]

        if len(batch) == 0: continue

        input_id_tensors = torch.stack([data[0] for data in batch])
        input_mask_tensors = torch.stack([data[1] for data in batch])
        label_tensors = torch.stack([data[2] for data in batch])

        # Move tensors to the GPU
        b_input_ids = input_id_tensors.to(device)
        b_input_mask = input_mask_tensors.to(device)
        b_labels = label_tensors.to(device)

        # Tell pytorch not to bother with constructing the compute graph during
        # the forward pass, since this is only needed for backprop (training).
        with torch.no_grad():        

            # Forward pass, calculate logit predictions.
            outputs = model(b_input_ids, 
                                    token_type_ids=None, 
                                    attention_mask=b_input_mask,
                                    labels=b_labels)
            loss = outputs.loss
            logits = outputs.logits

            # Accumulate the validation loss.
            total_eval_loss += loss.item()

            # Move logits and labels to CPU
            logits = logits.detach().cpu().numpy()
            label_ids = b_labels.to('cpu').numpy()

            # Calculate the number of correctly labeled examples in batch
            pred_flat = np.argmax(logits, axis=1).flatten()
            labels_flat = label_ids.flatten()
            num_correct = np.sum(pred_flat == labels_flat)
            total_correct += num_correct
        
    # Report the final accuracy for this validation run.
    avg_val_accuracy = total_correct / len(val_set)
    return avg_val_accuracy



# training loop

# For each epoch...
for epoch_i in range(0, epochs):
    # Perform one full pass over the training set.

    print("")
    print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
    print('Training...')

    # Reset the total loss for this epoch.
    total_train_loss = 0

    # Put the model into training mode.
    model.train()

    # For each batch of training data...
    num_batches = int(len(train_set_1)/batch_size) + 1

    for i in range(num_batches):
      
        print("Batch " + str(i) + " out of " + str(num_batches) + " batches.")
        
        end_index = min(batch_size * (i+1), len(train_set_1))

        batch = train_set_1[i*batch_size:end_index]

        if len(batch) == 0: continue

        input_id_tensors = torch.stack([data[0] for data in batch])
        input_mask_tensors = torch.stack([data[1] for data in batch])
        label_tensors = torch.stack([data[2] for data in batch])

        # Move tensors to the GPU
        b_input_ids = input_id_tensors.to(device)
        b_input_mask = input_mask_tensors.to(device)
        b_labels = label_tensors.to(device)

        # Clear the previously calculated gradient
        model.zero_grad()        

        # Perform a forward pass (evaluate the model on this training batch).
        outputs = model(b_input_ids, 
                            token_type_ids=None, 
                            attention_mask=b_input_mask, 
                            labels=b_labels)
        loss = outputs.loss
        logits = outputs.logits

        total_train_loss += loss.item()

        # Perform a backward pass to calculate the gradients.
        loss.backward()

        # Update parameters and take a step using the computed gradient.
        optimizer.step()
        
    # ========================================
    #               Validation
    # ========================================
    # After the completion of each training epoch, measure our performance on
    # our validation set. Implement this function in the cell above.
    print(f"Total loss: {total_train_loss}")
    val_acc = get_validation_performance(val_set_1)
    print(f"Validation accuracy: {val_acc}")
    
print("")
print("Training complete!")


Training...
Batch 0 out of 87 batches.
Batch 1 out of 87 batches.
Batch 2 out of 87 batches.
Batch 3 out of 87 batches.
Batch 4 out of 87 batches.
Batch 5 out of 87 batches.
Batch 6 out of 87 batches.
Batch 7 out of 87 batches.
Batch 8 out of 87 batches.
Batch 9 out of 87 batches.
Batch 10 out of 87 batches.
Batch 11 out of 87 batches.
Batch 12 out of 87 batches.
Batch 13 out of 87 batches.
Batch 14 out of 87 batches.
Batch 15 out of 87 batches.
Batch 16 out of 87 batches.
Batch 17 out of 87 batches.
Batch 18 out of 87 batches.
Batch 19 out of 87 batches.
Batch 20 out of 87 batches.
Batch 21 out of 87 batches.
Batch 22 out of 87 batches.
Batch 23 out of 87 batches.
Batch 24 out of 87 batches.
Batch 25 out of 87 batches.
Batch 26 out of 87 batches.
Batch 27 out of 87 batches.
Batch 28 out of 87 batches.
Batch 29 out of 87 batches.
Batch 30 out of 87 batches.
Batch 31 out of 87 batches.
Batch 32 out of 87 batches.
Batch 33 out of 87 batches.
Batch 34 out of 87 batches.
Batch 35 out of 8

Batch 21 out of 87 batches.
Batch 22 out of 87 batches.
Batch 23 out of 87 batches.
Batch 24 out of 87 batches.
Batch 25 out of 87 batches.
Batch 26 out of 87 batches.
Batch 27 out of 87 batches.
Batch 28 out of 87 batches.
Batch 29 out of 87 batches.
Batch 30 out of 87 batches.
Batch 31 out of 87 batches.
Batch 32 out of 87 batches.
Batch 33 out of 87 batches.
Batch 34 out of 87 batches.
Batch 35 out of 87 batches.
Batch 36 out of 87 batches.
Batch 37 out of 87 batches.
Batch 38 out of 87 batches.
Batch 39 out of 87 batches.
Batch 40 out of 87 batches.
Batch 41 out of 87 batches.
Batch 42 out of 87 batches.
Batch 43 out of 87 batches.
Batch 44 out of 87 batches.
Batch 45 out of 87 batches.
Batch 46 out of 87 batches.
Batch 47 out of 87 batches.
Batch 48 out of 87 batches.
Batch 49 out of 87 batches.
Batch 50 out of 87 batches.
Batch 51 out of 87 batches.
Batch 52 out of 87 batches.
Batch 53 out of 87 batches.
Batch 54 out of 87 batches.
Batch 55 out of 87 batches.
Batch 56 out of 87 b

Batch 42 out of 87 batches.
Batch 43 out of 87 batches.
Batch 44 out of 87 batches.
Batch 45 out of 87 batches.
Batch 46 out of 87 batches.
Batch 47 out of 87 batches.
Batch 48 out of 87 batches.
Batch 49 out of 87 batches.
Batch 50 out of 87 batches.
Batch 51 out of 87 batches.
Batch 52 out of 87 batches.
Batch 53 out of 87 batches.
Batch 54 out of 87 batches.
Batch 55 out of 87 batches.
Batch 56 out of 87 batches.
Batch 57 out of 87 batches.
Batch 58 out of 87 batches.
Batch 59 out of 87 batches.
Batch 60 out of 87 batches.
Batch 61 out of 87 batches.
Batch 62 out of 87 batches.
Batch 63 out of 87 batches.
Batch 64 out of 87 batches.
Batch 65 out of 87 batches.
Batch 66 out of 87 batches.
Batch 67 out of 87 batches.
Batch 68 out of 87 batches.
Batch 69 out of 87 batches.
Batch 70 out of 87 batches.
Batch 71 out of 87 batches.
Batch 72 out of 87 batches.
Batch 73 out of 87 batches.
Batch 74 out of 87 batches.
Batch 75 out of 87 batches.
Batch 76 out of 87 batches.
Batch 77 out of 87 b

Batch 63 out of 87 batches.
Batch 64 out of 87 batches.
Batch 65 out of 87 batches.
Batch 66 out of 87 batches.
Batch 67 out of 87 batches.
Batch 68 out of 87 batches.
Batch 69 out of 87 batches.
Batch 70 out of 87 batches.
Batch 71 out of 87 batches.
Batch 72 out of 87 batches.
Batch 73 out of 87 batches.
Batch 74 out of 87 batches.
Batch 75 out of 87 batches.
Batch 76 out of 87 batches.
Batch 77 out of 87 batches.
Batch 78 out of 87 batches.
Batch 79 out of 87 batches.
Batch 80 out of 87 batches.
Batch 81 out of 87 batches.
Batch 82 out of 87 batches.
Batch 83 out of 87 batches.
Batch 84 out of 87 batches.
Batch 85 out of 87 batches.
Batch 86 out of 87 batches.
Total loss: 9.883835550397635
Validation accuracy: 0.6

Training complete!


### 4) Prediction using title, abstract, and the list of authors

In [None]:
training_inputs = []
training_label_strings = []

validation_inputs = []
validation_label_strings = []

test_inputs = []
test_label_strings = []

for training_example in training_data:
    
    training_input = training_example[0][0] + ' [SEP] ' + training_example[0][2] + ' [SEP] ' + training_example[0][1].replace(' |', ',')
    training_inputs.append(training_input)
    
    training_label_strings.append(training_example[1])
    
for validation_example in validation_data:
    
    validation_input = validation_example[0][0] + ' [SEP] ' + validation_example[0][2] + ' [SEP] ' + validation_example[0][1].replace(' |', ',')
    validation_inputs.append(validation_input)
    
    validation_label_strings.append(validation_example[1])
    
for test_example in test_data:
    
    test_input = test_example[0][0] + ' [SEP] ' + test_example[0][2] + ' [SEP] ' + test_example[0][1].replace(' |', ',')
    test_inputs.append(test_input)
    
    test_label_strings.append(test_example[1])

In [None]:
training_input_ids, training_attention_masks = tokenize_and_format(training_inputs)
validation_input_ids, validation_attention_masks = tokenize_and_format(validation_inputs)
test_input_ids, test_attention_masks = tokenize_and_format(test_inputs)

In [None]:
training_label_IDs = []
validation_label_IDs = []
test_label_IDs = []

for training_label_string in training_label_strings:
    training_label_IDs.append(label_string_to_ID[training_label_string])
    
for validation_label_string in validation_label_strings:
    validation_label_IDs.append(label_string_to_ID[validation_label_string])
    
for test_label_string in test_label_strings:
    test_label_IDs.append(label_string_to_ID[test_label_string])
    
    
# Convert the lists into tensors.
training_input_ids = torch.cat(training_input_ids, dim=0)
training_attention_masks = torch.cat(training_attention_masks, dim=0)
training_label_IDs = torch.tensor(training_label_IDs)

validation_input_ids = torch.cat(validation_input_ids, dim=0)
validation_attention_masks = torch.cat(validation_attention_masks, dim=0)
validation_label_IDs = torch.tensor(validation_label_IDs)

test_input_ids = torch.cat(test_input_ids, dim=0)
test_attention_masks = torch.cat(test_attention_masks, dim=0)
test_label_IDs = torch.tensor(test_label_IDs)

In [None]:
train_set = [(training_input_ids[i], training_attention_masks[i], training_label_IDs[i]) for i in range(len(training_inputs))]
val_set = [(validation_input_ids[i], validation_attention_masks[i], validation_label_IDs[i]) for i in range(len(validation_inputs))]
test_set = [(test_input_ids[i], test_attention_masks[i], test_label_IDs[i]) for i in range(len(test_inputs))]

In [None]:
model = BertForSequenceClassification.from_pretrained(
    "bert-base-uncased", # Use the 12-layer BERT model, with an uncased vocab.
    num_labels = 20, # The number of output labels.   
    output_attentions = False, # Whether the model returns attentions weights.
    output_hidden_states = False, # Whether the model returns all hidden-states.
)

# Tell pytorch to run this model on the GPU.
model.cuda()

In [None]:
# Fine-tuning hyperparameters

batch_size = 16
optimizer = AdamW(model.parameters(),
                  lr = 5e-5, # args.learning_rate - default is 5e-5
                  eps = 1e-8 # args.adam_epsilon  - default is 1e-8
                )
epochs = 5


# function to get validation accuracy
def get_validation_performance(val_set):
    # Put the model in evaluation mode
    model.eval()

    # Tracking variables 
    total_eval_accuracy = 0
    total_eval_loss = 0

    num_batches = int(len(val_set)/batch_size) + 1

    total_correct = 0

    for i in range(num_batches):

        end_index = min(batch_size * (i+1), len(val_set))

        batch = val_set[i*batch_size:end_index]

        if len(batch) == 0: continue

        input_id_tensors = torch.stack([data[0] for data in batch])
        input_mask_tensors = torch.stack([data[1] for data in batch])
        label_tensors = torch.stack([data[2] for data in batch])

        # Move tensors to the GPU
        b_input_ids = input_id_tensors.to(device)
        b_input_mask = input_mask_tensors.to(device)
        b_labels = label_tensors.to(device)

        # Tell pytorch not to bother with constructing the compute graph during
        # the forward pass, since this is only needed for backprop (training).
        with torch.no_grad():        

            # Forward pass, calculate logit predictions.
            outputs = model(b_input_ids, 
                                    token_type_ids=None, 
                                    attention_mask=b_input_mask,
                                    labels=b_labels)
            loss = outputs.loss
            logits = outputs.logits

            # Accumulate the validation loss.
            total_eval_loss += loss.item()

            # Move logits and labels to CPU
            logits = logits.detach().cpu().numpy()
            label_ids = b_labels.to('cpu').numpy()

            # Calculate the number of correctly labeled examples in batch
            pred_flat = np.argmax(logits, axis=1).flatten()
            labels_flat = label_ids.flatten()
            num_correct = np.sum(pred_flat == labels_flat)
            total_correct += num_correct
        
    # Report the final accuracy for this validation run.
    avg_val_accuracy = total_correct / len(val_set)
    return avg_val_accuracy



# training loop

# For each epoch...
for epoch_i in range(0, epochs):
    # Perform one full pass over the training set.

    print("")
    print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
    print('Training...')

    # Reset the total loss for this epoch.
    total_train_loss = 0

    # Put the model into training mode.
    model.train()

    # For each batch of training data...
    num_batches = int(len(train_set)/batch_size) + 1

    for i in range(num_batches):
      
        print("Batch " + str(i) + " out of " + str(num_batches) + " batches.")
        
        end_index = min(batch_size * (i+1), len(train_set))

        batch = train_set[i*batch_size:end_index]

        if len(batch) == 0: continue

        input_id_tensors = torch.stack([data[0] for data in batch])
        input_mask_tensors = torch.stack([data[1] for data in batch])
        label_tensors = torch.stack([data[2] for data in batch])

        # Move tensors to the GPU
        b_input_ids = input_id_tensors.to(device)
        b_input_mask = input_mask_tensors.to(device)
        b_labels = label_tensors.to(device)

        # Clear the previously calculated gradient
        model.zero_grad()        

        # Perform a forward pass (evaluate the model on this training batch).
        outputs = model(b_input_ids, 
                            token_type_ids=None, 
                            attention_mask=b_input_mask, 
                            labels=b_labels)
        loss = outputs.loss
        logits = outputs.logits

        total_train_loss += loss.item()

        # Perform a backward pass to calculate the gradients.
        loss.backward()

        # Update parameters and take a step using the computed gradient.
        optimizer.step()
        
    # ========================================
    #               Validation
    # ========================================
    # After the completion of each training epoch, measure our performance on
    # our validation set. Implement this function in the cell above.
    print(f"Total loss: {total_train_loss}")
    val_acc = get_validation_performance(val_set)
    print(f"Validation accuracy: {val_acc}")
    
print("")
print("Training complete!")