# RPT (Research Paper Tagger)

In [1]:
import os
import zipfile
import json
import random
from tqdm import tqdm
import plotly
import plotly.express as px
import plotly.graph_objects as go

import numpy as np
import pandas as pd

from helpers import tokenize_and_format, flat_accuracy

import torch
from transformers import BertForSequenceClassification, AdamW, BertConfig, get_linear_schedule_with_warmup

from sklearn.metrics import precision_recall_fscore_support, top_k_accuracy_score

In [2]:
random.seed(0)
np.random.seed(0)

torch.manual_seed(0)
torch.use_deterministic_algorithms(False)
# Confirm that the GPU is detected

assert torch.cuda.is_available()

# Get the GPU device name.
device_name = torch.cuda.get_device_name()
n_gpu = torch.cuda.device_count()
print(f"Found device: {device_name}, n_gpu: {n_gpu}")
device = torch.device("cuda")

Found device: NVIDIA GeForce RTX 2060 with Max-Q Design, n_gpu: 1


In [3]:
with open("Data/Metadata/label_string_to_ID.jsonl", "r") as f:
    label_string_to_ID = json.load(f)
    
with open("Data/Metadata/label_ID_to_string.jsonl", "r") as f:
    label_ID_to_string = json.load(f)

In [4]:
with open("Title.txt", 'r', encoding='utf-8') as f:
    title = f.read()
    
with open("Abstract.txt", 'r', encoding='utf-8') as f:
    abstract = f.read()
    
with open("Authors.txt", 'r', encoding='utf-8') as f:
    authors = f.read()

In [5]:
title = title.lower()
abstract = abstract.lower()
authors = authors.lower()

### Bake the data

In [6]:
test_inputs = [title + '. ' + abstract + '. ' + authors]

In [7]:
max_seq_length = 396

test_input_ids, test_attention_masks = tokenize_and_format(test_inputs, max_seq_length)

In [8]:
test_input_ids = torch.cat(test_input_ids, dim=0)
test_attention_masks = torch.cat(test_attention_masks, dim=0)

In [9]:
test_set = [(test_input_ids[i], test_attention_masks[i]) for i in range(len(test_inputs))]

### Get the Predictions

In [10]:
best_hyperparameter_configuration = "Hyperparameter configuration 1"

model = BertForSequenceClassification.from_pretrained(
    "Saved models/" + best_hyperparameter_configuration + "/best validation accuracy model/",
    local_files_only = True,
    output_hidden_states = False, # Whether the model returns all hidden-states.
)

model.cuda()


batch_size = 8

def get_outputs(data_set):
    # Put the model in evaluation mode
    model.eval()

    num_batches = int(len(data_set)/batch_size) + 1

    total_correct = 0
    
    outputs = []
    
    all_predictions = []

    for i in range(num_batches):

        end_index = min(batch_size * (i+1), len(data_set))

        batch = data_set[i*batch_size:end_index]

        if len(batch) == 0: continue

        input_id_tensors = torch.stack([data[0] for data in batch])
        input_mask_tensors = torch.stack([data[1] for data in batch])

        # Move tensors to the GPU
        b_input_ids = input_id_tensors.to(device)
        b_input_mask = input_mask_tensors.to(device)

        # Tell pytorch not to bother with constructing the compute graph during
        # the forward pass, since this is only needed for backprop (training).
        with torch.no_grad():        

            # Forward pass, calculate logit predictions.
            
            outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask)
            
            logits = outputs.logits

            # Move logits and labels to CPU
            logits = logits.detach().cpu().numpy()

            # Calculate the number of correctly labeled examples in batch
            #pred_flat = np.argmax(logits, axis=1).flatten()
            preds_flat = [np.argpartition(logits[0], -3)[-3:]]
            
            for preds in preds_flat:
                for pred in preds:
                    print(label_ID_to_string[str(pred)])

    
    
get_outputs(test_set)

resources and evaluation
generation
nlp applications
