In [1]:
import logging
import progressbar
import os
from copy import deepcopy
import pandas as pd
from search_engine import Search

In [2]:
from transformers import BertTokenizer, AutoModelForSequenceClassification

In [3]:
from dotenv import load_dotenv
from pathlib import Path

initial_wd = os.getcwd()
while os.path.split(os.getcwd())[-1] != 'scientific-knowledge-distiller':
    os.chdir(os.path.join(os.getcwd(), '..'))
root_path = os.getcwd()
os.chdir(initial_wd)

load_dotenv(dotenv_path=Path(os.path.join(root_path, '.env')))

True

In [4]:
query = 'out-of-distribution detection in deep neural networks'
limit = 1000
threshold_sim = 0.0

In [None]:
s = Search(query, limit=limit)
s.perform()

INFO:arxiv.arxiv:Requesting 200 results at offset 0
INFO:arxiv.arxiv:Requesting page of results
INFO:root:unpaywall: 4
INFO:root:core: 40
INFO:root:crossref: 1000
INFO:root:semantic scholar: 100
INFO:arxiv.arxiv:Got first page; 2078126 of inf results available
INFO:root:arXiv: 1
INFO:root:arXiv: 2
INFO:root:arXiv: 3
INFO:root:arXiv: 4
INFO:root:arXiv: 5
INFO:root:arXiv: 6
INFO:root:arXiv: 7
INFO:root:arXiv: 8
INFO:root:arXiv: 9
INFO:root:arXiv: 10
INFO:root:arXiv: 11
INFO:root:arXiv: 12
INFO:root:arXiv: 13
INFO:root:arXiv: 14
INFO:root:arXiv: 15
INFO:root:arXiv: 16
INFO:root:arXiv: 17
INFO:root:arXiv: 18
INFO:root:arXiv: 19
INFO:root:arXiv: 20
INFO:root:arXiv: 21
INFO:root:arXiv: 22
INFO:root:arXiv: 23
INFO:root:arXiv: 24
INFO:root:arXiv: 25
INFO:root:arXiv: 26
INFO:root:arXiv: 27
INFO:root:arXiv: 28
INFO:root:arXiv: 29
INFO:root:arXiv: 30
INFO:root:arXiv: 31
INFO:root:arXiv: 32
INFO:root:arXiv: 33
INFO:root:arXiv: 34
INFO:root:arXiv: 35
INFO:root:arXiv: 36
INFO:root:arXiv: 37
INFO:roo

In [6]:
results = list(s.results())

total found: 5004
starting deduplication...


In [10]:
from keras.utils import pad_sequences
import torch


def convert_single_abstract_to_embedding(tokenizer, model, in_text, MAX_LEN = 510):
    input_ids = tokenizer.encode(
                        in_text,
                        add_special_tokens = True,
                        max_length = MAX_LEN,
                   )

    results = pad_sequences([input_ids], maxlen=MAX_LEN, dtype="long",
                              truncating="post", padding="post")

    # Remove the outer list.
    input_ids = results[0]

    # Create attention masks
    attention_mask = [int(i>0) for i in input_ids]

    # Convert to tensors.
    input_ids = torch.tensor(input_ids)
    attention_mask = torch.tensor(attention_mask)

    # Add an extra dimension for the "batch" (even though there is only one
    # input in this batch.)
    input_ids = input_ids.unsqueeze(0)
    attention_mask = attention_mask.unsqueeze(0)

    # Put the model in "evaluation" mode, meaning feed-forward operation.
    model.eval()

    #input_ids = input_ids.to(device)
    #attention_mask = attention_mask.to(device)

    # Run the text through BERT, and collect all of the hidden states produced
    # from all 12 layers.
    with torch.no_grad():
        logits, encoded_layers = model(
                                    input_ids = input_ids,
                                    token_type_ids = None,
                                    attention_mask = attention_mask,
                                    return_dict=False)

    layer_i = 12 # The last BERT layer before the classifier.
    batch_i = 0 # Only one input in the batch.
    token_i = 0 # The first token, corresponding to [CLS]

    # Extract the embedding.
    embedding = encoded_layers[layer_i][batch_i][token_i]

    # Move to the CPU and convert to numpy ndarray.
    embedding = embedding.detach().cpu().numpy()

    return embedding

In [14]:
from sentence_transformers import util

final_results = []
# Get the tokenizer from the previous path
pretrained_model = 'allenai/scibert_scivocab_uncased'
sciBERT_tokenizer = BertTokenizer.from_pretrained(pretrained_model,
                                          do_lower_case=True)

# Get the model
model = AutoModelForSequenceClassification.from_pretrained(pretrained_model,
                                                          output_attentions=False,
                                                          output_hidden_states=True)
# embedding1 = sciBERT_tokenizer.encode(query)
embedding1 = convert_single_abstract_to_embedding(sciBERT_tokenizer, model, query)

for doc in progressbar.progressbar(results):
    if not doc.title:
        continue

    title = doc.title.lower()
    # embedding2 = model.encode(title, convert_to_tensor=True, show_progress_bar=False, normalize_embeddings=True)
    embedding2 = convert_single_abstract_to_embedding(sciBERT_tokenizer, model, title)
    cosine_scores = util.cos_sim(embedding1, embedding2)
    sim_score = cosine_scores.item()
    #
    # if sim_score > threshold_sim:
    final_results.append((deepcopy(doc), sim_score))
final_results = sorted(final_results, key=lambda x: -x[1])

Some weights of the model checkpoint at allenai/scibert_scivocab_uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.decoder.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification we

In [15]:
df = pd.DataFrame(final_results, columns=['Title', 'Sim'])

In [16]:
df

Unnamed: 0,Title,Sim
0,Structured Prediction for Object Detection in ...,0.953022
1,Distribution Shift Detection for Deep Neural N...,0.949874
2,Uncertainty-Based Out-of-Distribution Classifi...,0.945775
3,Deep Hybrid Models for Out-of-Distribution Det...,0.943508
4,Uncertainty-Based Out-of-Distribution Detectio...,0.942280
...,...,...
4639,2021 Index IEEE Transactions on Parallel and D...,0.515682
4640,Dedication,0.507065
4641,Table of Content,0.504163
4642,Table of Contents,0.493734


In [17]:
df.to_csv('output-scibert.csv')