In [1]:
from copy import deepcopy

import progressbar
import spacy
from sentence_transformers import SentenceTransformer, util
from transformers import BertTokenizer, AutoModelForSequenceClassification
import os
w2v_model = spacy.load('en_core_web_lg')

In [2]:
from dotenv import load_dotenv
from pathlib import Path

initial_wd = os.getcwd()
while os.path.split(os.getcwd())[-1] != 'scientific-knowledge-distiller':
    os.chdir(os.path.join(os.getcwd(), '..'))
root_path = os.getcwd()
os.chdir(initial_wd)

load_dotenv(dotenv_path=Path(os.path.join(root_path, '.env')))

True

In [3]:
from keras.utils import pad_sequences
import torch


def convert_single_abstract_to_embedding(tokenizer, model, in_text, MAX_LEN = 510):
    input_ids = tokenizer.encode(
                        in_text,
                        add_special_tokens = True,
                        max_length = MAX_LEN,
                   )

    results = pad_sequences([input_ids], maxlen=MAX_LEN, dtype="long",
                              truncating="post", padding="post")

    # Remove the outer list.
    input_ids = results[0]

    # Create attention masks
    attention_mask = [int(i>0) for i in input_ids]

    # Convert to tensors.
    input_ids = torch.tensor(input_ids)
    attention_mask = torch.tensor(attention_mask)

    # Add an extra dimension for the "batch" (even though there is only one
    # input in this batch.)
    input_ids = input_ids.unsqueeze(0)
    attention_mask = attention_mask.unsqueeze(0)

    # Put the model in "evaluation" mode, meaning feed-forward operation.
    model.eval()

    #input_ids = input_ids.to(device)
    #attention_mask = attention_mask.to(device)

    # Run the text through BERT, and collect all of the hidden states produced
    # from all 12 layers.
    with torch.no_grad():
        logits, encoded_layers = model(
                                    input_ids = input_ids,
                                    token_type_ids = None,
                                    attention_mask = attention_mask,
                                    return_dict=False)

    layer_i = 12 # The last BERT layer before the classifier.
    batch_i = 0 # Only one input in the batch.
    token_i = 0 # The first token, corresponding to [CLS]

    # Extract the embedding.
    embedding = encoded_layers[layer_i][batch_i][token_i]

    # Move to the CPU and convert to numpy ndarray.
    embedding = embedding.detach().cpu().numpy()

    return embedding

In [10]:
query = 'neutral differential equations with periodic coefficients'
limit = 5000
threshold_sim = 0.0

In [11]:
from search_engine.databases.database_client import SupportedSources
from search_engine import Search

s = Search(
    query,
    limit=limit,
    sources=(
        SupportedSources.ARXIV,
        SupportedSources.CORE,
        SupportedSources.CROSSREF,
        SupportedSources.INTERNET_ARCHIVE,
        SupportedSources.SEMANTIC_SCHOLAR,
        SupportedSources.UNPAYWALL,
    )
)
s.perform()

INFO:arxiv.arxiv:Requesting 200 results at offset 0
INFO:arxiv.arxiv:Requesting page of results
INFO:root:unpaywall: 13
INFO:arxiv.arxiv:Got first page; 1668204 of inf results available
INFO:root:arXiv: 1
INFO:root:arXiv: 2
INFO:root:arXiv: 3
INFO:root:arXiv: 4
INFO:root:arXiv: 5
INFO:root:arXiv: 6
INFO:root:arXiv: 7
INFO:root:arXiv: 8
INFO:root:arXiv: 9
INFO:root:arXiv: 10
INFO:root:arXiv: 11
INFO:root:arXiv: 12
INFO:root:arXiv: 13
INFO:root:arXiv: 14
INFO:root:arXiv: 15
INFO:root:arXiv: 16
INFO:root:arXiv: 17
INFO:root:arXiv: 18
INFO:root:arXiv: 19
INFO:root:arXiv: 20
INFO:root:arXiv: 21
INFO:root:arXiv: 22
INFO:root:arXiv: 23
INFO:root:arXiv: 24
INFO:root:arXiv: 25
INFO:root:arXiv: 26
INFO:root:arXiv: 27
INFO:root:arXiv: 28
INFO:root:arXiv: 29
INFO:root:arXiv: 30
INFO:root:arXiv: 31
INFO:root:arXiv: 32
INFO:root:arXiv: 33
INFO:root:arXiv: 34
INFO:root:arXiv: 35
INFO:root:arXiv: 36
INFO:root:arXiv: 37
INFO:root:arXiv: 38
INFO:root:arXiv: 39
INFO:root:arXiv: 40
INFO:root:arXiv: 41
INF

In [12]:
results = list(s.results())

total found: 14340
starting deduplication...


In [13]:
pretrained_model = 'allenai/scibert_scivocab_uncased'
sciBERT_tokenizer = BertTokenizer.from_pretrained(pretrained_model,
                                          do_lower_case=True)

# Get the model
scibert_model = AutoModelForSequenceClassification.from_pretrained(pretrained_model,
                                                          output_attentions=False,
                                                          output_hidden_states=True)

final_results = []
embedded_query_w2v = w2v_model(query)
roberta_model = SentenceTransformer('stsb-roberta-large')
embedded_query_roberta = roberta_model.encode(query, convert_to_tensor=True, normalize_embeddings=True)
embedded_query_scibert = convert_single_abstract_to_embedding(sciBERT_tokenizer, scibert_model, query)

for doc in progressbar.progressbar(results):
    if not doc.title:
        continue
    title = doc.title.lower()

    embedded_title_w2v = w2v_model(title)
    sim_score_w2v = embedded_query_w2v.similarity(embedded_title_w2v)
    embedded_title_roberta = roberta_model.encode(title, convert_to_tensor=True, show_progress_bar=False, normalize_embeddings=True)
    cosine_scores = util.dot_score(embedded_query_roberta, embedded_title_roberta)
    sim_score_roberta = cosine_scores.item()

    embedded_title_scibert = convert_single_abstract_to_embedding(sciBERT_tokenizer, scibert_model, title)
    cosine_scores = util.cos_sim(embedded_query_scibert, embedded_title_scibert)
    sim_score_scibert = cosine_scores.item()

    # if sim_score > threshold_sim:
    final_results.append((deepcopy(doc), sim_score_roberta, sim_score_scibert, sim_score_w2v))

final_results = sorted(final_results, key=lambda x: x[3])

Some weights of the model checkpoint at allenai/scibert_scivocab_uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification we

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
  sim_score_w2v = embedded_query_w2v.similarity(embedded_title_w2v)
100% (10802 of 10802) |##################| Elapsed Time: 1:08:31 Time:  1:08:31


In [14]:
import pandas as pd

df = pd.DataFrame(final_results, columns=['title', 'roberta_sim', 'scibert_sim', 'w2v_sim'])
df

Unnamed: 0,title,roberta_sim,scibert_sim,w2v_sim
0,Az inga egyensúlyi helyzeteinek stabilizálása ...,0.259281,0.496522,-0.132913
1,Экстремальная динамика системы трех однонаправ...,0.228762,0.266258,-0.089049
2,معايير لتذبذب المعادلة التفاضلية نصف الخطية ال...,0.232086,0.340640,-0.082578
3,Динамика системы из двух простейших автогенера...,0.228754,0.542792,-0.082113
4,Положительные решения суперлинейных эллиптичес...,0.238447,0.459514,-0.080647
...,...,...,...,...
10324,Equations with Periodic Coefficients,0.734499,0.899293,0.976816
10325,Linear differential equations with periodic co...,0.923817,0.972731,0.979663
10326,Linear differential equations with periodic co...,0.923817,0.972731,0.979663
10327,Differential equations with periodic coefficients,0.882960,0.972664,0.989029


In [15]:
df.to_csv('output-1.csv')