In [1]:
from copy import deepcopy

import progressbar
import spacy
from sentence_transformers import SentenceTransformer, util
import os
w2v_model = spacy.load('en_core_web_lg')

In [2]:
from dotenv import load_dotenv
from pathlib import Path

initial_wd = os.getcwd()
while os.path.split(os.getcwd())[-1] != 'scientific-knowledge-distiller':
    os.chdir(os.path.join(os.getcwd(), '..'))
root_path = os.getcwd()
os.chdir(initial_wd)

load_dotenv(dotenv_path=Path(os.path.join(root_path, '.env')))

True

In [3]:
query = 'out-of-distribution detection in deep neural networks'
limit = 5000
threshold_sim = 0.0

In [None]:
from search_engine.databases.database_client import SupportedSources
from search_engine import Search

s = Search(
    query,
    limit=limit,
    sources=(
        SupportedSources.ARXIV,
        SupportedSources.CROSSREF,
        SupportedSources.INTERNET_ARCHIVE,
        SupportedSources.SEMANTIC_SCHOLAR,
        SupportedSources.UNPAYWALL,
    )
)
s.perform()

INFO:arxiv.arxiv:Requesting page of results
INFO:root:semantic scholar: 1000
INFO:arxiv.arxiv:Sleeping for 4.999953 seconds
INFO:root:internet archive: 1200
INFO:root:semantic scholar: 1100
INFO:arxiv.arxiv:Requesting page of results
INFO:root:semantic scholar: 1200
INFO:arxiv.arxiv:Sleeping for 4.999931 seconds
INFO:root:semantic scholar: 1300
INFO:root:internet archive: 1300
INFO:root:semantic scholar: 1400
INFO:arxiv.arxiv:Requesting page of results
INFO:root:semantic scholar: 1500
INFO:arxiv.arxiv:Sleeping for 4.999947 seconds
INFO:root:internet archive: 1400
INFO:root:semantic scholar: 1600
INFO:arxiv.arxiv:Requesting page of results
INFO:arxiv.arxiv:Sleeping for 4.999944 seconds
INFO:root:semantic scholar: 1700
INFO:root:semantic scholar: 1800
INFO:root:internet archive: 1500
INFO:arxiv.arxiv:Requesting page of results
INFO:root:semantic scholar: 1900
INFO:arxiv.arxiv:Sleeping for 4.999978 seconds
INFO:root:semantic scholar: 2000
INFO:arxiv.arxiv:Requesting page of results
INFO:a

KeyboardInterrupt: 

In [None]:
results = list(s.results())

In [7]:
import numpy as np

final_results = []
embedding_w2v_1 = w2v_model(query)
model = SentenceTransformer('stsb-roberta-large')
embedding1 = model.encode(query, convert_to_tensor=True, normalize_embeddings=True)

for doc in progressbar.progressbar(results):
    if not doc.title:
        continue
    title = doc.title.lower()

    embedding_w2v_2 = w2v_model(title)
    sim_score_w2v = embedding_w2v_1.similarity(embedding_w2v_2)
    embedding2 = model.encode(title, convert_to_tensor=True, show_progress_bar=False, normalize_embeddings=True)
    cosine_scores = util.dot_score(embedding1, embedding2)
    sim_score_roberta = cosine_scores.item()

    # if sim_score > threshold_sim:
    final_results.append((deepcopy(doc), sim_score_roberta, sim_score_w2v, np.abs(sim_score_roberta - sim_score_w2v)))

final_results = sorted(final_results, key=lambda x: x[3])

INFO:sentence_transformers.SentenceTransformer:Load pretrained SentenceTransformer: stsb-roberta-large
INFO:sentence_transformers.SentenceTransformer:Use pytorch device: cpu


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

  sim_score_w2v = embedding_w2v_1.similarity(embedding_w2v_2)
100% (18509 of 18509) |##################| Elapsed Time: 0:34:04 Time:  0:34:04


In [8]:
import pandas as pd

df = pd.DataFrame(final_results, columns=['title', 'roberta_sim', 'w2v_sim', 'diff'])
df

Unnamed: 0,title,roberta_sim,w2v_sim,diff
0,Outlier Detection for Multidimensional Time Se...,0.760481,0.761605,0.001124
1,Dynamic Network Anomaly Detection System by Us...,0.725599,0.727387,0.001788
2,Robust mixture of experts modeling using the <...,0.313494,0.316564,0.003070
3,What can linearized neural networks actually s...,0.461067,0.464407,0.003339
4,Network Traffic Anomaly Detection via Deep Lea...,0.689579,0.684817,0.004762
...,...,...,...,...
18359,Second-Generation Sequencing with Deep Reinfor...,0.092982,0.845946,0.752964
18360,LayoutReader: Pre-training of Text and Layout ...,0.038394,0.794956,0.756562
18361,Automatic monitoring and detection of tail-bit...,0.157831,0.922180,0.764349
18362,Automatic monitoring and detection of tail-bit...,0.157831,0.922180,0.764349


In [9]:
df.to_csv('output-1.csv')