In [None]:
import pandas as pd  # Handling data
import numpy as np  # Numerical operations
import torch  # PyTorch operations
from transformers import BertTokenizer, AutoModel  # Bert model and tokenizer
from sklearn.metrics.pairwise import cosine_similarity  # Cosine similarity computation

In [4]:
# Load the "arXiv_data" csv file and store it in the pandas dataframe "data"
data = pd.read_csv("./arxiv_data.csv")

In [3]:
data.head()

Unnamed: 0,id,submitter,authors,title,comments,journal-ref,doi,report-no,categories,license,abstract,versions,update_date,merged,lang
0,704.0001,Pavel Nadolsky,"C. Bal\'azs, E. L. Berger, P. M. Nadolsky, C.-...",Calculation of prompt diphoton production cros...,"37 pages, 15 figures; published version","Phys.Rev.D76:013009,2007",10.1103/PhysRevD.76.013009,ANL-HEP-PR-07-12,hep-ph,,A fully differential calculation in perturba...,"[{'version': 'v1', 'created': 'Mon, 2 Apr 2007...",2008-11-26,Calculation of prompt diphoton production cros...,en
1,704.0002,Louis Theran,Ileana Streinu and Louis Theran,Sparsity-certifying Graph Decompositions,To appear in Graphs and Combinatorics,,,,math.CO cs.CG,http://arxiv.org/licenses/nonexclusive-distrib...,"We describe a new algorithm, the $(k,\ell)$-...","[{'version': 'v1', 'created': 'Sat, 31 Mar 200...",2008-12-13,Sparsity-certifying Graph Decompositions\n\n ...,en
2,704.0003,Hongjun Pan,Hongjun Pan,The evolution of the Earth-Moon system based o...,"23 pages, 3 figures",,,,physics.gen-ph,,The evolution of Earth-Moon system is descri...,"[{'version': 'v1', 'created': 'Sun, 1 Apr 2007...",2008-01-13,The evolution of the Earth-Moon system based o...,en
3,704.0004,David Callan,David Callan,A determinant of Stirling cycle numbers counts...,11 pages,,,,math.CO,,We show that a determinant of Stirling cycle...,"[{'version': 'v1', 'created': 'Sat, 31 Mar 200...",2007-05-23,A determinant of Stirling cycle numbers counts...,en
4,704.0005,Alberto Torchinsky,Wael Abu-Shammala and Alberto Torchinsky,From dyadic $\Lambda_{\alpha}$ to $\Lambda_{\a...,,"Illinois J. Math. 52 (2008) no.2, 681-689",,,math.CA math.FA,,In this paper we show how to compute the $\L...,"[{'version': 'v1', 'created': 'Mon, 2 Apr 2007...",2013-10-15,From dyadic $\Lambda_{\alpha}$ to $\Lambda_{\a...,en


In [5]:
# Load BERT tokenizer and SciBERT model
model = AutoModel.from_pretrained("allenai/scibert_scivocab_uncased")
tokenizer = BertTokenizer.from_pretrained("allenai/scibert_scivocab_uncased")

In [6]:
# Define the text embedding function
def embed_text(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=512)
    # return_tensors="pt" returns PyTorch tensors
    # truncation=True ensures that the text is properly truncated
    # padding=True ensures that the text is padded to a maximum length of 512 tokens (limitation of the BERT model)

    # Pass the tokenized input to the BERT model and retrieve the output embeddings
    with torch.no_grad():
        outputs = model(**inputs)

    # Convert resulting embedding to a NumPy array and return it
    return outputs.last_hidden_state.mean(dim=1).numpy()

In [8]:
embed_text("machine learning").tolist()[0]

[0.17423900961875916,
 1.047127604484558,
 -0.2050633728504181,
 -0.9278192520141602,
 -0.014028340578079224,
 -0.29569801688194275,
 1.1805694103240967,
 0.2721199095249176,
 -0.4590453803539276,
 0.6893688440322876,
 -0.23956789076328278,
 0.23074930906295776,
 0.21747615933418274,
 -0.19735077023506165,
 -0.8601112365722656,
 0.32859620451927185,
 -0.822407603263855,
 0.4505489468574524,
 1.6983833312988281,
 0.19093436002731323,
 0.690409243106842,
 0.3492594361305237,
 -0.9499992728233337,
 -0.3382454216480255,
 0.5236876010894775,
 0.33761709928512573,
 -0.8201949596405029,
 0.16729643940925598,
 -0.16634607315063477,
 1.3821338415145874,
 -0.3001309633255005,
 -0.917592465877533,
 -0.48015791177749634,
 -1.1893104314804077,
 -0.5654113292694092,
 -1.046650767326355,
 0.7311017513275146,
 0.41746971011161804,
 0.3998907208442688,
 -0.757430911064148,
 0.06655566394329071,
 -0.2040015161037445,
 1.1291775703430176,
 -0.7667797803878784,
 0.02584405615925789,
 -0.4278639256954193,


In [7]:
# Compute SciBERT embeddings for merged titles and abstracts
merged_embeddings = [embed_text(text) for text in data["merged"]]

In [8]:
# Save merged_embeddings to new column "merged_embeddings" in the dataframe
data["merged_embeddings"] = merged_embeddings

In [9]:
data.head()

Unnamed: 0,id,submitter,authors,title,comments,journal-ref,doi,report-no,categories,license,abstract,versions,update_date,merged,lang,merged_embeddings
0,704.0001,Pavel Nadolsky,"C. Bal\'azs, E. L. Berger, P. M. Nadolsky, C.-...",Calculation of prompt diphoton production cros...,"37 pages, 15 figures; published version","Phys.Rev.D76:013009,2007",10.1103/PhysRevD.76.013009,ANL-HEP-PR-07-12,hep-ph,,A fully differential calculation in perturba...,"[{'version': 'v1', 'created': 'Mon, 2 Apr 2007...",2008-11-26,Calculation of prompt diphoton production cros...,en,"[[0.07953369, -0.2854601, 0.27501187, 0.049734..."
1,704.0002,Louis Theran,Ileana Streinu and Louis Theran,Sparsity-certifying Graph Decompositions,To appear in Graphs and Combinatorics,,,,math.CO cs.CG,http://arxiv.org/licenses/nonexclusive-distrib...,"We describe a new algorithm, the $(k,\ell)$-...","[{'version': 'v1', 'created': 'Sat, 31 Mar 200...",2008-12-13,Sparsity-certifying Graph Decompositions\n\n ...,en,"[[0.0328301, -0.28351074, 0.23665847, 0.209392..."
2,704.0003,Hongjun Pan,Hongjun Pan,The evolution of the Earth-Moon system based o...,"23 pages, 3 figures",,,,physics.gen-ph,,The evolution of Earth-Moon system is descri...,"[{'version': 'v1', 'created': 'Sun, 1 Apr 2007...",2008-01-13,The evolution of the Earth-Moon system based o...,en,"[[-0.46870053, -0.09688728, -0.031690042, 0.27..."
3,704.0004,David Callan,David Callan,A determinant of Stirling cycle numbers counts...,11 pages,,,,math.CO,,We show that a determinant of Stirling cycle...,"[{'version': 'v1', 'created': 'Sat, 31 Mar 200...",2007-05-23,A determinant of Stirling cycle numbers counts...,en,"[[0.463985, -0.40037638, -0.07055639, -0.07790..."
4,704.0005,Alberto Torchinsky,Wael Abu-Shammala and Alberto Torchinsky,From dyadic $\Lambda_{\alpha}$ to $\Lambda_{\a...,,"Illinois J. Math. 52 (2008) no.2, 681-689",,,math.CA math.FA,,In this paper we show how to compute the $\L...,"[{'version': 'v1', 'created': 'Mon, 2 Apr 2007...",2013-10-15,From dyadic $\Lambda_{\alpha}$ to $\Lambda_{\a...,en,"[[0.07784059, 0.13781649, 0.58904696, 0.118610..."


In [10]:
# Save the data to "arxiv_research_papers" csv file
data.to_csv("./arxiv_research_papers.csv", index=False)

In [None]:
# Convert the embeddings to NumPy arrays for easier manipulation
title_abstract_embeddings = np.vstack(merged_embeddings)

In [None]:
# Compute similarity matrices
semantic_similarity_matrix = cosine_similarity(title_abstract_embeddings)

In [None]:
semantic_similarity_matrix

In [None]:
element = data[data["merged"] == """Experimental efforts in search of 76Ge Neutrinoless Double Beta Decay

  Neutrinoless double beta decay is one of the most sensitive approaches in
non-accelerator particle physics to take us into a regime of physics beyond the
standard model. This article is a brief review of the experiments in search of
neutrinoless double beta decay from 76Ge. Following a brief introduction of the
process of double beta decay from 76Ge, the results of the very first
experiments IGEX and Heidelberg-Moscow which give indications of the existence
of possible neutrinoless double beta decay mode has been reviewed. Then ongoing
efforts to substantiate the early findings are presented and the Majorana
experiment as a future experimental approach which will allow a very detailed
study of the neutrinoless decay mode is discussed.
"""]

In [None]:
element

In [None]:
# Select and retrieve the top three most similar rows from the similarity matrix and fetch the corresponding rows from the original data based on the selected indices
results = data["merged"]
var = results.argsort()[-3:][::-1]
data.loc[var]