In [1]:
import pandas as pd
import pickle, json
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
def get_title_vector(list):
    model = SentenceTransformer("all-mpnet-base-v2")
    embeddings = model.encode(list, show_progress_bar = True)
    return embeddings

def create_vectors(dataframe):
    list_of_titles = []
    
    current = 1    
    for paper_id in set(dataframe["id"]):
        percentage = (current / len(dataframe.axes[0]) * 100)
        print(f"Current percentage: {percentage: .2f}%", end = "\r")
        
        current_paper = tuple(dataframe.loc[dataframe["id"] == paper_id].iloc[0])
        title = current_paper[1]
        list_of_titles.append(title)
        current += 1
    return get_title_vector(list_of_titles)

def save_data(filename: str, *data):
    file = open(filename, "wb")
    for item in data:
        pickle.dump(item, file)
    file.close()
    
def load_data(filename: str):
    file = open(filename, "rb")
    couples = pickle.load(file)
    file.close()
    return couples

def create_titles_file(dataframe, list):
    couples = {}
    current = 0    
    for paper_id in set(dataframe["id"]):
        couples[str(paper_id)] = list[current]
        current += 1
    save_data("data/title_vectors.txt", couples)
    
def get_similarities(dataframe):
    couples = load_data("data/title_vectors.txt")
    similarities = {"id": [], "value": []}
    
    nodes_list = set()
    for paper_id in set(dataframe["id"]):
        nodes_list.add(str(paper_id))
    
    current = 1    
    for paper_id in set(dataframe["id"]):
        percentage = (current / len(dataframe.axes[0]) * 100)
        print(f"Current percentage: {percentage: .2f}%", end = "\r")
        
        current_paper = tuple(dataframe.loc[dataframe["id"] == paper_id].iloc[0])
        references = current_paper[7]
        for reference_id in references:
            if reference_id in nodes_list:
                key = str(paper_id) + "_" + str(reference_id)
                value = cosine_similarity([couples[str(paper_id)]], [couples[str(reference_id)]])
                similarities["id"].append(key) 
                similarities["value"].append(float(value[0][0]))     
        current += 1
     
    df = pd.DataFrame(similarities)
    df.to_json("data/similarities.json")

In [None]:
paper_dataframe = pd.read_json("data/nlp_papers.json")
embeddings = create_vectors(paper_dataframe)

In [None]:
create_titles_file(paper_dataframe, embeddings)
get_similarities(paper_dataframe)