# Text Preprocessing:


In [None]:
# !pip install nltk
# !pip install sentence-transformers

In [None]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sentence_transformers import SentenceTransformer, util
import ast 
import csv

from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from tqdm import tqdm
import nltk
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
from nltk.corpus import stopwords 
import re

pattern = re.compile(r'(,){2,}')
stop_words = set(stopwords.words('english')) 
path = "./data/"

### Remark:
Due to restrictions on final zip size, we don't add text data to the folder, **but it can be downloaded from our github**.

In [None]:
fw = open(path + "abstracts_processed.txt","w",encoding="utf8")
f = open(path + "abstracts.txt","r",encoding="utf8")
ff = open(path + "abstracts_documents_final.txt","r",encoding="utf8")
doc_abst = {}
for l in tqdm(ff):
    id_,text = l.split("----")
    doc_abst[id_] = text.rstrip("\n")
ff.close()

In [None]:
lemmatizer = WordNetLemmatizer()

def get_wordnet_pos(treebank_tag):
    '''
    Do mapping between treebank tag and wordnet tag object
    '''
    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN #as default 

In [None]:
dic = {}
doc = []
lemmatize = True #Lemmatize word
# loads the inverted abstracts and stores them as id-abstracts in a dictionary dic and in a folder fw
for l in tqdm(f):
    if(l=="\n"):
        continue
    id = l.split("----")[0]
    inv = "".join(l.split("----")[1:])
    res = ast.literal_eval(inv) 
    abstract = [ "" for i in range(res["IndexLength"])]
    inv_indx=  res["InvertedIndex"]
    for i in inv_indx:   
        if i.isalpha() and i.lower() not in stop_words:
            if lemmatize:
                w_n_tag = get_wordnet_pos(nltk.pos_tag([i.lower()])[0][1])
                lem_word = lemmatizer.lemmatize(i.lower(), pos=w_n_tag)
            else:
                lem_word = i.lower()
            for j in inv_indx[i]:
                abstract[j] = lem_word
    abstract = re.sub(pattern, ',', ",".join(abstract))
    fw.write(id+"----"+abstract+"\n")
    dic[id] = abstract
    p = dic[id].split(",")
    dic[id] = p[1:]
    doc.append(dic[id])
fw.close()

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
298162it [1:02:51, 81.79it/s][A
298171it [1:02:51, 82.05it/s][A
298180it [1:02:52, 80.39it/s][A
298189it [1:02:52, 82.09it/s][A
298198it [1:02:52, 78.74it/s][A
298208it [1:02:52, 83.26it/s][A
298217it [1:02:52, 83.33it/s][A
298226it [1:02:52, 84.17it/s][A
298235it [1:02:52, 79.63it/s][A
298244it [1:02:52, 78.44it/s][A
298252it [1:02:52, 78.78it/s][A
298260it [1:02:53, 70.60it/s][A
298268it [1:02:53, 71.86it/s][A
298278it [1:02:53, 77.18it/s][A
298286it [1:02:53, 76.73it/s][A
298294it [1:02:53, 74.95it/s][A
298302it [1:02:53, 74.78it/s][A
298312it [1:02:53, 79.18it/s][A
298321it [1:02:53, 68.28it/s][A
298329it [1:02:53, 70.09it/s][A
298338it [1:02:54, 73.79it/s][A
298347it [1:02:54, 77.88it/s][A
298356it [1:02:54, 79.73it/s][A
298365it [1:02:54, 79.75it/s][A
298374it [1:02:54, 79.22it/s][A
298383it [1:02:54, 76.49it/s][A
298391it [1:02:54, 70.76it/s][A
298399it [1:02:54, 71.54it/s][A
298407it [1

# Text Embedding using Doc2Vec:

## Papers Embedding:

In [None]:
tagged_data = [TaggedDocument(d, [i]) for i, d in enumerate(doc)]

In [None]:
model = Doc2Vec(tagged_data, vector_size = dim, window = 5, min_count = 2, epochs = 100, workers=10)

# store the embeddings in "paperID":array format
f = open(path + "paper_embeddings.txt","w",encoding="utf8")
for tid in dic:
    sentence = dic[tid]
    f.write(str(tid)+":"+np.array2string(model.infer_vector(sentence), formatter={'float_kind':lambda x: "%.8f" % x})+"\n")    
f.close()

## Authors Embedding:

In [None]:
# read the file to create a dictionary with author key and paper list as value
f = open(path + "author_papers.txt","r")
papers_set = set()
d = {}
for l in f:
    auth_paps = [paper_id.strip() for paper_id in l.split(":")[1].replace("[","").replace("]","").replace("\n","").replace("\'","").replace("\"","").split(",")]
    d[l.split(":")[0]] = auth_paps
f.close()

# read the embeddings of each paper
f = open(path + "paper_embeddings.txt","r")
papers = {}
s = ""
pattern = re.compile(r'(\s){2,}')
for l in f:
    if(":" in l and s!=""):
        papers[s.split(":")[0]] = np.array(ast.literal_eval(re.sub(pattern, ',', s.split(":")[1]).replace(" ",",")))
        s = l.replace("\n","")
    else:
        s = s+" "+l.replace("\n","")
    
f.close()

# the author representation is set to be the average of its papers' representations
pattern = re.compile(r'(,){2,}')
df = open(path + "author_embedding.csv","w")
for author in d:
    v = np.zeros(256)
    c = 0
    for paper in d[author]:
        try:
            v+=papers[paper]
            c+=1
        except:
            continue
    if(c==0):
        c=1
    df.write(author+","+",".join(map(lambda x:"{:.8f}".format(round(x, 8)), v/c))+"\n")
    
df.close()

# Text Embedding using SBERT:

## Papers Embedding:

In [None]:
# Load document languages
doc_lang = np.load(path + 'Doc_lang.npy',allow_pickle='TRUE').item()

# Keep only english papers
doc_abst_eng = {}
for doc_id in doc_abst:
    source_ = doc_lang[doc_id]
    text = doc_abst[doc_id]
    if source_=='en':
        doc_abst_eng[doc_id] = text

#We then load the allenai-specter model with SentenceTransformers
model = SentenceTransformer('allenai-specter')

# The dimension of ouput embedding is 768. So we need high perfomance RAM to excuse the pretrained model.
#To ovecome this issue we split the paper_id set in n:100 portions and each time we save the results.
n=100
for mod in range(100):
  
    ## keep only the index where the modulo == mod
    index_ =[]
    i=0
    for doc in doc_abst_eng:
    i+=1
    if i%n==mod:
        index_.append(doc)

  #Compute embeddings for paper in the index_ set
    paper_bert = {}
    for doc in tqdm(index_):
        csv_file = open(path+"embedding_Bert_"+str(mod)+".csv", "a+")
        writer = csv.writer(csv_file)
        corpus_embeddings = model.encode(doc_abst_eng[doc], convert_to_tensor=True)
        embed = corpus_embeddings.numpy()
        paper_bert[doc] = embed
    np.save(path+'Bert_'+str(mod)+'.npy', paper_bert)

## Authors Embedding:

In [None]:
## To optimse the author embedding computing, we invert the author_paper dictionary to paper_author,
#because we need to look in 100 files to find each document embedding.
doc_auth = {}
for auth in d:
    for doc in d[auth]:
        if doc in doc_auth:
            doc_auth[doc].append(auth)
        else:
            doc_auth[doc] = [auth]

auth_embedding = {}
for auth in d:
    auth_embedding[auth] = np.zeros((768))
## Assign each document embedding to the all appropriate author 
for i in range(100):
    paper_embed = np.load(path+'Bert_'+str(i)+'.npy',allow_pickle='TRUE').item()
    for doc in paper_embed:
        for auth in doc_auth[doc]:
            auth_embedding[auth] += np.array(paper_embed[doc])
            auth_count[auth] += 1

## Compute the mean of the documents embedding
for auth in auth_embedding:
    c = auth_count[auth]
    if auth_count[auth]==0:
        c=1
    auth_embedding[auth] = auth_embedding[auth]/c
np.save(path+'authors_Bert.npy', auth_embedding)