In [None]:
#!pip install langdetect
import pandas as pd
import networkx as nx
from tqdm import tqdm
import numpy as np
from langdetect import detect
from langdetect import DetectorFactory
DetectorFactory.seed = 0
path = './data/'

# Graph features

In [None]:
graph_path = path + 'collaboration_network.edgelist'
G = nx.read_edgelist(graph_path, delimiter=' ', nodetype=int)
n_nodes = G.number_of_nodes()
n_edges = G.number_of_edges() 
print('Number of nodes:', n_nodes)
print('Number of edges:', n_edges)

Number of nodes: 231239
Number of edges: 1777338


In [None]:
# computes structural features for each node
core_number = nx.core_number(G)
avg_neighbor_degree = nx.average_neighbor_degree(G)
cluster = nx.clustering(G)

# Feature extraction

In [None]:
# Store the author_id and his papers_ids in dictionary
f = open(path + 'author_papers.txt',"r")
papers_set = set()
d = {}
for l in f:
    auth_paps = [paper_id.strip() for paper_id in l.split(":")[1].replace("[","").replace("]","").replace("\n","").replace("\'","").replace("\"","").split(",")]
    d[l.split(":")[0]] = auth_paps
f.close()

np.save(path + 'author_paper.npy', d)

# Store the paper_id and his tokenized abstract in a dictionary
fw = open(path + 'abstracts_processed.txt',"r",encoding="utf8")
d_abstracts = {}
for l in fw:
    key, value = l.split('----')
    d_abstracts[key] = value
fw.close()
# Store the paper_id and his raw abstract in a dictionary
ff = open(path+"abstracts_documents_final.txt","r",encoding="utf8")
doc_abst = {}
for l in ff:
    id_,text = l.split("----")
    doc_abst[id_] = text.rstrip("\n")
ff.close()

## Co-authorship

#### Co_authorship_1

This feature defines the number of common papers written between each two authors.

In [None]:
co_authorship = {}
for i,edge in enumerate(tqdm(G.edges)):
    paper_author_1 = d[str(int(edge[0]))]
    paper_author_2 = d[str(int(edge[1]))]
    res = int(len(list(set(paper_author_1).intersection(set(paper_author_2)))))
    if res==0:
        co_authorship[edge] = 1
    else: 
        co_authorship[edge] = res

In [None]:
# Create weighted graph
G_weigthed = G.copy()
for i,edge in enumerate(tqdm(list(G_weigthed.edges))):
    source,target = edge
    G_weigthed[source][target]['weight'] = co_authorship[tuple(edge)]

In [None]:
co_authored = {}
for node in tqdm((list(G.nodes()))):
    counts = []
    edges = list(G.edges(node))
    for edge in edges:
        counts.append(G_weigthed[edge[0]][edge[1]]['weight'] )
    co_authored[node] = counts
    if len(co_authored[node])!=G.degree(node):
        print('The length of the list is not same as the node degree')
        break
co_authorship_path = path + "dic_co_authorship_final.npy"
np.save(co_authorship_path, co_authored)

#### Co_authorship_2 

This feature describes how many authors collaborate in each article.

In [None]:
# Gather all documents in same list from all authors
docs_id =[id for docs in d.values() for id in docs ] 
# Dictionary of document occurences
docs_count = {}
for id in docs_id:
    if id in docs_count:
        docs_count[id] +=1
    else:
        doc_count_id[id] = 1
# Dictionary of authors and their document occurences
auth_abstract_count = {}
for auth in d:
    count = []
    for doc in d[auth]:
        if doc in doc_count_id:
            count.append(doc_count_id[doc])
        else:
            count.append(0)
    auth_abstract_count[auth] = count
# Save it
np.save(path+'auth_num_docs.npy', auth_abstract_count)

## Corpus_ratio

It describes the ratio of the unique words in an abstract to the corpus words for each author.

In [None]:
# Gather unique tokens in the corpus
corpus=[]
for val in tqdm(d_abstracts.values()):
    for token in val.split(','):
        if len(token)>0:
            corpus.append(token)
corpus_unique = set(corpus)

100%|██████████| 1056539/1056539 [00:30<00:00, 34658.01it/s]


In [None]:
# The ratio of unique token in each document abstract to the corpus
doc_abstract_count = {}
for id in d_abstracts:
    abstract_ = set(d_abstracts[id].split(','))
    doc_abstract_count[id] = len(abstract_)/len(corpus_unique)

papers_ratio_corpus = {}
for id in tqdm(d):
    count = []
    for doc in d[id]:
        if doc not in doc_abstract_count:
            count.append(0.)
        else:
            count.append(doc_abstract_count[doc])
    papers_ratio_corpus[id] = count
#Save it
np.save(path+'Atr_words_count.npy', papers_ratio_corpus)

100%|██████████| 231239/231239 [00:04<00:00, 47524.25it/s]


##	Abstract_ratio

It is the percentage of unique words in each author’s abstract.

In [None]:
abstract_words = {}
for id in d_abstracts:
    abstract_ = d_abstracts[id].split(',')
    abstract_words[id] = len(set(abstract_))/len(abstract_)

papers_ratio = {}
for id in tqdm(d):
    count = []
    for doc in d[id]:
        if doc not in abstract_words:
            count.append(0.)
        else:
            count.append(abstract_words[doc])
    papers_ratio[id] = count
#Save it
np.save(path+'Atr_coprs_ratio.npy', papers_ratio)

## Abstract and Corpus frequency

It is  frequency of tokens that are repeated at least twice in the abstract and then in the corpus.

In [None]:
doc_freq_abs = {}
doc_freq_crps = {}
for doc in tqdm(doc_freq):
    if len(doc_freq[doc]) ==0:
        doc_freq_abs[doc] = 0
        doc_freq_crps[doc] = 0
    else: 
        doc_freq_abs[doc] = len(doc_freq[doc])/len(set(d_abstracts[doc]))
        doc_freq_crps[doc] = len(doc_freq[doc])/len(corpus)

auth_freq_abs = {}
auth_freq_crps = {}
for auth in d:
    count_abs = []
    count_crps = []
    for doc in d[auth]:
        if doc in doc_freq_abs:
            count_abs.append(doc_freq_abs[doc])
            count_crps.append(doc_freq_crps[doc])
    auth_freq_abs[auth] = count_abs
    auth_freq_crps[auth] =  count_crps

np.save(path+'auth_freq_abs.npy',auth_freq_abs)
np.save(path+'auth_freq_crps.npy',auth_freq_crps)

## Doc_language

It is the language of each document for each author.

In [None]:
doc_lang = {}
for doc in doc_abst:
    doc_lang[doc] = detect(doc_abst[doc])

author_lang = {}
for id in tqdm(d):
    lang = []
    for doc in d[id]:
        if doc not in doc_lang:
            continue
        else:
            lang.append(doc_lang[doc])
    author_lang[id] = lang

#Save it
np.save(path+'authors_langage.npy',author_lang)

# Data Features

The features explained above were generated and exported to be used in the model.

In [None]:
authors_count = np.load(path+'auth_num_docs.npy',allow_pickle='TRUE').item()
papers_ratio_corpus = np.load(path+'Atr_words_count.npy',allow_pickle='TRUE').item()
papers_ratio = np.load(path+'Atr_coprs_ratio.npy',allow_pickle='TRUE').item()
authors_lang = np.load(path+'authors_langage.npy',allow_pickle='TRUE').item()
co_author = np.load(path+'dic_co_authorship_final.npy',allow_pickle='TRUE').item()

In [None]:
n_feature = 20
df_features  = np.zeros((n_nodes,n_feature))
for i, node in enumerate(G.nodes()):
    df_features[i,0:1] = avg_neighbor_degree[node] #avg_ngbr
    df_features[i,1:2] = G.degree(node) #nd_degree
    df_features[i,2:3] = core_number[node] #core_n
    df_features[i,3:4] = cluster[node] #cluster
    df_features[i,4:5] = len(d[str(int(node))]) #docs_n
    df_features[i,5:6] = authors_count[str(int(node))].count(1) #one_co_authorship_2
    df_features[i,6:7] = np.max(papers_ratio_corpus[str(int(node))]) #max_papers_ratio_corpus
    df_features[i,7:8] = np.mean(papers_ratio_corpus[str(int(node))]) #mean_papers_ratio_corpus
    df_features[i,8:9] = np.min(papers_ratio_corpus[str(int(node))]) # min_papers_ratio_corpus
    df_features[i,9:10] = np.max(papers_ratio[str(int(node))]) # max_papers_ratio
    df_features[i,10:11] = np.mean(papers_ratio[str(int(node))]) #mean_papers_ratio
    df_features[i,11:12] = np.min(papers_ratio[str(int(node))]) #min_papers_ratio
    df_features[i,12:13] = np.sum([x for x in authors_count[str(int(node))] if x!=1]) #sum_co_authorship_2
    df_features[i,13:14] = np.max(authors_count[str(int(node))]) #max_co_authorship_2
    df_features[i,14:15] = int(np.round(np.mean(authors_count[str(int(node))]))) #mean_co_authorship_2
    df_features[i,15:16] = np.min(authors_count[str(int(node))]) # min_co_authorship_2
    df_features[i,16:17] = np.mean(co_author[int(node)]) #mean_co_authorship_1
    df_features[i,17:18] = np.max(co_author[int(node)]) #max_co_authorship_1
    df_features[i,18:19] = co_author[int(node)].count(2) #two_co_authorship_1
    df_features[i,19:20] = len(set(authors_lang[str(int(node))])) #abstract languages
    df_features[i,20:21] = int(node)

In [None]:
df_features = pd.DataFrame(df_features)
df_features.columns = ['avg_ngbr', 'nd_degr', 'core_n', 'cluster', 'docs_n',
       'one_co_authorship_2', 'max_papers_ratio_corpus',
       'mean_papers_ratio_corpus', 'min_papers_ratio_corpus',
       'max_papers_ratio', 'mean_papers_ratio', 'min_papers_ratio',
       'sum_co_authorship_2', 'max_co_authorship_2', 'mean_co_authorship_2',
       'min_co_authorship_2', 'mean_co_authorship_1', 'max_co_authorship_1',
       'two_co_authorship_1','auth_lang_n','author_id']
df_features.head(5)
df_features.to_csv(path+'df_features.csv', index=False)