# Libraries
https://www.kaggle.com/code/ldegioanni/covid-19-papers-ranking/notebook#Part-I:-Data-Preparation

In [2]:
#Basic Sandbox
import os
import json
import pandas as pd
import numpy as np

import warnings
warnings.filterwarnings("ignore")

#To generate a refid for each paper (dataset + bib_entries)
import hashlib #for sha1

#To build network and compute pagerank
import networkx as nx
import math as math

#For Data viz
import seaborn as sns
import matplotlib.pyplot as plt

from datetime import date
from datetime import timedelta

# Constants

$P^{2}_{author}=(0.25)*P_{coauthornetwork}(author)+(0.75)*\sum^{publication}{P_{citationnetwork}(pubilication)}$


In [3]:
#Weight parameters for Approach 2 and 3 :

weights_InfluenceScore = [0.25, 0.25, 0.25, 0.25]

# 1. Load data

- os.walk()： **for directory tree** is a Python method that generates the file names in a directory tree by walking the tree either top-down or bottom-up. For each directory in the tree rooted at directory top (including top itself), it yields a 3-tuple (dirpath, dirnames, filenames)

In [4]:
#1. Get the data
datafiles = []
for dirname, _, filenames in os.walk('/Users/Shared/Files From e.localized/Singapore/Semester2/PC5253 Complex system modeling/PageRank/Kaggle/input'):
    
    for filename in filenames:
            ifile = os.path.join(dirname, filename)
            # datafiles: 存储json文件
            if ifile.split(".")[-1] == "json":  
                datafiles.append(ifile)
            
print("Number of Files Loaded: ", len(datafiles))


Number of Files Loaded:  85371


In [5]:
#Loading metadata csv file to get the publish time
metadata = pd.read_csv("/Users/Shared/Files From e.localized/Singapore/Semester2/PC5253 Complex system modeling/PageRank//Kaggle/input/metadata.csv")

In [6]:
#2. Creating of the two DataFrames:
#dfPaperList = df of Research Papers.. Variables: paper_id, paper_title, paper_authors
#dfCitationsFlat = df of all the citations . Variables: citationsId, paperId (where the citation is made),refid, title, year

authors = [] # paper id+ author name

citationsFlat = []
citationsCount = 0

for file in datafiles:
    with open(file,'r')as f:#opening a file in read mode
        doc = json.load(f)#doc is a dictionary, json.loas()return dictionary object
    paper_id = doc['paper_id']
    
    paper_authors = []

    for value in doc['metadata']['authors']:
        if len(doc['metadata']['authors']) == 0:
            paper_authors.append("NA")
        else:
            last = value["last"]
            first = value["first"]#first name ; last name
            paper_authors.append(first+" "+last)

    authors.append({"paper_id": paper_id, "authors" : paper_authors})

    for key,value in doc['bib_entries'].items():
        refid = key
        title = value['title'].lower()
        year = value['year']
        venue = value['venue'] 
        SHATitleCitation = hashlib.sha1(title.lower().encode()).hexdigest() #

        if (len(title) == 0):
            continue #there is noting we can do without any title

        citationsFlat.append({"citationId":citationsCount,\
                          "refid" : SHATitleCitation,\
                          "from": paper_id,\
                          "title": title.lower(),\
                          "year": year})
        citationsCount=citationsCount+1
        
#Conversion into DataFrame
dfCitationsFlat = pd.DataFrame(citationsFlat)# citationsId, paperId (where the citation is made),refid, title, year
authorsDf = pd.DataFrame(authors) # paper id+ author name

metadata_extract = metadata[["sha", "title", "abstract", "publish_time"]].rename(columns = {"sha" : "paper_id"})
dfPaperList = pd.merge(metadata_extract, authorsDf, on = "paper_id", how = "left")
# on the common column "paper_id" and use left join

dfPaperList["year"] = 0
dfPaperList["refid"] = ""

for i in range(len(dfPaperList)):
    
    dfPaperList["refid"][i] =  hashlib.sha1(str(dfPaperList["title"][i]).lower().encode()).hexdigest()
     #NB: We are building a custom identifier based on papers titles to ensure identification will be consistent between the papers in the Research Dataset and the papers extracted from the bib entries.
     #Unfortunately a paperId is not present for citations and doi is not provided for the whole dataset but title seem to be present for ~98% of the dataset. To enable and ease indexing capabilities we are hashing with SHA   
    dfPaperList["year"][i] = str(dfPaperList["publish_time"][i])[:4] # just want to get year
    
    try:
        dfPaperList["authors"][i] = dfPaperList["authors"][i].split(";")
    except:
        continue
        
quotationPapersFreq = pd.DataFrame({"refid" : dfCitationsFlat["refid"].value_counts().index, 
                       "nbQuotations" : dfCitationsFlat["title"].value_counts().values}) 

paperToScore = pd.merge(dfPaperList,quotationPapersFreq, on = "refid", how = "left")
paperToScore["nbQuotations"] = paperToScore["nbQuotations"].fillna(0)
#This is a Python code snippet that fills the missing values in the nbQuotations column of the paperToScore dataframe with 0.


#Adding list of references by papers according to the refid
refList = pd.DataFrame({"references" : dfCitationsFlat.groupby('from')['refid'].apply(list)}) 
refList["paper_id"] = refList.index; cols = ["paper_id","references"] ; refList = refList[cols].reset_index(drop = True) #Reformatting the reflist by papers
datasetForScoring = pd.merge(paperToScore, refList, how='left', on = 'paper_id').reset_index(drop = True)
#This code is grouping the refid column of the dfCitationsFlat dataframe by the values in the from column and applying the list function to the resulting groups. The list function creates a list of all the refid values in each group.

datasetForScoring = datasetForScoring[(datasetForScoring["authors"].isna() == False)].reset_index(drop = True)

In [7]:
#3. A few stats regarding number of papers loaded

print("Number of Papers in the CORD-19 dataset :",dfPaperList.shape[0])
#(05/14/2020) Number of Papers in the covid dataset : 63,571

print("Number of Citations found in the CORD-19 dataset :",dfCitationsFlat.shape[0])
#(05/14/2020) Number of Citations made in the covid dataset : 4,208,974

print("Citations with no title: ",sum(1 if x == "" else 0 for x in dfCitationsFlat["title"]))
#(05/14/2020) Citations with no title:  0

#How many duplicates? 
print("Number of duplicated research paper titles: ",len(dfPaperList["title"])-len(dfPaperList["title"].drop_duplicates()))
#(05/14/2020) Number of duplicated research paper titles:  1,421

print("Number of duplicated citations titles: ",len(dfCitationsFlat["title"])-len(dfCitationsFlat["title"].drop_duplicates()))
#(05/14/2020) Number of duplicated citations titles:  2,543,820

#Dataframe Visualization
print("Number of Papers that will be scored: ", datasetForScoring.shape[0])
datasetForScoring.head()

Number of Papers in the CORD-19 dataset : 63571
Number of Citations found in the CORD-19 dataset : 4208974
Citations with no title:  0
Number of duplicated research paper titles:  1421
Number of duplicated citations titles:  2543820
Number of Papers that will be scored:  46407


Unnamed: 0,paper_id,title,abstract,publish_time,authors,year,refid,nbQuotations,references
0,b2897e1277f56641193a6db73825f707eed3e4c9,Sequence requirements for RNA strand transfer ...,Nidovirus subgenomic mRNAs contain a leader se...,2001-12-17,"[Alexander Pasternak, Erwin Van Den Born, Will...",2001,7890bdcde2bc48da8b35296ee38c3aa6e6a549c5,79.0,"[63f53cf95376af6f781ae6c60df4887012432de5, 02c..."
1,e3d0d482ebd9a8ba81c254cc433f314142e72174,"Crystal structure of murine sCEACAM1a[1,4]: a ...",CEACAM1 is a member of the carcinoembryonic an...,2002-05-01,"[Kemin Tan, Bruce Zelus, Rob Meijers, Jin-Huan...",2002,9555d93a1e7d86c279a9cb7f40e1935ac998cb00,54.0,"[10e50d9f52a7b77e7d89559892a5898a5a202feb, 736..."
2,00b1d99e70f779eb4ede50059db469c65e8c1469,Synthesis of a novel hepatitis C virus protein...,Hepatitis C virus (HCV) is an important human ...,2001-07-16,"[Zhenming Xu, Jinah Choi, T Yen, Wen Lu, Anne ...",2001,592f6ad0e68ffa8c56a5cd5d2eb673bf092b02b3,7.0,"[11e6080d95677abd1aa156897750ce06c626df43, c35..."
3,cf584e00f637cbd8f1bb35f3f09f5ed07b71aeb0,Structure of coronavirus main proteinase revea...,The key enzyme in coronavirus polyprotein proc...,2002-07-01,"[Kanchan Anand, Gottfried Palm, Jeroen Mesters...",2002,6df07660b30c131c1d75c1a5974b75faada1044f,20.0,"[65886aa952fa05a113ca27ed8334a3adad3a4aa0, a9a..."
4,dde02f11923815e6a16a31dd6298c46b109c5dfa,Discontinuous and non-discontinuous subgenomic...,"Arteri-, corona-, toro- and roniviruses are ev...",2002-12-01,"[A Van Vliet, S Smits, P Rottier, R De Groot]",2002,72afbd0f45b04dc7c0bee89367a3d9813715e675,46.0,"[5d21964ac803893c1f15f6a32e740b94380a3199, cc9..."


# 2 Computation of Author Scoring and Publication pagerank

In [8]:
'''1. Creating an author dataset + Computation of the author page rank using an author network'''

#Variables for author dataset: id, name, co-authors, number of points linked to quotations, paper_count, citations, average citations,co_author_avg_citations,h-index

author_data = {}
author_id = {
    'start': 1,
    'curr': 1
}

assigned_ids = {}

def create_author_data(train_data, author_data, author_id, assigned_ids):
    for i in range(len(train_data)):
        authors = train_data.authors[i]
        #This code is accessing the i-th element of the nbQuotations list of the train_data object.
    
        try:
            citations = train_data.nbQuotations[i]/len(authors) #Number of times a paper have been quoted divided by len authors
        except:
            continue

        for author in authors:
            names = author.split(' ')
            unique_name = names[0] + "_" + names[len(names)-1]
            if unique_name not in author_data:
                author_data[unique_name] = {
                    'num_citations': citations,
                    'paper_count': 1,
                    'name': unique_name,
                    'author_id': author_id['curr'],
                    'co_authors': {},
                    'citations': [train_data.nbQuotations[i]]
                }
                assigned_ids[unique_name] = author_id['curr']
                author_id['curr'] += 1

            else:
                author_data[unique_name]['num_citations'] += citations
                author_data[unique_name]['paper_count'] += 1
                author_data[unique_name]['citations'].append(train_data.nbQuotations[i])

            for co_author in authors:
                co_author_names = co_author.split(' ')
                co_author_unique_name = co_author_names[0] + "_" + co_author_names[len(co_author_names)-1]
                if co_author_unique_name != unique_name:
                    author_data[unique_name]['co_authors'][co_author_unique_name] = 1
                        
            
            
# call for each data file
create_author_data(datasetForScoring, author_data, author_id, assigned_ids)

# add average citations
for data in author_data:
    author_data[data]['average_citations'] = author_data[data]['num_citations'] / author_data[data]['paper_count']
    
# adding h-index
def get_h_index(citations):
    return ([0] + [i + 1 for i, c in enumerate(sorted(citations, reverse = True)) if c >= i + 1])[-1]

data_to_df = []
for data in author_data:
    each_author = author_data[data]
    co_authors = each_author['co_authors']
    co_author_ids = []
    co_author_avg_citations = 0
    for co_author in co_authors:
        co_author_avg_citations += author_data[co_author]['average_citations']
        co_author_ids.append(assigned_ids[co_author])
    each_author['co_authors'] = co_author_ids
    each_author['co_author_avg_citations'] = co_author_avg_citations/len(co_author_ids) if len(co_author_ids) != 0 else 0
    data_to_df.append(each_author)
    
authorsData = pd.DataFrame.from_dict(data_to_df, orient='columns')
#h_index
authorsData['h_index'] = authorsData.apply(lambda x: get_h_index(x.citations), axis=1)

### 2. Computation of authors page rank

the explanation of How can I convert a Pandas DataFrame to a Python dictionary using the to_dict method: https://blog.gitnux.com/code/pandas-to_dict/

In [9]:
'''AUTHOR PAGE RANK'''

#Data Pre-processing: building the dataset on which the author network will be built
train = authorsData.copy().drop(columns=['num_citations', 'h_index','paper_count', 'citations']).dropna(axis = 0, subset=['co_authors'])
train = train[train.co_authors != '[]']
train['author_id'] = pd.to_numeric(train['author_id'])
#print(train.head(10))
print(train.shape)
print(len(train))

# Building up the network to compute author page rank: 
G = nx.Graph()
for i in range(len(train)):
    #select a specific row and column from a Pandas DataFrame1.
    auth = train.iloc[i]['author_id'] #select the row at index i
    G.add_node(auth)
    for neighbor in train.iloc[i]['co_authors']:
        if G.has_edge(auth, neighbor):
            G.add_edge(auth, neighbor, weight = G[auth][neighbor]['weight']+1)
        else:
            G.add_edge(auth, neighbor, weight = 1)

(171295, 5)
171295


In [10]:
'''form the edge list'''
from collections import defaultdict

edge_list=list(G.edges)
edges=defaultdict(list)
for edge in edge_list:
    _from,_to=edge[0],edge[1]
    edges[_from].append(_to)
    
Node_num=len(list(G.nodes))
print(max(list(G.nodes)))
print(len(list(G.nodes)))
#print(list(G.nodes))

171295
171295


In [11]:
score_authors = nx.pagerank(G, alpha=0.55, max_iter=100, tol=1.0e-6, nstart=None, weight='weight', dangling=None)

In [12]:
authorPRK = pd.DataFrame.from_dict(score_authors, orient = "index").reset_index(drop=False)
# The orient parameter is set to "index" to indicate that the keys of the dictionary should be used as the row labels of the DataFrame.
#print(authorPRK.head(10))
#num_rows=authorPRK.shape
#print(num_rows)

#plt.hist(R,bins=100,density=1)
authorPRK=authorPRK.sort_values(by='index')
authorPRK.set_index('index',inplace=True)
print(authorPRK.head(10))

data = authorPRK.values.tolist()
Author_pagerank_vector=np.array(data).reshape(-1)


              0
index          
1      0.000003
2      0.000005
3      0.000019
4      0.000041
5      0.000005
6      0.000005
7      0.000005
8      0.000005
9      0.000005
10     0.000005


### Trustrank method

In [13]:
import heapq
import numpy as np
#from graphs import plotGraph
from scipy.sparse import csr_matrix as SparseMatrix

class TrustRank:
    def __init__(self, beta, edges, epsilon, max_iterations, node_num,PageRank_vector):
        self.beta = beta
        self.edges = edges
        self.epsilon = epsilon
        self.node_num = node_num
        self.PageRank_vector = PageRank_vector
        self.MAX_ITERATIONS = max_iterations


    def get_trustedPages(self, node_number_threshold=100):

        # set number of trusted pages
        if self.node_num < node_number_threshold:
            ratio = 0.2
        else:
            ratio = 0.0002
        trusted_set_size = int(math.ceil(self.node_num * ratio))
            ## In Python, math.ceil() is a function that rounds up a given number to the smallest integer greater than or equal to that number


        # set and return trusted pages
        heaped_ranks = [(rank, node) for (node, rank) in 
            enumerate(self.PageRank_vector)]
            ##  function returns an iterator that generates tuples containing (index, element) pairs2.
        heapq._heapify_max(heaped_ranks)
        trusted_pages = [heapq._heappop_max(heaped_ranks)[1] 
            for _ in range(trusted_set_size)]
            ## _heappop_max function is used to pop the largest element from the heap and append it to the trusted_pages list for trusted_set_size times1.

        return trusted_pages

    def get_topicSpecificRank(self, teleport_set):


        diff = math.inf
        iterations = 0
        teleport_set_size = len(teleport_set)

        #pg = plotGraph(self.edges, interval=3000)#Time in milli-seconds for which graph is shown on screen
        ##jump action define
        final_rank_vector = np.zeros(self.node_num)
        initial_rank_vector = np.fromiter([1/teleport_set_size if node in teleport_set else 0 for node in
                range(self.node_num)], dtype='float')

        while(iterations < self.MAX_ITERATIONS and diff > self.epsilon):
            new_rank_vector = np.zeros(self.node_num)
            for parent in self.edges:
                for child in self.edges[parent]:
                    
                    new_rank_vector[child-1] += self.beta*(initial_rank_vector[parent-1] /len(self.edges[parent]))

            leaked_rank = (1 - sum(new_rank_vector)) / teleport_set_size
            leaked_rank_vector = np.array([leaked_rank if node in teleport_set
                else 0 for node in range(self.node_num)])

            final_rank_vector = new_rank_vector + leaked_rank_vector
            diff = sum(abs(final_rank_vector - initial_rank_vector))
            initial_rank_vector = final_rank_vector

            iterations += 1
            print("TrustRank iteration: " + str(iterations),"eps is: ",diff)
            #print(final_rank_vector)
            #pg.plot(9, final_rank_vector)

        return final_rank_vector

    def trustRank(self):

        trusted_pages = self.get_trustedPages()
        print("got seed set...")
        final_rank_vector = self.get_topicSpecificRank(trusted_pages)
            ##teleport set is a set of pages which are related to each other and belong to same topic.
        return final_rank_vector

In [20]:
tr = TrustRank(beta=0.55, edges=edges, epsilon=1e-9, max_iterations=50, node_num=Node_num, PageRank_vector=Author_pagerank_vector)
TrustRank_vector = tr.trustRank()
print(TrustRank_vector, sum(TrustRank_vector),sep='\n')
print(TrustRank_vector.max)

got seed set...
TrustRank iteration: 1  , epsilon: 0.2792616651372623
TrustRank iteration: 2  , epsilon: 0.05954281360564107
TrustRank iteration: 3  , epsilon: 0.013390212129405124
TrustRank iteration: 4  , epsilon: 0.0043142506665514195
TrustRank iteration: 5  , epsilon: 0.0014330208742650613
TrustRank iteration: 6  , epsilon: 0.0005120461956215936
TrustRank iteration: 7  , epsilon: 0.00018921237012757872
TrustRank iteration: 8  , epsilon: 7.173473278008748e-05
TrustRank iteration: 9  , epsilon: 2.7790886761925418e-05
TrustRank iteration: 10  , epsilon: 1.0927547636475961e-05
TrustRank iteration: 11  , epsilon: 4.3586148035261325e-06
TrustRank iteration: 12  , epsilon: 1.754931661043229e-06
TrustRank iteration: 13  , epsilon: 7.142882472935526e-07
TrustRank iteration: 14  , epsilon: 2.92845979418456e-07
TrustRank iteration: 15  , epsilon: 1.2104992798838934e-07
TrustRank iteration: 16  , epsilon: 5.032106040723907e-08
TrustRank iteration: 17  , epsilon: 2.1047051772880768e-08
TrustRan

# Get TrustRank

In [19]:
print(len(TrustRank_vector))
authorTRK = pd.DataFrame(TrustRank_vector)

#print(authorTRK[0].idxmax())
authorTRK.index = pd.RangeIndex(start=1, stop=len(authorTRK)+1)
#print(authorTRK.head(10))

print(authorTRK[0].nlargest(10))
print(authorPRK[0].nlargest(10))
#print(authorTRK[0].idxmax(10).index)
authorTRK["author_id"] = authorTRK.index
authorTRK.columns = ["trustrank_author", "author_id"]
#Put page rank value into a csv
authorTRK.to_csv("trustrank_author.csv",index = False)

171295
               0
1   6.968438e-07
2   2.422135e-06
3   4.023194e-05
4   4.120967e-05
5   3.351893e-06
6   5.627968e-06
7   1.403218e-05
8   1.774342e-05
9   1.669777e-05
10  6.067036e-07
992      0.000573
181      0.000340
260      0.000296
262      0.000261
259      0.000218
2855     0.000207
55487    0.000197
4278     0.000189
3013     0.000188
6449     0.000181
Name: 0, dtype: float64
index
2743     0.000965
3346     0.000559
2736     0.000313
354      0.000289
5324     0.000259
352      0.000240
9158     0.000205
17490    0.000202
19595    0.000202
5469     0.000196
Name: 0, dtype: float64


In [22]:
authorPRK["author_id"] = authorPRK.index
authorPRK.columns = ["pagerank_author", "author_id"]
#Put page rank value into a csv
authorPRK.to_csv("pagerank_author.csv",index = False)

### 3. Computation of publication page rank

In [23]:
'''Computation of publication page rank'''

# Building up the network to compute the pagerank for publication
G1 = nx.Graph()
for i in range(len(datasetForScoring)):
# for i in range(100): #Only on a sample
    G1.add_node(datasetForScoring['refid'][i])
    auth = datasetForScoring['refid'][i]
    
    for e in list(str(datasetForScoring["references"][i]).lstrip("[").rstrip("]").replace(" ","").split(",")):
        try:
            if G1.has_edge(auth, e):
                G1.add_edge(auth, e, weight = G[auth][e]['weight']+1)
            else:
                G1.add_edge(auth, e, weight = 1)
        except:
            continue
        
score_publication = nx.pagerank(G1, alpha=0.85, tol=1.0e-6, nstart=None, weight=1, dangling=None)

In [60]:
edge_list1=list(G1.edges)
print(edge_list1[:10])
edges1=defaultdict(list)
for edge in edge_list1:
    _from,_to=edge[0],edge[1]
    edges1[_from].append(_to)
    
Node_num=len(list(G1.nodes))
print(Node_num)
print(len(datasetForScoring))

[('7890bdcde2bc48da8b35296ee38c3aa6e6a549c5', "'63f53cf95376af6f781ae6c60df4887012432de5'"), ('7890bdcde2bc48da8b35296ee38c3aa6e6a549c5', "'02cc62567a75705831e809af741b9c6a47b2b186'"), ('7890bdcde2bc48da8b35296ee38c3aa6e6a549c5', "'b45f834b897ac6b39242087a9f37c58fd333c36f'"), ('7890bdcde2bc48da8b35296ee38c3aa6e6a549c5', "'6c0db1a2670ef286cd188498417422dd781aa88e'"), ('7890bdcde2bc48da8b35296ee38c3aa6e6a549c5', "'5d21964ac803893c1f15f6a32e740b94380a3199'"), ('7890bdcde2bc48da8b35296ee38c3aa6e6a549c5', "'7691d766309a4424cf53095848d799a6ac0729ab'"), ('7890bdcde2bc48da8b35296ee38c3aa6e6a549c5', "'cc9e3b01bf4c4c176ed5b53fe5d6d7d6e5f249a3'"), ('7890bdcde2bc48da8b35296ee38c3aa6e6a549c5', "'f680512ee0f89db57154ea8867ca53dc29058ab8'"), ('7890bdcde2bc48da8b35296ee38c3aa6e6a549c5', "'0f41ee565fbeff1e1535048f886e6f3042ebe96f'"), ('7890bdcde2bc48da8b35296ee38c3aa6e6a549c5', "'37a2fb1980e2a45cd9bd8826561c7eafb38276b3'")]
1371165
46407


In [63]:
#Saving the page rank by paper id
publiPRK = pd.DataFrame.from_dict(score_publication, orient = "index").reset_index(drop=False)
print(publiPRK.head(10))
publiPRK.index = pd.RangeIndex(start=1, stop=len(publiPRK)+1)
publiPRK["index1"] = publiPRK.index
print(publiPRK.head(10))
#publiPRK["publication_id"] = publiPRK['index']
#print(type(publiPRK["publication_id"]))
#publiPRK.columns = ["pageRankPublication","publication_id"]
publiPRK.columns = ["publication_id","pageRankPublication",'index1']
cols_forcal=[ "publication_id",'index1']
print(publiPRK.head())
#publiPRK["publication_id"] = publiPRK["publication_id"].str.replace("'","")
publiPRK_forcal = publiPRK[cols_forcal].reset_index(drop = True)
print(publiPRK_forcal.head())


                                        index             0
0    7890bdcde2bc48da8b35296ee38c3aa6e6a549c5  2.208890e-06
1  '63f53cf95376af6f781ae6c60df4887012432de5'  1.196131e-06
2  '02cc62567a75705831e809af741b9c6a47b2b186'  7.544144e-07
3  'b45f834b897ac6b39242087a9f37c58fd333c36f'  3.330688e-07
4  '6c0db1a2670ef286cd188498417422dd781aa88e'  5.808578e-07
5  '5d21964ac803893c1f15f6a32e740b94380a3199'  1.638114e-06
6  '7691d766309a4424cf53095848d799a6ac0729ab'  3.414348e-06
7  'cc9e3b01bf4c4c176ed5b53fe5d6d7d6e5f249a3'  5.168574e-07
8  'f680512ee0f89db57154ea8867ca53dc29058ab8'  1.907804e-06
9  '0f41ee565fbeff1e1535048f886e6f3042ebe96f'  3.330688e-07
                                         index             0  index1
1     7890bdcde2bc48da8b35296ee38c3aa6e6a549c5  2.208890e-06       1
2   '63f53cf95376af6f781ae6c60df4887012432de5'  1.196131e-06       2
3   '02cc62567a75705831e809af741b9c6a47b2b186'  7.544144e-07       3
4   'b45f834b897ac6b39242087a9f37c58fd333c36f'  3.330688e-07    

In [57]:
#authorPRK = pd.DataFrame.from_dict(score_authors, orient = "index").reset_index(drop=False)
# The orient parameter is set to "index" to indicate that the keys of the dictionary should be used as the row labels of the DataFrame.
#print(authorPRK.head(10))
#num_rows=authorPRK.shape
#print(num_rows)

#plt.hist(R,bins=100,density=1)
data1 = publiPRK_forcal.values.tolist()
#print(data1[:100])
Public_pagerank_vector=np.array(data1).reshape(-1)

In [72]:
import heapq
import numpy as np
#from graphs import plotGraph
from scipy.sparse import csr_matrix as SparseMatrix

class TrustRank:
    def __init__(self, beta, edges, epsilon, max_iterations, node_num,PageRank_vector,publiPRK_forcal):
        self.beta = beta
        self.edges = edges
        self.epsilon = epsilon
        self.node_num = node_num
        self.PageRank_vector = PageRank_vector
        self.MAX_ITERATIONS = max_iterations
        self.publiPRK_forcal=publiPRK_forcal


    def get_trustedPages(self, node_number_threshold=100):

        # set number of trusted pages
        if self.node_num < node_number_threshold:
            ratio = 0.2
        else:
            ratio = 0.0002
        trusted_set_size = int(math.ceil(self.node_num * ratio))
            ## In Python, math.ceil() is a function that rounds up a given number to the smallest integer greater than or equal to that number


        # set and return trusted pages
        heaped_ranks = [(rank, node) for (node, rank) in 
            enumerate(self.PageRank_vector)]
            ##  function returns an iterator that generates tuples containing (index, element) pairs2.
        heapq._heapify_max(heaped_ranks)
        trusted_pages = [heapq._heappop_max(heaped_ranks)[1] 
            for _ in range(trusted_set_size)]
            ## _heappop_max function is used to pop the largest element from the heap and append it to the trusted_pages list for trusted_set_size times1.

        return trusted_pages

    def get_topicSpecificRank(self, teleport_set):


        diff = math.inf
        iterations = 0
        teleport_set_size = len(teleport_set)

        #pg = plotGraph(self.edges, interval=3000)#Time in milli-seconds for which graph is shown on screen
        ##jump action define
        final_rank_vector = np.zeros(self.node_num)
        initial_rank_vector = np.fromiter([1/teleport_set_size if node in teleport_set else 0 for node in
                range(self.node_num)], dtype='float')

        while(iterations < self.MAX_ITERATIONS and diff > self.epsilon):
            new_rank_vector = np.zeros(self.node_num)
            for parent in self.edges:
                parent_value = self.publiPRK_forcal.loc[self.publiPRK_forcal["publication_id"] == parent, "index1"].values[0]
                for child in self.edges[parent]:
                    child_value = self.publiPRK_forcal.loc[self.publiPRK_forcal["publication_id"] == child, "index1"].values[0]
                    new_rank_vector[child_value-1] += self.beta*(initial_rank_vector[parent_value-1] /len(self.edges[parent]))

            leaked_rank = (1 - sum(new_rank_vector)) / teleport_set_size
            leaked_rank_vector = np.array([leaked_rank if node in teleport_set
                else 0 for node in range(self.node_num)])

            final_rank_vector = new_rank_vector + leaked_rank_vector
            diff = sum(abs(final_rank_vector - initial_rank_vector))
            initial_rank_vector = final_rank_vector

            iterations += 1
            print("TrustRank iteration: " + str(iterations),"eps is: ",diff)
            #print(final_rank_vector)
            #pg.plot(9, final_rank_vector)

        return final_rank_vector

    def trustRank(self):

        trusted_pages = self.get_trustedPages()
        print("got seed set...")
        final_rank_vector = self.get_topicSpecificRank(trusted_pages)
            ##teleport set is a set of pages which are related to each other and belong to same topic.
        return final_rank_vector

In [None]:
publi = TrustRank(beta=0.85, edges=edges1, epsilon=1e-9, max_iterations=50, node_num=Node_num, PageRank_vector=Public_pagerank_vector,publiPRK_forcal=publiPRK_forcal)
TrustRank_vector = publi.trustRank()
print(TrustRank_vector, sum(TrustRank_vector),sep='\n')
print(TrustRank_vector.max)

got seed set...


In [None]:
cols=["pageRankPublication", "publication_id"]
print(publiPRK.head())
publiPRK["publication_id"] = publiPRK["publication_id"].str.replace("'","")
publiPRK = publiPRK[cols].reset_index(drop = True)
print(publiPRK.head())

publiPRK.to_csv("pagerank_publication.csv",index = False)

#Integration of the variable Page Rank for publication datasetForScoring
enhancedDatasetForScoring = pd.merge(datasetForScoring,publiPRK, left_on = "refid", right_on = "publication_id", how = "left").drop(columns= ["publication_id"])
enhancedDatasetForScoring = enhancedDatasetForScoring.drop_duplicates(subset='refid', keep="last") #Temporary patch to manage the case where twice Page rank for some publications

In [65]:
mapped_list = list(map(lambda x: publiPRK_forcal.loc[publiPRK_forcal['publication_id'] == x, 'index1'].values[0], edge_list1))


IndexError: index 0 is out of bounds for axis 0 with size 0

In [68]:
# Create an empty list to store the mapped values
mapped_list = []

# Use nested loops and map function to apply the mapping to the 2D list
for i in range(len(edge_list1)):
    row = []
    for j in range(len(edge_list1[i])):
        mapped_value = publiPRK_forcal.loc[publiPRK_forcal['publication_id'] == edge_list1[i][j], 'index1'].values[0]
        row.append(mapped_value)
    mapped_list.append(row)

KeyboardInterrupt: 