In [72]:
import numpy as np
from sklearn.preprocessing import normalize
from numpy.linalg import norm
from numpy.linalg import norm
import pandas as pd
from tkinter import Tk
from tkinter.filedialog import askopenfilename
import time as time



# Author: Ashwin Srinivas
# Date: 21st Apr 2019
# Notebook Desc: PageRank in the sample input matrix

In [73]:
# Enter the adjacency matrix
num_journals = int(input("Enter the number of journals "))
  
print("Enter all",num_journals*num_journals,"citations in order separated by space: ") 
  
# User input of entries in a single line separated by space 
# Test Input: 1 0 2 0 4 3 3 0 1 1 0 0 2 0 4 0 1 0 0 0 1 0 0 1 8 0 3 0 5 2 0 0 0 0 0 0
entries1 = list(map(int, input().split())) 

# Reshape the list into a n x n matrix
mtx1 = np.array(entries1).reshape(num_journals, num_journals)

print("Adjacency matrix:")
print(mtx1)        

Enter the number of journals 6
Enter all 36 citations in order separated by space: 
1 0 2 0 4 3 3 0 1 1 0 0 2 0 4 0 1 0 0 0 1 0 0 1 8 0 3 0 5 2 0 0 0 0 0 0
Adjacency matrix:
[[1 0 2 0 4 3]
 [3 0 1 1 0 0]
 [2 0 4 0 1 0]
 [0 0 1 0 0 1]
 [8 0 3 0 5 2]
 [0 0 0 0 0 0]]


In [74]:
# Step 1: Make all self-references 0 (make diagonal elements 0)
for i in range(num_journals):
    for j in range(num_journals):
        if (i==j): 
            mtx1[i][j]=0
            break
            
print("Matrix after removing self-references: ")
print(mtx1)

Matrix after removing self-references: 
[[0 0 2 0 4 3]
 [3 0 1 1 0 0]
 [2 0 0 0 1 0]
 [0 0 1 0 0 1]
 [8 0 3 0 0 2]
 [0 0 0 0 0 0]]


In [75]:
# Step 2: Normalize this matrix. Divide each elememt by column_sum
norm_mtx1 = normalize(mtx1, axis=0, norm='l1') # imported from sklearn

#print the normalized matrix
print("Normalized matrix: ")
print(norm_mtx1)

Normalized matrix: 
[[0.         0.         0.28571429 0.         0.8        0.5       ]
 [0.23076923 0.         0.14285714 1.         0.         0.        ]
 [0.15384615 0.         0.         0.         0.2        0.        ]
 [0.         0.         0.14285714 0.         0.         0.16666667]
 [0.61538462 0.         0.42857143 0.         0.         0.33333333]
 [0.         0.         0.         0.         0.         0.        ]]


In [77]:
#Step 3: Compute the dangling-node vector. This will be the column sums of the normalized matrix
d=[]
for i in range(0,num_journals):
    d.append(0)
for i in range(0,num_journals):
    for j in range(0,num_journals):
        d[i] += norm_mtx1[j][i]     #d will have the column sums of all columns

# If column_sum=0, then make that value 1 in the vector
for i in range(0,num_journals):
    if(d[i]==1.0):
        d[i]=0.0
    else:
        d[i]=1.0

        
d = np.array(d).reshape(1,num_journals)
#print the dangling-node vector
print("Dangling-node vector: ")
print(d)

Dangling-node vector: 
[[0. 1. 0. 0. 0. 0.]]


In [78]:
# Generate the article vector and influence vector (for the first iteration)

a = [3/14,2/14,5/14,1/14,2/14,1/14] # this is the article vector
inf_vec_init = [] # this is the influence vector for the prev iteration
inf_vec = [] # this is the influence vector for the next iteration
for i in range(0,num_journals):
    inf_vec_init.append(float(1/num_journals))
    inf_vec.append(0.0)



# Convert both these lists into vectors of appropriate dimensions
a = np.array(a).reshape(num_journals, 1)
inf_vec_init = np.array(inf_vec_init).reshape(num_journals, 1)
inf_vec = np.array(inf_vec).reshape(num_journals, 1)


print("a: ",a)
print("influence vector: ",inf_vec_init)

#np.matmul(norm_mtx1,d1)

a:  [[0.21428571]
 [0.14285714]
 [0.35714286]
 [0.07142857]
 [0.14285714]
 [0.07142857]]
influence vector:  [[0.16666667]
 [0.16666667]
 [0.16666667]
 [0.16666667]
 [0.16666667]
 [0.16666667]]


In [10]:
# Inf_vec(k+1) = alpha*H*Inf_vec(k) + [alpha*d*Inf_vec(k) + (1-alpha)]a

# Here alpha=0.85; inf_vec(0)=[1/6,1/6,1/6,1/6,1/6,1/6]; H is the normalized matrix;
# d is the dangling node vector; a is article vector same as inf_vec_init


i=1
alpha = 0.85
while(i<=20):
    print("Iteration: ",i)
    inf_vec = (alpha * (np.matmul(norm_mtx1,inf_vec_init))) + np.matmul(a,(alpha * (np.dot(d,inf_vec_init)) + (1-alpha)))
    if(np.linalg.norm(inf_vec - inf_vec_init) <= 0.00001): # epsilon = 0.00001
        break
    else:
        print("Residual: ",np.linalg.norm(inf_vec - inf_vec_init))
        inf_vec_init = inf_vec
    i+=1
print("Influence vector: ")
print(inf_vec)

Iteration:  1
Residual:  0.2368453122091217
Iteration:  2
Residual:  0.08092008866629648
Iteration:  3
Residual:  0.03191071858184089
Iteration:  4
Residual:  0.005607834606594513
Iteration:  5
Residual:  0.0036292005059997416
Iteration:  6
Residual:  0.0015733983460622374
Iteration:  7
Residual:  0.0009248510671177797
Iteration:  8
Residual:  0.0005579643482496045
Iteration:  9
Residual:  0.00034763075755432735
Iteration:  10
Residual:  0.00021288182741747573
Iteration:  11
Residual:  0.0001314491222846803
Iteration:  12
Residual:  8.081769118034092e-05
Iteration:  13
Residual:  4.979979063586942e-05
Iteration:  14
Residual:  3.066549082426064e-05
Iteration:  15
Residual:  1.8892225266025046e-05
Iteration:  16
Residual:  1.1636776304398121e-05
Iteration:  17
Influence vector: 
[[0.30402454]
 [0.16360216]
 [0.18979672]
 [0.04661902]
 [0.2753102 ]
 [0.02064736]]


In [13]:
# Calculate the Eigenfactor:

# dot product of the Normalized matrix and the Influence vector
eigenfactor = []
for i in range(0,num_journals):
    eigenfactor.append(0.0)
eigenfactor = np.array(eigenfactor).reshape(num_journals, 1) #creating an Nx1 vector with 0.0 as its elements

eigenfactor = np.dot(norm_mtx1,inf_vec) # dot product
eigenfactor = 100*(normalize(eigenfactor, axis=0, norm='l1')) #multiply it by 100 to get percentages

eigenfactor # final eigenfactor matrix


array([[34.05071853],
       [17.20381588],
       [12.17543157],
       [ 3.65317104],
       [32.91686298],
       [ 0.        ]])

In [None]:
# --------------------------------------PART 2--------------------------------------------------------#
# ------------------ IMPORT THE TXT CSV FILE
Tk().withdraw() # we don't want a full GUI, so keep the root window from appearing
filename = askopenfilename() # show an "Open" dialog box and return the path to the selected file
journals_df=pd.read_csv(filename, header=None)


In [3]:
# test if the import has happened

print(journals_df.head(10))
# print(journals_df.shape)
# print(journals_df.dtypes)
print(journals_df.min())
print(journals_df.max())

     0     1    2
0  758  1476    5
1  758   758  150
2  758  5938    3
3  758  4972   13
4  758  2416    0
5  758  7067    1
6  758  4543    0
7  758  2722    1
8  758  2249    1
9  758  7531    1
0    0
1    0
2    0
dtype: int64
0    10747
1    10747
2    28293
dtype: int64


In [5]:
# Initialize an empty adjacency matrix with 0.0 as values

adj_mtx = []

n1 = 10748*10748 # there are 10748 unique. check with max and min values
for i in range(0,n1):
    adj_mtx.append(0.0)
adj_mtx = np.array(adj_mtx).reshape(10748, 10748)
print(adj_mtx)

[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


In [6]:
# Populate the adj matrix
i=0
j=0
for index, row in journals_df.iterrows():
    j = row[0]; i = row[1]; value = row[2] # setting the j to col1 and i to col2 in the DF
    adj_mtx[i][j] = value

print(adj_mtx)

[[  34.    0.    0. ...    0.    0.    0.]
 [   0.   21.    0. ...    0.    0.    0.]
 [   0.    0. 1594. ...    0.    0.    0.]
 ...
 [   0.    0.    0. ...   20.    0.    0.]
 [   0.    0.    0. ...    0.    0.    0.]
 [   0.    0.    0. ...    0.    0.   40.]]


In [13]:
# Step 1:  Remove self references (diagonal elements=0.0) in the adjacency matrix
for i in range(0,10748):
    for j in range(0,10748):
        if (i==j): 
            adj_mtx[i][j]=0.0
            break
            
print("Matrix after removing self-references: ")
print(adj_mtx)

Matrix after removing self-references: 
[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


In [23]:
# Step 2: Normalize adjacency matrix column-wise to get the share of each publisher

norm_adj_mtx = adj_mtx

j=0 # initialize for zeroth column index
while(j<10748):
    col_total = sum(norm_adj_mtx[:,j]) #for each column, sum the values in each row
    i = 0
    while(i<10748):
        if(col_total!=0):
            norm_adj_mtx[i][j] = (norm_adj_mtx[i][j])/col_total
        i+=1
    j+=1

# print the normalized adjacency matrix
print(norm_adj_mtx)

[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


In [28]:
# Step 3: Compute the dangling-node vector. This will be the column sums of the normalized matrix
d=[]
for i in range(0,10748):
    d.append(0.0)
for i in range(0,10748):
    for j in range(0,10748):
        d[i] += norm_adj_mtx[j][i]     #d will have the column sums of all columns

# If column_sum=0, then make that value 1 in the vector
for i in range(0,10748):
    if(d[i]==0.0):
        d[i]=1
    else:
        d[i]=0

        
d = np.array(d).reshape(1,10748)
#print the dangling-node vector
print("Dangling-node vector: ")
print(d)


Dangling-node vector: 
[[0 0 0 ... 0 0 0]]


In [54]:
# Initialize the article vector and influence vector (for the first iteration)

art_vec = [] # this is the article vector
pi_k = [] # this is the influence vector for the prev iteration
pi_k_plus1 = [] # this is the influence vector for the next iteration
for i in range(0,10748):
    pi_k.append(float(1/10748)) 
    # here, we are assuming that each publication has only 1 journal. 
    # So the contribution of each publication will be 1/total num of publications
    art_vec.append(float(1/10748))
    pi_k_plus1.append(0.0) # initialize list elements to 0.0



# Convert both these lists into vectors of appropriate dimensions
art_vec = np.array(art_vec).reshape(10748, 1)
pi_k = np.array(pi_k).reshape(10748, 1)
pi_k_plus1 = np.array(pi_k_plus1).reshape(10748, 1)


print("Article vector: ",art_vec)
print("Initial influence vector: ",pi_k)
print("Pi(k+1): ",pi_k_plus1)

Article vector:  [[9.30405657e-05]
 [9.30405657e-05]
 [9.30405657e-05]
 ...
 [9.30405657e-05]
 [9.30405657e-05]
 [9.30405657e-05]]
Initial influence vector:  [[9.30405657e-05]
 [9.30405657e-05]
 [9.30405657e-05]
 ...
 [9.30405657e-05]
 [9.30405657e-05]
 [9.30405657e-05]]
Pi(k+1):  [[0.]
 [0.]
 [0.]
 ...
 [0.]
 [0.]
 [0.]]


In [55]:
# Inf_vec(k+1) = alpha*H*Inf_vec(k) + [alpha*d*Inf_vec(k) + (1-alpha)]a

i=1
alpha = 0.85
start = time.time()
while(i<=100):
    #print("Iteration: ",i)
    pi_k_plus1 = (alpha * (np.matmul(norm_adj_mtx,pi_k))) + np.matmul(art_vec,(alpha * (np.dot(d,pi_k)) + (1-alpha)))
    if(np.linalg.norm(pi_k_plus1 - pi_k) <= 0.00001): # epsilon = 0.00001
        break
    else:
        #print("Residual: ",np.linalg.norm(pi_k_plus1 - pi_k))
        pi_k = pi_k_plus1
    i+=1
end = time.time()
print("Converged at iteration: ",i)
print("Execution time: ",end-start,"sec")
print("Influence vector: ")
print(pi_k_plus1)

Converged at iteration:  21
Execution time:  0.9868021011352539 sec
Influence vector: 
[[4.38757550e-05]
 [2.74529615e-05]
 [1.47057756e-04]
 ...
 [4.08370309e-05]
 [1.56825108e-05]
 [3.67758982e-05]]


In [59]:
#Calculate the eigenfactor vector

eigenfactor = np.dot(norm_adj_mtx,pi_k_plus1)
norm_eigenfactor = eigenfactor

col_total = sum(eigenfactor[:,0])  #col_total of matrix before normalization
for j in range(0,10748):
    norm_eigenfactor[j] = eigenfactor[j]/col_total #normalize each element

norm_eigenfactor = 100 * norm_eigenfactor
print(norm_eigenfactor) # print the final normalized vector

[[0.00346127]
 [0.00150941]
 [0.01572399]
 ...
 [0.00310084]
 [0.00011093]
 [0.00261783]]


In [62]:
# scores for top 20 journals

top20indices = np.argpartition(a = norm_eigenfactor, kth = -20, axis = 0)[-20:] #get the indices of top 20 elements
print("Top top 20 journals with scores")
print("Index\tScore")
for i in top20indices:
    print(i,'\t',norm_eigenfactor[i])

Top top 20 journals with scores
Index	Score
[1212] 	 [[0.31121248]]
[5035] 	 [[0.31659069]]
[4598] 	 [[0.37262531]]
[2880] 	 [[0.33019386]]
[6569] 	 [[0.31919523]]
[3314] 	 [[0.32730623]]
[3480] 	 [[0.37952447]]
[8930] 	 [[0.47758936]]
[1995] 	 [[0.38598353]]
[6857] 	 [[0.4396224]]
[1935] 	 [[0.38504837]]
[5966] 	 [[0.42962702]]
[6523] 	 [[0.48060872]]
[6610] 	 [[1.23460582]]
[6667] 	 [[0.63425277]]
[2056] 	 [[0.67933464]]
[4024] 	 [[0.57686694]]
[6919] 	 [[0.66469197]]
[4408] 	 [[1.4475384]]
[4801] 	 [[1.41203757]]


Above are the list of top 20 journals with their scores.

Code took 0.9868021011352539 sec to converge. (This varies with how busy my system was. But usually it executed within 3 sec)

It converged at the 21st iteration.