In [3]:
import numpy as np
from scipy import sparse
import networkx as nx

## Amirhossein Bagheri 98105621
## Mohammad Sadegh majidi Yazdi  98106004
## Amirmahdi kousheshi 98171053


# page ranking and Hits
+ implementation of hits and page_ranking with networkx library 
+ loading authors_sparse and articles_sparse will gives us sparse matrices that are weighted links graph
+ <b> we implemented both atricles and authors it was really easy because we had both matrices and our class
`Page_Ranking_Hits` is working dynamilally for both   </b>



In [4]:
class Page_Ranking_Hits:
    def __init__(self, objective = "article"):
        self.ref_matrix = None
        self.authors = sparse.load_npz("./Module_data/authors_sparse.npz")
        self.articles = sparse.load_npz("./Module_data/articles_sparse.npz")
        self.objective = self.articles if objective == "article" else self.authors
        
    def compute_page_rank(self, alpha = 0.9):
        graph = nx.from_numpy_array(A=self.objective.toarray(), create_using=nx.DiGraph)
        self.pr = nx.pagerank(G=graph, alpha=alpha)
        
    def compute_hits(self):
        graph = nx.from_numpy_array(A=self.objective.toarray(), create_using=nx.DiGraph)
        self.hub, self.authority = nx.hits(G=graph) 
        
    def tops_pages(self, k = 10):
        return sorted(self.pr.items(),key = lambda x : x[1] , reverse = True)[:k]
    
    def cal_cites(self):
        return np.asarray(np.sum(self.objective,axis = 0)).reshape(-1)
    
    def top_hubs(self, k = 10):
        return sorted(self.hub.items(),key = lambda x : x[1] , reverse = True)[:k]
    
    def top_auth(self, k = 10):
        return sorted(self.authority.items(),key = lambda x : x[1] , reverse = True)[:k]
    
    def cal_ref(self):
        return np.asarray(np.sum(self.objective,axis = 1)).reshape(-1)

## articles

In [5]:
page_hits_articles = Page_Ranking_Hits(objective = "article")
page_hits_articles.compute_page_rank(0.9)
page_hits_articles.compute_hits()

  A = nx.adjacency_matrix(G, nodelist=list(G), dtype=float)


here are 10 top pages that algorithm gave us it's logical to think that the one most cited and refrenced to none will win the game

In [6]:
page_hits_articles.tops_pages(k = 10)

[(8, 0.0008656540101637704),
 (290, 0.0007120085541894802),
 (36, 0.0006847758695930043),
 (249, 0.0006323226542710098),
 (17, 0.000611611449381661),
 (5, 0.000559734763528736),
 (525, 0.0005106178948410967),
 (345, 0.0004920061879285661),
 (6, 0.0004883780903031131),
 (540, 0.00046156682881045826)]

you can see here that 8 th article is winner and is in first place it is 67 times cited which is maximum cited article in dataset and also refrenced to no article.
so are assumption before is true.

In [7]:
print(f"maximum cited graph {np.max(page_hits_articles.cal_cites())}")
index = page_hits_articles.tops_pages(k = 10)[0][0]
print(f"index {index} is best page algorithm founded")
print(f"cites of index {index} = {page_hits_articles.cal_cites()[index]}")
print(f"refrences of index {index} = {page_hits_articles.cal_ref()[index]}")

maximum cited graph 67.0
index 8 is best page algorithm founded
cites of index 8 = 67.0
refrences of index 8 = 0.0


for hit algorithm first authority is same as top ranked page but second place and so on are different.

In [8]:
page_hits_articles.top_auth(k = 10)

[(8, 0.05899759850735505),
 (17, 0.03795623787682291),
 (36, 0.03720893021776358),
 (6, 0.02921601034711979),
 (5, 0.027878046189367258),
 (66, 0.024473872655129555),
 (32, 0.015517467201476558),
 (2, 0.01456622186167456),
 (21, 0.01188781546187709),
 (185, 0.011025246203961978)]

In [9]:
print(f"maximum authority graph {np.max(page_hits_articles.cal_cites())}")
index = page_hits_articles.top_auth(k = 10)[0][0]
print(f"index {index} is best authority algorithm founded")
print(f"cites of index {index} = {page_hits_articles.cal_cites()[index]}")
print(f"refrences of index {index} = {page_hits_articles.cal_ref()[index]}")

maximum authority graph 67.0
index 8 is best authority algorithm founded
cites of index 8 = 67.0
refrences of index 8 = 0.0


for hubs actually because we were told to gather 10 first ones so all nodes ave around 10 auth you would see that numbers assigned to hubs are pretty much close to eachothers

In [180]:
page_hits_articles.top_hubs(k = 10)

[(817, 0.016919708701678134),
 (998, 0.015173367938263075),
 (305, 0.014741094322541967),
 (242, 0.014599856211068513),
 (656, 0.014255014419600932),
 (4293, 0.013889709665425493),
 (240, 0.013710600553457642),
 (955, 0.01355607598659458),
 (952, 0.01351711024095247),
 (2506, 0.0132755140670792)]

here you can see that best hub is with 11 refrences and it is cited 0 

In [10]:
print(f"maximum hubs graph {np.max(page_hits_articles.cal_ref())}")
index = page_hits_articles.top_hubs(k = 10)[0][0]
print(f"index {index} is best hobs algorithm founded")
print(f"cites of index {index} = {page_hits_articles.cal_cites()[index]}")
print(f"refrences of index {index} = {page_hits_articles.cal_ref()[index]}")

maximum hobs graph 11.0
index 817 is best hobs algorithm founded
cites of index 817 = 0.0
refrences of index 817 = 11.0


## authors
here is whole same process for authors
notice that authors are weighted graph.

In [11]:
page_hits_authors = Page_Ranking_Hits(objective = "authors")
page_hits_authors.compute_page_rank(0.9)
page_hits_authors.compute_hits()

In [12]:
page_hits_authors.tops_pages(k = 10)

[(125, 0.0013876996913499957),
 (89, 0.0009782027108704535),
 (1300, 0.0009569687773095832),
 (390, 0.0008823641826682489),
 (1591, 0.0008752411386493033),
 (61, 0.000769724146590484),
 (40, 0.0007360523743464498),
 (719, 0.0007104064236694909),
 (5, 0.0006863153378455461),
 (741, 0.0006796511855971072)]

In [13]:
print(f"maximum cited graph {np.max(page_hits_authors.cal_cites())}")
index = page_hits_authors.tops_pages(k = 10)[0][0]
print(f"index {index} is best page algorithm founded")
print(f"cites of index {index} = {page_hits_authors.cal_cites()[index]}")
print(f"refrences of index {index} = {page_hits_authors.cal_ref()[index]}")

maximum cited graph 570.0
index 125 is best page algorithm founded
cites of index 125 = 570.0
refrences of index 125 = 265.0


In [14]:
page_hits_authors.top_auth(k = 10)

[(125, 0.015249612104948136),
 (719, 0.012285288174642847),
 (40, 0.010882592837816734),
 (1291, 0.009718618171959768),
 (89, 0.008810532064713258),
 (741, 0.008132404883454894),
 (42, 0.007737231355160543),
 (1173, 0.007628077581311401),
 (39, 0.007575760369123254),
 (45, 0.0072182701859336874)]

In [15]:
print(f"maximum authority graph {np.max(page_hits_authors.cal_cites())}")
index = page_hits_authors.top_auth(k = 10)[0][0]
print(f"index {index} is best authority algorithm founded")
print(f"cites of index {index} = {page_hits_authors.cal_cites()[index]}")
print(f"refrences of index {index} = {page_hits_authors.cal_ref()[index]}")

maximum authority graph 570.0
index 125 is best authority algorithm founded
cites of index 125 = 570.0
refrences of index 125 = 265.0


In [16]:
page_hits_authors.top_hubs(k = 10)

[(719, 0.014150077507117848),
 (1291, 0.009556922060991353),
 (106, 0.009451976304943672),
 (10, 0.009117263343134759),
 (1173, 0.008474869913528103),
 (1172, 0.00675277669131662),
 (831, 0.006608821366366546),
 (110, 0.006581395814896911),
 (147, 0.006510907954792334),
 (716, 0.006343058702093629)]

In [17]:
print(f"maximum hobs graph {np.max(page_hits_authors.cal_ref())}")
index = page_hits_authors.top_hubs(k = 10)[0][0]
print(f"index {index} is best hobs algorithm founded")
print(f"cites of index {index} = {page_hits_authors.cal_cites()[index]}")
print(f"refrences of index {index} = {page_hits_authors.cal_ref()[index]}")

maximum hobs graph 503.0
index 719 is best hobs algorithm founded
cites of index 719 = 353.0
refrences of index 719 = 389.0
