# PageRank

In [42]:
import networkx as nx
import pandas as pd

In [43]:
from IPython.display import display_html
def display_df_sbs(*args):
    html_str=''
    for df in args:
        html_str+=df.to_html()
    display_html(html_str.replace('table','table style="display:inline"'),raw=True)

Load edge list and create a graph

In [44]:
fh = open("canvas/hamster.edgelist", 'rb')
G = nx.read_edgelist(fh, create_using=nx.DiGraph())
fh.close()

Next we run the pagerank algorithm with a dampening parameter of 0.85. The dampening parameter represents the likelyhood of clicking a link on the webpage. With a dampening parameter of 0.85 we indicate that there is a 85% of clicking a link on the webpage and 15% of going to a random other node in the graph. We calculate the page rank using the power iteration method.

In [45]:
pr = nx.pagerank(G, alpha=0.85)

In [46]:
df_edge_in = pd.DataFrame(list(G.in_degree), columns=['node', 'in edges'])
df_edge_out = pd.DataFrame(list(G.out_degree), columns=['node', 'out edges'])
df_rank = pd.DataFrame(list(pr.items()), columns=['node', 'score']).sort_values(by=['score'], ascending=False)
df_temp = pd.merge(df_rank, df_edge_in, on='node')
df_total = pd.merge(df_temp, df_edge_out, on='node')
df_total.index = df_total.index + 1
df_total.columns.name = 'rank'

display_df_sbs(df_total.head(10), df_total.iloc[500 : 510], df_total.iloc[1000 : 1010], 
                        df_total.iloc[1500 : 1510], df_total.iloc[2000 : 2010], df_total.tail(10))

rank,node,score,in edges,out edges
1,404,0.042793,10,0
2,195,0.019961,80,1
3,77,0.018628,121,2
4,728,0.01553,10,0
5,36,0.011117,168,5
6,135,0.009544,49,8
7,192,0.009365,57,3
8,281,0.009304,32,0
9,136,0.008853,85,6
10,184,0.008296,80,3

rank,node,score,in edges,out edges
501,469,0.000328,26,48
502,1406,0.000328,13,26
503,610,0.000326,18,16
504,972,0.000326,7,4
505,129,0.000324,5,6
506,834,0.000322,11,2
507,1266,0.000322,3,4
508,27,0.000322,4,6
509,480,0.000321,4,6
510,1335,0.00032,5,1

rank,node,score,in edges,out edges
1001,708,0.000203,4,5
1002,896,0.000203,8,8
1003,270,0.000203,14,20
1004,799,0.000203,9,7
1005,811,0.000202,3,7
1006,56,0.000202,7,6
1007,290,0.000202,1,0
1008,142,0.000202,3,3
1009,944,0.000201,1,0
1010,695,0.000201,3,3

rank,node,score,in edges,out edges
1501,1597,0.000137,1,3
1502,1908,0.000137,1,3
1503,1442,0.000137,1,3
1504,1702,0.000137,1,3
1505,1048,0.000137,1,3
1506,1259,0.000137,1,7
1507,2172,0.000137,1,3
1508,1771,0.000137,1,3
1509,2035,0.000137,1,3
1510,2131,0.000137,1,3

rank,node,score,in edges,out edges
2001,2327,0.000113,0,4
2002,2091,0.000113,0,5
2003,2329,0.000113,0,3
2004,2330,0.000113,0,4
2005,2335,0.000113,0,4
2006,2309,0.000113,0,4
2007,1359,0.000113,0,1
2008,2006,0.000113,0,5
2009,1501,0.000113,0,4
2010,1148,0.000113,0,1

rank,node,score,in edges,out edges
2417,1739,0.000113,0,4
2418,918,0.000113,0,4
2419,1743,0.000113,0,1
2420,1744,0.000113,0,1
2421,1745,0.000113,0,1
2422,1746,0.000113,0,3
2423,1748,0.000113,0,4
2424,1749,0.000113,0,3
2425,1751,0.000113,0,2
2426,2426,0.000113,0,8


As expected, the higher ranked pages have more incoming edges than the lower ranked pages on average. It is important to note that a page being linked by a lot of other pages doesn't imply that it will rank high on the pagerank. The rank of a page is mainly influenced by the quality links directed to the page. A page which is linked on many other pages however is still far more likely to end up higher in the pagerank than a page which is linked less frequently. This is also shown in the data from the pagerank calculation above. The lower the pagerank the fewer incoming edges those pages have. There are however some exceptions in the data. One of them is the number 1 ranked page. The rank of that page far exceeds the other pages having a score of 0.042793 = 4.3% while the second best ranked page only has a score of 0.019961 = 2.0%. We will analyze this page by looking at the quality of the pages that link to it.

In [47]:
def gen_df(node):
    df_pred = pd.DataFrame(list(G.predecessors(node)), columns=['node'])
    scores = {}
    out_edges = {}
    for n in G.predecessors(node):
        out_edges[n] = len(G.out_edges(n))
        scores[n] = pr.get(n)
    df_out_edges = pd.DataFrame(list(out_edges.items()), columns=['node', 'out edges'])     
    df_score = pd.DataFrame(list(scores.items()), columns=['node', 'score']).sort_values(by=['score'], ascending=False)
    df_temp = pd.merge(df_score, df_pred, on='node')
    df_total = pd.merge(df_temp, df_out_edges, on='node')
    df_total.index = df_total.index + 1
    df_total.columns.name = node
    return df_total

def gen_sum_inc(node):
    summation = 0
    for n in G.predecessors(node):
        summation += pr.get(n)
    return summation

In [48]:
display_df_side_by_side(gen_df('404'), pd.DataFrame(), gen_df('195').head(10), gen_df('195').tail(10))
print("Summation incoming node score for node 404: {sum}".format(sum=gen_sum_inc('404')))
print("Summation incoming node score for node 195: {sum}".format(sum=gen_sum_inc('195')))

404,node,score,out edges
1,195,0.019961,1
2,77,0.018628,2
3,192,0.009365,3
4,126,0.008144,1
5,346,0.005487,3
6,403,0.004875,2
7,24,0.003894,3
8,246,0.002964,2
9,882,0.002327,1
10,775,0.000385,1

195,node,score,out edges
1,77,0.018628,2
2,36,0.011117,5
3,192,0.009365,3
4,181,0.005597,12
5,346,0.005487,3
6,182,0.004905,13
7,116,0.004715,54
8,125,0.003839,4
9,115,0.003329,45
10,101,0.003121,5

195,node,score,out edges
71,2019,0.000119,13
72,618,0.000119,16
73,2195,0.000113,6
74,684,0.000113,54
75,2135,0.000113,7
76,2097,0.000113,1
77,2018,0.000113,15
78,855,0.000113,3
79,911,0.000113,15
80,2352,0.000113,3


Summation incoming node score for node 404: 0.07603158841472238
Summation incoming node score for node 195: 0.1244965540211601


The data clearly shows that there are a lot more links to 195 than 404. Also does the data show that the sum of the score of all the pages that link to 195 is almost twice as high as the sum of the score of all pages that link to 404. Even though this is the case the score of 404 is way higher than the score of 195. The reason for this is that the pages that link to 195 also link to a lot of other pages while this is not the case for 404. The amount of outgoing edges for the pages that link to 404 is lower than for 195. Also do links from low scoring pages not affect the score of a page by a lot. Most of the score that both pages 404 and 195 get is from a few pages with high scores and low amount of links.

We now compare node 404 and 728. They look very similar in terms of both links to and from the page. Both have 10 links going to that particular page and both pages contain no links. Except for them looking the same in terms of connected edges, the score of node 404 is a lot higher than the score of 728. The only explanation for this is that the quality of the incoming edges of 404 must be better than the quality of the incoming edges of 728. We confirm this by looking at the nodes with edges directed to both pages.

In [49]:
display_df_side_by_side(gen_df('404'), pd.DataFrame(), gen_df('728'))
print("Summation incoming node score for node 404: {sum}".format(sum=gen_sum_inc('404')))
print("Summation incoming node score for node 195: {sum}".format(sum=gen_sum_inc('728')))

404,node,score,out edges
1,195,0.019961,1
2,77,0.018628,2
3,192,0.009365,3
4,126,0.008144,1
5,346,0.005487,3
6,403,0.004875,2
7,24,0.003894,3
8,246,0.002964,2
9,882,0.002327,1
10,775,0.000385,1

728,node,score,out edges
1,697,0.007044,1
2,724,0.004799,4
3,222,0.003467,4
4,727,0.003348,1
5,726,0.002353,2
6,170,0.002058,12
7,221,0.002017,5
8,725,0.001993,1
9,723,0.001598,1
10,220,0.001575,4


Summation incoming node score for node 404: 0.07603158841472238
Summation incoming node score for node 195: 0.030253764353088613


From the summation of the incoming node score we see that 404 scores better. However in the comparison between 404 and 195 it was already shown that this doesn't necessarily imply that 404 will score better than 195. If we take a look at the number of outgoing edges of the incoming nodes we see that 728 has slightly more in total. This also doesn't necessarily mean that the score of 728 should be lower than 404. The impact of an high amount of outgoing edges a node that has an high score is way more influential than when a node with a low score has an high amount of outgoing edges. The total score and amount of summations are a good indicator but not always right. In this case however it is.