In [3]:
import pandas as pd
import numpy as np
import networkx as nx
import community as cm

In [4]:
from IPython.display import display_html
def display_df_sbs(*args):
    html_str=''
    for df in args:
        html_str+=df.to_html()
    display_html(html_str.replace('table','table style="display:inline"'),raw=True)

In [5]:
df = pd.read_table("./data/tgraph_real_wikiedithyperlinks.txt", header = None, sep = " ",
                        names = ["src", "trg", "start", "end"])

In [6]:
df.shape

(4729035, 4)

# Pagerank

We create a graph and run the pagerank algorithm with a dampening parameter of 0.85. The dampening parameter represents the likelyhood of clicking a link on the webpage. With a dampening parameter of 0.85 we indicate that there is a 85% of clicking a link on the webpage and 15% of going to a random other node in the graph. We calculate the page rank using the power iteration method. We import the data as a DigGraph meaning that it will ignore the timeframes and ignores edges which are already in G.

In [7]:
G = nx.from_pandas_dataframe(df, 'src', 'trg', create_using=nx.DiGraph())
G_und = nx.from_pandas_dataframe(df, 'src', 'trg')
pr = nx.pagerank(G, alpha=0.85)

Next we create some utility methods for displaying the data

In [8]:
def gen_df_from_graph(graph, pagerank):
    df_edge_in = pd.DataFrame(list(graph.in_degree), columns=['node', 'in edges'])
    df_edge_out = pd.DataFrame(list(graph.out_degree), columns=['node', 'out edges'])
    df_rank = pd.DataFrame(list(pagerank.items()), columns=['node', 'score']).sort_values(by=['score'], ascending=False)
    df_temp = pd.merge(df_rank, df_edge_in, on='node')
    df_total = pd.merge(df_temp, df_edge_out, on='node')
    df_total.index = df_total.index + 1
    df_total.columns.name = 'rank'
    return df_total

def gen_df_from_node(graph, node):
    df_pred = pd.DataFrame(list(graph.predecessors(node)), columns=['node'])
    scores = {}
    out_edges = {}
    for n in graph.predecessors(node):
        out_edges[n] = len(graph.out_edges(n))
        scores[n] = pr.get(n)
    df_out_edges = pd.DataFrame(list(out_edges.items()), columns=['node', 'out edges'])     
    df_score = pd.DataFrame(list(scores.items()), columns=['node', 'score']).sort_values(by=['score'], ascending=False)
    df_temp = pd.merge(df_score, df_pred, on='node')
    df_total = pd.merge(df_temp, df_out_edges, on='node')
    df_total.index = df_total.index + 1
    df_total.columns.name = node
    return df_total

def gen_sum_inc(graph, node):
    summation = 0
    for n in graph.predecessors(node):
        summation += pr.get(n)
    return summation

We display the highest, middle and lowest 10 ranked pages.

In [9]:
graph_df = gen_df_from_graph(G, pr)
display_df_sbs(graph_df.head(10), graph_df.iloc[339448 : 339458], graph_df.tail(10))

rank,node,score,in edges,out edges
1,149,0.002692,9655,545
2,146,0.00176,9264,0
3,394,0.001724,10521,1777
4,3546,0.001287,8561,0
5,15,0.001232,5308,609
6,363,0.00116,5749,1029
7,9416,0.001093,2899,0
8,10645,0.001071,2680,12
9,216,0.001043,5201,732
10,125,0.00104,6239,533

rank,node,score,in edges,out edges
339449,423027,7.231437e-07,1,2
339450,79646,7.231434e-07,2,1
339451,243890,7.231425e-07,1,0
339452,243885,7.231425e-07,1,0
339453,243888,7.231425e-07,1,0
339454,243884,7.231425e-07,1,1
339455,243905,7.231425e-07,1,1
339456,243889,7.231425e-07,1,0
339457,243883,7.231425e-07,1,1
339458,243882,7.231425e-07,1,0

rank,node,score,in edges,out edges
678898,577540,6.754513e-07,0,1
678899,213802,6.754513e-07,0,3
678900,577533,6.754513e-07,0,3
678901,642875,6.754513e-07,0,1
678902,642857,6.754513e-07,0,3
678903,579518,6.754513e-07,0,1
678904,584894,6.754513e-07,0,1
678905,584893,6.754513e-07,0,2
678906,580930,6.754513e-07,0,1
678907,377674,6.754513e-07,0,2


The data shows a big different between the high and low ranking pages. The high ranking pages have a lot more incoming edges and a lot more outgoing edges although not all of them have a lot of outgoing edges. This makes sense because the amount of outgoing edges shouldn't directly affect the pagerank. On the other hand does the higher amount of incoming edges affect the page rank in general. If a page is linked more often it is more likely to end up higher in the pagerank.

Next we compare two of the top ranking nodes (146 and 3546) to see what makes one of them rank better than the other. We do that by showing the nodes that link to 146 and 3546 and sort them by score.

In [10]:
df_146 = gen_df_from_node(G, 146)
df_3546 = gen_df_from_node(G, 3546)
display_df_sbs(df_146.head(), df_3546.head())
print("Summation incoming node score for node 146: {sum}".format(sum=gen_sum_inc(G, 146)))
print("Summation incoming node score for node 3546: {sum}".format(sum=gen_sum_inc(G, 3546)))

146,node,score,out edges
1,394,0.001724,1777
2,15,0.001232,609
3,216,0.001043,732
4,125,0.00104,533
5,740,0.00094,149

3546,node,score,out edges
1,7219,0.000349,46
2,3539,0.000343,4
3,7191,0.000131,2
4,7127,0.000125,101
5,160929,0.000125,91


Summation incoming node score for node 146: 0.06398768016680219
Summation incoming node score for node 3546: 0.009957472206417262


We can see that both nodes are probably not in the same community since in the top 5 best scoring nodes that link to it, none of them are the same. It is interesting to see that the summation of the scores of 3546 is a lot lower than 146. The reason why the pagerank of 3546 is so high is because the nodes that link to 3546 have a lot less outgoing edges than the nodes that link to 149.

# Comminity detection

Next we compare pagerank to communities. We will check if we can find relations between the community size and pagerank. We partition the graph into communities using the louvain method.

In [12]:
part = cm.community_louvain.best_partition(G_und)

We definte some utility functions for displaying the communitie data.

In [13]:
def gen_df_community_pagerank(graph, graph_und, pagerank, partition):
    df_community = pd.DataFrame({'node': range(1, graph_und.number_of_nodes() + 1), 'community': list(part.values())})
    df_community.head()
    df_edge_in = pd.DataFrame(list(graph.in_degree), columns=['node', 'in edges'])
    df_edge_out = pd.DataFrame(list(graph.out_degree), columns=['node', 'out edges'])
    df_rank = pd.DataFrame(list(pagerank.items()), columns=['node', 'score']).sort_values(by=['score'], ascending=False)
    df_temp = pd.merge(df_rank, df_edge_in, on='node')
    df_total = pd.merge(df_temp, df_edge_out, on='node')
    df_final = pd.merge(df_total, df_community, on='node')
    df_final.index = df_final.index + 1
    df_final.columns.name = 'rank'
    return df_final

In [14]:
df_com_pr = gen_df_community_pagerank(G, G_und, pr, part)
display_df_sbs(df_com_pr.head(10), df_com_pr.iloc[10 : 20])

rank,node,score,in edges,out edges,community
1,149,0.002692,9655,545,5
2,146,0.00176,9264,0,5
3,394,0.001724,10521,1777,3
4,3546,0.001287,8561,0,0
5,15,0.001232,5308,609,4
6,363,0.00116,5749,1029,3
7,9416,0.001093,2899,0,3
8,10645,0.001071,2680,12,4
9,216,0.001043,5201,732,9
10,125,0.00104,6239,533,5

rank,node,score,in edges,out edges,community
11,740,0.00094,6484,149,0
12,1795,0.000935,5207,46,0
13,4257,0.000928,1795,1,4
14,7554,0.000902,1626,724,12
15,19206,0.000883,1973,0,5
16,220255,0.000857,1902,0,2
17,51072,0.000847,167,0,0
18,455300,0.000824,2170,0,8
19,34297,0.000777,437,7,4
20,26735,0.000747,2202,2,0


In [15]:
def gen_df_community_pagerank2(graph, graph_und, pagerank, partition):
    

SyntaxError: unexpected EOF while parsing (<ipython-input-15-dd288d8e054a>, line 2)

In [None]:
df_com_pr2 = gen_df_community_pagerank2(G, G_und, pr, part)
display_df_sbs(df_com_pr2.head(10), df_com_pr2.iloc[10 : 20])

In [21]:
df_x = pd.DataFrame(part.items(), columns=['community', 'node'])

ValueError: DataFrame constructor not properly called!