# 1b Graph evolving

In [1]:
import networkx as nx
import plotly.plotly as py
import plotly.figure_factory as ff
import pandas as pd

Load edge list and create a graph

In [2]:
fh = open("canvas/hamster.edgelist", 'rb')
G = nx.read_edgelist(fh, create_using=nx.DiGraph())
fh.close()

Next we run the pagerank algorithm with a dampening parameter of 0.85. The dampening parameter represents the likelyhood of clicking a link on the webpage. With a dampening parameter of 0.85 we indicate that there is a 85% of clicking a link on the webpage and 15% of going to a random other node in the graph. We calculate the page rank using the power iteration method.

In [3]:
def calc_pagerank(G, alpha = 0.85):
    return nx.pagerank(G, alpha=0.85)

In [4]:
pr_origin = calc_pagerank(G)

In [5]:
def create_dataframe(pr):
    df_edge_in = pd.DataFrame(list(G.in_degree()), columns=['node', 'in edges'])
    df_edge_out = pd.DataFrame(list(G.out_degree()), columns=['node', 'out edges'])
    df_rank = pd.DataFrame(list(pr.items()), columns=['node', 'score']).sort_values(by=['score'], ascending=False)
    df_temp = pd.merge(df_rank, df_edge_in, on='node')
    df_total = pd.merge(df_temp, df_edge_out, on='node')
    df_total.index = df_total.index + 1
    df_total.columns.name = 'rank'
    return df_total

In [6]:
df_origin = create_dataframe(pr_origin)
df_origin.head()

rank,node,score,in edges,out edges
1,404,0.042793,10,0
2,195,0.019961,80,1
3,77,0.018628,121,2
4,728,0.01553,10,0
5,36,0.011117,168,5


It looks like node 404 is the best ranked page, following by 195 and 77. This means that these pages should be shown at the top by search engines.

In [7]:
df_origin.tail()

rank,node,score,in edges,out edges
2422,1746,0.000113,0,3
2423,1748,0.000113,0,4
2424,1749,0.000113,0,3
2425,1751,0.000113,0,2
2426,2426,0.000113,0,8


As expected the higher ranked pages have more incoming edges than the lower ranked pages. Looking at the amount of incoming edges for the 15 best ranked pages it is clear that the the rank of the source of the incoming edges is more important than the amount of incoming edges.

In [8]:
import networkx as nx
import random
import matplotlib.pyplot as plt

In [9]:
#add new connection to random other node with probability prob_add
#remove random connection of node with probability prob_remove
def random_edges(G, prob_add, prob_remove):
    add_edges = []
    removed_edges = []
    
    for node in G.nodes():
        #find notes that this one is connected to
        connected = [to for (fr, to) in G.edges(node)]
        #find candidates for new edges
        unconnected = [n for n in G.nodes() if not n in connected]
        
        #randomly add new edge to the selected node
        if len(unconnected): #only add when unconnected is not empty
            if random.random() < prob_add:
                new = random.choice(unconnected)
                G.add_edge(node, new)
                print("\tnew edge:\t {} -- {}".format(node, new))
                add_edges.append( (node, new) )
                #update lists, in case both add and remove done in same cycle
                unconnected.remove(new)    
                connected.append(new)
                
        if len(connected):
            if random.random() < prob_remove:
                remove = random.choice(connected)
                G.remove_edge(node, remove)
                print("\tedge removed:\t {} -- {}".format(node, remove))
                removed_edges.append((node, remove))
                #update lists, in case both add and remove done in same cycle
                connected.remove(remove)
                unconnected.append(remove)
    return G

In [10]:
prob_add = 0.050
prob_remove = 0.005
G_random = random_edges(G, prob_add, prob_remove)

	new edge:	 5 -- 1429
	new edge:	 15 -- 1303
	new edge:	 16 -- 63
	new edge:	 24 -- 1356
	edge removed:	 32 -- 53
	new edge:	 33 -- 2325
	new edge:	 47 -- 923
	new edge:	 104 -- 354
	new edge:	 127 -- 1086
	new edge:	 132 -- 1522
	new edge:	 152 -- 950
	new edge:	 159 -- 1392
	new edge:	 188 -- 244
	new edge:	 216 -- 2213
	new edge:	 217 -- 1720
	new edge:	 226 -- 193
	new edge:	 239 -- 1591
	new edge:	 339 -- 819
	new edge:	 355 -- 1815
	new edge:	 376 -- 1868
	new edge:	 396 -- 2406
	new edge:	 416 -- 1129
	edge removed:	 431 -- 159
	new edge:	 456 -- 455
	new edge:	 470 -- 1508
	new edge:	 476 -- 1907
	new edge:	 527 -- 1930
	new edge:	 528 -- 415
	new edge:	 549 -- 566
	new edge:	 566 -- 468
	new edge:	 587 -- 330
	new edge:	 606 -- 114
	new edge:	 620 -- 1360
	edge removed:	 657 -- 71
	new edge:	 669 -- 814
	new edge:	 689 -- 334
	new edge:	 725 -- 1365
	new edge:	 733 -- 1462
	new edge:	 806 -- 2080
	new edge:	 817 -- 448
	new edge:	 828 -- 1438
	new edge:	 851 -- 17
	new edge:	 

In [11]:
pr_random = calc_pagerank(G_random)
df_random = create_dataframe(pr_random)
df_random.head()

rank,node,score,in edges,out edges
1,404,0.041393,10,0
2,195,0.019459,80,1
3,77,0.017877,121,2
4,728,0.014517,10,0
5,36,0.01087,168,5


In [12]:
#randomly add and remove nodes
#Edge Copying Model (slide 51 of Week6-SNA-Props)
def random_nodes(G, prob_select):
    #k is number of edges to be added, random integer 1 between 5
    #select k random vertices
    k = random.randint(1,5)
    #add node to graph
    max_number_nodes = nx.number_of_nodes(G)
    new_node = max_number_nodes + 1
    G.add_node(new_node)
    list_of_nodes = nx.nodes(G)
    k_random_selected_nodes = []
    if random.random() < prob_select:
        #select random vertice with probability prob_select
        #to-do: do this until we have k-nodes in k_random_selected_nodes
        k_random_selected_nodes.append(random.choice(list_of_nodes))
    #todo: add edges with prob 1-prob_select from the just 
    #added new_node and the nodes that are in k_random_selected_nodes

### Evolving Graphs: Removal and Addition of Edges proportional to the Node Degree

In [64]:
G_stoh_random = nx.stochastic_graph(G)

In [65]:
pagerank_stoh_random = calc_pagerank(G_stoh_random)
df_stoh_random = create_dataframe(pagerank_stoh_random)
df_stoh_random.head()

rank,node,score,in edges,out edges
1,404,0.041393,10,0
2,195,0.019459,80,1
3,77,0.017877,121,2
4,728,0.014517,10,0
5,36,0.01087,168,5


In [69]:
g_diff = nx.symmetric_difference(G, G_stoh_random)
# create_dataframe(calc_pagerank(g_diff)).head(10)

In [63]:
#nx.Graph.adjacency_list(g_diff)

### Evolving Graphs: Removal and Addition of Edges proportional to the Node Degree