# 1b Graph evolving
### Joris & Abdel

In [1]:
import networkx as nx
import plotly.plotly as py
import plotly.figure_factory as ff
import pandas as pd
import numpy as np
import networkx as nx
import random
import matplotlib.pyplot as plt

Load edge list and create a graph

In [2]:
fh = open("canvas/hamster.edgelist", 'rb')
G = nx.read_edgelist(fh, create_using=nx.DiGraph())
fh.close()

In [3]:
random.seed(99)

Next we run the pagerank algorithm with a dampening parameter of 0.85. The dampening parameter represents the likelyhood of clicking a link on the webpage. With a dampening parameter of 0.85 we indicate that there is a 85% of clicking a link on the webpage and 15% of going to a random other node in the graph. We calculate the page rank using the power iteration method.

In [4]:
def calc_pagerank(G_in, alpha = 0.85):
    return nx.pagerank(G_in, alpha=0.85)

In [5]:
pr_origin = calc_pagerank(G)

In [6]:
def create_dataframe(pr, G_in):
    df_edge_in = pd.DataFrame(list(G_in.in_degree()), columns=['node', 'in edges'])
    df_edge_out = pd.DataFrame(list(G_in.out_degree()), columns=['node', 'out edges'])
    df_rank = pd.DataFrame(list(pr.items()), columns=['node', 'score']).sort_values(by=['score'], ascending=False)
    df_temp = pd.merge(df_rank, df_edge_in, on='node')
    df_total = pd.merge(df_temp, df_edge_out, on='node')
    df_total.index = df_total.index + 1
    df_total.columns.name = 'rank'
    return df_total

In [7]:
df_origin = create_dataframe(pr_origin, G)
df_origin.head()

rank,node,score,in edges,out edges
1,404,0.042793,10,0
2,195,0.019961,80,1
3,77,0.018628,121,2
4,728,0.01553,10,0
5,36,0.011117,168,5


It looks like node 404 is the best ranked page, following by 195 and 77. This means that these pages should be shown at the top by search engines.

In [8]:
df_origin.tail()

rank,node,score,in edges,out edges
2422,1746,0.000113,0,3
2423,1748,0.000113,0,4
2424,1749,0.000113,0,3
2425,1751,0.000113,0,2
2426,2426,0.000113,0,8


As expected the higher ranked pages have more incoming edges than the lower ranked pages. Looking at the amount of incoming edges for the 15 best ranked pages it is clear that the the rank of the source of the incoming edges is more important than the amount of incoming edges.

In [9]:
#add new connection to random other node with probability prob_add
#remove random connection of node with probability prob_remove
def random_edges(G_in, prob_add, prob_remove):
    add_edges = []
    removed_edges = []
    
    for node in G_in.nodes():
        #find notes that this one is connected to
        connected = [to for (fr, to) in G_in.edges(node)]
        #find candidates for new edges
        unconnected = [n for n in G_in.nodes() if not n in connected]
        
        #randomly add new edge to the selected node
        if len(unconnected): #only add when unconnected is not empty
            if random.random() < prob_add:
                new = random.choice(unconnected)
                G_in.add_edge(node, new)
                print("\tnew edge:\t {} -- {}".format(node, new))
                add_edges.append( (node, new) )
                #update lists, in case both add and remove done in same cycle
                unconnected.remove(new)    
                connected.append(new)
                
        if len(connected):
            if random.random() < prob_remove:
                remove = random.choice(connected)
                G_in.remove_edge(node, remove)
                print("\tedge removed:\t {} -- {}".format(node, remove))
                removed_edges.append((node, remove))
                #update lists, in case both add and remove done in same cycle
                connected.remove(remove)
                unconnected.append(remove)
    return G_in

### <font color="red">To-Do: Uniform (random.boolean())</font>

In [10]:
prob_add = 0.050
prob_remove = 0.005
G_random = random_edges(G.copy(), prob_add, prob_remove)

	new edge:	 15 -- 843
	edge removed:	 69 -- 79
	edge removed:	 77 -- 404
	new edge:	 108 -- 2209
	new edge:	 109 -- 1354
	new edge:	 111 -- 162
	new edge:	 142 -- 654
	new edge:	 143 -- 1176
	new edge:	 149 -- 1111
	new edge:	 151 -- 1190
	new edge:	 157 -- 120
	new edge:	 194 -- 1310
	new edge:	 207 -- 1096
	new edge:	 211 -- 1084
	new edge:	 223 -- 2320
	new edge:	 240 -- 289
	new edge:	 340 -- 724
	new edge:	 341 -- 1052
	new edge:	 393 -- 921
	new edge:	 411 -- 1745
	edge removed:	 413 -- 196
	new edge:	 441 -- 794
	new edge:	 455 -- 358
	new edge:	 467 -- 2267
	new edge:	 471 -- 1977
	new edge:	 478 -- 1809
	new edge:	 508 -- 1884
	new edge:	 521 -- 345
	new edge:	 529 -- 907
	new edge:	 555 -- 932
	new edge:	 575 -- 190
	new edge:	 626 -- 951
	edge removed:	 632 -- 79
	new edge:	 633 -- 1626
	edge removed:	 641 -- 642
	new edge:	 650 -- 1636
	new edge:	 683 -- 2280
	new edge:	 695 -- 2290
	new edge:	 717 -- 306
	new edge:	 741 -- 189
	new edge:	 761 -- 2177
	new edge:	 775 -- 116

In [11]:
pr_random = calc_pagerank(G_random)
df_random = create_dataframe(pr_random, G_random)
df_random.head()

rank,node,score,in edges,out edges
1,404,0.040968,9,0
2,195,0.027651,80,1
3,77,0.018416,121,1
4,728,0.01502,10,0
5,36,0.010911,168,5


In [12]:
#randomly add and remove nodes
#Edge Copying Model (slide 53 of Week6-SNA-Props)
def random_add_one_node(G_in, prob_select, k = 5):
    print("number of edges before :"+ str(len(G_in.edges())))
    #k is number of edges to be added, random integer 1 between 5
    #select k random vertices
    k = random.randint(1, k)
    #add node to graph
    new_node = nx.number_of_nodes(G_in) + 1
    G_in.add_node(new_node)

    #create list of nodes
    list_of_nodes = list(G_in)
    k_random_selected_nodes = []
    print("k = "+str(k))
    k_count = 0
    while (k_count < k):
        if random.random() < prob_select:
            #select random vertice with probability prob_select
            k_random_selected_nodes.append(random.choice(list_of_nodes))
            k_count += 1
    for node in k_random_selected_nodes:
         G_in.add_edge(new_node, node)
            
    prob_random_node = 1 - prob_select
    one = 0
    while (one < 1):
        if random.random() < prob_random_node:
            node = random.choice(list_of_nodes)
            neighbors = G_in.neighbors(node)
            one += 1
            for neighbor_node in neighbors:
                G_in.add_edge(new_node, neighbor_node)
                    
    #add edges with prob 1-prob_select from the just
    #added new_node and the nodes that are in k_random_selected_nodes
    print("number of edges after :"+str(len(G_in.edges())))
    return G_in

### Remove nodes

In [13]:
def random_node_removals(G_in, prob_remove, proportional = False, prop_n = None):
    if (proportional):
        n = prop_n
    else:
        n = int(0.1 * len(list(G_in)))  # max 10% of nodes
    #remove nodes and corresponding edges
    print("number of nodes before :"+ str(len(list(G_in))))
    to_be_removed = random.randint(1, n)
    node_counter = 0
    while(node_counter < to_be_removed):
        list_of_nodes = list(G_in)
        if random.random() < prob_remove:
            node_remove = random.choice(list_of_nodes)
            G_in.remove_node(node_remove)
            node_counter += 1
    print("number of nodes after :"+ str(len(list(G_in))))
    return G_in

In [14]:
prob_select = 0.1
G_random_add_nodes = random_add_one_node(G.copy(), prob_select)

k = 5
number of edges before :16632
number of edges after :16637


In [15]:
prob_remove = 0.1
G_random_node_removal = random_node_removals(G.copy(), prob_remove)

number of nodes before :2426
number of nodes after :2406


In [16]:
pr_random_nodes = calc_pagerank(G_random_add_nodes)
df_random_nodes = create_dataframe(pr_random_nodes, G_random_add_nodes)
df_random_nodes.head()

rank,node,score,in edges,out edges
1,404,0.042787,10,0
2,195,0.019958,80,1
3,77,0.018627,121,2
4,728,0.015526,10,0
5,36,0.011115,168,5


In [17]:
pr_random_nodes = calc_pagerank(G_random_node_removal)
df_random_nodes = create_dataframe(pr_random_nodes, G_random_node_removal)
df_random_nodes.head()

rank,node,score,in edges,out edges
1,404,0.042912,10,0
2,195,0.020021,80,1
3,77,0.018689,120,2
4,728,0.015578,10,0
5,36,0.011135,167,5


### Random but proportional to node degree or any other statistic.

#### Proportional to number of incoming/outgoing edges

In [18]:
def random_node_removals_proportional_indegree(G_in):
    #remove nodes and corresponding edges
    n = int(0.1 * len(list(G_in)))  # max 10% of nodes
    print("number of nodes before :"+ str(len(list(G_in))))
    to_be_removed = random.randint(1, n)
    node_counter = 0
    while(node_counter < to_be_removed):
        list_of_nodes = list(G_in)
        in_degrees = dict(G_in.in_degree()).values()
        prob_degree = [float(i)/sum(in_degrees) for i in in_degrees]
        node_remove = np.random.choice(list_of_nodes, p = prob_degree)
        G_in.remove_node(node_remove)
        node_counter += 1
    print("number of nodes after :"+ str(len(list(G_in))))
    return G_in

In [19]:
G_rand_prop_in_degree_removal = random_node_removals_proportional_indegree(G.copy())
pr_random_nodes = calc_pagerank(G_rand_prop_in_degree_removal)
df_random_nodes = create_dataframe(pr_random_nodes, G_rand_prop_in_degree_removal)
df_random_nodes.head()

number of nodes before :2426
number of nodes after :2369


rank,node,score,in edges,out edges
1,77,0.029733,109,0
2,728,0.019244,10,0
3,135,0.017024,45,4
4,192,0.011903,49,1
5,35,0.011779,135,2


In [20]:
def random_node_removals_proportional_hits(G_in, authorithy = False):
    #remove nodes and corresponding edges
    n = int(0.1 * len(list(G_in)))  # max 10% of nodes
    print("number of nodes before :"+ str(len(list(G_in))))
    to_be_removed = random.randint(1, n)
    node_counter = 0
    while(node_counter < to_be_removed):
        list_of_nodes = list(G_in)
        print(int((node_counter / to_be_removed) * 100), "%")
        if (authorithy):
            p = list(nx.hits(G_in)[0].values())
        else: # hub
            p = list(nx.hits(G_in)[1].values())
        node_remove = np.random.choice(list_of_nodes, p = p)
        G_in.remove_node(node_remove)
        node_counter += 1
    print("number of nodes after :"+ str(len(list(G_in))))
    return G_in

In [21]:
G_rand_prop_in_degree_removal = random_node_removals_proportional_hits(G.copy(), True)
pr_random_nodes = calc_pagerank(G_rand_prop_in_degree_removal)
df_random_nodes = create_dataframe(pr_random_nodes, G_rand_prop_in_degree_removal)
df_random_nodes.head()

number of nodes before :2426
0 %
2 %
4 %
6 %
9 %
11 %
13 %
15 %
18 %
20 %
22 %
25 %
27 %
29 %
31 %
34 %
36 %
38 %
40 %
43 %
45 %
47 %
50 %
52 %
54 %
56 %
59 %
61 %
63 %
65 %
68 %
70 %
72 %
75 %
77 %
79 %
81 %
84 %
86 %
88 %
90 %
93 %
95 %
97 %
number of nodes after :2382


rank,node,score,in edges,out edges
1,404,0.042865,10,0
2,195,0.01997,72,1
3,77,0.018685,110,2
4,728,0.015876,10,0
5,36,0.010972,157,5


In [None]:
#randomly add and remove nodes
#Edge Copying Model (slide 53 of Week6-SNA-Props)
def random_add_one_node_proportional_in_degree(G_in, k = 5):
    print("number of edges before :"+ str(len(G_in.edges())))
    #k is number of edges to be added, random integer 1 between 5
    #select k random vertices
    k = random.randint(1, k)
    #add node to graph
    new_node = nx.number_of_nodes(G_in) + 1
    G_in.add_node(new_node)

    #create list of nodes
    list_of_nodes = list(G_in)
    k_random_selected_nodes = []
    print("k = "+str(k))
    k_count = 0
    list_of_nodes = list(G_in)
    in_degrees = dict(G_in.in_degree()).values()
    prob_degree = [float(i)/sum(in_degrees) for i in in_degrees]
    while (k_count < k):
        node_to_add = np.random.choice(list_of_nodes, p = prob_degree)
        k_random_selected_nodes.append(node_to_add)
        k_count += 1
    
    for node in k_random_selected_nodes:
        G_in.add_edge(new_node, node)
        
    prob_degree_rev = [(1 - x) for x in prob_degree]
    one = 0
    while (one < 1):
        node_to_add = np.random.choice(list_of_nodes, p = prob_degree_rev)
        one += 1
        neighbors = G_in.neighbors(node_to_add)
        for neighbor_node in neighbors:
            G_in.add_edge(new_node, neighbor_node)          
    #add edges with prob 1-prob_select from the just
    #added new_node and the nodes that are in k_random_selected_nodes
    print("number of edges after :"+str(len(G_in.edges())))
    return G_in

In [120]:
np.random.choice([1, 2, 3], p = [0.2, 0.2, 0.6], size = 2, replace = False)

array([3, 2])

In [154]:
one = [1, 0, 2, 3, 0, 0, 0]
two = [2, 3]
[i for i in one if i != 0]

[1, 2, 3]

In [144]:
len(list(G))

2426

In [146]:
len(dict(G.in_degree()).values())

2426

In [148]:
len([float(i)/sum(dict(G.in_degree()).values()) for i in dict(G.in_degree()).values()])

2426

In [162]:
#randomly add and remove nodes
#Edge Copying Model (slide 53 of Week6-SNA-Props)
def random_add_one_node_proportional_in_degree(G_in, k = 5):
    print("number of edges before :"+ str(len(G_in.edges())))
    #k is number of edges to be added, random integer 1 between 5
    #select k random vertices
    k = random.randint(1, k)
    #add node to graph
    new_node = nx.number_of_nodes(G_in) + 1
    G_in.add_node(new_node)

    #create list of nodes
    list_of_nodes = list(G_in)
    k_random_selected_nodes = []
    print("k = "+str(k))
    k_count = 0
    list_of_nodes = list(G_in)
    in_degrees = dict(G_in.in_degree()).values()
    prob_degree = [float(i)/sum(in_degrees) for i in in_degrees]
    while (k_count < k):
        node_to_add = np.random.choice(list_of_nodes, p = prob_degree)
        k_random_selected_nodes.append(node_to_add)
        k_count += 1
    
    for node in k_random_selected_nodes:
        G_in.add_edge(new_node, node)
        
    one = 0
    non_zero_probs = [i for i in prob_degree if i != 0.0]
    highest_chance_nodes = np.random.choice(list_of_nodes, p = prob_degree,
                                            size = (len(non_zero_probs) - 1), replace = False)
    node_to_add = set(list_of_nodes).difference(set(highest_chance_nodes)).pop()
    neighbors = G_in.neighbors(node_to_add)
    for neighbor_node in neighbors:
        G_in.add_edge(new_node, neighbor_node)          
    #add edges with prob 1-prob_select from the just
    #added new_node and the nodes that are in k_random_selected_nodes
    print("number of edges after :"+str(len(G_in.edges())))
    return G_in

In [171]:
G_rand_prop_in_degree_removal = random_add_one_node_proportional_in_degree(G.copy())
pr_random_nodes = calc_pagerank(G_rand_prop_in_degree_removal)
df_random_nodes = create_dataframe(pr_random_nodes, G_rand_prop_in_degree_removal)
df_random_nodes.head()

number of edges before :16631
k = 3
number of edges after :16636


rank,node,score,in edges,out edges
1,404,0.042792,10,0
2,195,0.019962,80,1
3,77,0.018626,121,2
4,728,0.015556,10,0
5,36,0.011119,168,5
