# Task 1: PageRank Stability on Evolving Graphs
## Graph evolution and PageRank values comparison
### Joris & Abdel

### Imports and general set-up

In [1]:
import networkx as nx
import plotly.plotly as py
import plotly.figure_factory as ff
import pandas as pd
import numpy as np
import random
import matplotlib.pyplot as plt

Load edge list and create a graph

In [2]:
fh = open("canvas/hamster.edgelist", 'rb')
G = nx.read_edgelist(fh, create_using=nx.DiGraph())
fh.close()

In [3]:
random.seed(99)

In [4]:
def calc_pagerank(G_in, alpha = 0.85):
    return nx.pagerank(G_in, alpha=0.85)

In [5]:
pr_origin = calc_pagerank(G)

In [6]:
def create_dataframe(pr, G_in):
    df_edge_in = pd.DataFrame(list(G_in.in_degree()), columns=['node', 'in edges'])
    df_edge_out = pd.DataFrame(list(G_in.out_degree()), columns=['node', 'out edges'])
    df_rank = pd.DataFrame(list(pr.items()), columns=['node', 'score']).sort_values(by=['score'], ascending=False)
    df_temp = pd.merge(df_rank, df_edge_in, on='node')
    df_total = pd.merge(df_temp, df_edge_out, on='node')
    df_total.index = df_total.index + 1
    df_total.columns.name = 'rank'
    return df_total

In [7]:
df_origin = create_dataframe(pr_origin, G)
df_origin.head()

rank,node,score,in edges,out edges
1,404,0.042793,10,0
2,195,0.019961,80,1
3,77,0.018628,121,2
4,728,0.01553,10,0
5,36,0.011117,168,5


It looks like node 404 is the best ranked page, following by 195 and 77. This means that these pages should be shown at the top by search engines.

### <font color="red">To do: Update the PageRank value calculation and analysis in this notebook with the extended/corrected/completed <i>(this still needs to be done)</i> version of the notebook of 1a</font>

## 1b. Graph Evolution and Pagerank values comparison

In this section the effects of graph evolutions are going to be studied in relation to an evaluation of the stability of PageRank. In particular, various methodologies are going to be devised and exploited in which graphical represesentations of a social network are going to be altered by the removal and/or addition of nodes and edges in these graphs. The original graph $G$, represents a social network of friendships and familylinks between users of the website <a>hamsterster.com</a>. Various functions which make it possible to change this graph are going to be given and explained. Some of these functions focus on the addition or removal of edges, while other focus on nodes. Some of these functions are going to do make choices at random, while others are also going to exploit randomness, but proporotional to the node degree and other statistics. The choice is made to analyze the effects of the functions which evolve the graphs on the original graph $G$. So the evaluation of the various functions which add/remove graphs is going to be done starting from the full and original graph $G$ for each of the given functions. 

<i>Note: A social network would naturally be described with an undirected graph. The social network data is, however, treated as a combination of target and source id's which faciliate the usage of this data as a directed graph for the sake of implementing and testing graph evolutions methods to evaluate the stability of PageRank. No implications or conclusions should be directly related to the actual structure of the social networks of the website</i>

### A. Removing and adding edges uniformly at random

For $n$ number of nodes do the following:
* select 1 node uniformly at random
* add or remove an incoming/outgoing at random 

In [8]:
#add/remove edges for all the nodes uniformly at random
def random_edges_uniform_random(G_in, number_of_nodes = 1):
    list_of_nodes = list(G_in) # all the nodes
    # select uniformly at random nodes of which we are going to add/remove edges
    selected_nodes = list(np.random.choice(list_of_nodes, size = number_of_nodes, replace = False)) # default probability p is an uniform distribution
    
    for node in selected_nodes: 
        successors = list(G_in.successors(str(node))) # find the successors of this nodes
        predecessors = list(G_in.predecessors(str(node))) # find the predecessors of this node
        #find candidates for new edges
        unconnected_to = [n for n in list(G_in.nodes()) if not n in successors] # no outgoing edge to these nodes
        unconnected_from = [n for n in list(G_in.nodes()) if not n in predecessors] # no incoming edge from these nodes
        
        add = bool(random.getrandbits(1)) # randomly add or remove an edge of this node
        incoming =  bool(random.getrandbits(1)) # randomly add an outgoing/incoming edge
        if(add): # add an incoming/outgoing edge to node
            if(incoming): # add incoming edge
                if len(unconnected_from): #only add when unconnected_from is not empty
                    new = random.choice(unconnected_from)
                    G_in.add_edge(new, node)
                    print("\tnew edge:\t {} --> {}".format(new, node))
                    unconnected_from.remove(new)
                    predecessors.append(new)
            else: # add outgoing edge:
                if len(unconnected_to): #only add when unconnected_to is not empty
                    new = random.choice(unconnected_to)
                    G_in.add_edge(node, new)
                    print("\tnew edge:\t {} --> {}".format(node, new))
                    unconnected_to.remove(new)    
                    successors.append(new)
        else: # remove
            if(incoming): # remove incoming edge
                if len(predecessors): #only remove when predecessors is not empty
                    new = random.choice(predecessors)
                    G_in.remove_edge(new, node)
                    print("\tremove edge:\t {} --> {}".format(new, node))
                    predecessors.remove(new)
                    unconnected_from.append(new)
            else: # remove outgoing edge:
                if len(successors): #only remove when successors is not empty
                    new = random.choice(successors)
                    G_in.remove_edge(node, new)
                    print("\tremove edge:\t {} --> {}".format(node, new))
                    successors.remove(new)    
                    unconnected_to.append(new)
    return G_in

In [9]:
G_random_edges_uniform_random = random_edges_uniform_random(G.copy(), 100)

	remove edge:	 1751 --> 410
	new edge:	 1301 --> 944
	remove edge:	 65 --> 55
	new edge:	 2216 --> 189
	remove edge:	 2053 --> 322
	remove edge:	 2415 --> 72
	new edge:	 2418 --> 406
	remove edge:	 2143 --> 298
	new edge:	 811 --> 878
	remove edge:	 1299 --> 149
	new edge:	 1388 --> 2120
	remove edge:	 1868 --> 1266
	new edge:	 390 --> 28
	remove edge:	 1245 --> 1249
	new edge:	 960 --> 1754
	remove edge:	 1091 --> 430
	remove edge:	 328 --> 20
	new edge:	 1731 --> 211
	remove edge:	 11 --> 493
	new edge:	 659 --> 1335
	new edge:	 1821 --> 585
	new edge:	 728 --> 1752
	new edge:	 721 --> 1180
	remove edge:	 1493 --> 1494
	new edge:	 1717 --> 2267
	remove edge:	 1842 --> 1843
	remove edge:	 1938 --> 200
	remove edge:	 218 --> 182
	remove edge:	 2107 --> 2109
	remove edge:	 2346 --> 2348
	new edge:	 1697 --> 1817
	remove edge:	 344 --> 350
	new edge:	 518 --> 540
	remove edge:	 1692 --> 1693
	new edge:	 175 --> 550
	new edge:	 617 --> 1531
	new edge:	 1677 --> 1632
	remove edge:	 2227 --

In [10]:
pr_random_edges_uniform_random = calc_pagerank(G_random_edges_uniform_random)
df_random_edges_uniform_random = create_dataframe(pr_random_edges_uniform_random, G_random_edges_uniform_random)
df_random_edges_uniform_random.head()

rank,node,score,in edges,out edges
1,404,0.042304,10,0
2,195,0.019747,80,1
3,77,0.018392,121,2
4,728,0.01526,10,1
5,1752,0.013086,2,1


### B. Adding nodes uniformly at random (copying model)

For $n$ iterations do the following:
* Make a new node instance $n$
* with a uniform random distribution pick $k$ nodes in the original graph
* copy the incoming/outgoing edges of the $k$ nodes for $n$
* choose with an unifrom distribution another node $l$ and add its edges also to $n$

<i>The last step might seem redundant at the moment, but later when the the $k$ nodes are going to be chosen with at random but proportional to a certain statistic, it makes sense to have a step in which you pick another node $l$ that is chosen with the opposite property so that the generation/stability of communities is ensured (i.e. power-law degree). In this implementation. This step is omitted, but it will be thus added in the functions that take statistical measures into consideration when choosing nodes at random</i>

In [11]:
#randomly add and remove nodes
#Edge Copying Model (slide 53 of Week6-SNA-Props)
def random_add_nodes_uniform(G_in, number_of_nodes = 1, k = 5):
    print("number of edges before :"+ str(len(G_in.edges())))
    for i in range(0, number_of_nodes):
        #k is number of edges to be added, random integer 1 between 5
        k = random.randint(1, k) #select k random vertices
        print("k = " + str(k))
        new_node = nx.number_of_nodes(G_in) + 1 #add node to graph
        print("new node = " + str(new_node))
        
        list_of_nodes = list(G_in)  #create list of nodes
        
        G_in.add_node(str(new_node))   
        k_random_selected_nodes = np.random.choice(list_of_nodes, size = k, replace = False) # k nodes with a uniform distribution
        
        for node in k_random_selected_nodes:
            print("node in k_random_selected_nodes = " + str(node))
            successors = list(G_in.successors(str(node)))
            print("succesors are " + str(successors))
            for node_to in successors:
                G_in.add_edge(new_node, node_to) # add outgoing edges
            predecessors = list(G_in.predecessors(str(node)))
            for node_from in predecessors:
                G_in.add_edge(node_from, new_node) # add incoming edges
    print("number of edges after :"+str(len(G_in.edges())))
    return G_in

In [12]:
G_random_add_nodes_uniform = random_add_nodes_uniform(G.copy(), 100)

number of edges before :16631
k = 5
new node = 2427
node in k_random_selected_nodes = 186
succesors are ['11', '831', '18', '114', '188', '189', '145', '19']
node in k_random_selected_nodes = 445
succesors are ['35', '248', '452', '77', '116', '61', '454']
node in k_random_selected_nodes = 580
succesors are ['433']
node in k_random_selected_nodes = 2015
succesors are ['77', '19', '454', '2016']
node in k_random_selected_nodes = 1569
succesors are ['1570', '1571']
k = 3
new node = 2429
node in k_random_selected_nodes = 199
succesors are ['202', '173', '201', '19']
node in k_random_selected_nodes = 1396
succesors are ['630', '107', '1397', '1398', '1399', '1400', '1401', '1402', '1403', '1404', '1405', '1406']
node in k_random_selected_nodes = 1448
succesors are ['1449', '1450', '1451', '1452', '1453']
k = 1
new node = 2431
node in k_random_selected_nodes = 1585
succesors are ['1586']
k = 1
new node = 2433
node in k_random_selected_nodes = 411
succesors are ['58', '845', '119', '158', '5

In [13]:
pr_random_add_nodes_uniform = calc_pagerank(G_random_add_nodes_uniform)
df_random_add_nodes_uniform = create_dataframe(pr_random_add_nodes_uniform, G_random_add_nodes_uniform)
df_random_add_nodes_uniform.head()

rank,node,score,in edges,out edges
1,404,0.041949,10,0
2,195,0.019678,83,1
3,77,0.017997,123,2
4,728,0.015003,10,0
5,36,0.010767,173,5


### C. Removal of nodes uniformly at random

Let $n$ represent the number of nodes that should be removed. If the $number\_of\_nodes$ parameter is givem then $n = number\_of\_nodes$, if this parameter is not specfied by the caller we have $n = \lfloor(0.1 * total\_number\_of\_nodes(G\_in))\rfloor)$

For $n$ iterations do the following:
* Select a node $m$ uniformly at random (iterations are abstracted by $np.random.choice$)
* Remove this node and its respective edges from the graph

In [14]:
def random_removal_nodes_uniform(G_in, number_given = False, number_of_nodes = None):
    if (number_given & number_of_nodes < len(list(G_in))): # check if we do not remove too much nodes
        n = number_of_nodes
    else:
        n = int(0.1 * len(list(G_in)))  # max 10% of nodes
    #remove nodes and corresponding edges
    print("number of nodes before :"+ str(len(list(G_in))))
    list_of_nodes = list(G_in)
    selected_nodes = np.random.choice(list_of_nodes, size = n, replace = False)
    for m_remove in selected_nodes:
        G_in.remove_node(m_remove)
    print("number of nodes after :"+ str(len(list(G_in))))
    return G_in

In [15]:
G_random_removal_nodes_uniform = random_removal_nodes_uniform(G.copy(), True, 100)

number of nodes before :2426
number of nodes after :2326


In [16]:
pr_random_removal_nodes_uniform = calc_pagerank(G_random_removal_nodes_uniform)
df_random_removal_nodes_uniform = create_dataframe(pr_random_removal_nodes_uniform, G_random_removal_nodes_uniform)
df_random_removal_nodes_uniform.head()

rank,node,score,in edges,out edges
1,195,0.034664,79,0
2,77,0.023705,119,1
3,728,0.016163,10,0
4,36,0.011552,163,5
5,192,0.009822,55,2


## Graph evolution methodologies using statistical measures

### D. Removal of nodes at random but proportional to the degree of the nodes

Let $n$ represent the number of nodes that should be removed. If the $number\_of\_nodes$ parameter is givem then $n = number\_of\_nodes$, if this parameter is not specfied by the caller we have $n = \lfloor(0.1 * total\_number\_of\_nodes(G\_in))\rfloor)$

For $n$ iterations do the following:
* Select a node $m$ at random, but proportional to the in degree's of the nodes (iterations are abstracted by $np.random.choice$)
* Remove this node and its respective edges from the graph

In [17]:
def random_node_removals_proportional_indegree(G_in, number_given = False, number_of_nodes = None):
    if (number_given & number_of_nodes < len(list(G_in))): # check if we do not remove too much nodes
        n = number_of_nodes
    else:
        n = int(0.1 * len(list(G_in)))  # max 10% of nodes
    #remove nodes and corresponding edges
    print("number of nodes before :"+ str(len(list(G_in))))
    list_of_nodes = list(G_in)
    in_degrees = dict(G_in.in_degree()).values() # in_degrees of all the nodes
    prob_degree = [float(i)/sum(in_degrees) for i in in_degrees] # probabilities proportional to degree
    
    selected_nodes = np.random.choice(list_of_nodes, size = n, replace = False, p = prob_degree)
    for m_remove in selected_nodes:
        G_in.remove_node(m_remove)
    print("number of nodes after :"+ str(len(list(G_in))))
    return G_in

In [18]:
G_random_node_removals_proportional_indegree = random_node_removals_proportional_indegree(G.copy(), True, 100)

number of nodes before :2426
number of nodes after :2326


In [19]:
G_random_node_removals_proportional_indegree = random_node_removals_proportional_indegree(G.copy(), True, 100)
pr_random_node_removals_proportional_indegree = calc_pagerank(G_random_node_removals_proportional_indegree)
df_random_node_removals_proportional_indegree = create_dataframe(pr_random_node_removals_proportional_indegree,
                                                                 G_random_node_removals_proportional_indegree)
df_random_node_removals_proportional_indegree.head()

number of nodes before :2426
number of nodes after :2326


rank,node,score,in edges,out edges
1,728,0.026972,9,0
2,404,0.018741,5,0
3,77,0.016504,99,1
4,136,0.015112,59,6
5,697,0.01345,16,1


### E. Removal of nodes at random but proportional to the hubs/authorithy measures (HITS) of nodes

Let $n$ represent the number of nodes that should be removed. If the $number\_of\_nodes$ parameter is givem then $n = number\_of\_nodes$, if this parameter is not specfied by the caller we have $n = \lfloor(0.1 * total\_number\_of\_nodes(G\_in))\rfloor)$

For $n$ iterations do the following:
* Select a node $m$ at random, but proportional to HITS measures (i.e. hub or authority) of the nodes (iterations are abstracted by $np.random.choice$)
* Remove this node and its respective edges from the graph

In [20]:
def random_node_removals_proportional_HITS(G_in, authorithy = False, number_given = False, number_of_nodes = None):
    if (number_given & number_of_nodes < len(list(G_in))): # check if we do not remove too much nodes
        n = number_of_nodes
    else:
        n = int(0.1 * len(list(G_in)))  # max 10% of nodes
    #remove nodes and corresponding edges
    node_counter = 0
    while(node_counter < n):
        list_of_nodes = list(G_in)
        print(int((node_counter / n) * 100), "%")
        if (authorithy):
            p = list(nx.hits(G_in)[0].values()) # probabilities proportional to authority of nodes
        else: # hub
            p = list(nx.hits(G_in)[1].values()) # probabilities proportional to hub of nodes
        node_remove = np.random.choice(list_of_nodes, p = p)
        G_in.remove_node(node_remove)
        node_counter += 1
    print("number of nodes after :"+ str(len(list(G_in))))
    return G_in

In [21]:
G_random_node_removals_proportional_HITS = random_node_removals_proportional_HITS(G.copy(), True, True, 10)

0 %
10 %
20 %
30 %
40 %
50 %
60 %
70 %
80 %
90 %
number of nodes after :2416


In [22]:
pr_random_node_removals_proportional_HITS = calc_pagerank(G_random_node_removals_proportional_HITS)
df_random_node_removals_proportional_HITS = create_dataframe(pr_random_node_removals_proportional_HITS,
                                                             G_random_node_removals_proportional_HITS)
df_random_node_removals_proportional_HITS.head()

rank,node,score,in edges,out edges
1,404,0.042613,10,0
2,195,0.019752,78,1
3,77,0.018474,119,2
4,728,0.015707,10,0
5,36,0.011142,168,5


### F. Addition of nodes at random but proportional to the degree of the nodes

In [23]:
#Edge Copying Model (slide 53 of Week6-SNA-Props)
def random_node_additions_proportional_in_degree(G_in, number_given = False, number_of_nodes = 1, k = 5):
    if (number_given & number_of_nodes < len(list(G_in))): # check if we do not remove too much nodes
        n = number_of_nodes
    else:
        n = int(0.1 * len(list(G_in)))  # max 10% of nodes
    print("number of edges before :"+ str(len(G_in.edges())))
    for i in range(0, n):
        #k is number of edges to be added, random integer 1 between 5
        k = random.randint(1, k) #select k random vertices
        new_node = nx.number_of_nodes(G_in) + 1 #add node to graph
        G_in.add_node(new_node)
        
        list_of_nodes = list(G_in)  #create list of nodes
        in_degrees = dict(G_in.in_degree()).values() # in_degrees of all the nodes
        prob_degree = [float(i)/sum(in_degrees) for i in in_degrees] # probabilities proportional to degree
        
        k_random_selected_nodes = np.random.choice(list_of_nodes, size = k, p = prob_degree, replace = False) # selecte k nodes proportional to chosen measure
        for node in k_random_selected_nodes:
            successors = list(G_in.successors(str(node)))
            for node_to in successors:
                G_in.add_edge(new_node, node_to) # add outgoing edges
            predecessors = list(G_in.predecessors(str(node)))
            for node_from in predecessors:
                G_in.add_edge(node_from, new_node) # add incoming edges
        
        # pick one node that has a low probability (relatively low number of incoming edges)
        non_zero_probs = [i for i in prob_degree if i != 0.0]
        highest_chance_nodes = np.random.choice(list_of_nodes, p = prob_degree, 
                                                size = (len(non_zero_probs) - 1), replace = False)
        
        node_to_add = random.sample(set(list_of_nodes).difference(set(highest_chance_nodes)), 1)[0] # low prob node
        successors = list(G_in.successors(node_to_add)) # successors of the node
        predecessors = list(G_in.predecessors(node_to_add)) # predecessors of the node 
        
        succ_current_node = list(G_in.successors(new_node)) # find the successors of the new node 
        pred_current_node = list(G_in.predecessors(new_node)) # find the predecessors of the new node
                                 
        # remove nodes to which the new node is already connected from the successors/predecessors list
        successors = [n for n in successors if not n in succ_current_node]
        predecessors = [n for n in predecessors if not n in pred_current_node]
                                 
        for node_to in successors:
            G_in.add_edge(new_node, node_to) # add outgoing edges
        for node_from in predecessors:
            G_in.add_edge(node_from, new_node) # add incoming edges
            
    print("number of edges after :"+str(len(G_in.edges())))
    return G_in

In [24]:
G_random_node_additions_proportional_in_degree = random_node_additions_proportional_in_degree(G.copy(), True, 10, 5)

number of edges before :16631


NetworkXError: The node 2427 is not in the digraph.

In [None]:
pr_random_node_additions_proportional_in_degree = calc_pagerank(G_random_node_additions_proportional_in_degree)
df_random_node_additions_proportional_in_degree = create_dataframe(pr_random_node_additions_proportional_in_degree,
                                                             G_random_node_additions_proportional_in_degree)
df_random_node_additions_proportional_in_degree.head()

### G. Addition of nodes at random but proportional to the hubs/authorithy measures (HITS) of nodes

In [None]:
#Edge Copying Model (slide 53 of Week6-SNA-Props)
def random_node_additions_proportional_HITS(G_in, authority = False, number_given = False, number_of_nodes = 1, k = 5):
    if (number_given & number_of_nodes < len(list(G_in))): # check if we do not remove too much nodes
        n = number_of_nodes
    else:
        n = int(0.1 * len(list(G_in)))  # max 10% of nodes
    print("number of edges before :"+ str(len(G_in.edges())))
    for i in range(0, n):
        #k is number of edges to be added, random integer 1 between 5
        k = random.randint(1, k) #select k random vertices
        new_node = nx.number_of_nodes(G_in) + 1 #add node to graph
        G_in.add_node(new_node)
        
        list_of_nodes = list(G_in)  #create list of nodes
        if (authority):
            p = list(nx.hits(G_in)[0].values())
        else: # hub
            p = list(nx.hits(G_in)[1].values())
        
        k_random_selected_nodes = np.random.choice(list_of_nodes, size = k, p = p, replace = False) # selecte k nodes proportional to chosen measure
        for node in k_random_selected_nodes:
            successors = list(G_in.successors(str(node)))
            for node_to in successors:
                G_in.add_edge(new_node, node_to) # add outgoing edges
            predecessors = list(G_in.predecessors(str(node)))
            for node_from in predecessors:
                G_in.add_edge(node_from, new_node) # add incoming edges
        
        # pick one node that has a low probability (relatively low number of incoming edges)
        non_zero_probs = [i for i in p if i != 0.0]
        highest_chance_nodes = np.random.choice(list_of_nodes, p = p, 
                                                size = (len(non_zero_probs) - 1), replace = False)
        
        node_to_add = random.sample(set(list_of_nodes).difference(set(highest_chance_nodes)), 1)[0] # low prob node
        successors = list(G_in.successors(node_to_add)) # successors of the node
        predecessors = list(G_in.predecessors(node_to_add)) # predecessors of the node 
        
        succ_current_node = list(G_in.successors(new_node)) # find the successors of the new node 
        pred_current_node = list(G_in.predecessors(new_node)) # find the predecessors of the new node
                                 
        # remove nodes to which the new node is already connected from the successors/predecessors list
        successors = [n for n in successors if not n in succ_current_node]
        predecessors = [n for n in predecessors if not n in pred_current_node]
                                 
        for node_to in successors:
            G_in.add_edge(new_node, node_to) # add outgoing edges
        for node_from in predecessors:
            G_in.add_edge(node_from, new_node) # add incoming edges
            
    print("number of edges after :"+str(len(G_in.edges())))
    return G_in

In [None]:
G_random_node_additions_proportional_HITS = random_node_additions_proportional_HITS(G.copy(), True, True, 10, 5)

In [None]:
pr_random_node_additions_proportional_HITS = calc_pagerank(G_random_node_additions_proportional_HITS)
df_random_node_additions_proportional_HITS = create_dataframe(pr_random_node_additions_proportional_HITS, G_random_node_additions_proportional_HITS)
df_random_node_additions_proportional_HITS.head()