# Task 1: PageRank Stability on Evolving Graphs
## Graph evolution and PageRank values comparison
### Joris & Abdel

### Imports and general set-up

In [1]:
import networkx as nx
import plotly.plotly as py
import plotly.figure_factory as ff
import pandas as pd
import numpy as np
import random
import matplotlib.pyplot as plt

Load edge list and create a graph

In [2]:
fh = open("canvas/hamster.edgelist", 'rb')
G = nx.read_edgelist(fh, create_using=nx.DiGraph())
fh.close()

In [3]:
np.random.seed(98)
random.seed(99)

In [4]:
def calc_pagerank(G_in, alpha = 0.85):
    return nx.pagerank(G_in, alpha=0.85)

In [5]:
pr_origin = calc_pagerank(G)

In [6]:
def create_dataframe(pr, G_in):
    df_edge_in = pd.DataFrame(list(G_in.in_degree()), columns=['node', 'in edges'])
    df_edge_out = pd.DataFrame(list(G_in.out_degree()), columns=['node', 'out edges'])
    df_rank = pd.DataFrame(list(pr.items()), columns=['node', 'score']).sort_values(by=['score'], ascending=False)
    df_temp = pd.merge(df_rank, df_edge_in, on='node')
    df_total = pd.merge(df_temp, df_edge_out, on='node')
    df_total.index = df_total.index + 1
    df_total.columns.name = 'rank'
    return df_total

In [7]:
df_origin = create_dataframe(pr_origin, G)
df_origin.head(10)

rank,node,score,in edges,out edges
1,404,0.042793,10,0
2,195,0.019961,80,1
3,77,0.018628,121,2
4,728,0.01553,10,0
5,36,0.011117,168,5
6,135,0.009544,49,8
7,192,0.009365,57,3
8,281,0.009304,32,0
9,136,0.008853,85,6
10,184,0.008296,80,3


It looks like node 404 is the best ranked page, following by 195 and 77. This means that these pages should be shown at the top by search engines.

### <font color="red">To do: Update the PageRank value calculation and analysis in this notebook with the extended/corrected/completed <i>(this still needs to be done)</i> version of the notebook of 1a</font>

## 1b. Graph Evolution and Pagerank values comparison

In this section the effects of graph evolutions are going to be studied in relation to an evaluation of the stability of PageRank. In particular, various methodologies are going to be devised and exploited in which graphical represesentations of a social network are going to be altered by the removal and/or addition of nodes and edges in these graphs. The original graph $G$, represents a social network of friendships and familylinks between users of the website <a>hamsterster.com</a>. Various functions which make it possible to change this graph are going to be given and explained. Some of these functions focus on the addition or removal of edges, while other focus on nodes. Some of these functions are going to do make choices at random, while others are also going to exploit randomness, but proporotional to the node degree and other statistics. The choice is made to analyze the effects of the functions which evolve the graphs on the original graph $G$. So the evaluation of the various functions which add/remove graphs is going to be done starting from the full and original graph $G$ for each of the given functions. 

<i>Note: A social network would naturally be described with an undirected graph. The social network data is, however, treated as a combination of target and source id's which faciliate the usage of this data as a directed graph for the sake of implementing and testing graph evolutions methods to evaluate the stability of PageRank. No implications or conclusions should be directly related to the actual structure of the social networks of the website</i>

### A. Removing and adding edges uniformly at random

For $n$ number of nodes do the following:
* select 1 node uniformly at random
* add or remove an incoming/outgoing at random 

In [8]:
#add/remove edges for all the nodes uniformly at random
def random_edges_uniform_random(G_in, number_of_nodes = 1, choice_given = False, choice = False):
    nr_of_edges_added = 0
    nr_of_edges_removed = 0
    
    list_of_nodes = list(G_in) # all the nodes
    # select uniformly at random nodes of which we are going to add/remove edges
    selected_nodes = list(np.random.choice(list_of_nodes, size = number_of_nodes, replace = False)) # default probability p is an uniform distribution
    
    for node in selected_nodes: 
        successors = list(G_in.successors(str(node))) # find the successors of this nodes
        predecessors = list(G_in.predecessors(str(node))) # find the predecessors of this node
        #find candidates for new edges
        unconnected_to = [n for n in list(G_in.nodes()) if not n in successors] # no outgoing edge to these nodes
        unconnected_from = [n for n in list(G_in.nodes()) if not n in predecessors] # no incoming edge from these nodes
        
        if (choice_given):
            add = choice
        else:
            add = bool(random.getrandbits(1)) # randomly add or remove an edge of this node
        
        incoming =  bool(random.getrandbits(1)) # randomly add an outgoing/incoming edge
        if(add): # add an incoming/outgoing edge to node
            if(incoming): # add incoming edge
                if len(unconnected_from): #only add when unconnected_from is not empty
                    new = random.choice(unconnected_from)
                    G_in.add_edge(new, node)
                    print("\tnew edge:\t {} --> {}".format(new, node))
                    unconnected_from.remove(new)
                    predecessors.append(new)
            else: # add outgoing edge:
                if len(unconnected_to): #only add when unconnected_to is not empty
                    new = random.choice(unconnected_to)
                    G_in.add_edge(node, new)
                    print("\tnew edge:\t {} --> {}".format(node, new))
                    unconnected_to.remove(new)    
                    successors.append(new)
            nr_of_edges_added += 1
        else: # remove
            if(incoming): # remove incoming edge
                if len(predecessors): #only remove when predecessors is not empty
                    new = random.choice(predecessors)
                    G_in.remove_edge(new, node)
                    print("\tremove edge:\t {} --> {}".format(new, node))
                    predecessors.remove(new)
                    unconnected_from.append(new)
            else: # remove outgoing edge:
                if len(successors): #only remove when successors is not empty
                    new = random.choice(successors)
                    G_in.remove_edge(node, new)
                    print("\tremove edge:\t {} --> {}".format(node, new))
                    successors.remove(new)    
                    unconnected_to.append(new)
            nr_of_edges_removed += 1
            
    print("number of edges added: " + str(nr_of_edges_added))
    print("number of edges removed " + str(nr_of_edges_removed))
    
    return G_in

`random_edges_uniform_random` selects one node uniformly at random, and then for that specific node it uniformly at random adds or removes one outgoing or ingoing edge, which is also determined uniformly at random. Because the choice is made to analyze the effects of the functions which evolve the graphs on the original graph $G$, we call the function `random_edges_uniform_random` parameterized with a copy of $G$ and `number_of_nodes`$ = 100$. In other words, `random_edges_uniform_random` will either add or remove either an incoming or outgoing edge for each node of `number_of_nodes`. The result graph is stored in `G_random_edges_uniform_random`.

In [9]:
number_of_nodes_random = random.randint(1,  int(0.1 * nx.number_of_edges(G.copy()))) #max 10% of eges to add/remove
print("number of nodes: " + str(number_of_nodes_random))
G_random_edges_uniform_random = random_edges_uniform_random(G.copy(), number_of_nodes_random)

number of nodes: 828
	new edge:	 817 --> 945
	remove edge:	 301 --> 874
	remove edge:	 1777 --> 709
	new edge:	 2209 --> 1653
	new edge:	 2413 --> 823
	remove edge:	 756 --> 648
	new edge:	 909 --> 1545
	new edge:	 2107 --> 881
	remove edge:	 541 --> 375
	remove edge:	 2383 --> 2384
	remove edge:	 2246 --> 305
	new edge:	 360 --> 522
	remove edge:	 1887 --> 1889
	remove edge:	 697 --> 728
	remove edge:	 1801 --> 308
	new edge:	 2192 --> 1863
	remove edge:	 1631 --> 1638
	new edge:	 2328 --> 1903
	new edge:	 1292 --> 1467
	remove edge:	 1300 --> 249
	new edge:	 1810 --> 717
	new edge:	 728 --> 1889
	new edge:	 721 --> 1112
	remove edge:	 1123 --> 421
	remove edge:	 37 --> 60
	new edge:	 1234 --> 333
	new edge:	 189 --> 359
	remove edge:	 1724 --> 1725
	remove edge:	 964 --> 967
	new edge:	 1934 --> 825
	new edge:	 227 --> 460
	new edge:	 2049 --> 1817
	remove edge:	 470 --> 958
	new edge:	 522 --> 736
	remove edge:	 2113 --> 303
	new edge:	 121 --> 547
	new edge:	 617 --> 838
	new edge:

Next, we run `calc_pagerank` to calculate the new pagerank scores of `G_random_edges_uniform_random`. Thereafter a dataframe is created of the pagerank scores, together with for each node the number of incoming edges and outcoming edges. The nodes are sorted on the pagerank score, in descending order.

In [10]:
pr_random_edges_uniform_random = calc_pagerank(G_random_edges_uniform_random)
df_random_edges_uniform_random = create_dataframe(pr_random_edges_uniform_random, G_random_edges_uniform_random)
df_random_edges_uniform_random.head(10)

rank,node,score,in edges,out edges
1,404,0.03391,10,0
2,195,0.019782,81,2
3,77,0.018272,122,2
4,192,0.010309,56,3
5,281,0.009478,32,0
6,36,0.009187,164,5
7,135,0.009129,49,9
8,1555,0.008787,15,3
9,184,0.008769,81,3
10,126,0.008135,55,1


Before the results are analyzed, lets first talk about some intuition what could happen when edges are added or removed uniformly at random. The original graph $G$ is a scale-free network, i.e. its degree distribution follows a power law, at least asymptiotically. This means that in this type of network structure, there will be many nodes with very low level of connectivity. And very few or except one node with exceptionally high degree of connectivity. So the nodes are very unequal in terms of how connected and influential the different nodes in the network are. Scale free networks describes a power or exponential relationship between the degree of connectivity a node has and the frequency of its occurence. This results in a highly centralized network. In the social network that is loaded from hamsterster.com, we have some people who have very many links into them, but there are also many people that have very few links into them. The power law distribution is often explained with reference to preferential attachment. Preferential attachment describes how a new node is linked amongst a number of nodes according to how much they already have. So those who already have a lot of links will receive more than those who have litte: the so called "rich get richer model". In the paragraphs D-E-F-G, preferential attachment proporional to some statistical measures is elaborated and analyzed. Paragraphs A-B-C will analyze the effects of adding/removing nodes/adges uniformly at random. In paragraph A, we add or remove only edges uniformly at random. In paragraph B nodes and corresponding edges are added uniformly at random. In paragraph C nodes and corresponding edges are removed uniformly at random. All modifications that will take place in paragraph A-B-C are determined uniformly at random, i.e. all nodes have equal probability to be chosen or to be removed. Scale free networks can be very robust or very fragile, depending on how we remove nodes (randomly or strategically). If we remove nodes uniformly at random, the network will be very robust to failure. This is because the vast majority of nodes have a very low degree of connectivity. Therefore, it is very likely that we will modify one of these insignificant nodes with little effect on the overall network.
So real word networks, like the network from hamsterster.com, are resilient to random attacks.  

When we add or remove only edges, we expect a constant average degree, i.e. the number of edges grows linearly with the number of nodes. Also, we expect that as the network grows, the distances between nodes grow. From the output of the function above, we see that $52$ edges are added, and $48$ edges are removed. When we look at the top ten nodes, we see that the top ten of `G_random_edges_uniform_random` exactly matches the top ten of the original graph $G$. Let's dive deeper into both graphs to look if there changed else.

First, we compare the $density$ between the original graph G and the graph `G_random_edges_uniform_random`. The density for directed graphs is: $d = \frac{E}{V(V-1)}$, where $E$ denotes the total number of edges and $V$ denotes the total number of nodes in the particular graph.

In [11]:
nx.info(G)

'Name: \nType: DiGraph\nNumber of nodes: 2426\nNumber of edges: 16631\nAverage in degree:   6.8553\nAverage out degree:   6.8553'

In [12]:
nx.info(G_random_edges_uniform_random)

'Name: \nType: DiGraph\nNumber of nodes: 2426\nNumber of edges: 16637\nAverage in degree:   6.8578\nAverage out degree:   6.8578'

In [13]:
nx.density(G), nx.density(G_random_edges_uniform_random)

(0.002826935008201528, 0.0028279548873458497)

When we determine inside the function whether we add or remove an edge, the top ten of nodes are equal to the top ten nodes. This makes totally sense. The purpose of the function is adding or removing edges uniformaly at random to `number_of_nodes` nodes. These selected nodes all have the same probability to be selected. As explained earlier, there are a lot of nodes in the network that have a low connectivity and only a few nodes with a very high connectivity. It is very likely that we will modify one of these insignificant nodes with little effect on the overall network and therefore we can explain that the top ten ranked nodes are not changed after adding or removing only nodes. 

### only edges added

In [14]:
number_of_nodes_random = random.randint(1,  int(0.1 * nx.number_of_edges(G.copy()))) #max 10% of edges to add
print("number of nodes: " + str(number_of_nodes_random))
G_random_add_edges_uniform_random = random_edges_uniform_random(G.copy(), number_of_nodes_random, True, True)

number of nodes: 248
	new edge:	 1047 --> 2046
	new edge:	 70 --> 714
	new edge:	 1273 --> 1378
	new edge:	 566 --> 996
	new edge:	 2208 --> 30
	new edge:	 2138 --> 1947
	new edge:	 2239 --> 1848
	new edge:	 1670 --> 1873
	new edge:	 1140 --> 458
	new edge:	 829 --> 1631
	new edge:	 1821 --> 695
	new edge:	 315 --> 2310
	new edge:	 1894 --> 492
	new edge:	 1933 --> 779
	new edge:	 1434 --> 755
	new edge:	 1336 --> 1742
	new edge:	 374 --> 699
	new edge:	 762 --> 1019
	new edge:	 1594 --> 1329
	new edge:	 234 --> 1525
	new edge:	 658 --> 2274
	new edge:	 471 --> 1553
	new edge:	 2171 --> 2260
	new edge:	 1416 --> 115
	new edge:	 2425 --> 951
	new edge:	 118 --> 178
	new edge:	 1900 --> 1541
	new edge:	 2096 --> 1176
	new edge:	 226 --> 1539
	new edge:	 681 --> 424
	new edge:	 2231 --> 1547
	new edge:	 1173 --> 67
	new edge:	 826 --> 393
	new edge:	 2251 --> 1138
	new edge:	 16 --> 1154
	new edge:	 2423 --> 2071
	new edge:	 58 --> 729
	new edge:	 373 --> 2292
	new edge:	 1574 --> 1710
	n

In [15]:
pr_random_add_edges_uniform_random = calc_pagerank(G_random_add_edges_uniform_random)
df_random_add_edges_uniform_random = create_dataframe(pr_random_add_edges_uniform_random, G_random_add_edges_uniform_random)
df_random_add_edges_uniform_random.head(10)

rank,node,score,in edges,out edges
1,404,0.042067,10,0
2,195,0.019808,80,1
3,77,0.01793,121,2
4,36,0.011424,168,5
5,728,0.010619,10,0
6,192,0.009514,57,3
7,135,0.008965,49,8
8,136,0.00894,85,6
9,126,0.008835,56,1
10,281,0.00862,32,0


Above, we see the resulting top ten dataframe after running the function `random_edges_uniform_random`, parameterized with a copy of the original graph  GG , number_of_nodes = 248 and choice = True. In other words, the function adds 248 new edges (incoming or outgoing) to the network. The output is exaclty what we expect: the top ten is not changed that much. The only new node is node 281, which replaced node 184 of the original graph. Indeed, the new nodes that are added uniformly at random don't have a preferential attachment to the highly connected, because the number of incoming edges and outcoming edges for the remaining top nine nodes are exactly the same. 

In [16]:
nx.info(G)

'Name: \nType: DiGraph\nNumber of nodes: 2426\nNumber of edges: 16631\nAverage in degree:   6.8553\nAverage out degree:   6.8553'

In [17]:
nx.info(G_random_add_edges_uniform_random)

'Name: \nType: DiGraph\nNumber of nodes: 2426\nNumber of edges: 16879\nAverage in degree:   6.9575\nAverage out degree:   6.9575'

In [18]:
nx.density(G), nx.density(G_random_add_edges_uniform_random)

(0.002826935008201528, 0.0028690900128334794)

The density of the graphs increases when we add edges. This totally makes sense of course, because we only add edges and no new nodes.

### only edges removed

In [19]:
number_of_nodes_random = random.randint(1,  int(0.1 * nx.number_of_edges(G.copy()))) #max 10% of edges to remove to select
print("number of nodes: " + str(number_of_nodes_random))
G_random_remove_edges_uniform_random = random_edges_uniform_random(G.copy(), number_of_nodes_random, True, False)

number of nodes: 962
	remove edge:	 2391 --> 2393
	remove edge:	 653 --> 19
	remove edge:	 1126 --> 1127
	remove edge:	 2300 --> 866
	remove edge:	 2072 --> 908
	remove edge:	 384 --> 394
	remove edge:	 1776 --> 1779
	remove edge:	 206 --> 207
	remove edge:	 2019 --> 512
	remove edge:	 2138 --> 2139
	remove edge:	 1506 --> 1507
	remove edge:	 1410 --> 1411
	remove edge:	 459 --> 407
	remove edge:	 2137 --> 1866
	remove edge:	 689 --> 690
	remove edge:	 1144 --> 1145
	remove edge:	 509 --> 690
	remove edge:	 488 --> 489
	remove edge:	 1578 --> 603
	remove edge:	 889 --> 891
	remove edge:	 1666 --> 1671
	remove edge:	 2286 --> 733
	remove edge:	 1956 --> 436
	remove edge:	 410 --> 102
	remove edge:	 1870 --> 492
	remove edge:	 1414 --> 1416
	remove edge:	 2245 --> 412
	remove edge:	 2006 --> 870
	remove edge:	 889 --> 890
	remove edge:	 1156 --> 1157
	remove edge:	 2044 --> 2341
	remove edge:	 82 --> 83
	remove edge:	 637 --> 872
	remove edge:	 2188 --> 35
	remove edge:	 435 --> 743
	rem

In [20]:
pr_random_remove_edges_uniform_random = calc_pagerank(G_random_remove_edges_uniform_random)
df_random_remove_edges_uniform_random = create_dataframe(pr_random_remove_edges_uniform_random, G_random_remove_edges_uniform_random)
df_random_remove_edges_uniform_random.head(10)

rank,node,score,in edges,out edges
1,404,0.034888,7,0
2,77,0.021295,118,2
3,195,0.020118,78,1
4,728,0.01413,8,0
5,36,0.011473,166,4
6,192,0.010633,56,3
7,281,0.010608,28,0
8,135,0.009159,49,7
9,184,0.008499,77,3
10,136,0.008219,82,5


Above, we see the resulting top ten dataframe after running the function `random_edges_uniform_random`, parameterized with a copy of the original graph $G$, `number_of_nodes` = 962 and choice = False. In other words, the function only removes edges for 962 nodes in the graph uniformly at random. Also in this case we see that the top ten remains the same. This means that indeed the edges are removed uniformly at random, i.e. most of the edges that are removed are of the insignificant edges. Also we see that the score of the first node, node 404, is decreased, while the second and third nodes (77 and 195) are increased. This means that the function did remove some high ranked links from node 404. Removing nodes will decrease the average degree of the nodes, which is comfirmed by the two info cells below.

In [21]:
nx.info(G)

'Name: \nType: DiGraph\nNumber of nodes: 2426\nNumber of edges: 16631\nAverage in degree:   6.8553\nAverage out degree:   6.8553'

In [22]:
nx.info(G_random_remove_edges_uniform_random)

'Name: \nType: DiGraph\nNumber of nodes: 2426\nNumber of edges: 15844\nAverage in degree:   6.5309\nAverage out degree:   6.5309'

In [23]:
nx.density(G), nx.density(G_random_remove_edges_uniform_random)

(0.002826935008201528, 0.002693160860438038)

When edges are removed, the density of the network shrinks which is what we expected. 

### B. Adding nodes uniformly at random (copying model)

For $n$ iterations do the following:
* Make a new node instance $n$
* with a uniform random distribution pick $k$ nodes in the original graph
* copy the incoming/outgoing edges of the $k$ nodes for $n$
* choose with an unifrom distribution another node $l$ and add its edges also to $n$

<i>The last step might seem redundant at the moment, but later when the the $k$ nodes are going to be chosen with at random but proportional to a certain statistic, it makes sense to have a step in which you pick another node $l$ that is chosen with the opposite property so that the generation/stability of communities is ensured (i.e. power-law degree). In this implementation. This step is omitted, but it will be thus added in the functions that take statistical measures into consideration when choosing nodes at random</i>

In [24]:
#randomly add and remove nodes
#Edge Copying Model (slide 53 of Week6-SNA-Props)
def random_add_nodes_uniform(G_in, number_of_nodes = 1, k = 5):
    print("number of edges before :"+ str(len(G_in.edges())))
    for _ in range(number_of_nodes):
        #k is number of edges to be added, random integer 1 between 5
        k = random.randint(1, k) #select k random vertices
        #print("k = " + str(k))
        new_node = nx.number_of_nodes(G_in) + 1 #add node to graph
        # print("new node = " + str(new_node))
        
        list_of_nodes = list(G_in)  #create list of nodes
        
        G_in.add_node(str(new_node))   
        k_random_selected_nodes = np.random.choice(list_of_nodes, size = k, replace = False) # k nodes with a uniform distribution
        
        for node in k_random_selected_nodes:
            #print("node in k_random_selected_nodes = " + str(node))
            successors = list(G_in.successors(str(node)))
            #print("succesors are " + str(successors))
            for node_to in successors:
                G_in.add_edge(new_node, node_to) # add outgoing edges
            predecessors = list(G_in.predecessors(str(node)))
            for node_from in predecessors:
                G_in.add_edge(node_from, new_node) # add incoming edges
    print("number of edges after :"+str(len(G_in.edges())))
    return G_in

In [25]:
number_of_nodes_to_add_random = random.randint(1,  int(0.1 * len(list(G.copy()))))
print("number of nodes added: " + str(number_of_nodes_to_add_random))
G_random_add_nodes_uniform = random_add_nodes_uniform(G.copy(), number_of_nodes_to_add_random)

number of nodes added: 121
number of edges before :16631
number of edges after :17942


In [26]:
pr_random_add_nodes_uniform = calc_pagerank(G_random_add_nodes_uniform)
df_random_add_nodes_uniform = create_dataframe(pr_random_add_nodes_uniform, G_random_add_nodes_uniform)
df_random_add_nodes_uniform.head(10)

rank,node,score,in edges,out edges
1,404,0.041791,10,0
2,195,0.019638,84,1
3,77,0.01824,127,2
4,728,0.016094,11,0
5,36,0.010746,173,5
6,135,0.009359,52,8
7,281,0.009154,34,0
8,192,0.00892,59,3
9,136,0.008491,87,6
10,184,0.008313,86,3


`random_add_nodes_uniform` add nodes uniformly at random, while the degree distribution of the network still satisfies the power law distribution. We call the function, parameterized with `number_of_nodes` =  maximal 10% of the total number of nodes that are in $G$. What we see is that rank of some nodes in the top ten is changed. We even see a new node in top ten, node 126. This means that randomness can create new strong nodes, by adding new nodes and create new strong communities in the graph. However, since this all happens unformly at random, we can't guarantee that this happens every single run (which we can guarantee when we add proportional to some statistical measure, see D-E-F-G). The added nodes are linked, one by one, to $k$ random selected nodes. Note that every node in $G$ does have the same probability, so again, it is most likely that we select the insignificant nodes as nodes where we link the new nodes to. We see that node 126 is in the top then , but this is because node 184 has a lower pagerank score compared to the original graph $G$. This could be that the some of the new added nodes are linked to node 184, and because this nodes have a low pagerank score, it will have a negative effect on node 184. We can also look at the density of graph  `G_random_add_nodes_uniform`. Because we add nodes, the density should be lower compared to the original graph $G$. 

In [27]:
nx.info(G)

'Name: \nType: DiGraph\nNumber of nodes: 2426\nNumber of edges: 16631\nAverage in degree:   6.8553\nAverage out degree:   6.8553'

In [28]:
nx.info(G_random_add_nodes_uniform)

'Name: \nType: DiGraph\nNumber of nodes: 2663\nNumber of edges: 17942\nAverage in degree:   6.7375\nAverage out degree:   6.7375'

In [29]:
nx.density(G), nx.density(G_random_add_nodes_uniform)

(0.002826935008201528, 0.0025309970254930735)

The output indeed confirms our hypotheses. Another aspect that we can look at is whether the number of edges grows linearly with the number of nodes. The number of edges before are 16631 and after the addition of 235 nodes we have 18104 edges. 1473 nodes are therefore added to the graph `G_random_add_nodes_uniform`. So the number of nodes has increased with a factor of approximately $1.1$. However, the number of edges is increased with $\frac{18104}{16631} = 1.08$. So the number of edges does linearly increase with number of nodes.  (<font color = "red">to-do: shouldn't number of edges not be more than 1.1?</font>)

### C. Removal of nodes uniformly at random

Let $n$ represent the number of nodes that should be removed. If the $number\_of\_nodes$ parameter is givem then $n = number\_of\_nodes$, if this parameter is not specfied by the caller we have $n = \lfloor(0.1 * total\_number\_of\_nodes(G\_in))\rfloor)$

For $n$ iterations do the following:
* Select a node $m$ uniformly at random (iterations are abstracted by $np.random.choice$)
* Remove this node and its respective edges from the graph

In [30]:
def random_removal_nodes_uniform(G_in, number_given = False, number_of_nodes = None):
    if (number_given & number_of_nodes < len(list(G_in))): # check if we do not remove too much nodes
        n = number_of_nodes
    else:
        n = int(0.1 * len(list(G_in)))  # max 10% of nodes
    #remove nodes and corresponding edges
    print("number of nodes before :"+ str(len(list(G_in))))
    list_of_nodes = list(G_in)
    selected_nodes = np.random.choice(list_of_nodes, size = n, replace = False)
    for m_remove in selected_nodes:
        G_in.remove_node(m_remove)
    print("number of nodes after :"+ str(len(list(G_in))))
    return G_in

In [31]:
G_random_removal_nodes_uniform = random_removal_nodes_uniform(G.copy(), True, int(0.1 * len(list(G.copy()))))

number of nodes before :2426
number of nodes after :2184


In [32]:
pr_random_removal_nodes_uniform = calc_pagerank(G_random_removal_nodes_uniform)
df_random_removal_nodes_uniform = create_dataframe(pr_random_removal_nodes_uniform, G_random_removal_nodes_uniform)
df_random_removal_nodes_uniform.head(10)

rank,node,score,in edges,out edges
1,404,0.042637,9,0
2,195,0.01994,70,1
3,77,0.018827,105,2
4,36,0.012105,151,4
5,728,0.011899,9,0
6,135,0.0118,45,8
7,192,0.010481,51,3
8,281,0.010351,27,0
9,136,0.009547,77,6
10,184,0.009023,71,2


In [33]:
nx.info(G)

'Name: \nType: DiGraph\nNumber of nodes: 2426\nNumber of edges: 16631\nAverage in degree:   6.8553\nAverage out degree:   6.8553'

In [34]:
nx.info(G_random_removal_nodes_uniform)

'Name: \nType: DiGraph\nNumber of nodes: 2184\nNumber of edges: 13059\nAverage in degree:   5.9794\nAverage out degree:   5.9794'

In [35]:
nx.density(G), nx.density(G_random_removal_nodes_uniform)

(0.002826935008201528, 0.0027390726543268917)

In [36]:
avg_node_degree_full_graph = df_origin["in edges"].mean()
avg_node_degree_graph_removed_nodes = df_random_removal_nodes_uniform["in edges"].mean()
avg_node_degree_full_graph, avg_node_degree_graph_removed_nodes

(6.855317394888706, 5.979395604395604)

`random_removal_nodes_uniform` removes nodes from the graph $G$ uniformly at random, parameterized with `number_of_nodes` = maximal 10% of the total number of nodes in the original graph $G$. It's is important to check whether the nodes that are removed are removed uniformly at random, i.e. the nodes that are removed are mostly the insignificant nodes. When we look at the output of the cell above, we see that the average degree of the nodes that are removed is slightly lower than the average degree of the full orignal graph $G$. This is good, because it means that in most of the cases only the insignificant nodes are removed. When we compare the top ten of `G_random_removal_nodes_uniform` to the top ten of $G$, we see that they contain the same items, only the order of some nodes has changed. For example, node 36 has climbed one spot, to rank 4. What happened is that `random_removal_nodes_uniform` removed some nodes that were linked to node 36 with a low pagerank score, and therefore the overall pagerank score of 36 has increased slightly. The most important conclusion that we can draw after this function call is that scale free networks are very robust against removing of nodes uniformly at random. As explained earlier, all nodes that are possible candidates to be removed have an equally probability to be chosen. Because we have only a few nodes with a very large connectivity, it is most likely that we select the nodes with a very low connectivity. Therefore the overall end result does not differ much compared to the original graph $G$.

## Graph evolution methodologies using statistical measures

### D. Removal of nodes at random but proportional to the degree of the nodes

Let $n$ represent the number of nodes that should be removed. If the $number\_of\_nodes$ parameter is givem then $n = number\_of\_nodes$, if this parameter is not specfied by the caller we have $n = \lfloor(0.1 * total\_number\_of\_nodes(G\_in))\rfloor)$

For $n$ iterations do the following:
* Select a node $m$ at random, but proportional to the in degree's of the nodes (iterations are abstracted by $np.random.choice$)
* Remove this node and its respective edges from the graph

In [37]:
def random_node_removals_proportional_indegree(G_in, number_given = False, number_of_nodes = None):
    if (number_given & number_of_nodes < len(list(G_in))): # check if we do not remove too much nodes
        n = number_of_nodes
    else:
        n = int(0.1 * len(list(G_in)))  # max 10% of nodes
    #remove nodes and corresponding edges
    print("number of nodes before :"+ str(len(list(G_in))))
    list_of_nodes = list(G_in)
    in_degrees = dict(G_in.in_degree()).values() # in_degrees of all the nodes
    prob_degree = [float(i)/sum(in_degrees) for i in in_degrees] # probabilities proportional to degree
    
    selected_nodes = np.random.choice(list_of_nodes, size = n, replace = False, p = prob_degree)
    for m_remove in selected_nodes:
        G_in.remove_node(m_remove)
    print("number of nodes after :"+ str(len(list(G_in))))
    return G_in

In [38]:
G_random_node_removals_proportional_indegree = random_node_removals_proportional_indegree(G.copy(), True, 100)

number of nodes before :2426
number of nodes after :2326


In [39]:
G_random_node_removals_proportional_indegree = random_node_removals_proportional_indegree(G.copy(), True, 100)
pr_random_node_removals_proportional_indegree = calc_pagerank(G_random_node_removals_proportional_indegree)
df_random_node_removals_proportional_indegree = create_dataframe(pr_random_node_removals_proportional_indegree,
                                                                 G_random_node_removals_proportional_indegree)
df_random_node_removals_proportional_indegree.head()

number of nodes before :2426
number of nodes after :2326


rank,node,score,in edges,out edges
1,404,0.036285,8,0
2,192,0.013651,48,2
3,195,0.012935,61,1
4,728,0.011959,9,0
5,35,0.011818,129,2


### E. Removal of nodes at random but proportional to the hubs/authorithy measures (HITS) of nodes

Let $n$ represent the number of nodes that should be removed. If the $number\_of\_nodes$ parameter is givem then $n = number\_of\_nodes$, if this parameter is not specfied by the caller we have $n = \lfloor(0.1 * total\_number\_of\_nodes(G\_in))\rfloor)$

For $n$ iterations do the following:
* Select a node $m$ at random, but proportional to HITS measures (i.e. hub or authority) of the nodes (iterations are abstracted by $np.random.choice$)
* Remove this node and its respective edges from the graph

In [40]:
def random_node_removals_proportional_HITS(G_in, authorithy = False, number_given = False, number_of_nodes = None):
    if (number_given & number_of_nodes < len(list(G_in))): # check if we do not remove too much nodes
        n = number_of_nodes
    else:
        n = int(0.1 * len(list(G_in)))  # max 10% of nodes
    #remove nodes and corresponding edges
    node_counter = 0
    while(node_counter < n):
        list_of_nodes = list(G_in)
        print(int((node_counter / n) * 100), "%")
        if (authorithy):
            p = list(nx.hits(G_in)[0].values()) # probabilities proportional to authority of nodes
        else: # hub
            p = list(nx.hits(G_in)[1].values()) # probabilities proportional to hub of nodes
        node_remove = np.random.choice(list_of_nodes, p = p)
        G_in.remove_node(node_remove)
        node_counter += 1
    print("number of nodes after :"+ str(len(list(G_in))))
    return G_in

In [41]:
G_random_node_removals_proportional_HITS = random_node_removals_proportional_HITS(G.copy(), True, True, 10)

0 %
10 %
20 %
30 %
40 %
50 %
60 %
70 %
80 %
90 %
number of nodes after :2416


In [42]:
pr_random_node_removals_proportional_HITS = calc_pagerank(G_random_node_removals_proportional_HITS)
df_random_node_removals_proportional_HITS = create_dataframe(pr_random_node_removals_proportional_HITS,
                                                             G_random_node_removals_proportional_HITS)
df_random_node_removals_proportional_HITS.head()

rank,node,score,in edges,out edges
1,404,0.042751,10,0
2,195,0.019941,78,1
3,77,0.018609,120,2
4,728,0.01561,10,0
5,36,0.011095,162,5


### F. Addition of nodes at random but proportional to the degree of the nodes

In [43]:
#Edge Copying Model (slide 53 of Week6-SNA-Props)
def random_node_additions_proportional_in_degree(G_in, number_given = False, number_of_nodes = 1, k = 5):
    if (number_given & number_of_nodes < len(list(G_in))): # check if we do not remove too much nodes
        n = number_of_nodes
    else:
        n = int(0.1 * len(list(G_in)))  # max 10% of nodes
    print("number of edges before :"+ str(len(G_in.edges())))
    for _ in range(n):
        #k is number of edges to be added, random integer 1 between 5
        k = random.randint(1, k) #select k random vertices
        list_of_nodes = list(G_in)  #create list of nodes
        in_degrees = dict(G_in.in_degree()).values() # in_degrees of all the nodes
        prob_degree = [float(i)/sum(in_degrees) for i in in_degrees] # probabilities proportional to degree
        k_random_selected_nodes = np.random.choice(list_of_nodes, size = k, p = prob_degree, replace = False) # selecte k nodes proportional to chosen measure
        
        new_node = nx.number_of_nodes(G_in) + 1 #add node to graph
        G_in.add_node(str(new_node))
        
        for node in k_random_selected_nodes:
            successors = list(G_in.successors(str(node)))
            for node_to in successors:
                G_in.add_edge(new_node, node_to) # add outgoing edges
            predecessors = list(G_in.predecessors(str(node)))
            for node_from in predecessors:
                G_in.add_edge(node_from, new_node) # add incoming edges
        
        # pick one node that has a low probability (relatively low number of incoming edges)
        non_zero_probs = [i for i in prob_degree if i != 0.0]
        highest_chance_nodes = np.random.choice(list_of_nodes, p = prob_degree, 
                                                size = (len(non_zero_probs) - 1), replace = False)
        
        node_to_add = random.sample(set(list_of_nodes).difference(set(highest_chance_nodes)), 1)[0] # low prob node
        successors = list(G_in.successors(node_to_add)) # successors of the node
        predecessors = list(G_in.predecessors(node_to_add)) # predecessors of the node 
        
        succ_current_node = list(G_in.successors(new_node)) # find the successors of the new node 
        pred_current_node = list(G_in.predecessors(new_node)) # find the predecessors of the new node
                                 
        # remove nodes to which the new node is already connected from the successors/predecessors list
        successors = [n for n in successors if not n in succ_current_node]
        predecessors = [n for n in predecessors if not n in pred_current_node]
                                 
        for node_to in successors:
            G_in.add_edge(new_node, node_to) # add outgoing edges
        for node_from in predecessors:
            G_in.add_edge(node_from, new_node) # add incoming edges
            
    print("number of edges after :"+str(len(G_in.edges())))
    return G_in

In [44]:
G_random_node_additions_proportional_in_degree = random_node_additions_proportional_in_degree(G.copy(), True, 10, 5)

number of edges before :16631
number of edges after :17268


In [45]:
pr_random_node_additions_proportional_in_degree = calc_pagerank(G_random_node_additions_proportional_in_degree)
df_random_node_additions_proportional_in_degree = create_dataframe(pr_random_node_additions_proportional_in_degree,
                                                             G_random_node_additions_proportional_in_degree)
df_random_node_additions_proportional_in_degree.head()

rank,node,score,in edges,out edges
1,404,0.0411,10,0
2,195,0.019226,80,1
3,77,0.017743,124,2
4,728,0.014869,10,0
5,36,0.010564,170,5


### G. Addition of nodes at random but proportional to the hubs/authorithy measures (HITS) of nodes

In [46]:
#Edge Copying Model (slide 53 of Week6-SNA-Props)
def random_node_additions_proportional_HITS(G_in, authority = False, number_given = False, number_of_nodes = 1, k = 5):
    if (number_given & number_of_nodes < len(list(G_in))): # check if we do not remove too much nodes
        n = number_of_nodes
    else:
        n = int(0.1 * len(list(G_in)))  # max 10% of nodes
    print("number of edges before :"+ str(len(G_in.edges())))
    for _ in range(n):
        #k is number of edges to be added, random integer 1 between 5
        k = random.randint(1, k) #select k random vertices
        list_of_nodes = list(G_in)  #create list of nodes
        if (authority):
            p = list(nx.hits(G_in)[0].values())
        else: # hub
            p = list(nx.hits(G_in)[1].values())
        
        k_random_selected_nodes = np.random.choice(list_of_nodes, size = k, p = p, replace = False) # selecte k nodes proportional to chosen measure
        
        new_node = nx.number_of_nodes(G_in) + 1 #add node to graph
        G_in.add_node(str(new_node))
        
        for node in k_random_selected_nodes:
            successors = list(G_in.successors(node))
            for node_to in successors:
                G_in.add_edge(new_node, node_to) # add outgoing edges
            predecessors = list(G_in.predecessors(node))
            for node_from in predecessors:
                G_in.add_edge(node_from, new_node) # add incoming edges
        
        # pick one node that has a low probability (relatively low number of incoming edges)
        non_zero_probs = [i for i in p if i != 0.0]
        highest_chance_nodes = np.random.choice(list_of_nodes, p = p, 
                                                size = (len(non_zero_probs) - 1), replace = False)
        
        node_to_add = random.sample(set(list_of_nodes).difference(set(highest_chance_nodes)), 1)[0] # low prob node
        successors = list(G_in.successors(node_to_add)) # successors of the node
        predecessors = list(G_in.predecessors(node_to_add)) # predecessors of the node 
        
        succ_current_node = list(G_in.successors(new_node)) # find the successors of the new node 
        pred_current_node = list(G_in.predecessors(new_node)) # find the predecessors of the new node
                                 
        # remove nodes to which the new node is already connected from the successors/predecessors list
        successors = [n for n in successors if not n in succ_current_node]
        predecessors = [n for n in predecessors if not n in pred_current_node]
                                 
        for node_to in successors:
            G_in.add_edge(new_node, node_to) # add outgoing edges
        for node_from in predecessors:
            G_in.add_edge(node_from, new_node) # add incoming edges
            
    print("number of edges after :"+str(len(G_in.edges())))
    return G_in

In [72]:
G_random_node_additions_proportional_HITS = random_node_additions_proportional_HITS(G.copy(), True, True, 20, 5)

number of edges before :16631
number of edges after :17518


In [73]:
pr_random_node_additions_proportional_HITS = calc_pagerank(G_random_node_additions_proportional_HITS)
df_random_node_additions_proportional_HITS = create_dataframe(pr_random_node_additions_proportional_HITS, G_random_node_additions_proportional_HITS)
df_random_node_additions_proportional_HITS.head()

rank,node,score,in edges,out edges
1,404,0.043909,10,0
2,195,0.020703,84,1
3,77,0.018855,124,2
4,36,0.01173,177,5
5,192,0.009899,60,3


### More advanced evaluation methods of PageRank stability

#### rank-based error
In the analyses above the pagerank values of the different graph evolvement are compared, where only the absolute percentual change are chosen as comparison measure. For this paragraph, a more advanced method is implemented to evaluate the different evolvement on the original graph. The first measure to compare PageRank is rank-based error. For rank-based error, the error can be defined as: $$Error_{rank} = \sum_{i=1}^{n} \frac{|rank - rank_{baseline}|}{rank_{baseline}} $$

where $rank$ is generated by the used method. Before this measure can be applied, all the pagerank scores of the evolvement functions need to be combined into one single dataframe.

In [51]:
df_random_edges_uniform_random.rename(columns={'score': 'score_random_edges_uniform_random'}, inplace=True)
df_random_edges_uniform_random.insert(1, 'rank_random_edges_uniform_random', range(1, 1+len(df_random_edges_uniform_random)))
df_random_edges_uniform_random.drop(['in edges', 'out edges'], axis=1, inplace = True)

In [52]:
df_origin.rename(columns={'score': 'score_original'}, inplace=True)
df_origin.insert(1, 'rank_original', range(1, 1+len(df_origin))) #add rank score to dataframe, because first column 'rank'can't be accessed (actually is row number)
df_origin.drop(['in edges', 'out edges'], axis=1, inplace = True)

In [53]:
df_comparison_1 = pd.merge(df_random_edges_uniform_random, df_origin, on='node')

In [54]:
def compute_error_based_1(row):
    return abs(row['rank_random_edges_uniform_random'] - row['rank_original']) / row['rank_original'] 

df_comparison_1.apply(compute_error_based_1, axis = 1).sum()

324.19566859752683

$Error_{edges-uniform-random} = 324$

In [55]:
df_random_add_edges_uniform_random.rename(columns={'score': 'score_random_add_edges_uniform_random'}, inplace=True)
df_random_add_edges_uniform_random.insert(1, 'rank_random_add_edges_uniform_random', range(1, 1+len(df_random_add_edges_uniform_random)))
df_random_add_edges_uniform_random.drop(['in edges', 'out edges'], axis=1, inplace = True)

In [56]:
df_comparison_2 = pd.merge(df_origin, df_random_add_edges_uniform_random, on='node')
def compute_error_based_2(row):
    return abs(row['rank_random_add_edges_uniform_random'] - row['rank_original']) / row['rank_original'] 

df_comparison_2.apply(compute_error_based_2, axis = 1).sum()

219.16527398834387

$Error_{edges-add-uniform-random} = 219$

In [57]:
df_random_remove_edges_uniform_random.rename(columns={'score': 'score_random_remove_edges_uniform_random'}, inplace=True)
df_random_remove_edges_uniform_random.insert(1, 'rank_random_remove_edges_uniform_random', range(1, 1+len(df_random_remove_edges_uniform_random)))
df_random_remove_edges_uniform_random.drop(['in edges', 'out edges'], axis=1, inplace = True)

In [58]:
df_comparison_3 = pd.merge(df_origin, df_random_remove_edges_uniform_random, on='node')
def compute_error_based_3(row):
    return abs(row['rank_random_remove_edges_uniform_random'] - row['rank_original']) / row['rank_original'] 

df_comparison_3.apply(compute_error_based_3, axis = 1).sum()

352.96574519634083

$Error_{edges-remove-uniform-random} = 353$

In [59]:
df_random_add_nodes_uniform.rename(columns={'score': 'score_random_add_nodes_uniform'}, inplace=True)
df_random_add_nodes_uniform.insert(1, 'rank_random_add_nodes_uniform', range(1, 1+len(df_random_add_nodes_uniform)))
df_random_add_nodes_uniform.drop(['in edges', 'out edges'], axis=1, inplace = True)

In [60]:
df_comparison_4 = pd.merge(df_origin, df_random_add_nodes_uniform, on='node')
def compute_error_based_4(row):
    return abs(row['rank_random_add_nodes_uniform'] - row['rank_original']) / row['rank_original'] 

df_comparison_4.apply(compute_error_based_4, axis = 1).sum()

175.14339107801015

$Error_{nodes-add-uniform-random} = 175$

In [61]:
df_random_removal_nodes_uniform.rename(columns={'score': 'score_random_removal_nodes_uniform'}, inplace=True)
df_random_removal_nodes_uniform.insert(1, 'rank_random_removal_nodes_uniform', range(1, 1+len(df_random_removal_nodes_uniform)))
df_random_removal_nodes_uniform.drop(['in edges', 'out edges'], axis=1, inplace = True)

In [62]:
df_comparison_5 = pd.merge(df_origin, df_random_removal_nodes_uniform, on='node')
def compute_error_based_5(row):
    return abs(row['rank_random_removal_nodes_uniform'] - row['rank_original']) / row['rank_original'] 

df_comparison_5.apply(compute_error_based_5, axis = 1).sum()

293.6373458130011

$Error_{nodes-removal-uniform-random} = 294$

for function D-E-F-G:

In [63]:
df_random_node_removals_proportional_indegree.rename(columns={'score': 'score_random_node_removals_proportional_indegree'}, inplace=True)
df_random_node_removals_proportional_indegree.insert(1, 'rank_random_node_removals_proportional_indegree', range(1, 1+len(df_random_node_removals_proportional_indegree)))
df_random_node_removals_proportional_indegree.drop(['in edges', 'out edges'], axis=1, inplace = True)
df_comparison_6 = pd.merge(df_origin, df_random_node_removals_proportional_indegree, on='node')

In [64]:
def compute_error_based_6(row):
    return abs(row['rank_random_node_removals_proportional_indegree'] - row['rank_original']) / row['rank_original'] 
df_comparison_6.apply(compute_error_based_6, axis = 1).sum()

245.54260258431913

$Error_{nodes-removal-proportional-indegree} = 245$

In [65]:
df_random_node_removals_proportional_HITS.rename(columns={'score': 'score_random_node_removals_proportional_HITS'}, inplace=True)
df_random_node_removals_proportional_HITS.insert(1, 'rank_random_node_removals_proportional_HITS', range(1, 1+len(df_random_node_removals_proportional_HITS)))
df_random_node_removals_proportional_HITS.drop(['in edges', 'out edges'], axis=1, inplace = True)
df_comparison_7 = pd.merge(df_origin, df_random_node_removals_proportional_HITS, on='node')

In [66]:
def compute_error_based_7(row):
    return abs(row['rank_random_node_removals_proportional_HITS'] - row['rank_original']) / row['rank_original'] 
df_comparison_7.apply(compute_error_based_7, axis = 1).sum()

60.98962003870765

In [67]:
df_random_node_additions_proportional_in_degree.rename(columns={'score': 'score_random_node_additions_proportional_in_degree'}, inplace=True)
df_random_node_additions_proportional_in_degree.insert(1, 'rank_random_node_additions_proportional_in_degree', range(1, 1+len(df_random_node_additions_proportional_in_degree)))
df_random_node_additions_proportional_in_degree.drop(['in edges', 'out edges'], axis=1, inplace = True)
df_comparison_8 = pd.merge(df_origin, df_random_node_additions_proportional_in_degree, on='node')

In [68]:
def compute_error_based_8(row):
    return abs(row['rank_random_node_additions_proportional_in_degree'] - row['rank_original']) / row['rank_original'] 
df_comparison_8.apply(compute_error_based_8, axis = 1).sum()

100.71712278484891

$Error_{nodes-addition-proportional-indegree} = 101$

In [74]:
df_random_node_additions_proportional_HITS.rename(columns={'score': 'score_random_node_additions_proportional_HITS'}, inplace=True)
df_random_node_additions_proportional_HITS.insert(1, 'rank_random_node_additions_proportional_HITS', range(1, 1+len(df_random_node_additions_proportional_HITS)))
df_random_node_additions_proportional_HITS.drop(['in edges', 'out edges'], axis=1, inplace = True)
df_comparison_9 = pd.merge(df_origin, df_random_node_additions_proportional_HITS, on='node')

In [75]:
def compute_error_based_9(row):
    return abs(row['rank_random_node_additions_proportional_HITS'] - row['rank_original']) / row['rank_original'] 
df_comparison_9.apply(compute_error_based_9, axis = 1).sum()

106.32614202612108

$Error_{nodes-addition-proportional-indegree} = 106$