# 1-b Graph evolving - Joris

In [1]:
import networkx as nx
import plotly.plotly as py
import plotly.figure_factory as ff
import pandas as pd

Load edge list and create a graph

In [2]:
fh = open("canvas/hamster.edgelist", 'rb')
G = nx.read_edgelist(fh, create_using=nx.DiGraph())
fh.close()

Next we run the pagerank algorithm with a dampening parameter of 0.85. The dampening parameter represents the likelyhood of clicking a link on the webpage. With a dampening parameter of 0.85 we indicate that there is a 85% of clicking a link on the webpage and 15% of going to a random other node in the graph. We calculate the page rank using the power iteration method.

In [3]:
def calc_pagerank(G, alpha = 0.85):
    return nx.pagerank(G, alpha=0.85)

In [4]:
pr_origin = calc_pagerank(G)

In [5]:
def create_dataframe(pr):
    df_edge_in = pd.DataFrame(list(G.in_degree()), columns=['node', 'in edges'])
    df_edge_out = pd.DataFrame(list(G.out_degree()), columns=['node', 'out edges'])
    df_rank = pd.DataFrame(list(pr.items()), columns=['node', 'score']).sort_values(by=['score'], ascending=False)
    df_temp = pd.merge(df_rank, df_edge_in, on='node')
    df_total = pd.merge(df_temp, df_edge_out, on='node')
    df_total.index = df_total.index + 1
    df_total.columns.name = 'rank'
    return df_total

In [6]:
df_origin = create_dataframe(pr_origin)
df_origin.head()

rank,node,score,in edges,out edges
1,404,0.042793,10,0
2,195,0.019961,80,1
3,77,0.018628,121,2
4,728,0.01553,10,0
5,36,0.011117,168,5


It looks like node 404 is the best ranked page, following by 195 and 77. This means that these pages should be shown at the top by search engines.

In [7]:
df_origin.tail()

rank,node,score,in edges,out edges
2422,1746,0.000113,0,3
2423,1748,0.000113,0,4
2424,1749,0.000113,0,3
2425,1751,0.000113,0,2
2426,2426,0.000113,0,8


As expected the higher ranked pages have more incoming edges than the lower ranked pages. Looking at the amount of incoming edges for the 15 best ranked pages it is clear that the the rank of the source of the incoming edges is more important than the amount of incoming edges.

In [8]:
import networkx as nx
import random
import matplotlib.pyplot as plt

In [9]:
#add new connection to random other node with probability prob_add
#remove random connection of node with probability prob_remove
def random_edges(G, prob_add, prob_remove):
    add_edges = []
    removed_edges = []
    
    for node in G.nodes():
        #find notes that this one is connected to
        connected = [to for (fr, to) in G.edges(node)]
        #find candidates for new edges
        unconnected = [n for n in G.nodes() if not n in connected]
        
        #randomly add new edge to the selected node
        if len(unconnected): #only add when unconnected is not empty
            if random.random() < prob_add:
                new = random.choice(unconnected)
                G.add_edge(node, new)
                print("\tnew edge:\t {} -- {}".format(node, new))
                add_edges.append( (node, new) )
                #update lists, in case both add and remove done in same cycle
                unconnected.remove(new)    
                connected.append(new)
                
        if len(connected):
            if random.random() < prob_remove:
                remove = random.choice(connected)
                G.remove_edge(node, remove)
                print("\tedge removed:\t {} -- {}".format(node, remove))
                removed_edges.append((node, remove))
                #update lists, in case both add and remove done in same cycle
                connected.remove(remove)
                unconnected.append(remove)
    return G

In [10]:
prob_add = 0.050
prob_remove = 0.005
G_random = random_edges(G, prob_add, prob_remove)

	new edge:	 7 -- 667
	new edge:	 36 -- 901
	new edge:	 38 -- 406
	new edge:	 41 -- 1946
	new edge:	 58 -- 1903
	new edge:	 72 -- 565
	new edge:	 89 -- 661
	new edge:	 90 -- 782
	new edge:	 99 -- 1746
	new edge:	 104 -- 2354
	new edge:	 166 -- 2103
	new edge:	 175 -- 1704
	new edge:	 195 -- 2383
	new edge:	 206 -- 1397
	new edge:	 221 -- 1501
	new edge:	 225 -- 2086
	new edge:	 268 -- 1432
	new edge:	 286 -- 1769
	new edge:	 293 -- 1052
	new edge:	 336 -- 1102
	new edge:	 361 -- 672
	new edge:	 365 -- 2240
	new edge:	 450 -- 2014
	edge removed:	 452 -- 238
	new edge:	 456 -- 581
	new edge:	 469 -- 2394
	new edge:	 470 -- 753
	new edge:	 501 -- 1472
	new edge:	 503 -- 1163
	edge removed:	 513 -- 514
	new edge:	 514 -- 2048
	new edge:	 519 -- 26
	new edge:	 527 -- 1741
	new edge:	 539 -- 1760
	new edge:	 589 -- 1729
	new edge:	 600 -- 180
	new edge:	 603 -- 1017
	new edge:	 680 -- 571
	new edge:	 690 -- 2280
	new edge:	 699 -- 810
	new edge:	 715 -- 410
	new edge:	 734 -- 76
	new edge:	 7

In [11]:
pr_random = calc_pagerank(G_random)
df_random = create_dataframe(pr_random)
df_random.head()

rank,node,score,in edges,out edges
1,404,0.032768,10,0
2,195,0.019623,80,2
3,77,0.018723,121,2
4,728,0.014896,10,0
5,36,0.011467,168,6


In [12]:
#randomly add and remove nodes
#Edge Copying Model (slide 51 of Week6-SNA-Props)
def random_nodes(G, prob_select, prob_remove):
    #k is number of edges to be added, random integer 1 between 5
    for node in range(0,len(G.nodes())):
        #select k random vertices
        k = random.randint(1,5)
        #add node to graph
        new_node = nx.number_of_nodes(G) + 1
        G.add_node(new_node)

        #create list of nodes
        list_of_nodes = list(G)
        k_random_selected_nodes = []
        print("k = "+str(k))

        for node in G.nodes():
            if len(k_random_selected_nodes) < k:
                if random.random() < prob_select:
                        #select random vertice with probability prob_select
                    k_random_selected_nodes.append(random.choice(list_of_nodes))
            else:
                print("breaked")
                break;
        #add edges with prob 1-prob_select from the just
        #added new_node and the nodes that are in k_random_selected_nodes

        print("number of edges before :"+ str(len(G.edges())))
        for node in k_random_selected_nodes:
            G.add_edge(new_node, node)
        print("number of edges after :"+str(len(G.edges())))
    
    
    #remove nodes and corresponding edges
    for node in range(0,len(G.nodes())):
        list_of_nodes = list(G)
        if random.random() < prob_remove:
            node_remove = random.choice(list_of_nodes)
            G.remove_node(node_remove)
    print("number of edges after remove:"+str(len(G.edges())))
    return G

In [13]:
prob_select = 0.1
prob_remove = 0.1
G_random_nodes = random_nodes(G, prob_select, prob_remove)

k = 4
breaked
number of edges before :16760
number of edges after :16764
k = 1
breaked
number of edges before :16764
number of edges after :16765
k = 5
breaked
number of edges before :16765
number of edges after :16770
k = 3
breaked
number of edges before :16770
number of edges after :16773
k = 3
breaked
number of edges before :16773
number of edges after :16776
k = 3
breaked
number of edges before :16776
number of edges after :16779
k = 1
breaked
number of edges before :16779
number of edges after :16780
k = 3
breaked
number of edges before :16780
number of edges after :16783
k = 1
breaked
number of edges before :16783
number of edges after :16784
k = 3
breaked
number of edges before :16784
number of edges after :16787
k = 3
breaked
number of edges before :16787
number of edges after :16790
k = 2
breaked
number of edges before :16790
number of edges after :16792
k = 5
breaked
number of edges before :16792
number of edges after :16797
k = 2
breaked
number of edges before :16797
number 

k = 5
breaked
number of edges before :18029
number of edges after :18034
k = 2
breaked
number of edges before :18034
number of edges after :18036
k = 4
breaked
number of edges before :18036
number of edges after :18040
k = 3
breaked
number of edges before :18040
number of edges after :18043
k = 2
breaked
number of edges before :18043
number of edges after :18045
k = 4
breaked
number of edges before :18045
number of edges after :18049
k = 3
breaked
number of edges before :18049
number of edges after :18052
k = 2
breaked
number of edges before :18052
number of edges after :18054
k = 1
breaked
number of edges before :18054
number of edges after :18055
k = 1
breaked
number of edges before :18055
number of edges after :18056
k = 3
breaked
number of edges before :18056
number of edges after :18059
k = 3
breaked
number of edges before :18059
number of edges after :18062
k = 4
breaked
number of edges before :18062
number of edges after :18066
k = 2
breaked
number of edges before :18066
number 

number of edges before :19087
number of edges after :19092
k = 3
breaked
number of edges before :19092
number of edges after :19095
k = 3
breaked
number of edges before :19095
number of edges after :19098
k = 5
breaked
number of edges before :19098
number of edges after :19103
k = 3
breaked
number of edges before :19103
number of edges after :19106
k = 5
breaked
number of edges before :19106
number of edges after :19111
k = 2
breaked
number of edges before :19111
number of edges after :19113
k = 2
breaked
number of edges before :19113
number of edges after :19115
k = 3
breaked
number of edges before :19115
number of edges after :19118
k = 2
breaked
number of edges before :19118
number of edges after :19120
k = 1
breaked
number of edges before :19120
number of edges after :19121
k = 5
breaked
number of edges before :19121
number of edges after :19125
k = 3
breaked
number of edges before :19125
number of edges after :19128
k = 4
breaked
number of edges before :19128
number of edges after

k = 4
breaked
number of edges before :19906
number of edges after :19910
k = 3
breaked
number of edges before :19910
number of edges after :19913
k = 3
breaked
number of edges before :19913
number of edges after :19916
k = 5
breaked
number of edges before :19916
number of edges after :19921
k = 2
breaked
number of edges before :19921
number of edges after :19923
k = 1
breaked
number of edges before :19923
number of edges after :19924
k = 5
breaked
number of edges before :19924
number of edges after :19929
k = 5
breaked
number of edges before :19929
number of edges after :19934
k = 5
breaked
number of edges before :19934
number of edges after :19939
k = 1
breaked
number of edges before :19939
number of edges after :19940
k = 3
breaked
number of edges before :19940
number of edges after :19943
k = 3
breaked
number of edges before :19943
number of edges after :19946
k = 3
breaked
number of edges before :19946
number of edges after :19949
k = 1
breaked
number of edges before :19949
number 

k = 1
breaked
number of edges before :20919
number of edges after :20920
k = 1
breaked
number of edges before :20920
number of edges after :20921
k = 5
breaked
number of edges before :20921
number of edges after :20926
k = 3
breaked
number of edges before :20926
number of edges after :20929
k = 4
breaked
number of edges before :20929
number of edges after :20933
k = 2
breaked
number of edges before :20933
number of edges after :20935
k = 5
breaked
number of edges before :20935
number of edges after :20940
k = 3
breaked
number of edges before :20940
number of edges after :20943
k = 4
breaked
number of edges before :20943
number of edges after :20947
k = 5
breaked
number of edges before :20947
number of edges after :20952
k = 1
breaked
number of edges before :20952
number of edges after :20953
k = 1
breaked
number of edges before :20953
number of edges after :20954
k = 3
breaked
number of edges before :20954
number of edges after :20957
k = 1
breaked
number of edges before :20957
number 

k = 5
breaked
number of edges before :21753
number of edges after :21758
k = 1
breaked
number of edges before :21758
number of edges after :21759
k = 5
breaked
number of edges before :21759
number of edges after :21764
k = 4
breaked
number of edges before :21764
number of edges after :21768
k = 2
breaked
number of edges before :21768
number of edges after :21770
k = 3
breaked
number of edges before :21770
number of edges after :21773
k = 3
breaked
number of edges before :21773
number of edges after :21775
k = 4
breaked
number of edges before :21775
number of edges after :21779
k = 1
breaked
number of edges before :21779
number of edges after :21780
k = 5
breaked
number of edges before :21780
number of edges after :21785
k = 2
breaked
number of edges before :21785
number of edges after :21787
k = 3
breaked
number of edges before :21787
number of edges after :21790
k = 3
breaked
number of edges before :21790
number of edges after :21793
k = 5
breaked
number of edges before :21793
number 

breaked
number of edges before :22588
number of edges after :22593
k = 2
breaked
number of edges before :22593
number of edges after :22595
k = 3
breaked
number of edges before :22595
number of edges after :22598
k = 1
breaked
number of edges before :22598
number of edges after :22599
k = 5
breaked
number of edges before :22599
number of edges after :22604
k = 3
breaked
number of edges before :22604
number of edges after :22607
k = 5
breaked
number of edges before :22607
number of edges after :22612
k = 4
breaked
number of edges before :22612
number of edges after :22616
k = 3
breaked
number of edges before :22616
number of edges after :22619
k = 1
breaked
number of edges before :22619
number of edges after :22620
k = 3
breaked
number of edges before :22620
number of edges after :22623
k = 1
breaked
number of edges before :22623
number of edges after :22624
k = 1
breaked
number of edges before :22624
number of edges after :22625
k = 5
breaked
number of edges before :22625
number of edg

breaked
number of edges before :23217
number of edges after :23218
k = 2
breaked
number of edges before :23218
number of edges after :23220
k = 3
breaked
number of edges before :23220
number of edges after :23223
k = 5
breaked
number of edges before :23223
number of edges after :23228
k = 3
breaked
number of edges before :23228
number of edges after :23231
k = 1
breaked
number of edges before :23231
number of edges after :23232
k = 5
breaked
number of edges before :23232
number of edges after :23237
k = 5
breaked
number of edges before :23237
number of edges after :23242
k = 4
breaked
number of edges before :23242
number of edges after :23246
k = 3
breaked
number of edges before :23246
number of edges after :23249
k = 4
breaked
number of edges before :23249
number of edges after :23253
k = 2
breaked
number of edges before :23253
number of edges after :23255
k = 1
breaked
number of edges before :23255
number of edges after :23256
k = 1
breaked
number of edges before :23256
number of edg

In [14]:
pr_random_nodes = calc_pagerank(G_random_nodes)
df_random_nodes = create_dataframe(pr_random_nodes)
df_random_nodes.head()

rank,node,score,in edges,out edges
1,404,0.032792,11,0
2,195,0.01988,79,2
3,77,0.019162,113,2
4,135,0.013056,47,8
5,36,0.011027,158,5
