-
Notifications
You must be signed in to change notification settings - Fork 0
/
walk.py
436 lines (359 loc) · 12.6 KB
/
walk.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
import math
import networkx as nx
import numpy as np
import random
import scipy.sparse as sparse
from networkx.classes.function import info, number_of_edges, number_of_nodes
from networkx.linalg.graphmatrix import adjacency_matrix
from networkx.readwrite.gpickle import read_gpickle
from networkx.relabel import convert_node_labels_to_integers
from numpy.random import choice
from scipy.sparse.linalg import eigs
from sklearn.preprocessing import normalize
def estimate_edges(G, u, run):
# Current timestep
t = 0
# Current number of returns to start vertex (u)
k = 0
# Time of k-th return to start vertex
Z_total = 0
# Degree of start vertex
deg_u = G.degree(u)
# Build the transition matrix P for the simple random walk
# w(u,v) <- 1
P = adjacency_matrix(G)
P = normalize(P, norm='l1', axis=1)
eigenvals, eigenvecs = eigs(P, k=2, which='LM')
lambda_2 = float(sorted(eigenvals)[-2])
Z_uu = 1.0 / (1.0 - lambda_2)
pi_u = float(deg_u) / G.number_of_edges()
ct_factor = (2 * Z_uu + pi_u - 1.0) / (pi_u ** 2)
# Estimates of the number of edges, one for each return time
m_est = []
# Std. dev estimates using Z_uu
m_std_Zuu = []
# Return times
Tus = []
# Start vertex
vertex = u
while k < 100000:
# Simple random walk
i = random.randint(0, len(G.neighbors(vertex)) - 1)
vertex = G.neighbors(vertex)[i]
# Check if we have a new return to start vertex
if vertex == u:
# Add new return time
Tus.append(t - Z_total)
Z_total = t
k += 1
# Add new estimate of m (number of edges)
curr_est = float(Z_total * deg_u) / (2 * k)
m_est.append(curr_est)
# Add new std. dev
curr_std_Zuu = math.sqrt(float(deg_u ** 2 / (4 * k)) * ct_factor)
m_std_Zuu.append(curr_std_Zuu)
if k % 100 == 0:
print "Estimate of m at", k, ":", curr_est, "std:", curr_std_Zuu
# Increase timestep
t += 1
m_est = np.array(m_est)
m_std_Zuu = np.array(m_std_Zuu)
Tus = np.array(Tus)
m_est.tofile('m_est_' + str(run) + '.npy')
m_std_Zuu.tofile('m_std_Zuu_' + str(run) + '.npy')
Tus.tofile('Tus' + str(run) + '.npy')
def estimate_nodes(G, u, run):
# Current timestep
t = 0
# Current number of returns to start vertex (u)
k = 0
# Time of k-th return to start vertex
Z_total = 0
# Compute weight of start vertex
w_u = 1.0
for v in G.neighbors(u):
w_u += 1.0 / float(G.degree(v))
# Build the transition matrix P for the weighted random walk
# w(u,v) <- 1 / deg(u) + 1 / deg(v)
P = sparse.lil_matrix((G.number_of_nodes(), G.number_of_nodes()))
for (u, v) in G.edges():
P[u,v] = 1.0 / G.degree(u) + 1.0 / G.degree(v)
P[v,u] = P[u,v]
P = normalize(P, norm='l1', axis=1)
P = sparse.csr_matrix(P)
eigenvals, eigenvecs = eigs(P)
lambda_2 = float(sorted(eigenvals)[-2])
#lambda_2 = 0.933389228491 # generated graph
lambda_2 = 0.999876848583 # Google graph
Z_uu = 1.0 / (1.0 - lambda_2)
pi_u = w_u / float(2 * G.number_of_nodes())
ct_factor = (2 * Z_uu + pi_u - 1.0) / (pi_u ** 2)
# Pre-compute degrees
degs = np.empty((G.number_of_nodes(),), dtype=int)
for node in G.nodes():
degs[node] = G.degree(node)
# Estimates of the number of nodes, one for each return time
n_est = []
# Variance estimates using Z_uu
n_std_Zuu = []
# Return times
Tus = []
# Start vertex
vertex = u
while k < 1000:
neighbors = G.neighbors(vertex)
# Compute weighted edges for the random walk from current vertex
w_vertex = 1.0 / degs[vertex]
p = np.array([w_vertex + (1.0 / degs[n]) for n in neighbors])
# Choose next vertex
new_vertex = np.random.choice(neighbors, p=p/sum(p))
vertex = new_vertex
# Check if we have a new return to start vertex
if vertex == u:
# Add new return time
Tus.append(t - Z_total)
Z_total = t
k += 1
# Add new estimate of n (number of nodes)
curr_est = float(Z_total * w_u) / (2 * k)
n_est.append(curr_est)
# Add new std. dev.
curr_std_Zuu = math.sqrt(float(w_u ** 2 / (4 * k)) * ct_factor)
n_std_Zuu.append(curr_std_Zuu)
if k % 100 == 0:
print "Estimate of n at", k, ":", curr_est, "std:", curr_std_Zuu
# Increase timestep
t += 1
n_est = np.array(n_est)
n_std_Zuu = np.array(n_std_Zuu)
Tus = np.array(Tus)
n_est.tofile('n_est_' + str(run) + '.npy')
n_std_Zuu.tofile('n_std_Zuu_' + str(run) + '.npy')
Tus.tofile('n_Tus' + str(run) + '.npy')
def estimate_triangles(G, u, c, run):
# Current timestep
t = 0
# Current number of returns to start vertex (u)
k = 0
# Time of k-th return to start vertex
Z_total = 0
# Degree of start vertex
deg_u = G.degree(u)
# Triangles containing start vertex
t_u = nx.triangles(G, u)
# Pre-compute the weights for all edges for a faster random walk:
# weight(e) <- 1 + c * triangles(e)
edges = G.edges()
d1 = dict(((x,y), 1.0) for (x,y) in edges)
d2 = dict(((y,x), 1.0) for (x,y) in edges)
edge_weights = dict(d1.items() + d2.items())
for (x,y) in edge_weights:
edge_weights[(x,y)] += \
c * len([n for n in G.neighbors(y) if n in G.neighbors(x)])
np.save('WRW_t' + str(c) + '.npy', edge_weights)
# Build the transition matrix P
N = G.number_of_nodes()
P = sparse.lil_matrix((N, N))
for (u, v) in edge_weights.keys():
P[u,v] = edge_weights[(u,v)]
P = normalize(P, norm='l1', axis=1)
P = sparse.csr_matrix(P)
eigenvals, eigenvecs = eigs(P, k=2, which='LM')
lambda_2 = float(sorted(eigenvals)[0])
Z_uu = 1.0 / (1.0 - lambda_2)
m = G.number_of_edges()
ct_numerator = float(deg_u + 2 * c * t_u)
t_G = sum(nx.triangles(G).values()) / 3
pi_u = ct_numerator / (2 * m + 6 * c * t_G)
ct_factor = (2 * Z_uu + pi_u - 1.0) / (pi_u ** 2)
# Estimates of the number of triangles, one for each return time
t_est = []
# Variance estimates
t_std_Zuu = []
# Return times
Tus = []
# Start vertex
vertex = u
while k < 10000:
neighbors = G.neighbors(vertex)
# Gather relevant weighted edges for the random walk
p = np.array([edge_weights[(x,y)] \
for (x,y) in zip([vertex] * len(neighbors), neighbors)])
# Choose next vertex
new_vertex = np.random.choice(neighbors, p=p/sum(p))
vertex = new_vertex
# Check if we have a new return to start vertex
if vertex == u:
# Add new return time
Tus.append(t - Z_total)
Z_total = t
k += 1
# Add new estimate of t (number of triangles)
curr_est = max(0,
float(Z_total * (deg_u + 2 * c * t_u)) / (6 * k) - \
float(m) / 3)
t_est.append(curr_est)
# Add new std. dev.
curr_std_Zuu = math.sqrt(float(ct_numerator ** 2 / (36 * k)) *
ct_factor)
t_std_Zuu.append(curr_std_Zuu)
if k % 100 == 0:
print "Estimate of t at", k, ":", curr_est, "std:", curr_std_Zuu
# Increase timestep
t += 1
t_est = np.array(t_est)
t_std_Zuu = np.array(t_std_Zuu)
Tus = np.array(Tus)
t_est.tofile('t_est_' + str(run) + str(c) + '.npy')
t_std_Zuu.tofile('t_std_Zuu_' + str(run) + str(c) + '.npy')
Tus.tofile('t_Tus' + str(run) + str(c) + '.npy')
def cycle_edges(G, u, run):
prev_R_u = 0.0
R_u = 0.0
# Quantities needed for estimating variance of R_u
deg_u = G.degree(u)
pi_u = deg_u / (2.0 * G.number_of_edges())
#lambda_2 = 0.91063444938 # taken from estimate_edges() - generated
lambda_2 = 0.999589228299 # taken from estimate_edges() - Google
Z_vv = 1.0 / (1.0 - lambda_2)
ct_factor = (2 * Z_vv + pi_u - 1.0) / (pi_u ** 2)
k = 0
# Estimates of the number of edges, one for each return time
m_est = []
# Variance Estimates
m_std = []
# Start vertex
vertex = u
while k < 8000:
# Simple random walk
i = random.randint(0, len(G.neighbors(vertex)) - 1)
vertex = G.neighbors(vertex)[i]
R_u += 1.0
# Check if we have a new return to start vertex
if vertex == u:
R_u = (prev_R_u * k + R_u) / (k + 1)
k += 1
# Add new estimate of m (number of edges)
curr_est = 0.5 * deg_u * R_u
m_est.append(curr_est)
# Add new std. dev.
curr_std = math.sqrt(ct_factor / k)
m_std.append(curr_std)
if k % 100 == 0:
print "Estimate of m at", k, ":", curr_est, "w/ std:", curr_std
prev_R_u = R_u
R_u = 0.0
m_est = np.array(m_est)
m_std = np.array(m_std)
m_est.tofile('m_est_' + str(run) + '_cycle.npy')
m_std.tofile('m_std_' + str(run) + '_cycle.npy')
def cycle_nodes(G, u, run):
prev_R_u = 0.0
R_u = 0.0
deg_u = G.degree(u)
k = 0
# Pre-compute degrees
degs = np.empty((G.number_of_nodes(),), dtype=int)
for node in G.nodes():
degs[node] = G.degree(node)
# Estimates of the number of nodes, one for each return time
n_est = []
# Start vertex
vertex = u
while k < 1000:
# Simple random walk
i = random.randint(0, len(G.neighbors(vertex)) - 1)
vertex = G.neighbors(vertex)[i]
R_u += 1.0 / float(degs[vertex])
# Check if we have a new return to start vertex
if vertex == u:
R_u = (prev_R_u * k + R_u) / (k + 1)
k += 1
# Add new estimate of n (number of nodes)
curr_est = deg_u * R_u
n_est.append(curr_est)
if k % 100 == 0:
print "Estimate of n at", k, ":", curr_est
prev_R_u = R_u
R_u = 0.0
n_est = np.array(n_est)
n_est.tofile('n_est_' + str(run) + '_cycle.npy')
def cycle_triangles(G, u, run):
prev_R_u = 0.0
R_u = 0.0
deg_u = G.degree(u)
k = 0
# Pre-compute degrees
degs = np.empty((G.number_of_nodes(),), dtype=float)
for node in G.nodes():
degs[node] = G.degree(node)
# Pre-compute triangles
ts = np.empty((G.number_of_nodes(),), dtype=float)
for node in G.nodes():
ts[node] = nx.triangles(G, node)
# Estimates of the number of triangles, one for each return time
t_est = []
# Start vertex
vertex = u
while k < 1000:
# Simple random walk
i = random.randint(0, len(G.neighbors(vertex)) - 1)
vertex = G.neighbors(vertex)[i]
R_u += ts[vertex] / degs[vertex]
# Check if we have a new return to start vertex
if vertex == u:
R_u = (prev_R_u * k + R_u) / (k + 1)
k += 1
# Add new estimate of t (number of triangles)
curr_est = 1.0 / 3.0 * deg_u * R_u
t_est.append(curr_est)
if k % 100 == 0:
print "Estimate of t at", k, ":", curr_est
prev_R_u = R_u
R_u = 0.0
t_est = np.array(t_est)
t_est.tofile('t_est_' + str(run) + '_cycle.npy')
def evaluation(G, u):
# Estimate m - random walk
for i in range(1,11):
estimate_edges(G, u, i)
# Estimate n - random walk
for i in range(1, 11):
estimate_nodes(G, u, i)
# Estimate t - random walk
for i in range(1, 11):
estimate_triangles(G, u, 0.1, i)
estimate_triangles(G, u, 1.0, i)
# Estimate m - cycle formula
for i in range(1,11):
cycle_edges(G, u, i)
# Estimate n - cycle formula
for i in range(1, 11):
cycle_nodes(G, u, i)
# Estimate t - cycle formula
for i in range(1, 11):
cycle_triangles(G, u, i)
def load_graph(gen=True):
if gen:
G = read_gpickle('HTCM.gpickle')
else:
G = read_gpickle('Google.gpickle')
G = convert_node_labels_to_integers(G)
print info(G)
print "Triangles in gen. graph:", sum(nx.triangles(G).values()) / 3
max_deg1 = 0, max_deg2 = 0
u1 = 0, u2 = 0
for node in G.nodes():
if max_deg < G.degree(node):
max_deg = G.degree(node)
u = node
print "Max degree", max_deg, "at node", u, "(belonging to",\
nx.triangles(G, u), "triangles)"
return G
if __name__ == '__main__':
# Load generated graph
G1 = load_graph()
# Load Google graph
G2 = load_graph(gen=False)
evaluation(G1, u)
evaluation(G2, u)