In [1]:
import matplotlib.pyplot as plt
import numpy as np
import requests
import os

In [2]:
with open("example_index", "wb") as f:
    r = requests.get("http://webdatacommons.org/hyperlinkgraph/data/example_index")
    f.write(r.content)
with open("example_arcs", "wb") as f:
    r = requests.get("http://webdatacommons.org/hyperlinkgraph/data/example_arcs")    
    f.write(r.content)

nodes = np.loadtxt("example_index", dtype=object)[:,0]
edges = np.loadtxt("example_arcs", dtype=int)

os.system("rm example_index")
os.system("rm example_arcs")

n_nodes = nodes.shape[0]
n_edges = edges.shape[0]

### Graph Init

In [33]:
# out degrees
degrees = [0 for i in range(n_nodes)]
# adjacency lists
adj = [[] for i in range(n_nodes)]

for u, v in edges:
    degrees[u] += 1
    adj[v].append(u)

### Algorithm

In [40]:
import tqdm

EPOCH = 100
beta = 0.85
r = np.ones(n_nodes) / n_nodes
residuals = []

for epoch in tqdm.tqdm(range(EPOCH)):
    new_r = np.zeros_like(r)
    
    for n in range(n_nodes):
        # from adjacent nodes
        new_r[n] = sum(beta * r[v] / degrees[v] for v in adj[n])
        # random teleports
        new_r[n] += (1 - beta) / n_nodes
    # dead ends
    new_r /= new_r.sum()
    
    residuals.append(np.sum(np.abs(r - new_r)))
    r = new_r

100%|██████████| 100/100 [00:00<00:00, 3739.51it/s]


In [41]:
for idx, (score, page) in enumerate(sorted(zip(r, nodes), reverse=True)):
    print(f"{score:.6f}\t{page}")
    if idx >= 10:
        break

0.123900	blogspot.com
0.038960	creativecommons.org
0.032904	canalblog.com
0.031180	rea-group.com
0.025555	wikipedia.org
0.024800	wikimedia.org
0.023818	yahoo.com
0.021117	wordpress.com
0.020645	youtube.com
0.020454	flickr.com
0.019235	tumblr.com
