In [1]:
import random
import scipy as sp
import timeit
import sys
import networkx as nx
import scipy


In [2]:
import numpy as np
def wikipr(M, eps=1.0e-8, d=0.85):
    N = M.shape[1]
    v = np.random.rand(N, 1)
    v = v / np.linalg.norm(v, 1)
    last_v = np.ones((N, 1), dtype=np.float32) * 100
    
    while np.linalg.norm(v - last_v, 2) > eps:
        last_v = v
        v = d * np.matmul(M, v) + (1 - d) / N
    return v    
#wikipr(A.todense())

In [24]:
def pagerank_scipy(G, M, alpha=0.85, personalization=None,
                   max_iter=100, tol=1.0e-6, weight='weight',
                   dangling=None):

    import scipy.sparse

    N = len(G)
    if N == 0:
        return {}

    nodelist = G.nodes()
    M = nx.to_scipy_sparse_matrix(G, nodelist=nodelist, weight=weight,
                                  dtype=float)
    #print ("M=", M)    
    S = scipy.array(M.sum(axis=1)).flatten()
    S[S != 0] = 1.0 / S[S != 0]
    #print ("S=", S)
    Q = scipy.sparse.spdiags(S.T, 0, *M.shape, format='csr')
    #print ("Q=", Q)
    M = Q * M
    #print ("M=", M)
    
    # initial vector
    x = scipy.repeat(1.0 / N, N)

    # Personalization vector
    if personalization is None:
        p = scipy.repeat(1.0 / N, N)
    else:
        missing = set(nodelist) - set(personalization)
        if missing:
            raise NetworkXError('Personalization vector dictionary '
                                'must have a value for every node. '
                                'Missing nodes %s' % missing)
        p = scipy.array([personalization[n] for n in nodelist],
                        dtype=float)
        p = p / p.sum()

    # Dangling nodes
    if dangling is None:
        dangling_weights = p
    else:
        missing = set(nodelist) - set(dangling)
        if missing:
            raise NetworkXError('Dangling node dictionary '
                                'must have a value for every node. '
                                'Missing nodes %s' % missing)
        # Convert the dangling dictionary into an array in nodelist order
        dangling_weights = scipy.array([dangling[n] for n in nodelist],
                                       dtype=float)
        dangling_weights /= dangling_weights.sum()
    is_dangling = scipy.where(S == 0)[0]

    # power iteration: make up to max_iter iterations
    for iteration in range(max_iter):
        #print ("x=", x[:10])
        
        xlast = x
        x = alpha * (x * M + sum(x[is_dangling]) * dangling_weights) + \
            (1 - alpha) * p
        # check convergence, l1 norm
#         err = scipy.absolute(x - xlast).sum()
#         if err < N * tol:
        err = sp.linalg.norm(x - xlast)
        print (iteration, err)
        if err < tol:
            return dict(zip(nodelist, map(float, x)))
    raise NetworkXError('pagerank_scipy: power iteration failed to converge '
                        'in %d iterations.' % max_iter)

In [25]:
from __future__ import division

import scipy as sp
import scipy.sparse as sprs
import scipy.spatial
import scipy.sparse.linalg

def pagerank_power(G, p=0.85, max_iter=100,
                   tol=1e-06, personalize=None, reverse=False):
    """ Calculates pagerank given a csr graph

    Inputs:
    -------
    G: a csr graph.
    p: damping factor
    max_iter: maximum number of iterations
    personlize: if not None, should be an array with the size of the nodes
                containing probability distributions.
                It will be normalized automatically.
    reverse: If true, returns the reversed-pagerank

    Returns:
    --------
    Pagerank Scores for the nodes

    """
    # In Moler's algorithm, $G_{ij}$ represents the existences of an edge
    # from node $j$ to $i$, while we have assumed the opposite!
    if not reverse:
        G = G.T

    n, _ = G.shape
    c = sp.asarray(G.sum(axis=0)).reshape(-1)

    k = c.nonzero()[0]

    D = sprs.csr_matrix((1 / c[k], (k, k)), shape=(n, n))

    if personalize is None:
        personalize = sp.ones(n)
    personalize = personalize.reshape(n, 1)
    e = (personalize / personalize.sum()) * n

    z = (((1 - p) * (c != 0) + (c == 0)) / n)[sp.newaxis, :]
    G = p * G.dot(D)

    x = e / n
    oldx = sp.zeros((n, 1))

    iteration = 0

    for iteration in range(max_iter):
        #print ("x=", x.flatten()[:10])
        oldx = x
        x = G.dot(x) + e.dot(z.dot(x))
        err = sp.linalg.norm(x - oldx)
        print (iteration, err)
        if err < tol:
            break
    x = x / sum(x)

    return x.reshape(-1)

In [19]:
passed=True

G_size = 100
p=0.6
G = nx.gnm_random_graph(1000, 500000, directed=True)
# for e in G.edges():
#      G[e[0]][e[1]]['weight']=sp.rand()
#A=nx.to_scipy_sparse_matrix(G)

nodelist = G.nodes()
A = nx.to_scipy_sparse_matrix(G, nodelist=nodelist, weight='weight',
                              dtype=float)

A.todense()

matrix([[0., 0., 1., ..., 0., 0., 0.],
        [0., 0., 1., ..., 0., 1., 0.],
        [0., 0., 0., ..., 0., 1., 1.],
        ...,
        [0., 0., 0., ..., 0., 0., 1.],
        [0., 0., 1., ..., 0., 0., 0.],
        [1., 0., 0., ..., 0., 0., 0.]])

In [26]:
timeit.timeit(
        lambda: pagerank_scipy(
            G, A,
            alpha=0.85,
            tol=1e-6, max_iter=1000),
        number=5) / 5

0 0.0008138007236708838
1 2.1738062879568022e-05
2 5.635881352469846e-07
0 0.0008138007236708838
1 2.1738062879568022e-05
2 5.635881352469846e-07
0 0.0008138007236708838
1 2.1738062879568022e-05
2 5.635881352469846e-07
0 0.0008138007236708838
1 2.1738062879568022e-05
2 5.635881352469846e-07
0 0.0008138007236708838
1 2.1738062879568022e-05
2 5.635881352469846e-07


1.5154891875999965

In [23]:
timeit.timeit(
        lambda: pagerank_power(
            A, p=0.85, tol=1e-6, max_iter=1000), number=5) / 5

0 0.0008138007236708885
1 2.1738062879566033e-05
2 5.635881352460045e-07
0 0.0008138007236708885
1 2.1738062879566033e-05
2 5.635881352460045e-07
0 0.0008138007236708885
1 2.1738062879566033e-05
2 5.635881352460045e-07
0 0.0008138007236708885
1 2.1738062879566033e-05
2 5.635881352460045e-07
0 0.0008138007236708885
1 2.1738062879566033e-05
2 5.635881352460045e-07


0.033681842200030586

In [None]:
import cProfile
cProfile.run('pagerank_scipy(G, alpha=0.85, tol=1e-6, max_iter=100)')