# AG-15

In [7]:
import numpy as np
import torch as torch
from torch_cluster import random_walk
from torch.distributions.geometric import Geometric
import networkit as nk
import math
import torch
import random
from torch_geometric.utils import from_networkit
from torch_geometric import EdgeIndex
import networkit as nk
import torch
import random as rd


def get_in_neighbors(G, u) -> list:
    mask = G[1] == u
    return G[:, mask][0].tolist()

def get_out_degree(G, u) -> int:
    return (G[0] == u).nonzero().size(dim=0)

class PPR:
    def __init__(self, G, alpha, delta):
        self._G = G  # EdgeIndex
        self._alpha = alpha
        self._delta = delta
        self._epsilon = np.sqrt(delta)
        self._beta = 1 / 6
        self._c = 350

    def fast_ppr(self, s, t):
        t_set, f_set, pi_inv = self._frontier(t)
        if s in t_set:
            return pi_inv[s].item(), len(t_set | f_set)
        else:
            number_of_walks = int(np.ceil(self._c * self._epsilon / self._delta))
            sum_pi_inv = 0
            for i in range(number_of_walks):
                Geom = Geometric(torch.tensor([self._alpha]))
                L = int(np.ceil(Geom.sample().item()))
                L = L if L > 0 else 1
                walk = random_walk(self._G[0], self._G[1], torch.tensor([s]), walk_length=L).flatten()
                for v in walk:
                    if v.item() in f_set:
                        sum_pi_inv += pi_inv[v].item()
            return sum_pi_inv/number_of_walks, len(t_set | f_set)

    def _frontier(self, t):
        error_inv = self._beta * self._epsilon
        estimate_vec = torch.zeros(self._G.num_rows)
        estimate_vec[t] = self._alpha
        residual_vec = torch.clone(estimate_vec)
        target_set = {t}
        frontier_set = set([])
        residual_vec_bigger_ix = (residual_vec > error_inv * self._alpha).nonzero().flatten()

        while residual_vec_bigger_ix.size(dim=0) > 0:
            w = residual_vec_bigger_ix[0].item()
            for u in get_in_neighbors(self._G, w):
                capital_delta = (1.0 - self._alpha) * residual_vec[w].item() / get_out_degree(self._G, u)
                estimate_vec[u] = estimate_vec[u] + capital_delta
                residual_vec[u] = residual_vec[u] + capital_delta
                if estimate_vec[u].item() > error_inv:
                    target_set.add(u)
                    frontier_set = set(get_in_neighbors(self._G, u)).union(frontier_set)
            residual_vec[w] = 0
            residual_vec_bigger_ix = (residual_vec > error_inv * self._alpha).nonzero().flatten()

        frontier_set = frontier_set - target_set
        return target_set, frontier_set, estimate_vec

In [8]:
G_nk = nk.readGraph("./foodweb-baydry.konect")
n = G_nk.numberOfNodes()
is_undir = not G_nk.isDirected()
G_pyg = from_networkit(G_nk)[0]
G_pyg = EdgeIndex(G_pyg, sparse_size=(n, n), is_undirected=is_undir)

In [19]:
def run_for_alpha(alpha):

    print("--------------------------------")

    delta = 1 / n
    print("delta:", delta)
    print("alpha:", alpha)

    my_ppr = PPR(G_pyg, alpha=alpha, delta=delta)

    random.seed(0)
    num_node_pairs = 20
    num_below_delta = 0
    num_non_zero = 0
    average_size_target_set = 0
    for _ in range(num_node_pairs):
        # print("----------------")
        s = random.randint(0, n - 1)
        t = s
        while t == s:
            t = random.randint(0, n - 1)

        # print(s, t)
        score, target_set_size = my_ppr.fast_ppr(s, t)
        # print(score, target_set_size)
        if score > 0:
            num_non_zero += 1
        if score < delta:
            num_below_delta += 1
        average_size_target_set += target_set_size
    average_size_target_set /= num_node_pairs

    print("below delta:", int(100 * num_below_delta / num_node_pairs ), "%")
    print("num below delta:", num_below_delta)
    print("non zero:", int(100 * num_non_zero / num_node_pairs ), "%")
    print("average size target set:", average_size_target_set)

In [10]:
run_for_alpha(0.0001)

--------------------------------
delta: 0.0078125
alpha: 0.0001
below delta: 100 %
non zero: 0 %
average size target set: 1.0


In [20]:
run_for_alpha(0.1)

--------------------------------
delta: 0.0078125
alpha: 0.1
below delta: 100 %
num below delta: 20
non zero: 25 %
average size target set: 3.95


In [21]:
run_for_alpha(0.3)

--------------------------------
delta: 0.0078125
alpha: 0.3
below delta: 95 %
num below delta: 19
non zero: 60 %
average size target set: 24.75


## a)What is the average combined size of the target set and frontier set? How many (computed) PPR values are smaller than δ?
delta: 0.0078125\
alpha: 0.3\
below delta: 95 %\
num below delta: 19\
non zero: 60 %\
average size target set: 24.75

## b)What is the influence of α in the algorithm?
alpha can be understood as the probability of the random walk between s and t stopping at each step. It affects the frontiers diameter and the lengths of the walks from source that are going to hit the frontier. Therefore, it should be chosen such that the random walks and the frontier meet in the "middle".

## c)What is the role of δ regarding the expected error bounds of the algorithm?
if the fast_ppr(s,t) score is higher than delta, then with probability higher than 0.99, the estimated ppr is at most max(δ, πs(t))/4 away from the exact value.