In [1]:
import numpy as np
import pandas as pd
import sys
import re
from scipy import sparse

### Read SNAP data into list of lists

In [18]:
# fname = './data/NCAA_football.csv'
fname = './data/amazon0505.txt'
with open(fname, 'r') as f:
    lines = [[int(node) for node in re.split('\t',edge.strip('\n'))[:2]] for edge in f.readlines() if edge[0][0] != '#']

In [19]:
edges = np.array(lines)

### Get unique list of nodes

In [20]:
nodes = np.unique(np.array(lines))

### Convert lines to scipy.sparse.lil

In [21]:
adj = sparse.lil_matrix((len(nodes), len(nodes)))

In [22]:
adj[edges[:,0], edges[:,1]] = 1

### Connect sink nodes to themselves

In [23]:
degOut = adj.getnnz(axis = 1) # num of non zero values in row

adj.setdiag(degOut == 0) # more efficient with lilmatrix
adj = adj.tocsr()
degOut = adj.getnnz(axis = 1) # num of non zero values in row

### Scale matrix by outgoing edges

In [24]:
degOutRep = np.repeat(degOut, degOut) # degOut is the same as number of data points in row
adj.data = adj.data / degOutRep

### Initialize pageranks

In [None]:
p = []

# --- testing ---

In [25]:
fname = './data/eg.txt'
with open(fname, 'r') as f:
    lns = [[int(node) for node in re.split('\t',edge.strip('\n'))[:2]] for edge in f.readlines() if edge[0][0] != '#']
    
e = np.array(lns)
n = np.unique(e)

In [26]:
p = [1/len(n)]*len(n) # page rank

m = sparse.lil_matrix((len(n),len(n)))
m[e[:,0],e[:,1]] =  1
msumrow = m.sum(axis = 1) 
m.setdiag(msumrow == 0)

In [27]:
c = sparse.csr_matrix([[2,4,6], [5,10,15]])
d = np.array([2,5])

val = np.repeat(d, c.getnnz(axis=1))
c.data = c.data / val
print(c.todense())

[[1. 2. 3.]
 [1. 2. 3.]]


In [12]:
c = m.tocsr()
d = np.array(m.sum(axis=1))

val = np.repeat(d, c.getnnz(axis=1))

c.data = c.data / val

print(c.todense())

[[0.  1.  0.  0. ]
 [0.  0.  0.5 0.5]
 [0.  0.  1.  0. ]
 [0.5 0.  0.5 0. ]]


In [13]:
m = m.tocsr()
m /= msumrow
print(m)

[[0.  1.  0.  0. ]
 [0.  0.  0.5 0.5]
 [nan nan inf nan]
 [0.5 0.  0.5 0. ]]


  return np.true_divide(self.todense(), other)
  return np.true_divide(self.todense(), other)


In [14]:
# print(m.todense())
# print(np.array((m.sum(axis=1) == 0).flatten())[0])
mask = np.array((m.sum(axis=1) == 0).flatten())[0]
# print(mask)
print(m[mask])

[]


In [15]:
def set_sink_diags(adjm):
    # takes adjacency matrix
    # returns updated matrix whose diagonal will be 1 if that row is a sink node
    sums = adjm.sum(axis=1)
    for i in range(adjm.shape[0]):
        if sums[i] == 0:
            adjm[i,i] = 1

In [16]:
m /= m.sum(axis=1)
print(m)

[[0.  1.  0.  0. ]
 [0.  0.  0.5 0.5]
 [nan nan nan nan]
 [0.5 0.  0.5 0. ]]
