In [1]:
import numpy as np
import pandas as pd
import sys
import re
from scipy import sparse
from tabulate import tabulate

### Read SNAP data into list of lists

In [2]:
# fname = './data/NCAA_football.csv'
# with open(fname, 'r') as f:
#     lines = [[int(node) for node in re.split('\t',edge.strip('\n'))[:2]] for edge in f.readlines() if edge[0][0] != '#']
# edges = np.array(lines)
# numItems = len(np.unique(np.array(lines)))

### Read small datasets into list of lists

In [3]:
# fname = './data/karate.csv'
fname = './data/NCAA_football.csv'
# fname = './data/eg.txt'
df = pd.read_csv(fname, header=None, usecols=[i for i in range(4)])
if type(df[2][0]) == str:
    df[2] = df[2].str.replace('"', '').str.strip()

names = sorted(np.unique(np.concatenate((df[0].unique(),df[2].unique()))))

a = np.array(df[0].apply(names.index))   
b = np.array(df[2].apply(names.index))
edges = np.array([b,a]).T
numItems = len(names)

### Create adjacency matrix 

In [4]:
adj = sparse.lil_matrix((numItems, numItems))

In [5]:
adj[edges[:,0], edges[:,1]] = 1

### Connect sink nodes to themselves

In [6]:
degOut = adj.getnnz(axis = 1) # num of non zero values in row

adj.setdiag(degOut == 0) # more efficient with lilmatrix
adj = adj.tocsr()
degOut = adj.getnnz(axis = 1) # num of non zero values in row

### Scale matrix by outgoing edges

In [7]:
degOutRep = np.repeat(degOut, degOut) # degOut is the same as number of data points in row
adj.data = adj.data / degOutRep

### Initialize pageranks

In [22]:
print(names)

['Adams State', 'Air Force', 'Akron', 'Alabama', 'Alabama A&M', 'Alabama State', 'Albany', 'Albion', 'Alcorn State', 'Angelo State', 'Appalachian State', 'Arizona', 'Arizona State', 'Arkansas', 'Arkansas State', 'Arkansas-Monticello', 'Arkansas-Pine Bluff', 'Army', 'Assumption', 'Auburn', 'Austin Peay', 'Azusa Pacific', 'Baker University', 'Ball State', 'Baylor', 'Benedict', 'Bentley College', 'Bethune-Cookman', 'Birmingham Southern', 'Boise State', 'Boston College', 'Bowling Green', 'Brigham Young', 'Brown', 'Bryant University', 'Bucknell', 'Buffalo', 'Butler', 'Cal Poly', 'California', 'Campbell', 'Carthage', 'Central Arkansas', 'Central Connecticut State', 'Central Methodist', 'Central Michigan', 'Central State', 'Central Washington', 'Charleston Southern', 'Chattanooga', 'Chowan', 'Cincinnati', 'Citadel', 'Clark Atlanta', 'Clemson', 'Coastal Carolina', 'Colgate', 'Colorado', 'Colorado State', 'Columbia', 'Concordia College', 'Concordia University (WI)', 'Connecticut', 'Cornell', 'C

In [26]:
print(df[df[0] == "Utah"])
print(df[df[2] == "Utah"])

         0   1                2   3
47    Utah  25         Michigan  23
227   Utah  42             UNLV  21
332   Utah  58       Utah State  10
397   Utah  30        Air Force  23
551   Utah  37      Weber State  21
567   Utah  31     Oregon State  28
706   Utah  40          Wyoming   7
807   Utah  49   Colorado State  16
1100  Utah  13       New Mexico  10
1108  Utah  13              TCU  10
1321  Utah  63  San Diego State  14
1410  Utah  48    Brigham Young  24
1506  Utah  31          Alabama  17
Empty DataFrame
Columns: [0, 1, 2, 3]
Index: []


### Iterate pagerank

In [8]:
p = [1/adj.shape[0]] * adj.shape[0] 

In [9]:
d = 0.9
jProb = [(1-d)/len(p)] * len(p)

In [10]:
for i in range(30):
    p = adj.T*p*d + jProb

In [11]:
print(sum(p))

1.000000000000001


In [12]:
output = []
for i in range(len(p)):
    output.append([names[i],p[i]])
output = pd.DataFrame(output, columns = ['name', 'pageRank'])
output = output.sort_values(by=['pageRank'], ascending=False)
output = output.reset_index().drop(columns=['index'])

print(tabulate(output, headers='keys', tablefmt='psql'))
# i=1
# for r in output.itertuples():
#     print(f"{i} {r[1]} with pagerank: {r[2]}")
#     i+=1

+-----+-----------------------------+-------------+
|     | name                        |    pageRank |
|-----+-----------------------------+-------------|
|   0 | Utah                        | 0.168811    |
|   1 | Mississippi                 | 0.0342903   |
|   2 | Florida                     | 0.0270402   |
|   3 | Wake Forest                 | 0.0170967   |
|   4 | Oklahoma                    | 0.0155623   |
|   5 | Alabama                     | 0.0152552   |
|   6 | Texas Tech                  | 0.0152301   |
|   7 | Virginia Tech               | 0.0145567   |
|   8 | Oregon State                | 0.0139949   |
|   9 | Vanderbilt                  | 0.0135798   |
|  10 | Boston College              | 0.0127457   |
|  11 | Texas                       | 0.0126105   |
|  12 | Georgia Tech                | 0.0120252   |
|  13 | North Carolina Pembroke     | 0.012012    |
|  14 | South Carolina              | 0.0113403   |
|  15 | Virginia                    | 0.0107675   |
|  16 | USC 

# --- testing ---

In [13]:
fname = './data/eg.txt'
with open(fname, 'r') as f:
    lns = [[int(node) for node in re.split('\t',edge.strip('\n'))[:2]] for edge in f.readlines() if edge[0][0] != '#']
    
e = np.array(lns)
n = np.unique(e)

In [14]:
p = [1/len(n)]*len(n) # page rank

m = sparse.lil_matrix((len(n),len(n)))
m[e[:,0],e[:,1]] =  1
msumrow = m.sum(axis = 1) 
m.setdiag(msumrow == 0)

In [15]:
c = sparse.csr_matrix([[2,4,6], [5,10,15]])
d = np.array([2,5])

val = np.repeat(d, c.getnnz(axis=1))
c.data = c.data / val
print(c.todense())

[[1. 2. 3.]
 [1. 2. 3.]]


In [16]:
c = m.tocsr()
d = np.array(m.sum(axis=1))

val = np.repeat(d, c.getnnz(axis=1))

c.data = c.data / val

print(c.todense())

[[0.  1.  0.  0. ]
 [0.  0.  0.5 0.5]
 [0.  0.  1.  0. ]
 [0.5 0.  0.5 0. ]]


In [17]:
m = m.tocsr()
m /= msumrow
print(m)

[[0.  1.  0.  0. ]
 [0.  0.  0.5 0.5]
 [nan nan inf nan]
 [0.5 0.  0.5 0. ]]


  return np.true_divide(self.todense(), other)
  return np.true_divide(self.todense(), other)


In [18]:
# print(m.todense())
# print(np.array((m.sum(axis=1) == 0).flatten())[0])
mask = np.array((m.sum(axis=1) == 0).flatten())[0]
# print(mask)
print(m[mask])

[]


In [19]:
def set_sink_diags(adjm):
    # takes adjacency matrix
    # returns updated matrix whose diagonal will be 1 if that row is a sink node
    sums = adjm.sum(axis=1)
    for i in range(adjm.shape[0]):
        if sums[i] == 0:
            adjm[i,i] = 1

In [20]:
m /= m.sum(axis=1)
print(m)

[[0.  1.  0.  0. ]
 [0.  0.  0.5 0.5]
 [nan nan nan nan]
 [0.5 0.  0.5 0. ]]
