## Q2.8 ##

### a) ###

In [20]:
import numpy as np
import os
import scipy.io as sio

# Helper functions.
def norm(x):
    return np.sqrt(np.sum(x ** 2))

# Load data.
data_folder = 'PS02_dataSet'
J = sio.loadmat(os.path.join(data_folder, 'pagerank_adj.mat'))['J']
J = np.float64(J)

# Create A and verify column sums.
A = J / np.sum(J, axis=0)
A_col_sums = np.sum(A, axis=0)
max_allowable_err = 1E-5
for col in A_col_sums:
    assert np.abs(col - 1.0) < max_allowable_err, 'No way!'
print('Each column in A sums to 1.')

Each column in A sums to 1.


### b) ###

In [42]:
import plotly.offline as py
import plotly.graph_objs as go
py.offline.init_notebook_mode(connected=True)

def init_x(length):
    return np.random.uniform(0, 1, length)

power_iters = 10
power_iter_errors = [0]
x_1 = init_x(len(A))
x_1 /= norm(x_1)
identity = np.identity(len(A))
for i in range(power_iters):
    y = np.matmul(A, x_1)
    x_1 = y / norm(y)
    A_x = np.matmul(A, x_1)
    eigenval = np.matmul(x_1.T, A_x)
    err = np.log(norm(A_x - x_1))
    power_iter_errors.append(err)

# Plotting.
trace1 = go.Scatter(
    x = np.arange(len(power_iter_errors)),
    y = np.array(power_iter_errors).astype(np.float32),
    mode = 'lines',
    name = 'Vanilla Power Iteration'
)

data = [trace1]
py.iplot(data, filename='power-iteration-errors')

### c) ###

In [47]:
# Shifted inverse power iteration algorithm.
shifted_power_iter_errors = [0]
sigma = 0.99
x_2 = init_x(len(A))
x_2 /= norm(x_2)
shifted_inverse = np.linalg.inv(A - sigma * identity)
for i in range(power_iters):
    y = np.matmul(shifted_inverse, x_2)
    x_2 = y / norm(y)
    A_x = np.matmul(A, x_2)
    err = np.log(norm(A_x - x_2))
    shifted_power_iter_errors.append(err)

# Rayleigh quotient iteration algorithm.
# This one gives the lowest error so we will use its result
# for our PageRank analysis.
rayleigh_iter_errors = [0]
sigma = 0.99
x_3 = init_x(len(A))
x_3 /= norm(x_3)
for i in range(power_iters):
    if i > 1 and np.abs(sigma - 1.0) >= 1E-10:
        numerator = np.matmul(x_3.T, A_x)
        sigma = numerator / np.matmul(x_3.T, x_3)
    print(sigma)
    shifted_inverse = np.linalg.inv(A - sigma * identity)
    y = np.matmul(shifted_inverse, x_3)
    x_3 = y / norm(y)
    A_x = np.matmul(A, x_3)
    err = np.log(norm(A_x - x_3))
    rayleigh_iter_errors.append(err)

# Plotting.
trace2 = go.Scatter(
    x = np.arange(len(shifted_power_iter_errors)),
    y = np.array(shifted_power_iter_errors).astype(np.float32),
    mode = 'lines',
    name = 'Shifted Inverse'
)
trace3 = go.Scatter(
    x = np.arange(len(rayleigh_iter_errors)),
    y = np.array(rayleigh_iter_errors).astype(np.float32),
    mode = 'lines',
    name = 'Rayleigh'
)
data = [trace1, trace2, trace3]
py.iplot(data, filename='all-algos-iteration-errors')

0.99
0.99
1.0003825451392023
1.0001625687802904
1.0000533764371777
1.0000046238680838
1.0000000383288954
1.000000000002663
1.000000000002663
1.000000000002663


### d) ###

In [48]:
k = 5
x = np.abs(x_3)
x = x / np.sum(x)
sorted_indices = x.argsort()
max_indices = sorted_indices[-k:][::-1]
min_indices = sorted_indices[:k][::-1]

# Top scores.
print('The top ' + str(k) + ' page indices along with PageRank scores:')
for i in range(k):
    idx = max_indices[i]
    print('Page Index: ' + str(1 + idx) + ', Score: ' + str(x[idx]))

print(' ')
print(' ')

# Bottommost scores.
print('The bottom ' + str(k) + ' page indices along with PageRank scores:')
for i in range(k):
    idx = min_indices[i]
    print('Page Index: ' + str(1 + idx) + ', Score: ' + str(x[idx]))

The top 5 page indices along with PageRank scores:
Page Index: 424, Score: 0.08393629338252721
Page Index: 986, Score: 0.041968146691263614
Page Index: 987, Score: 0.04196814669126361
Page Index: 985, Score: 0.031476110018447696
Page Index: 930, Score: 0.023265929339250525
 
 
The bottom 5 page indices along with PageRank scores:
Page Index: 48, Score: 2.9332147503515276e-67
Page Index: 23, Score: 2.5078709879776977e-67
Page Index: 4, Score: 2.2574711521138488e-67
Page Index: 1, Score: 2.228120094592619e-67
Page Index: 10, Score: 2.1566811917873028e-67


The top 5 webpages listed here are all homepages. This makes sense, since homepages are hubs from which all visitors must visit to access either general information or other pages' information. 

Some of the unpopular ones are either web pages that either obsolete / not supposed to be used, or simply fringe information that no one looks at. Unfortunately, I had trouble even loading some of these pages, but from the url name "http://www1.hollins.edu/security/Default.htm" one can guess that it's not commonly visited by students.