In [1]:
import numpy as np
import scipy as sp
from scipy import stats
import pandas as pd
import sympy as sy
import scipy.linalg as la 
import scipy.sparse.linalg as spla
import time 
import matplotlib.pyplot as plt
import cmath
from utils import helper
sy.init_printing()
%matplotlib inline

### Page Rank Algorithm

### Problem 1. 
Write the following function that creates a adjacency matrix from a file.

    def to_matrix( filename, n ):
        ''' 
        Return the nxn adjacency matrix described by the file.
        INPUTS:
            filename - Name of a .txt file describing a directed graph. Lines
                      describing edges should have the form '<from node>\t<to node>'.
                      The file may also include comments.
            n   - The number of nodes in the graph described by datafile
        RETURN:
        Return a NumPy array. 
        '''
Hints:

1. The file matrix.txt included with this lab describes the matrix in Figure 15.1 and has the adjacency matrix A given above. You may use it to test your function.

2. You can open a file in Python using the with syntax. Then, you can iterate through the lines using a for loop. Here is an example.

        ￼￼# Open `matrix.txt` for read-only
        with open('./matrix.txt', 'r') as myfile:
            for line in myfi:
                print(line)

3. Here is an example of how to process a line of the form in datafile.

        ￼￼>>> line = '0\t4\n'
        # strip() removes trailing whitespace from a line.
        # split() returns a list of the space-separated pieces of the line. 
        >>> line.strip().split()
        ['0', '4']
        
4. Rather than testing for lines of matrix.txt that contain comments, put all your string operations in a `try` block with an `except` block following.

In [19]:
def to_matrix( filename, n ):
    ''' 
    Return the nxn adjacency matrix described by the file.
    INPUTS:
        filename - Name of a .txt file describing a directed graph. Lines
                  describing edges should have the form '<from node>\t<to node>'.
                  The file may also include comments.
        n   - The number of nodes in the graph described by datafile
    RETURN:
    Return a NumPy array. 
    '''
    #TODO: make adj_mat a sparse matrix type
    data = pd.read_csv(filename, sep = "\t").values
    adj_mat = np.zeros((n,n))
    adj_mat[data[:,0], data[:,1]] = 1
    return adj_mat

In [60]:
filename = "../Data/Volume1/matrix.txt"
n = 8

Am = to_matrix(filename, n)
Am

array([[ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  1.],
       [ 1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
       [ 1.,  0.,  1.,  0.,  0.,  0.,  1.,  0.],
       [ 1.,  0.,  0.,  0.,  0.,  1.,  1.,  0.],
       [ 1.,  0.,  0.,  0.,  0.,  0.,  1.,  0.],
       [ 1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
       [ 1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.]])

In [61]:
#modify the adjacency matrix for sink pages
sink_index = np.where(np.sum(Am, axis =1)==0)[0]

Am[sink_index] = np.ones(len(Am))
Am

array([[ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  1.],
       [ 1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
       [ 1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.],
       [ 1.,  0.,  1.,  0.,  0.,  0.,  1.,  0.],
       [ 1.,  0.,  0.,  0.,  0.,  1.,  1.,  0.],
       [ 1.,  0.,  0.,  0.,  0.,  0.,  1.,  0.],
       [ 1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
       [ 1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.]])

In [62]:
#calculate K
D_inv = np.diag(np.sum(Am, axis=1)**-1)
K = np.dot(D_inv, Am).T

### Problem 2. 

Write a function that computes the K matrix given an adjacency matrix.

1. Compute the diagonal matrix $D$.
2. Compute the modified adjacency matrix where the rows corresponding to sinks all have ones instead of zeros. 
3. Compute $K$ using array broadcasting.

In [63]:
def make_K(adj_mat):
    sink_index = np.where(np.sum(adj_mat, axis =1)==0)[0]

    adj_mat[sink_index] = np.ones(len(adj_mat))
    D_inv = np.diag(np.sum(adj_mat, axis=1)**-1)
    K = np.dot(D_inv, adj_mat).T
    return K

In [65]:
I = np.eye(n)
d = 0.85
la.solve(I-d*K, ((1-d)/8)*np.ones(n))

array([ 0.43869288,  0.02171029,  0.02786154,  0.02171029,  0.02171029,
        0.02786154,  0.04585394,  0.39459924])

### Problem 3. 

Implement the function below, using the iterative method to find the steady state of the PageRank algorithm. When the argument `N` is not None, work with only the upper $N \times N$ portion of the array `adj`. Test your function against the example in the lab.


    def iter_solve( adj, N=None, d=.85, tol=1E-5): 
        '''
        Return the page ranks of the network described by 'adj' using the iterative method.
        INPUTS:
            adj - A NumPy array representing the adjacency matrix of a directed graph
            N - Restrict the computation to the first `N` nodes of the graph.
                    Defaults to N=None; in this case, the entire matrix is used.
            d     - The damping factor, a float between 0 and 1.
                    Defaults to .85.
            tol  - Stop iterating when the change in approximations to the
                   solution is less than 'tol'. Defaults to 1E-5.
        OUTPUTS:
            Return the approximation to the steady state of p.
        '''
Hints:

1. Try making your initial guess for $p(0)$ a random vector.
2. NumPy can do unexpected things with the dimensions when performing matrix-vector multiplication. When debugging, check at each iteration that all arrays have the dimensions you expect.