# Project task 04:  Restaurant ranking

In [None]:
import numpy as np
import scipy.sparse as sp

The goal of this task is to rank restaurants using the **PageRank** algorithm. You are given a directed weighted graph where each node represents one restaurant. The edges in this graph are based on users reviews.

Additionally for each restaurant you are given the categories it belongs to, i.e. 'Mexican', 'Italian', etc. Note that each restaurant can belong to multiple categories.

Considering these categories as topics you will perform **topic-sensitive PageRank**, enabling you to e.g. find the top 10 'Mexican' restaurants.

## 1. Load data

* The graph is stored as a sparse adjacency matrix $A$
* The categories are stored in a binary sparse matrix $C$, with $C_{ij}=1$ indicating that restaurant $i$ belongs to category $j$
* We also provide you with a dictionary mapping each category to its corresponding column index in $C$
* The name of each restaurant is provided as a list, with the i-th element in the list corresponding to the i-th node in the graph

In [None]:
A = sp.load_npz('restaurant_graph.npz')
A

In [None]:
C = sp.load_npz('restaurant_categories.npz')
C

In [None]:
categories = np.load('categories.npy').tolist()
categories['Mexican'], categories['Chinese']

In [None]:
names = np.load('restaurant_names.npy')
names[:3]

In [None]:
assert A.shape[0] == len(names) == C.shape[0]
assert C.shape[1] == len(categories)

 ## 2. Determine the teleport set
 

Given a list of topics of intereset, i.e. `['Mexican', 'Italian', ...]`, implement a helper function to return all the restaurants that belong to **at least one** of these topics. These restaurants will become part of the teleport set in topic-sensitive PageRank.

In [None]:
def teleport_set(C, topics, categories):
    """
    Finds the teleport set consisting of restaurants that belong to at least one of the specified topics.
    
    Parameters
    ----------
    C             : sp.spmatrix, shape [num_restaurants, num_categories]
                    Binary matrix encoding which restaurants belongs to which categories.
    topics        : List[string]
                    List of topics of interest.
    categories    : dict(string, int)
                    Dictionary mapping each category to its corresponding column index in C.
        
    Returns
    -------
    teleport_idx : np.array, shape [S]
                   The indicies of the nodes in the teleport set.
    """
    #### YOUR CODE ####
    
    teleport_idx=[]
    
    teleport_idx = [node for topic in topics for node in sp.find(C[:,categories[topic]])[0]]
    
    return np.array(teleport_idx)

 ## 3. Implement topic-sensitive PageRank

In [None]:
def page_rank(A, beta, teleport_idx=None, eps=1e-12):
    """
    Implements topic-sensitive PageRank using power iteration and sparse matrix operations.
    
    Parameters
    ----------
    A           : sp.spmatrix, shape [num_restaurants, num_restaurants]
                  The adjacency matrix representing the graph of restaurants.
    beta        : float, 
                  0 < beta < 1, (1-beta) is the probabilty of teleporting to the nodes in the teleport set
    teleport_idx: np.array, shape [S]
                  The indicies of the nodes in the teleport set. If it equals to None
                  it means runs standard PageRank, i.e. all nodes are in the teleport set.
    
    Returns
    -------
    r          : np.array, shape [num_restaurants]
                 The page rank vector containing the page rank scores for each restaurant.
    """
    
    #### YOUR CODE ####
    
    b=1/np.sum(A,0)
    B=np.diagflat(b)
    M=sp.csc_matrix(sp.csc_matrix.dot(A,B))
    N=A.shape[0]
    
    if teleport_idx is None: #"Normal" PageRank
        r=np.transpose(np.ones(N)/N)
        r_tmp=np.transpose(np.zeros(N))
    else: #Topic-Sensitive PageRank
        S=len(teleport_idx)
        r=np.transpose(np.zeros(N))
        for idx in teleport_idx: 
            r[idx]=1/S
    
    r_tmp=np.transpose(np.zeros(N))
    add_vec=r
    while np.linalg.norm(r-r_tmp)>=eps:
        r_tmp=r
        r=np.add(beta*M.dot(r),(1-beta)*add_vec)
    
    return r

### 3.1 Calculate the standard PageRank scores and print the names of the top 5 restaurants overall

In [None]:
idx_to_category = {v:k for k, v in categories.items()}

In [None]:
r = page_rank(A=A, beta=0.6, teleport_idx=None)

for i, x in enumerate(r.argsort()[-5:]):
    print(i+1, names[x], '\n  Categories: ', [idx_to_category[cat] for cat in C[x].nonzero()[1]])

### 3.2 Calculate the topic-sensitive PageRank scores and print the names of top 5 Mexican restaurants

In [None]:
teleport_idx = teleport_set(C, ['Mexican'], categories)
r = page_rank(A=A, beta=0.6, teleport_idx=teleport_idx)

for i, x in enumerate(r.argsort()[-5:]):
    print(i+1, names[x], '\n  Categories: ', [idx_to_category[cat] for cat in C[x].nonzero()[1]])

### 3.3 Calculate the topic-sensitive PageRank scores and print the names of top 5 Italian or French restaurants


In [None]:
teleport_idx = teleport_set(C, ['Italian', 'French'], categories)
r = page_rank(A=A, beta=0.6, teleport_idx=teleport_idx)

for i, x in enumerate(r.argsort()[-5:]):
    print(i+1, names[x], '\n  Categories: ', [idx_to_category[cat] for cat in C[x].nonzero()[1]])