# Project task 04:  Restaurant ranking

In [1]:
import numpy as np
import scipy.sparse as sp

The goal of this task is to rank restaurants using the **PageRank** algorithm. You are given a directed weighted graph where each node represents one restaurant. The edges in this graph are based on users reviews.

Additionally for each restaurant you are given the categories it belongs to, i.e. 'Mexican', 'Italian', etc. Note that each restaurant can belong to multiple categories.

Considering these categories as topics you will perform **topic-sensitive PageRank**, enabling you to e.g. find the top 10 'Mexican' restaurants.

## 1. Load data

* The graph is stored as a sparse adjacency matrix $A$
* The categories are stored in a binary sparse matrix $C$, with $C_{ij}=1$ indicating that restaurant $i$ belongs to category $j$
* We also provide you with a dictionary mapping each category to its corresponding column index in $C$
* The name of each restaurant is provided as a list, with the i-th element in the list corresponding to the i-th node in the graph

In [3]:
A = sp.load_npz('task_04_data/restaurant_graph.npz')
A

<7073x7073 sparse matrix of type '<class 'numpy.float64'>'
	with 1682844 stored elements in Compressed Sparse Row format>

In [4]:
C = sp.load_npz('task_04_data/restaurant_categories.npz')
C

<7073x138 sparse matrix of type '<class 'numpy.float64'>'
	with 19047 stored elements in Compressed Sparse Row format>

In [5]:
categories = np.load('task_04_data/categories.npy').tolist()
categories['Mexican'], categories['Chinese']

(3, 14)

In [6]:
names = np.load('task_04_data/restaurant_names.npy')
names[:3]

array(['Alize Catering', 'Chula Taberna Mexicana', 'Sunnyside Grill'],
      dtype='<U50')

In [7]:
assert A.shape[0] == len(names) == C.shape[0]
assert C.shape[1] == len(categories)

In [16]:
A.shape, C.shape

((7073, 7073), (7073, 138))

 ## 2. Determine the teleport set
 

Given a list of topics of intereset, i.e. `['Mexican', 'Italian', ...]`, implement a helper function to return all the restaurants that belong to **at least one** of these topics. These restaurants will become part of the teleport set in topic-sensitive PageRank.

In [29]:
def teleport_set(C, topics, categories):
    """
    Finds the teleport set consisting of restaurants that belong to at least one of the specified topics.
    
    Parameters
    ----------
    C             : sp.spmatrix, shape [num_restaurants, num_categories]
                    Binary matrix encoding which restaurants belongs to which categories.
    topics        : List[string]
                    List of topics of interest.
    categories    : dict(string, int)
                    Dictionary mapping each category to its corresponding column index in C.
        
    Returns
    -------
    teleport_idx : np.array, shape [S]
                   The indicies of the nodes in the teleport set.
    """
    #### YOUR CODE ####

    # Extract the column indices of 'C' from 'categories' by relevant categories in topics
    c_col_idx = [categories[c] for c in topics]
    
    # Compute the vector that contains number of categories each restaurant has
    num_cat = np.sum(C[:,c_col_idx], axis=1)

    # get the teleport indices by extracting indices of restaurants having nonzero number of categories
    teleport_idx = np.flatnonzero(num_cat)
    
    return teleport_idx

In [30]:
## FOR TESTING

# teleport_idx = teleport_set(C, ['Mexican'], categories)
# print(teleport_idx)

[   1   11   34   39   47  135  144  172  181  184  206  276  289  387
  388  424  432  446  467  553  564  579  583  599  625  643  669  727
  789  817  905  930 1033 1046 1131 1166 1273 1275 1286 1293 1340 1352
 1382 1402 1416 1486 1540 1561 1569 1614 1733 1743 1756 1794 1886 1898
 1902 1909 1924 1925 1926 1941 2031 2043 2069 2070 2101 2146 2206 2226
 2311 2329 2390 2496 2537 2538 2572 2573 2576 2579 2591 2608 2628 2734
 2770 2815 2839 2847 2852 2880 2887 2897 2949 2955 3018 3149 3165 3211
 3221 3235 3239 3261 3274 3311 3332 3341 3364 3416 3430 3439 3441 3447
 3456 3459 3460 3464 3479 3492 3545 3584 3606 3617 3693 3728 3791 3794
 3849 3874 3885 3919 3932 3935 3939 3942 3975 3988 3992 4007 4008 4019
 4020 4045 4049 4170 4191 4254 4268 4320 4321 4332 4343 4382 4395 4402
 4417 4418 4426 4478 4542 4569 4571 4587 4598 4604 4609 4615 4624 4689
 4695 4796 4800 4815 4886 4925 4946 4958 4999 5019 5020 5052 5116 5157
 5187 5230 5233 5271 5300 5315 5324 5391 5442 5480 5498 5534 5546 5574
 5621 

 ## 3. Implement topic-sensitive PageRank

In [8]:
def page_rank(A, beta, teleport_idx=None, eps=1e-12):
    """
    Implements topic-sensitive PageRank using power iteration and sparse matrix operations.
    
    Parameters
    ----------
    A           : sp.spmatrix, shape [num_restaurants, num_restaurants]
                  The adjacency matrix representing the graph of restaurants.
    beta        : float, 
                  0 < beta < 1, (1-beta) is the probabilty of teleporting to the nodes in the teleport set
    teleport_idx: np.array, shape [S]
                  The indicies of the nodes in the teleport set. If it equals to None
                  it means runs standard PageRank, i.e. all nodes are in the teleport set.
    
    Returns
    -------
    r          : np.array, shape [num_restaurants]
                 The page rank vector containing the page rank scores for each restaurant.
    """
    
    #### YOUR CODE ####
    
    return r

### 3.1 Calculate the standard PageRank scores and print the names of the top 5 restaurants overall

In [9]:
idx_to_category = {v:k for k, v in categories.items()}

In [10]:
r = page_rank(A=A, beta=0.6, teleport_idx=None)

for i, x in enumerate(r.argsort()[-5:]):
    print(i+1, names[x], '\n  Categories: ', [idx_to_category[cat] for cat in C[x].nonzero()[1]])

1 Congee Me 
  Categories:  ['Korean']
2 Go Go China 
  Categories:  ['Chinese']
3 Sushi Making For the Soul 
  Categories:  ['Japanese']
4 Spring Rolls 
  Categories:  ['African']
5 Happy Tummy Filipino Cuisine 
  Categories:  ['Chinese']


### 3.2 Calculate the topic-sensitive PageRank scores and print the names of top 5 Mexican restaurants

In [11]:
teleport_idx = teleport_set(C, ['Mexican'], categories)
r = page_rank(A=A, beta=0.6, teleport_idx=teleport_idx)

for i, x in enumerate(r.argsort()[-5:]):
    print(i+1, names[x], '\n  Categories: ', [idx_to_category[cat] for cat in C[x].nonzero()[1]])

1 Chill 
  Categories:  ['Mexican']
2 El Taquito 
  Categories:  ['Mexican']
3 The Atlantic 
  Categories:  ['Fast Food', 'Mexican']
4 Burrito Loco 
  Categories:  ['Mexican']
5 El Takito 
  Categories:  ['Mexican']


### 3.3 Calculate the topic-sensitive PageRank scores and print the names of top 5 Italian or French restaurants


In [12]:
teleport_idx = teleport_set(C, ['Italian', 'French'], categories)
r = page_rank(A=A, beta=0.6, teleport_idx=teleport_idx)

for i, x in enumerate(r.argsort()[-5:]):
    print(i+1, names[x], '\n  Categories: ', [idx_to_category[cat] for cat in C[x].nonzero()[1]])

1 Ali Baba's Middle Eastern Cuisine 
  Categories:  ['Sandwiches', 'Pizza', 'Italian']
2 New May Hong Yuen BBQ 
  Categories:  ['Italian']
3 Sunnyside Café 
  Categories:  ['French']
4 IPho Vietnamese Cuisine 
  Categories:  ['Italian']
5 McDonald's 
  Categories:  ['Italian']
