# Restaurant Recommendation system for YELP users using Collaborative Filtering

In [1]:
import os
import numpy as np
import scipy.sparse as sp
import scipy.linalg as la
import matplotlib
import time
from collections import Counter
from sklearn.model_selection import train_test_split
import pandas as pd

matplotlib.use("svg")
if not os.environ.get("DISABLE_TESTING", False):
    %matplotlib inline

import matplotlib.pyplot as plt
plt.style.use("ggplot")

## Dataset

The data obtained with the help of Yelp API and web scraping for the restaurants in Shadyside, Pittsburgh was used to build a recommendation system.

For this project, we will only be looking at the ratings data, and ignoring the restaurant data and user reviews for restaurants which could be used to improve the ability of the recommendation system.

In [67]:
def read_csv(file="complete_data_shadyside.csv"):
    """read the CSV file and process the data.

    kwargs:
        file : path to the data file
    
    returns: Tuple[ratings, restaurants, users] where
      ratings : Tuple[np.ndarray[int], np.ndarray[int], np.ndarray[float]] -- a list of user ids, restaurant ids, and corresponding ratings
      restaurants : Dict[int, str] -- the lookup table from restaurant ID to restaurant name
      users : Dict[str, int] -- the lookup table from user name to user ID
    """
    restaurants = {}
    users = {}
    
    data = pd.read_csv(file)
    
    restaurant_list = list(data["restaurant"].unique())
    data["restaurant_id"] = [restaurant_list.index(data["restaurant"][i]) for i in range(data["restaurant"].shape[0])]
    
    for i in range(len(restaurant_list)):
        restaurants[i] = restaurant_list[i]
    
    user_list = list(data["author"].unique())
    data["user_id"] = [user_list.index(data["author"][i]) for i in range(data["author"].shape[0])]
    
    for i in range(len(user_list)):
        users[user_list[i]] = i 
        
    order = [6,0,5,4,1,2,3] # setting column's order
    data = data[[data.columns[i] for i in order]]
    
    #print(data.head())
    
    return (np.array(data["user_id"]), np.array(data["restaurant_id"]), np.array(data["rating"],np.float32)), restaurants, users

In [68]:
data = read_csv()

## Data preparation

Matrix factorization requires that we have our ratings stored in a matrix of users, so the first task is to take the dataframe and convert it into this format. Typically these matrices are extremely large and sparse, and so we work with sparse matrices here. 

In [90]:
def process(ratings_userid, ratings_restaurantid, ratings_rating, test_size=0.1, random_state=0xCAFE):
    """ Given rating data, split the data into training and testing sets and convert them to sparse matrices.

        args: 
            ratings_userid  : np.ndarray[num_ratings] -- vector of user Ids
            ratings_restaurantid : np.ndarray[num_ratings] -- vector of restaurant Ids
            ratings_rating  : np.ndarray[num_ratings] -- vector of rating values

        kwargs:
            test_size : float -- fraction of dataset to place in the test set
            random_state : int -- the random seed for dataset splitting

        return: Tuple[X_train, X_test, movies] 
            X_train : sp.coo_matrix -- the training data, as a sparse matrix
            X_test : sp.coo_matrix -- the test data, as a sparse matrix 
    """
    data = np.stack((ratings_userid, ratings_restaurantid, ratings_rating)).T

    data_train,data_test = train_test_split(data,test_size=test_size,random_state = random_state) 
    
    X_train = sp.coo_matrix((data_train[:,2],(data_train[:,0],data_train[:,1])),shape=(len(set(ratings_userid)),len(set(ratings_restaurantid))))
    X_test = sp.coo_matrix((data_test[:,2],(data_test[:,0],data_test[:,1])),shape=(len(set(ratings_userid)),len(set(ratings_restaurantid))))
    
    return X_train, X_test

In [91]:
data_train, data_test = process(data[0][0], data[0][1], data[0][2])

In [92]:
repr(data_train)

"<16512x826 sparse matrix of type '<class 'numpy.float64'>'\n\twith 67061 stored elements in COOrdinate format>"

In [93]:
repr(data_test)

"<16512x826 sparse matrix of type '<class 'numpy.float64'>'\n\twith 7452 stored elements in COOrdinate format>"

## Alternating Minimization for Collaborative Filtering
Now we build the collaborative filtering recommendation system. We will use a method known as alternating least squares. We alternate between optimizing $U$ and $V$ and holding the other constant. By treating one matrix as a constant, we get a weighted least squares problem which we can solve easily.

In [95]:
def error(X, U, V):
    """ Compute the mean error of the observed ratings in X and their estimated values. 

        args: 
            X : np.array[num_users, num_restaurants] -- the ratings matrix
            U : np.array[num_users, num_features] -- a matrix of features for each user
            V : np.array[num_restaurants,num_features] -- a matrix of features for each restaurant

        return: float -- the mean squared error between non-zero entries of X and the ratings
            predicted by U and V; as this is an error and not a loss function, you do not need to include the
            regularizing terms.
        """
        
    X_ = np.dot(U,V.T)
    error = ((X_[X!=0] - X[X!=0])**2).sum()/((X!=0).sum())
    
    return error

In [60]:
def train(X_train, X_test, k, niters=20, lam=5., verbose=True):
    """ Train a collaborative filtering model. 
        Args: 
            X_train : np.array[num_users, num_restaurants] -- the training ratings matrix, assumed dense
            X_test : np.array[num_users, num_restaurants] -- the test ratings matrix, assumed dense
            k : int -- the number of features in the CF model
            niters : int -- number of iterations to run
            lam : float -- regularization parameter, shown as lambda
            verbose : boolean -- if true, print the error on train and test sets every few iterations 

        return : Tuple[U, V]
            U : np.array[num_users,  num_features] -- the user-feature matrix
            V : np.array[num_restaurants, num_features] -- the restaurant-feature matrix
    """
    U = np.random.normal(scale=0.1,size=(X_train.shape[0],k))
    V = np.random.normal(scale=0.1,size=(X_train.shape[1],k))

    if verbose:
        print("| Time    | Iter  | Train Err | Test Err |")
        print("| ------- | ----- | --------- | -------- |")

    start_time = time.perf_counter()
    for i in range(niters):
        for j in range(U.shape[0]):
            v = V[X_train[j]!=0]
            if(v.shape[0] !=0):
                U[j] = la.solve(v.T@v + lam * np.eye(k),v.T@X_train[j][X_train[j]!=0])

        for j in range(V.shape[0]):
            u = U[X_train[:,j]!=0]
            if(u.shape[0] !=0):
                V[j] = la.solve(u.T@u+ lam * np.eye(k),u.T@(X_train[X_train[:,j]!=0][:,j]))
        
        if verbose: 
            print(f"| {time.perf_counter() - start_time: 7.3f} |{i+1: 6d} |{error(X_train, U, V):10.4f} |{error(X_test, U, V):9.4f} |")
    
    if verbose: 
        print("")
    return U, V

In [61]:
u,v = train(data_train.toarray(), data_test.toarray(), 3)

| Time    | Iter  | Train Err | Test Err |
| ------- | ----- | --------- | -------- |
|   1.174 |     1 |   14.0255 |  17.6904 |
|   2.613 |     2 |    2.2391 |   6.5662 |
|   4.022 |     3 |    1.1114 |   4.0377 |
|   5.433 |     4 |    1.0412 |   3.8771 |
|   6.839 |     5 |    1.0185 |   3.8662 |
|   8.257 |     6 |    1.0064 |   3.8722 |
|   9.681 |     7 |    0.9986 |   3.8776 |
|  11.089 |     8 |    0.9930 |   3.8801 |
|  12.526 |     9 |    0.9887 |   3.8809 |
|  14.033 |    10 |    0.9854 |   3.8807 |
|  15.470 |    11 |    0.9826 |   3.8802 |
|  16.961 |    12 |    0.9803 |   3.8795 |
|  18.368 |    13 |    0.9783 |   3.8786 |
|  19.770 |    14 |    0.9764 |   3.8773 |
|  21.177 |    15 |    0.9747 |   3.8760 |
|  22.584 |    16 |    0.9730 |   3.8749 |
|  23.983 |    17 |    0.9715 |   3.8741 |
|  25.399 |    18 |    0.9702 |   3.8736 |
|  26.823 |    19 |    0.9691 |   3.8734 |
|  28.253 |    20 |    0.9682 |   3.8735 |



## Recommendations

Finally, we need to be able to make recommendations given a matrix factorization. We can do this with the help of estimated ratings matrix. 

In [69]:
def recommend_restaurant(X, U, V):
    """Recommend a new restaurant for every user.

        args: 
            X : np.array[num_users, num_restaurants] -- the ratings matrix
            U : np.array[num_users, num_features] -- a matrix of features for each user
            V : np.array[num_restaurants,num_features] -- a matrix of features for each restaurant

        return: List[int] -- a list of restaurant Ids for each user
    """
    
    X_ = np.dot(U,V.T)
    recommendations = []
    
    for i in range(X_.shape[0]):
        X_[i][X[i]!=0] = 0
        recommendations.append(X_[i].argmax())
        
    return recommendations

In [89]:
recommendation = recommend(data_train.toarray() + data_test.toarray(), u, v)
counts = Counter(recommendation)
print("Most Recommended Restaurants:\n")
print([(data[1][i], c) for i, c in counts.most_common(5)])

Most Recommended Restaurants:

[('Church Brew Works', 3672), ("DiAnoia's Eatery", 3003), ('Point Brugge Café', 1770), ('Hofbrauhaus Pittsburgh', 1546), ('Carmi Soul Food', 1240)]


In [86]:
def recommend_user(X, U, V, users, restaurants, user_name, n):
    """Recommend 'n' restaurants for a given user.

        args: 
            X : np.array[num_users, num_restaurants] -- the ratings matrix
            U : np.array[num_users, num_features] -- a matrix of features for each user
            V : np.array[num_restaurants,num_features] -- a matrix of features for each restaurant
            users : Dict[str, int] -- the lookup table from user name to user ID
            restaurants : Dict[int, str] -- the lookup table from restaurant ID to restaurant name
            user_name : str -- user name for whom recommendations are required
            n : int -- number of recommendations
            
        return: List[int] -- a list of restaurant Ids for the given user
    """
    
    X_ = np.dot(U,V.T)
    
    X_[users[user_name]][X[users[user_name]]!=0] = 0
    recommendations = X_[users[user_name]].argsort()[-n:][::-1]
    
    print("Recommendations for user ",user_name,":\n")
    print([restaurants[i] for i in recommendations])
    
    return recommendations

In [88]:
recommendations = recommend_user(data_train.toarray() + data_test.toarray(), u, v, data[2],data[1], "Jeremy L.", 5)

Recommendations for user  Jeremy L. :

["Carmella's Plates & Pints", 'Double Wide Grill', 'The Porch at Schenley', 'Girasole Restaurant', 'Caffe Mona La Bistro']
