# User-based Collaborative Filtering Restaurant Recommender
### Linsten Han, Pui Ling (Amy) Ching
### CSC 478 Final Project 

In [1]:
cd C:\Users\lhan3\Downloads

C:\Users\lhan3\Downloads


#Preprocessing

In [2]:
import pandas as pd
import numpy as np

In [3]:
import scipy.sparse as sp

In [4]:
# Upload the Yelp Phoenix Review dataset
phxreview =pd.read_csv("phxReview.csv", delimiter=',',index_col=0)

In [5]:
# Upload the Yelp Phoenix User dataset
uDB=pd.read_csv("phxUser.csv", delimiter=',')
print "User Database shape is:",uDB.shape
# Upload the Yelp Phoenix Business dataset
busDB=pd.read_csv("phxB.csv", delimiter=',')
print "Business Database shape is:",busDB.shape

User Database shape is: (45277, 24)
Business Database shape is: (2244, 59)


In [6]:
# Create np arrays for the user_id and business_id columns in the Review dataset
phxuser=np.array(phxreview["user_id"])
phxbusiness=np.array(phxreview['business_id'])

In [7]:
# Extract stars column for values in sparse matrix 
stars=phxreview['stars']

In [8]:
# create key, value pairs for user_ids where key=user_id  and value=index
# create key, value pairs for business_ids where key=business_id  and value=index 
users = np.unique(phxreview['user_id'])
businesses = np.unique(phxreview['business_id'])
 
number_of_rows = len(users)
number_of_columns = len(businesses)

business_indices, user_indices = {}, {}

In [9]:
for i in range(len(businesses)):
    business_indices[businesses[i]] = i
    
for i in range(len(users)):
    user_indices[users[i]] = i

In [10]:
# create a matrix to fill in the sparse data
V = sp.lil_matrix((45277, 2043))

In [11]:
# adds the stars data into the sparse matrix
for line in phxreview.values:
    business, date, review_id, stars, user = map(str, line)
    V[user_indices[user], business_indices[business]] = stars

In [12]:
# make our sparse matrix a dense one. 
newV = V.todense()

In [13]:
newV.shape

(45277L, 2043L)

In [14]:
newVarray=np.array(newV)

In [15]:
np.set_printoptions(threshold='nan')

In [16]:
def pearsSim(inA,inB):
    if len(inA) < 3 : return 1.0
    return 0.5 + 0.5 * np.corrcoef(inA, inB, rowvar = 0)[0][1]

In [17]:
from scipy import spatial

In [18]:
from scipy.stats.stats import pearsonr

# Functions 
## Predicting User Ratings

In [19]:
def userpred1(u,b):
    '''This function predicts user u"s rating for business b using user-based collaborative filtering. Predictions are made
    using the average user u"s rating + weighted average for all neighbors'similarity (formula in slide 13 of Recommender PPT)'''
    uavg=newVarray[u][np.nonzero(newVarray[u])].mean()
    totalR =0 
    totalsim =0
    for x in range(len(newVarray)):# iterate over all users
        if newVarray[x][b]>0: #if that user's rating for b >0
            #print 'n_idx', x
            navg=newVarray[x][np.nonzero(newVarray[x])].mean()# average rating for neighbor
            #print 'navg',navg
            nbrating= newVarray[x][b] # neighbor's rating for b
            #print 'nbrating',nbrating
            uxsim,pvalue=pearsonr(newVarray[u],newVarray[x])# cosine similarity between u and neighbor 
            #print 'uxsim',(uxsim)
            totalR+=(nbrating-navg)*(uxsim)
            totalsim+=(uxsim)
            #print 'totalR:',totalR,'totalsim:',totalsim 
    final = uavg + totalR/totalsim
    return final

In [20]:
# test userpred1 function
print 'Rating prediction for user0 for restaurant 7:',userpred1(0,7)
print 'Rating prediction for user1 for restaurant 130:',userpred1(1,30)
print 'Rating prediction for user199 for restaurant 6:',userpred1(199,6)
print 'Rating prediction for user250 for restaurant 6:',userpred1(250,6)
print 'Rating prediction for user7 for restaurant 173:',userpred1(70,173)

Rating prediction for user0 for restaurant 7: 3.7017088815
Rating prediction for user1 for restaurant 130: 2.4968480621
Rating prediction for user199 for restaurant 6: 3.80271312997
Rating prediction for user250 for restaurant 6: 3.54881879171
Rating prediction for user7 for restaurant 173: 2.54685973377


# Functions 
## Recommend Restaurants

In [21]:
''' this function recommends k number of business's for user, using the k highest predictions generated from the userpred1 function.'''
def userrecom(user, k):
    nbus=newVarray.shape[1]
    zerorating =[]
    for nb in range(nbus):
        if newVarray[user,nb]==0:
            zerorating.append(nb)
    ratings=[]
    for i in zerorating:
        z=userpred1(user, i)
        ratings.append(z)
    businessratings=np.column_stack((zerorating, ratings))
    sortedBratings = businessratings[np.argsort(businessratings[:,1])][::-1]
    for j in range(k):
        bindex=sortedBratings[j][0]
        key=business_indices.keys()[business_indices.values().index(bindex)]
        name = np.array(busDB[busDB["business_id"]==key].name)
        if sortedBratings[j][1]>5:
            sortedBratings[j][1]=5
        print name

In [22]:
import timeit

In [23]:
# test userrecom function 
userrecom(0,5)

['Grinders Coffee Company' 'Grinders Coffee Company']
["Lin's Grand Buffet"]
['Breakfast Club']
["Rico's American Grill"]
["Gallagher's Sports Grill" "Gallagher's Sports Grill"]


In [24]:
userrecom(139,5)

["Bobby D's Restaurant & Lounge" "Bobby D's Restaurant & Lounge"]
["Rinaldi's Italian Deli"]
['Del Taco']
['Hillside Spot']
["Macayo's Mexican Kitchen"]


In [25]:
userrecom(10,3)

["Ingo's Tasty Food"]
['Chicago Hamburger Co']
["Barro's Pizza"]


# Evaluation 
## Timing


In [26]:
%timeit userrecom(0,5)

['Grinders Coffee Company' 'Grinders Coffee Company']
["Lin's Grand Buffet"]
['Breakfast Club']
["Rico's American Grill"]
["Gallagher's Sports Grill" "Gallagher's Sports Grill"]
['Grinders Coffee Company' 'Grinders Coffee Company']
["Lin's Grand Buffet"]
['Breakfast Club']
["Rico's American Grill"]
["Gallagher's Sports Grill" "Gallagher's Sports Grill"]
['Grinders Coffee Company' 'Grinders Coffee Company']
["Lin's Grand Buffet"]
['Breakfast Club']
["Rico's American Grill"]
["Gallagher's Sports Grill" "Gallagher's Sports Grill"]
['Grinders Coffee Company' 'Grinders Coffee Company']
["Lin's Grand Buffet"]
['Breakfast Club']
["Rico's American Grill"]
["Gallagher's Sports Grill" "Gallagher's Sports Grill"]
1 loops, best of 3: 55.1 s per loop


In [27]:
%timeit userrecom(139,5)

["Bobby D's Restaurant & Lounge" "Bobby D's Restaurant & Lounge"]
["Rinaldi's Italian Deli"]
['Del Taco']
['Hillside Spot']
["Macayo's Mexican Kitchen"]
["Bobby D's Restaurant & Lounge" "Bobby D's Restaurant & Lounge"]
["Rinaldi's Italian Deli"]
['Del Taco']
['Hillside Spot']
["Macayo's Mexican Kitchen"]
["Bobby D's Restaurant & Lounge" "Bobby D's Restaurant & Lounge"]
["Rinaldi's Italian Deli"]
['Del Taco']
['Hillside Spot']
["Macayo's Mexican Kitchen"]
["Bobby D's Restaurant & Lounge" "Bobby D's Restaurant & Lounge"]
["Rinaldi's Italian Deli"]
['Del Taco']
['Hillside Spot']
["Macayo's Mexican Kitchen"]
1 loops, best of 3: 54 s per loop


# Evaluation 
## Validating users

In [28]:
def twenty_validate_user(dataMat, user, test_ratio, estMethod=userpred1, simMeas=pearsSim):
    '''This function tests error of userpred1 for a ratio of the user"s ratings'''
    number_of_items = np.shape(dataMat)[1]
    rated_items_by_user = np.array([i for i in range(number_of_items) if dataMat[user,i]>0]) # nonzero ratings for that user
    test_size = math.ceil(test_ratio * len(rated_items_by_user)) # size of portion of nonzero ratings for that user. Ceiling rounds up.
    '''Note: We are using ceiling because some users only have 1 rating. These users will have "test" size =1 and this 
    function will return the error for that 1 rating. '''
    test_indices = np.random.randint(0, len(rated_items_by_user), test_size)
    withheld_items = rated_items_by_user[test_indices] # testing set
    original_user_profile = np.copy(dataMat[user]) 
    
    if len(withheld_items)==1:
        error_u = estMethod(user, withheld_items) - original_user_profile[withheld_items]
        count_u = 1
        MAE = error_u
    else:
        dataMat[user, withheld_items] = 0 # So that the withheld test items is not used in the rating estimation below
        error_u = 0.0
        count_u = len(withheld_items) # size of testing set 
        #print "withheld", withheld_items
        
    # Compute absolute error for user u over all test items
        for item in withheld_items:
            # Estimate rating on the withheld item
            estimatedScore = estMethod(user,item)
            error_u = error_u + abs(estimatedScore - original_user_profile[item])
            #print "original", original_user_profile[item]
            #print item
            #print "estScore", estimatedScore
            #print "current error", error_u
        MAE=error_u/len(withheld_items)

    # Now restore ratings of the withheld items to the user profile
        for item in withheld_items:
            dataMat[user, item] = original_user_profile[item]

    # Return sum of absolute errors, count of test cases, and MAE for this user
    # these will have to be accumulated for each user to compute the MAE in the test function that follows 
    return error_u, count_u, MAE

In [29]:
error_u, count_u, MAE = twenty_validate_user(newVarray, 0, 0.2, estMethod=userpred1, simMeas=pearsSim)
print "Validate User 0: total User0 error is", error_u, ', total rating count is:', count_u, ", MAE :", MAE
error_u, count_u, MAE = twenty_validate_user(newVarray, 1, 0.2, estMethod=userpred1)
print "Validate User 1: total User 1 error is", error_u, ', total rating count is:', count_u, ", MAE :", MAE
error_u, count_u, MAE=twenty_validate_user(newVarray, 31, 0.2, estMethod=userpred1)
print "Validate User 31: total User 31 error is", error_u, ', total rating count is:', count_u, ", MAE :", MAE
error_u, count_u, MAE=twenty_validate_user(newVarray, 109, 0.2, estMethod=userpred1)
print "Validate User 109: total User 109 error is", error_u, ', total rating count is:', count_u, ", MAE :", MAE
error_u, count_u, MAE=twenty_validate_user(newVarray, 72, 0.2, estMethod=userpred1)
print "Validate User 72: total User 72 error is", error_u, ', total rating count is:', count_u, ", MAE :", MAE

NameError: global name 'math' is not defined

#Evaluation
##Test

In [104]:
import math as math

In [135]:
def test(dataMat, test_ratio, estMethod=userpred1):
    '''This function gives overall MAE for estMethod on all users.'''
    totalerror=0 # sum of errors across all test cases for all users 
    totalcount=0 # total number of test cases for all users 
    for user in range(len(dataMat)): # iterate over all users
        error, count, MAE = twenty_validate_user(dataMat, user, test_ratio, estMethod)
        totalerror+=error # error is the sum of absolute errors for this user
        totalcount+=count # count is the number of test cases for this user 
    TotMAE=totalerror/totalcount # the ratio of total error across all test cases to the total number of test cases, for all users
    print 'Mean Absoloute Error for ',estMethod,' : ', TotMAE

In [136]:
test(newVarray, 0.2, estMethod=userpred1)

Mean Absoloute Error for  <function userpred1 at 0x0000000011AF9C18>  :  [ 0.38646296]


In [None]:
## took 2 hr to ran the test functions. 