<a href="https://colab.research.google.com/github/Zaryn-Ooi/Food-Recommendation-System/blob/main/RecoSys.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Building a Restaurant Recommendation System 
Aim: Build a recommendation system to help users decide what to eat based on their likings.  

Data Source: Yelp Dataset

Data Content: 

- 1st dataset: User review data 
- 2nd dataset: Restaurant description (Eg.name, address, location, ratings)



## Import Packages and Dataset 


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords 
from nltk.tokenize import WordPunctTokenizer

#### User Review Dataset Overview

In [None]:
df_review = pd.read_csv('/content/drive/My Drive/review.csv')
df_review = df_review.iloc[:, 1:]
df_review.iloc[0]

review_id                                 h_BiERAWsreJmpEnwg9gmw
user_id                                   J5sT9OGM_HIEO1jnyAItgw
business_id                               _C7QiQQc47AOEv4PE3Kong
stars                                                          2
useful                                                         0
funny                                                          1
cool                                                           0
text           I had heard so much about this place but never...
date2                                                 2015-06-23
Name: 0, dtype: object

In [None]:
df_review.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 246622 entries, 0 to 246621
Data columns (total 9 columns):
 #   Column       Non-Null Count   Dtype 
---  ------       --------------   ----- 
 0   review_id    246622 non-null  object
 1   user_id      246622 non-null  object
 2   business_id  246622 non-null  object
 3   stars        246622 non-null  int64 
 4   useful       246622 non-null  int64 
 5   funny        246622 non-null  int64 
 6   cool         246622 non-null  int64 
 7   text         246622 non-null  object
 8   date2        246622 non-null  object
dtypes: int64(4), object(5)
memory usage: 16.9+ MB


In [None]:
len(df_review)

246622

In [None]:
df_review.shape

(246622, 9)

In [None]:
df_review.head()

Unnamed: 0,review_id,user_id,business_id,stars,useful,funny,cool,text,date2
0,h_BiERAWsreJmpEnwg9gmw,J5sT9OGM_HIEO1jnyAItgw,_C7QiQQc47AOEv4PE3Kong,2,0,1,0,I had heard so much about this place but never...,2015-06-23
1,abFdLPoawYFKOF4sYv2rdA,oGFSMLwrVZFe4JL5b_YOfw,al3Ri6TEqa2rBzjHsn0T_g,3,1,1,0,I was really looking forward to Merchant becau...,2013-07-16
2,k-rcig_9hU-tRHts-P_Vtw,88IP8keHlnsCT4ksZNs5mw,M2h3vq8WeSiQAmyJtrPvbQ,5,0,0,0,I had an excellent meal here and the service w...,2016-03-20
3,GNk0ybEjcpAF8rf8H5LqdA,whASopgK0XTo4fKA78uSaw,Om9eoEcwPK1lp1-HEjBzeQ,2,0,0,0,Very disappointing. We give it a 2 star instea...,2017-03-06
4,kZ7_MR9-XEkhnzzU2Li2eg,WNRmgAmR_LR-ey8bOzJ8Og,U3grYFIeu6RgAAQgdriHww,3,0,0,0,"Long line way before it opens\nLots of Stairs,...",2017-08-18


#### Restaurant Description Dataset Overview

In [None]:
df_business = pd.read_json('/content/drive/My Drive/business.json', lines=True)

In [None]:
len(df_business)

150346

In [None]:
business_list = list(df_review['business_id'])

In [None]:
df_business = df_business[df_business['business_id'].isin(business_list)]
len(df_business)

11797

In [None]:
df_business.shape

(11797, 14)

In [None]:
df_business.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 11797 entries, 30070 to 45104
Data columns (total 14 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   business_id   11797 non-null  object 
 1   name          11797 non-null  object 
 2   address       11797 non-null  object 
 3   city          11797 non-null  object 
 4   state         11797 non-null  object 
 5   postal_code   11797 non-null  object 
 6   latitude      11797 non-null  float64
 7   longitude     11797 non-null  float64
 8   stars         11797 non-null  float64
 9   review_count  11797 non-null  int64  
 10  is_open       11797 non-null  int64  
 11  attributes    10993 non-null  object 
 12  categories    11791 non-null  object 
 13  hours         9891 non-null   object 
dtypes: float64(3), int64(2), object(9)
memory usage: 1.4+ MB


In [None]:
df_business.head()

Unnamed: 0,business_id,name,address,city,state,postal_code,latitude,longitude,stars,review_count,is_open,attributes,categories,hours
30070,zOllte2g-139fzkcdgnu6w,Judge's Bar-B-Que,2104 W Michigan St,Indianapolis,IN,46222,39.774852,-86.197124,4.0,55,0,"{'BusinessParking': '{'garage': False, 'street...","Restaurants, Barbeque, Event Planning & Servic...","{'Monday': '11:0-16:0', 'Tuesday': '11:0-16:0'..."
30071,qMBL1awgwxreifGsm-kZ8g,Starlight Moving Co.,2801 Foster Ave,Nashville,TN,37210,36.154232,-86.737061,4.0,16,0,{'BusinessAcceptsCreditCards': 'True'},"Movers, Home Services","{'Monday': '9:0-18:0', 'Tuesday': '9:0-18:0', ..."
30074,tmqVB8UchkzsMFq9xnpyGQ,Life is Good Chiropractic,"190 W Magee Rd, Ste 152",Oro Valley,AZ,85704,32.352788,-110.972523,5.0,17,1,"{'ByAppointmentOnly': 'False', 'RestaurantsPri...","Chiropractors, Massage Therapy, Massage, Vitam...","{'Monday': '13:0-16:0', 'Wednesday': '13:0-16:..."
30075,tQUkoSqK59m0jXNWL4QyiQ,East Side Smiles,7 N 10th St,Nashville,TN,37206,36.178629,-86.751675,4.0,40,1,"{'ByAppointmentOnly': 'True', 'AcceptsInsuranc...","Health & Medical, General Dentistry, Dentists,...","{'Monday': '0:0-0:0', 'Tuesday': '8:0-17:0', '..."
30076,t72SvAiBRX1dKO8D8PvWrg,United States Post Office,"132 N Kansas St, Ste 1",Edwardsville,IL,62025,38.812945,-89.954296,3.0,5,1,,"Public Services & Government, Post Offices","{'Monday': '8:30-17:30', 'Tuesday': '8:30-17:3..."


## Data Cleaning

In [None]:
df = df_review[['business_id', 'user_id', 'stars', 'text']]

In [None]:
len(df)

246622

In [None]:
df = df.head(30000)

In [None]:
len(df)

30000

In [None]:
# Remove punctuation
import string 
punctuation = string.punctuation
def remove_punctuation(sentence):
  return sentence.translate(str.maketrans('','', punctuation)) # replace punctutation with ''

df['text'] = df['text'].apply(lambda sentence: remove_punctuation(sentence))


In [None]:
df.head(6)

Unnamed: 0,business_id,user_id,stars,text
0,_C7QiQQc47AOEv4PE3Kong,J5sT9OGM_HIEO1jnyAItgw,2,I had heard so much about this place but never...
1,al3Ri6TEqa2rBzjHsn0T_g,oGFSMLwrVZFe4JL5b_YOfw,3,I was really looking forward to Merchant becau...
2,M2h3vq8WeSiQAmyJtrPvbQ,88IP8keHlnsCT4ksZNs5mw,5,I had an excellent meal here and the service w...
3,Om9eoEcwPK1lp1-HEjBzeQ,whASopgK0XTo4fKA78uSaw,2,Very disappointing We give it a 2 star instead...
4,U3grYFIeu6RgAAQgdriHww,WNRmgAmR_LR-ey8bOzJ8Og,3,Long line way before it opensnLots of Stairs n...
5,dNFGbezaxeBR71BdpspKqQ,qxV2hic1JnrwEU_8afWZng,4,This place definitely exceeds my expectation I...


In [None]:
df = df[df.business_id != '#NAME?']
df = df[df.user_id != '#NAME?']
df = df[df.business_id != '#VALUE!']
df = df[df.user_id != '#VALUE!']

In [None]:
df["text"] = df["text"].str.lower()

In [None]:
import spacy
nlp = spacy.load('en_core_web_sm')
# Add 'I' to the set of stop words. 
nlp.Defaults.stop_words.add('i')

# Set the stop_word tag on the lexeme
nlp.vocab['i'].is_stop = True

In [None]:
# remove stopwords
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')

STOPWORDS = set(stopwords.words('english'))
def remove_stopwords(sentence):
    return " ".join([word for word in str(sentence).split() if word not in STOPWORDS])

df["text"] = df["text"].apply(lambda sentence: remove_stopwords(sentence))

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:
df.head(10)

Unnamed: 0,business_id,user_id,stars,text
0,_C7QiQQc47AOEv4PE3Kong,J5sT9OGM_HIEO1jnyAItgw,2,heard much place never opportunity spare money...
1,al3Ri6TEqa2rBzjHsn0T_g,oGFSMLwrVZFe4JL5b_YOfw,3,really looking forward merchant reviews wasnt ...
2,M2h3vq8WeSiQAmyJtrPvbQ,88IP8keHlnsCT4ksZNs5mw,5,excellent meal service amazing highly recommen...
3,Om9eoEcwPK1lp1-HEjBzeQ,whASopgK0XTo4fKA78uSaw,2,disappointing give 2 star instead 1 tacos bad ...
4,U3grYFIeu6RgAAQgdriHww,WNRmgAmR_LR-ey8bOzJ8Og,3,long line way opensnlots stairs elevators stro...
5,dNFGbezaxeBR71BdpspKqQ,qxV2hic1JnrwEU_8afWZng,4,place definitely exceeds expectation would def...
6,kbUeqMV5NzSHVVzJURSuBA,XGQUbwATeYtqoOTZqr3jEA,5,hotdog menu unique different easy understand p...
7,eBadstkXafVkRxZTL1Ya6A,ZNRLwPKNzp9ckkmukXREDw,4,work locally go matzoh ball soup comfort food ...
8,09izIShLukoy36bBOWfKAg,KhEVvIWMAqNPQkETvPCj3A,4,love breadsticks good also cheese dip one favo...
9,HH9x7WcNQR3cnwkHo_YL0w,vYlm2yKpeo_-oNRGoy-5dA,2,great food awful service blonde server busser ...


For each user, combine all the reviews to form a single paragraph, after we combine it all then we apply the TFIDF Vectorizer to extract the features from the text. The similar approach for each restaurant and we need to give the max_feature to match the dimensions of the matrixes.

In [None]:
# Group the user id with their reviews
user_review = df[['user_id','text']]
business_review = df[['business_id', 'text']]


In [None]:
user_review = user_review.groupby('user_id').agg({'text':' '.join})
business_review = business_review.groupby('business_id').agg({'text':' '.join})

In [None]:
user_review.head()

Unnamed: 0_level_0,text
user_id,Unnamed: 1_level_1
-0AyZxS5C--WySnbW_Q8yQ,lot choices menu including list maxines best m...
-0KosxqqMBYNfJ7VuvBYDQ,try osso buco bacco deliciously rich melt mout...
-0LGLx8LP5dq3zcGO4Bebw,delicious meal great service many things menu ...
-0YrXUvXz8112yHap35V2g,wordoutstanding huge fan place since bought gr...
-0lknh8CLIp8XFYwnU59Ag,chriss great menu lots choices especially brea...


In [None]:
user_review.loc[['ZwVz20be-hOZnyAbevyMyQ']]['text']

user_id
ZwVz20be-hOZnyAbevyMyQ    food usually good hobbit menu back fun items t...
Name: text, dtype: object

### TFIDF Vectorizer 
To extract features from the text.

With the help of [nltk.tokenize.WordPunctTokenizer()()](https://https://www.geeksforgeeks.org/python-nltk-tokenize-wordpuncttokenizer/)method, we are able to extract the tokens from string of words or sentences in the form of Alphabetic and Non-Alphabetic character.


In [None]:
#User ID vectorizer
user_vectorizer = TfidfVectorizer(tokenizer = WordPunctTokenizer().tokenize, max_features=5000) # max_features = Limit the amount of features (vocabulary) that the vectorizer will learn
user_vectors = user_vectorizer.fit_transform(user_review['text'])

#Business ID vectorizer
business_vectorizer = TfidfVectorizer(tokenizer = WordPunctTokenizer().tokenize, max_features=5000)
business_vectors = business_vectorizer.fit_transform(business_review['text'])

In [None]:
df['stars'] = df['stars'].astype(int)

In [None]:
print(df.info())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 29225 entries, 0 to 29999
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   business_id  29225 non-null  object
 1   user_id      29225 non-null  object
 2   stars        29225 non-null  int64 
 3   text         29225 non-null  object
dtypes: int64(1), object(3)
memory usage: 1.1+ MB
None


In [None]:
print(user_vectors.shape)
print(business_vectors.shape)

(26425, 5000)
(6466, 5000)


In [None]:
df

Unnamed: 0,business_id,user_id,stars,text
0,_C7QiQQc47AOEv4PE3Kong,J5sT9OGM_HIEO1jnyAItgw,2,heard much place never opportunity spare money...
1,al3Ri6TEqa2rBzjHsn0T_g,oGFSMLwrVZFe4JL5b_YOfw,3,really looking forward merchant reviews wasnt ...
2,M2h3vq8WeSiQAmyJtrPvbQ,88IP8keHlnsCT4ksZNs5mw,5,excellent meal service amazing highly recommen...
3,Om9eoEcwPK1lp1-HEjBzeQ,whASopgK0XTo4fKA78uSaw,2,disappointing give 2 star instead 1 tacos bad ...
4,U3grYFIeu6RgAAQgdriHww,WNRmgAmR_LR-ey8bOzJ8Og,3,long line way opensnlots stairs elevators stro...
...,...,...,...,...
29995,rSYyGcZZziJLsqKl5hMcDw,tYzp_iXqSZes7SgfhUDPHg,4,stayed north tower everyone customer service m...
29996,61NCACGHsNPhbi2DgJAWmg,zNz3l0bSJEAjS17j-2wA8A,4,love place clean cheerful nice staff plenty ch...
29997,Z6f5SQmGTckZ_KkKbib2VQ,rqHmh0RpDqW4rHkd5JajxQ,1,looking buffet area gps found place closed the...
29998,HJduAXxpms1kGJlpx_udoA,lbmyKWjxRFpWOIiaEkWvTw,4,actually nice park considering location entran...


In [None]:
P = pd.DataFrame(user_vectors.toarray(), index=user_review.index, columns=user_vectorizer.get_feature_names())
Q = pd.DataFrame(business_vectors.toarray(), index=business_review.index, columns=business_vectorizer.get_feature_names())



In [None]:
# P = P.rename(index=df['user_id'])
# Q = Q.rename(index=df['business_id'])

In [None]:
# Create a matrix of users and business with the ratings
userid_rating_matrix = pd.pivot_table(df, values='stars', index=['user_id'], columns=['business_id'])

In [None]:
userid_rating_matrix

business_id,-0E708CsutslDLUuuzWGYQ,-16EH6b1ho0xQqP0Bzm9Mg,-1PG6k_iezwJmRZLB7f6og,-2KK9cqa26KZ1xrfOm9A7w,-2VYztMXVorktljCdQNPmQ,-2aGyAUenQEZWAtNUnMHzg,-3LGekWQ5iHMQO1a9_DDbw,-7KAng29RoHr87mvOFbK9w,-7ezd9LpmVkow-eSUHIG9A,-8dM09lp35K28Qn8oaZPdA,...,zv1mH2hoGOuu6ALzQ-14Xg,zvzmKaltuHKPeEcBkiUp1w,zwGzwkVeYXE-tRisb8if7A,zwjWB0vnJhVcSehP_LS82A,zx5mdLeWxqTdcNyIGmaE9A,zxRmQ_FWVowh8rlzLCSURQ,zxWeLWsv6Ebrkb-MDMbD-g,zxqCNze-DDqEZIYrEMJLxg,zz0l4dUf28wzPAaTdGqsSw,zzbZtgPYZS8sTIWQH6DwEw
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
-0AyZxS5C--WySnbW_Q8yQ,,,,,,,,,,,...,,,,,,,,,,
-0KosxqqMBYNfJ7VuvBYDQ,,,,,,,,,,,...,,,,,,,,,,
-0LGLx8LP5dq3zcGO4Bebw,,,,,,,,,,,...,,,,,,,,,,
-0YrXUvXz8112yHap35V2g,,,,,,,,,,,...,,,,,,,,,,
-0lknh8CLIp8XFYwnU59Ag,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
zz5IHe3r6TIpXBGjQy8m7A,,,,,,,,,,,...,,,,,,,,,,
zz9YDSRIvJQkhMbs0JA2cg,,,,,,,,,,,...,,,,,,,,,,
zzSSEPbr4hkvPBOq1ZM9Yw,,,,,,,,,,,...,,,,,,,,,,
zzZTbKRe1B3VwIajVhIOaw,,,,,,,,,,,...,,,,,,,,,,


In [None]:
# Truncated Singular Value Decomposition (SVD)

# svd_preds_df = pd.DataFrame(df, columns = userid_rating_matrix.columns, index=df['user_id']).transpose()

# svd_preds_df

In [None]:
# Creating Item-Item Matrix based on Cosine Similarity
# from sklearn.metrics.pairwise import cosine_similarity
# item_item_matrix = cosine_similarity(svd_preds_df)
# item_item_matrix= pd.DataFrame(item_item_matrix, columns=svd_preds_df.index, index = svd_preds_df.index)
# item_item_matrix

In [None]:
P.loc['-0AyZxS5C--WySnbW_Q8yQ']

0       0.0
1       0.0
10      0.0
100     0.0
1000    0.0
       ... 
√       0.0
√©      0.0
√®      0.0
√°      0.0
√±      0.0
Name: -0AyZxS5C--WySnbW_Q8yQ, Length: 5000, dtype: float64

In [None]:
Q.loc['kcOAfvwLUOAYQO3x4gocqA']

0       0.000000
1       0.000000
10      0.012293
100     0.000000
1000    0.000000
          ...   
√       0.000000
√©      0.017030
√®      0.000000
√°      0.000000
√±      0.000000
Name: kcOAfvwLUOAYQO3x4gocqA, Length: 5000, dtype: float64

### Matrix Factorization
Matrix factorization is a collaborative filtering method to find the relationship between restaurant’s and users’ entities. 
Latent features, the association between users and restaurant matrices, are determined to find similarity and make a prediction based on both item and user entities.
The matrix factorization of user and restaurant matrices can be generated when the math cost function RMSE is minimized through matrix factorization. ***Gradient descent*** is a method to minimize the cost function.

In [None]:
def matrix_factorization(R, P, Q, steps=25, gamma=0.001,lamda=0.02):
    for step in range(steps):
        for i in R.index:
            for j in R.columns:
                if R.loc[i,j]>0:
                    eij=R.loc[i,j]-np.dot(P.loc[i],Q.loc[j])
                    P.loc[i]=P.loc[i]+gamma*(eij*Q.loc[j]-lamda*P.loc[i])
                    Q.loc[j]=Q.loc[j]+gamma*(eij*P.loc[i]-lamda*Q.loc[j])
        e=0
        for i in R.index:
            for j in R.columns:
                if R.loc[i,j]>0:
                    e= e + pow(R.loc[i,j]-np.dot(P.loc[i],Q.loc[j]),2)+lamda*(pow(np.linalg.norm(P.loc[i]),2)+pow(np.linalg.norm(Q.loc[j]),2))
        if e<0.001:
            break
        
    return P,Q

P, Q = matrix_factorization(userid_rating_matrix, P, Q, steps=25, gamma=0.001,lamda=0.02)

In [None]:
words = "i want to have chinese food with beautiful interior"
test_df= pd.DataFrame([words], columns=['text'])
test_df['text'] = test_df['text'].apply(remove_stopwords)
test_vectors = user_vectorizer.transform(test_df['text'])
test_v_df = pd.DataFrame(test_vectors.toarray(), index=test_df.index, columns=user_vectorizer.get_feature_names())
predictItemRating=pd.DataFrame(np.dot(test_v_df.loc[0],Q.T),index=Q.index,columns=['Rating'])
topRecommendations=pd.DataFrame.sort_values(predictItemRating,['Rating'],ascending=[0])[:7]


for i in topRecommendations.index:
    print(df_business[df_business['business_id']==i]['name'].iloc[0])
    print(df_business[df_business['business_id']==i]['categories'].iloc[0])
    print(str(df_business[df_business['business_id']==i]['stars'].iloc[0])+ ' '+str(df_business[df_business['business_id']==i]['review_count'].iloc[0]))
    print('')