In [24]:
import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import pairwise_distances
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer

df_train = pd.read_csv('Data/restaurant_review.csv')
df_train = df_train[df_train.user_id != '#NAME?']

In [25]:
#Preprocessed data
df_bus_user = df_train[['business_id','name','user_id','review_rating']]
df_bus_user.tail()

Unnamed: 0,business_id,name,user_id,review_rating
2851,a6,res_6,,
2852,a7,res_7,,
2853,a8,res_8,,
2854,a9,res_9,,
2855,a10,res_10,,


In [26]:
#keep only required data
df_idNameCat = df_train[['business_id','name','categories']]
df_idNameCat.set_index('business_id', inplace = True)
df_idNameCat = df_idNameCat.drop_duplicates()
df_idNameCat.tail()

Unnamed: 0_level_0,name,categories
business_id,Unnamed: 1_level_1,Unnamed: 2_level_1
a6,res_6,"Asian Fusion,Thai,Sports Bars"
a7,res_7,"Pubs,German,Beer Bar"
a8,res_8,"Asian Fusion,Ramen,Noodles"
a9,res_9,"Indian,Himalayan/Nepalese,Vegan"
a10,res_10,"Sushi Bars,Japanese"


In [27]:
#Consider all keyword as unigram
df_idNameCat['categories'] = df_idNameCat['categories'].map(lambda x: x.lower().split(','))
df_idNameCat['name'] = df_idNameCat['name'].map(lambda x: x.split(' '))

In [28]:
#combine name and categories and store in keyword column
df_idNameCat['keywords'] = ''
columns = df_idNameCat.columns
for index, row in df_idNameCat.iterrows():
    words = ''
    for col in columns:
        words = words + ' '.join(row[col])+ ' '
    row['keywords'] = words
    
df_idNameCat.drop(columns = [col for col in df_idNameCat.columns if col!= 'keywords'], inplace = True)

In [29]:
df_idNameCat.tail()

Unnamed: 0_level_0,keywords
business_id,Unnamed: 1_level_1
a6,res_6 asian fusion thai sports bars
a7,res_7 pubs german beer bar
a8,res_8 asian fusion ramen noodles
a9,res_9 indian himalayan/nepalese vegan
a10,res_10 sushi bars japanese


In [30]:
#Convert a keywords to a matrix of token counts
count = CountVectorizer()
count_matrix = count.fit_transform(df_idNameCat['keywords'])
indices = pd.Series(df_idNameCat.index)
indices[:5]

0    f-m7-hyFzkf0HSEeQ2s-9A
1    HHtpR0RslupSQ99GIIwW5A
2    ZoZjbOYR-apY8XvommlNUA
3    gqVl3RprESEqkIPeJH0yOg
4    8kck3-K4zYKTJbJko0JlXQ
Name: business_id, dtype: object

In [31]:
#Calculating similarity using cosine similarity
cosine_sim = cosine_similarity(count_matrix, count_matrix)
cosine_sim

array([[1.        , 0.        , 0.20412415, ..., 0.        , 0.        ,
        0.20412415],
       [0.        , 1.        , 0.        , ..., 0.36514837, 0.        ,
        0.        ],
       [0.20412415, 0.        , 1.        , ..., 0.4472136 , 0.        ,
        0.        ],
       ...,
       [0.        , 0.36514837, 0.4472136 , ..., 1.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 1.        ,
        0.        ],
       [0.20412415, 0.        , 0.        , ..., 0.        , 0.        ,
        1.        ]])

In [32]:
# function that takes in restaurant as input and returns the top 10 recommended restaurants
def Similar_restaurants(res_id, cosine_sim = cosine_sim):
    
    recommended_res = []
    
    # gettin the index of the restaurant that matches restaurant
    idx = indices[indices == res_id].index[0]

    # creating a Series with the similarity scores in descending order
    score_series = pd.Series(cosine_sim[idx]).sort_values(ascending = False)

    # getting the indexes of the 10 most similar restaurants
    top_10_indexes = list(score_series.iloc[1:11].index)
    
    # populating the list with the restaurant of the best 10 matching restaurants
    for i in top_10_indexes:
        recommended_res.append(list(df_idNameCat.index)[i])
        
    return recommended_res

In [33]:
rec_bus_list = Similar_restaurants('a6')
rec_bus_list

['2FR395iJImCphQNGWubUtw',
 'aZ_YuxbdMB_ViExPkiXTbg',
 'TvPheRa8TfhyET_vsN2uKQ',
 'METk5cHv8bBJGxkOosw2fw',
 's10mApVcsXi6GHxOuVHEGw',
 'TdSMgDvprOcvMurDHFZdZw',
 'jON2lBVp77IT5l5AcCMjqw',
 'cjfJOMF6hpE7bX0ESWdAEg',
 'rMuLW1qVB-deU5NStYerWA',
 '8rc-DM2AqXx46X2hZq7W1A']

In [34]:
# For each similar restaurants, find all users from df_bus_user

def find_users_who_may_like_similar_business(df_bus_user, rec_bus_list):
    users_who_may_like = set() # set
    for each_recommendation in rec_bus_list:
        for index, row in df_bus_user.iterrows():
            if row[0] == each_recommendation and row[3] > 3:
                users_who_may_like.add(row[2])
    return users_who_may_like

In [35]:
high_star_rating_users_who_may_like = find_users_who_may_like_similar_business(df_bus_user, rec_bus_list)

In [37]:
# List of users who might like res_id (These users will be recommended with res_id)
high_star_rating_users_who_may_like

{'1gdZQ47zus0DsW-F9hdGmQ',
 '3hSGiWeU55-t3ef_dsIqkw',
 'C3YcMYonAvBKZAD2uUnfig',
 'Lf1vReyrKC0fI63AWiP23g',
 'LljZPVdvobWxgbAaXDwM6Q',
 'MOgz0_VkT9AOJFQ5r33Y7A',
 'PrkYAAWHpcNLn0X1sVP2Ig',
 'RG91_Obi7yhHKAs5tUYgDQ',
 'ReoITf9K798Y_1aat7zh-Q',
 'SgUv6nrd1uKtDvppvOmP-A',
 'WV7LEO40IMjWBrKXzGMlmQ',
 'ZfUM2xZQcq2ymMyxnh4skg',
 'eIa4QJXp5ZNmuFORnuPvDA',
 'jCaKs16lbEjuc9n02GTrsQ',
 'pYLh_NnbD0kY8Gxa6R8egA',
 'rrEP6uLkw5QPCcC8JNSqxw',
 'uQCqaGoFCIbZdZgtLmNWnw',
 'x4CKCX971-iuoKFgfDUh1w',
 'xkoRni5Ubm-Ic9Bj6uDuiQ',
 'yZM8bO0dbhffVp6RCTFN5w',
 'yZY3qBVGRlS-m-iEp-Zwdw'}