In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords 
from nltk.tokenize import WordPunctTokenizer

In [2]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
df = pd.read_csv('Merged_dataset_recent.csv', encoding = "ISO-8859-1")

In [4]:
df= df.head(20000)

In [5]:
##for cleaning the text by removing stop words and coverting to readable format using regex
import string
import re
def Clean_the_pooled_text(text):
    text = text.translate(string.punctuation)
    
    ## Convert words to lower case and split them
    text = text.lower().split()
    
    ## Remove stop words
    stops = set(stopwords.words("english"))
    text = [w for w in text if not w in stops and len(w) >= 3]
    
    text = " ".join(text)
    
    
    text = re.sub(r"[^A-Za-z0-9^,!.\/'+-=]", " ", text)
    text = re.sub(r"what's", "what is ", text)
    text = re.sub(r"\'s", " ", text)
    text = re.sub(r"\'ve", " have ", text)
    text = re.sub(r"n't", " not ", text)
    text = re.sub(r"i'm", "i am ", text)
    text = re.sub(r"\'re", " are ", text)
    text = re.sub(r"\'d", " would ", text)
    text = re.sub(r"\'ll", " will ", text)
    text = re.sub(r",", " ", text)
    text = re.sub(r"\.", " ", text)
    text = re.sub(r"!", " ! ", text)
    text = re.sub(r"\/", " ", text)
    text = re.sub(r"\^", " ^ ", text)
    text = re.sub(r"\+", " + ", text)
    text = re.sub(r"\-", " - ", text)
    text = re.sub(r"\=", " = ", text)
    text = re.sub(r"'", " ", text)
    text = re.sub(r"(\d+)(k)", r"\g<1>000", text)
    text = re.sub(r":", " : ", text)
    text = re.sub(r" e g ", " eg ", text)
    text = re.sub(r" b g ", " bg ", text)
    text = re.sub(r" u s ", " american ", text)
    text = re.sub(r"\0s", "0", text)
    text = re.sub(r" 9 11 ", "911", text)
    text = re.sub(r"e - mail", "email", text)
    text = re.sub(r"j k", "jk", text)
    text = re.sub(r"\s{2,}", " ", text)    
    return text

In [6]:
final_data = df[['name', 'user_id', 'stars', 'text']]

In [7]:
%%time
final_data['text'] = final_data['text'].apply(Clean_the_pooled_text)

Wall time: 17.5 s


In [8]:
#Split train test
vld_size=0.15
X_train, X_valid, y_train, y_valid = train_test_split(final_data['text'], df['name'], test_size = vld_size) 

In [9]:
user_dataframe = final_data[['user_id','text']]
business_dataframe = final_data[['name', 'text']]

In [11]:
user_dataframe = user_dataframe.groupby('user_id').agg({'text': ' '.join})
business_dataframe = business_dataframe.groupby('name').agg({'text': ' '.join})

In [13]:
#user text vectorizer
user_text_vectorizer = TfidfVectorizer(tokenizer = WordPunctTokenizer().tokenize, max_features=1000)
user_text_vectors = user_text_vectorizer.fit_transform(user_dataframe['text'])
user_text_vectors.shape


(17914, 1000)

In [16]:
#Business text vectorizer
business_text_vectorizer = TfidfVectorizer(tokenizer = WordPunctTokenizer().tokenize, max_features=1000)
business_text_vectors = business_text_vectorizer.fit_transform(business_dataframe['text'])
business_text_vectors.shape

(117, 1000)

In [17]:
user_rating_matrix = pd.pivot_table(final_data, values='stars', index=['user_id'], columns=['name'])
user_rating_matrix.shape

(17914, 117)

In [18]:
user_rating_matrix.head()

name,8 Noodle Bar,Aces & Ales,Argana,BLT Steak,Barista Bagels,Bavette's Steakhouse & Bar,Beer Park,Black Bear Diner,Blizz,Blueberry Hill Family Restaurant,...,Toddy Shop,Vegas Uncork'd: The Grand Tasting,Viva Salsa,Voodoo Pizza,Wildburger,Winchell's Donut House,Winchell's Pub & Grill,"Wok & Roll ""TAPSILOGAN""",Woo Chun Korean BBQ,Yummy Grill & Sushi
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
--IFH_sbTkfXxbXO4nSEyQ,,,,,,,,,,,...,,,,,,,,,,
--RlSfc-QmcHFGHyX6aVjA,,,,,,,,,,,...,,,,,,,,3.0,,
--ZNfWKj1VyVElRx6-g1fg,,,,,,,,,,,...,,,,,,,,,,
-0DzTWU5ZKy-4yErEziMUQ,,,,,,,,,,,...,,,,,,,,,,
-0GfW6bw64XRcI6fWvUQ6g,,,,,,,,,,,...,,,,,,,,,,


In [19]:
user_matrix = pd.DataFrame(user_text_vectors.toarray(), index=user_dataframe.index, columns=user_text_vectorizer.get_feature_names())
business_matrix = pd.DataFrame(business_text_vectors.toarray(), index=business_dataframe.index, columns=business_text_vectorizer.get_feature_names())


In [20]:
user_matrix

Unnamed: 0_level_0,!,+,-,00,1,10,100,11,12,15,...,write,wrong,year,years,yelp,yes,yet,you,yum,yummy
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
--IFH_sbTkfXxbXO4nSEyQ,0.634751,0.181406,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
--RlSfc-QmcHFGHyX6aVjA,0.000000,0.491293,0.212330,0.0,0.0,0.117587,0.0,0.0,0.0,0.0,...,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
--ZNfWKj1VyVElRx6-g1fg,0.158169,0.508537,0.065935,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
-0DzTWU5ZKy-4yErEziMUQ,0.000000,0.298149,0.128856,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
-0GfW6bw64XRcI6fWvUQ6g,0.000000,0.111151,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
zzNDm2chjIw0WAN21CRPWg,0.000000,0.000000,0.155403,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
zzVUc2funSKmaIcJtiRIIQ,0.535175,0.000000,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
zzVgjcvRlPHUcddrAnYNIg,0.130475,0.000000,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
zza0_wCO1Q4bw0ZBtXnlsQ,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.326893,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [21]:
business_matrix

Unnamed: 0_level_0,!,+,-,00,1,10,100,11,12,15,...,write,wrong,year,years,yelp,yes,yet,you,yum,yummy
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
8 Noodle Bar,0.290974,0.421741,0.063942,0.008538,0.004857,0.010490,0.001233,0.009864,0.008628,0.004444,...,0.002314,0.010587,0.003143,0.003780,0.006927,0.007560,0.006675,0.011375,0.002338,0.013599
Aces & Ales,0.330067,0.365222,0.053892,0.000000,0.011084,0.016322,0.000000,0.000000,0.000000,0.000000,...,0.006602,0.002746,0.011953,0.016174,0.005646,0.016174,0.013602,0.016224,0.006671,0.005542
Argana,0.363976,0.446698,0.193143,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.022834,0.000000,0.000000,0.000000,0.000000
BLT Steak,0.287805,0.523309,0.112230,0.008147,0.011354,0.010350,0.013382,0.004632,0.007204,0.002783,...,0.004348,0.010044,0.008309,0.009862,0.006610,0.005917,0.007564,0.009157,0.000488,0.006893
Barista Bagels,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Winchell's Donut House,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
Winchell's Pub & Grill,0.500576,0.338774,0.059029,0.000000,0.010761,0.021129,0.000000,0.004553,0.003983,0.008206,...,0.000000,0.014216,0.003868,0.034894,0.003654,0.000000,0.000000,0.012001,0.017270,0.025109
"Wok & Roll ""TAPSILOGAN""",0.371133,0.669208,0.139564,0.000000,0.008292,0.004070,0.005263,0.005263,0.004604,0.004743,...,0.000000,0.012324,0.000000,0.000000,0.016896,0.004033,0.000000,0.013872,0.000000,0.024877
Woo Chun Korean BBQ,0.233486,0.077829,0.206498,0.000000,0.000000,0.000000,0.070080,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.056248,0.000000,0.054203,0.046180,0.000000,0.000000


In [22]:
def matrix_factorization(R, P, Q, steps=1, gamma=0.001,lamda=0.02):
    for step in range(steps):
        for i in R.index:
            for j in R.columns:
                if R.loc[i,j]>0:
                    eij=R.loc[i,j]-np.dot(P.loc[i],Q.loc[j])
                    P.loc[i]=P.loc[i]+gamma*(eij*Q.loc[j]-lamda*P.loc[i])
                    Q.loc[j]=Q.loc[j]+gamma*(eij*P.loc[i]-lamda*Q.loc[j])
        e=0
        for i in R.index:
            for j in R.columns:
                if R.loc[i,j]>0:
                    e= e + pow(R.loc[i,j]-np.dot(P.loc[i],Q.loc[j]),2)+lamda*(pow(np.linalg.norm(P.loc[i]),2)+pow(np.linalg.norm(Q.loc[j]),2))
        if e<0.001:
            break
        
    return P,Q


In [23]:

P, Q = matrix_factorization(user_rating_matrix, user_matrix, business_matrix, steps=1, gamma=0.001,lamda=0.02)

In [24]:

#Store P, Q and vectorizer in pickle file
import pickle
output = open('recommendation.pkl', 'wb')
pickle.dump(P,output)
pickle.dump(Q,output)
pickle.dump(user_text_vectorizer,output)
output.close()

In [28]:
input_text = 'I am carving for japanese food'
test_dataframe= pd.DataFrame([input_text], columns=['text'])
test_dataframe['text'] = test_dataframe['text'].apply(Clean_the_pooled_text)
test_vectors = user_text_vectorizer.transform(test_dataframe['text'])
test_vector_df = pd.DataFrame(test_vectors.toarray(), index=test_dataframe.index, columns=user_text_vectorizer.get_feature_names())

predictItemRating=pd.DataFrame(np.dot(test_vector_df.loc[0],Q.T),index=Q.index,columns=['Rating'])
topRecommendations=pd.DataFrame.sort_values(predictItemRating,['Rating'],ascending=[0])[:5]

topRecommendations

Unnamed: 0_level_0,Rating
name,Unnamed: 1_level_1
Taiga Modern Japanese & Thai Restaurant,0.337171
Sushi Takashi,0.234392
HEIGHT japanese cutlet,0.231047
Yummy Grill & Sushi,0.161671
Le Thai,0.152989


In [29]:
topRecommendations.to_csv('cuban_food.csv', encoding = 'utf-8', index = True)

In [30]:
f = open('recommendation.pkl', 'rb')
P, Q, userid_vectorizer = pickle.load(f), pickle.load(f), pickle.load(f)

In [32]:
test_df = X_valid.to_frame()
test_df['text'] = test_df['text'].apply(Clean_the_pooled_text)
test_vectors = userid_vectorizer.transform(test_df['text'])
test_v_df = pd.DataFrame(test_vectors.toarray(), index=test_df.index,
                         columns=userid_vectorizer.get_feature_names())


In [33]:
y_pred = []
for key, row in test_v_df.iterrows():
    predictItemRating=pd.DataFrame(np.dot(row,Q.T),index=Q.index,columns=['Rating'])
    topRecommendations=pd.DataFrame.sort_values(predictItemRating,['Rating'],ascending=[0])[:1]
    y_pred.append(topRecommendations.index[0])

In [34]:
#Calculate Accuracy
from sklearn.metrics import accuracy_score
print('Accuracy for validation set is: ',accuracy_score(y_valid, y_pred))

Accuracy for validation set is:  0.5023333333333333


In [35]:
cat = pd.read_csv('business.csv', encoding = "ISO-8859-1")
cat.head()

Unnamed: 0.1,Unnamed: 0,business_id,name,address,city,state,postal_code,latitude,longitude,stars,review_count,is_open,attributes,categories,hours
0,0,f9NumwFMBDn751xgFiRbNA,The Range At Lake Norman,10913 Bailey Rd,Cornelius,NC,28031,35.462724,-80.852612,3.5,36,1,"{'BusinessAcceptsCreditCards': 'True', 'BikePa...","Active Life, Gun/Rifle Ranges, Guns & Ammo, Sh...","{'Monday': '10:0-18:0', 'Tuesday': '11:0-20:0'..."
1,1,Yzvjg0SayhoZgCljUJRF9Q,"Carlos Santo, NMD","8880 E Via Linda, Ste 107",Scottsdale,AZ,85258,33.569404,-111.890264,5.0,4,1,"{'GoodForKids': 'True', 'ByAppointmentOnly': '...","Health & Medical, Fitness & Instruction, Yoga,...",
2,2,XNoUzKckATkOD1hP6vghZg,Felinus,3554 Rue Notre-Dame O,Montreal,QC,H4C 1P4,45.479984,-73.58007,5.0,5,1,,"Pets, Pet Services, Pet Groomers",
3,3,6OAZjbxqM5ol29BuHsil3w,Nevada House of Hose,1015 Sharp Cir,North Las Vegas,NV,89030,36.219728,-115.127725,2.5,3,0,"{'BusinessAcceptsCreditCards': 'True', 'ByAppo...","Hardware Stores, Home Services, Building Suppl...","{'Monday': '7:0-16:0', 'Tuesday': '7:0-16:0', ..."
4,4,51M2Kk903DFYI6gnB5I6SQ,USE MY GUY SERVICES LLC,4827 E Downing Cir,Mesa,AZ,85205,33.428065,-111.726649,4.5,26,1,"{'BusinessAcceptsCreditCards': 'True', 'ByAppo...","Home Services, Plumbing, Electricians, Handyma...","{'Monday': '0:0-0:0', 'Tuesday': '9:0-16:0', '..."


In [36]:

cat[cat['name']=="Cannery Row Buffet"]


Unnamed: 0.1,Unnamed: 0,business_id,name,address,city,state,postal_code,latitude,longitude,stars,review_count,is_open,attributes,categories,hours
30958,30958,-P8dGzSVhJi-5oZ-8U2y0w,Cannery Row Buffet,5255 Boulder Hwy,Las Vegas,NV,89122,36.108417,-115.057084,2.5,63,0,"{'BusinessAcceptsCreditCards': 'True', 'Restau...","Buffets, Restaurants, Diners","{'Monday': '16:0-20:0', 'Tuesday': '16:0-20:0'..."
173453,173453,oqSl1UbxD6n9aFnQiYY96w,Cannery Row Buffet,2121 E Craig Rd,North Las Vegas,NV,89030,36.23855,-115.119088,2.5,139,1,"{'RestaurantsGoodForGroups': 'True', 'NoiseLev...","Restaurants, Buffets","{'Monday': '11:0-20:0', 'Tuesday': '11:0-20:0'..."


In [227]:
for i in topRecommendations.index:
    print('Restaurant name is  '+cat[cat['name']==i]['name'].iloc[0])
    print('Categories  are '+cat[cat['name']==i]['categories'].iloc[0])
    print('rating ='+str(cat[cat['name']==i]['stars'].iloc[0])+'   '+ 'number of reviews= '+str(cat[cat['name']==i]['review_count'].iloc[0]))
    print('')
topRecommendations

Restaurant name is  Taiga Modern Japanese & Thai Restaurant
Categories  are Restaurants, Thai, Japanese
rating =4.0   number of reviews= 48

Restaurant name is  Sushi Takashi
Categories  are Restaurants, Sushi Bars, Izakaya, Japanese, Tapas/Small Plates
rating =4.5   number of reviews= 455

Restaurant name is  HEIGHT japanese cutlet
Categories  are Restaurants, Food, Japanese Curry, Japanese, Specialty Food
rating =4.5   number of reviews= 73

Restaurant name is  Yummy Grill & Sushi
Categories  are Chinese, Restaurants, Japanese, Hawaiian
rating =3.5   number of reviews= 208

Restaurant name is  Le Thai
Categories  are Nightlife, Restaurants, Beer Gardens, Thai
rating =4.0   number of reviews= 1951



Unnamed: 0_level_0,Rating
name,Unnamed: 1_level_1
Taiga Modern Japanese & Thai Restaurant,0.337171
Sushi Takashi,0.234392
HEIGHT japanese cutlet,0.231047
Yummy Grill & Sushi,0.161671
Le Thai,0.152989
