In [1]:
import pandas as pd
import numpy as np
import nltk
import re
from nltk.corpus import stopwords
from sklearn.metrics.pairwise import linear_kernel
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
import tkinter as tk
from tkinter import messagebox
import warnings
warnings.filterwarnings('ignore')

In [2]:
data = pd.read_csv('zomato.csv')

In [3]:
data.head(3)

Unnamed: 0,url,address,name,online_order,book_table,rate,votes,phone,location,rest_type,dish_liked,cuisines,approx_cost(for two people),reviews_list,menu_item,listed_in(type),listed_in(city)
0,https://www.zomato.com/bangalore/jalsa-banasha...,"942, 21st Main Road, 2nd Stage, Banashankari, ...",Jalsa,Yes,Yes,4.1/5,775,080 42297555\r\n+91 9743772233,Banashankari,Casual Dining,"Pasta, Lunch Buffet, Masala Papad, Paneer Laja...","North Indian, Mughlai, Chinese",800,"[('Rated 4.0', 'RATED\n A beautiful place to ...",[],Buffet,Banashankari
1,https://www.zomato.com/bangalore/spice-elephan...,"2nd Floor, 80 Feet Road, Near Big Bazaar, 6th ...",Spice Elephant,Yes,No,4.1/5,787,080 41714161,Banashankari,Casual Dining,"Momos, Lunch Buffet, Chocolate Nirvana, Thai G...","Chinese, North Indian, Thai",800,"[('Rated 4.0', 'RATED\n Had been here for din...",[],Buffet,Banashankari
2,https://www.zomato.com/SanchurroBangalore?cont...,"1112, Next to KIMS Medical College, 17th Cross...",San Churro Cafe,Yes,No,3.8/5,918,+91 9663487993,Banashankari,"Cafe, Casual Dining","Churros, Cannelloni, Minestrone Soup, Hot Choc...","Cafe, Mexican, Italian",800,"[('Rated 3.0', ""RATED\n Ambience is not that ...",[],Buffet,Banashankari


In [4]:
#Droping NaN and duplicates
data = data.drop(['url','dish_liked','phone'],axis=1) 
data.dropna(how='any',inplace=True)
data.drop_duplicates().reset_index(drop=True)
data.shape

(43533, 14)

In [5]:
#Changing the column names
data = data.rename(columns = {'approx_cost(for two people)':'cost', 'reviews_list':'reviews',
                       'listed_in(type)':'type', 'listed_in(city)':'city'})

#Transformations
data['cost'] = data['cost'].astype(str) #Changing cost to string
data['cost'] = data['cost'].apply(lambda x: x.replace(',','.')) #Replace ',' in cost
data['cost'] = data['cost'].astype(float)

#Removing '/5' from Rates
data = data.loc[data.rate !='NEW']
data = data.loc[data.rate !='-'].reset_index(drop=True)
remove_slash = lambda x: x.replace('/5', '') if type(x) == np.str else x
data.rate = data.rate.apply(remove_slash).astype('float')

# Adjust the column names
data.name = data.name.apply(lambda x:x.title())
data.online_order.replace('Yes',True,inplace=True)
data.online_order.replace('No',False,inplace=True)
data.book_table.replace('Yes',True,inplace=True)
data.book_table.replace('No',False,inplace=True)

## Compute Mean Rating
restaurants = list(data['name'].unique())
data['Mean Rating'] = 0

for i in range(len(restaurants)):
    data['Mean Rating'][data['name'] == restaurants[i]] = data['rate'][data['name'] == restaurants[i]].mean()
    
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler(feature_range = (1,5)) #scales the rating to a value between 1 and 5
data[['Mean Rating']] = scaler.fit_transform(data[['Mean Rating']]).round(2)

In [6]:
# Lower casing
data["reviews"] = data["reviews"].str.lower()

#Removing Puctuation
import string
PUNCT_TO_REMOVE = string.punctuation
def remove_punctuation(text):
    return text.translate(str.maketrans('', '', PUNCT_TO_REMOVE))

data["reviews"] = data["reviews"].apply(lambda text: remove_punctuation(text))

#Removing Stopwords
STOPWORDS = set(stopwords.words('english'))
def remove_stopwords(text):
    return " ".join([word for word in str(text).split() if word not in STOPWORDS])

data["reviews"] = data["reviews"].apply(lambda text: remove_stopwords(text))

#Removing URLS
def remove_urls(text):
    url_pattern = re.compile(r'https?://\S+|www\.\S+')
    return url_pattern.sub(r'', text)

data["reviews"] = data["reviews"].apply(lambda text: remove_urls(text))

data[['reviews', 'cuisines']].sample(5)

Unnamed: 0,reviews,cuisines
12167,rated 50 ratedn mangalore pearl place go cravi...,"Mangalorean, Seafood"
71,rated 50 ratedn people always hunt biryanis pe...,"Biryani, Chinese, Kebab"
1962,rated 40 ratedn looking healthy yet tasty food...,"Cafe, Continental, Beverages, Healthy Food, De..."
2022,rated 50 ratedn quite easy pocket place serves...,Biryani
29223,rated 30 ratedn wellnhad meetup friendsnthis c...,"Mughlai, North Indian"


In [7]:
# Filter df to only unique restaurant names 
restaurant_names = list(data['name'].unique())

# Remove cols 
data=data.drop(['address','rest_type', 'type', 'menu_item', 'votes'],axis=1)

# Randomly sample dataframe
df_percent = data.sample(frac=0.5)

## TF-IDF Vectorization

TF-IDF (Term Frequency-Inverse Document Frequency) vectors for each document. This will give you a matrix where each column represents a word in the general vocabulary (all words that appear in at least one document) and each column represents a restaurant, as before.

TF-IDF is the statistical method of assessing the meaning of a word in a given document.

In [8]:
#Set name as the index column 
df_percent.set_index('name', inplace=True)
indices = pd.Series(df_percent.index)

#Creating tf-idf matrix
tfidf = TfidfVectorizer(analyzer='word', ngram_range=(1, 2), min_df=0, stop_words='english')
tfidf_matrix = tfidf.fit_transform(df_percent['reviews'])

#Calculate simularity value 
similarities = linear_kernel(tfidf_matrix, tfidf_matrix)

In [11]:
def recommend(name, filters, similarities = similarities):
    
    # Create a list to put top restaurants
    recommend_restaurant = []
    
    # Find the index of the hotel entered
    idx = indices[indices == name].index[0]
    
    # Find the restaurants with a similar similarity value and order them ascending
    score_series = pd.Series(similarities[idx]).sort_values(ascending=False)
    
    # Extract top 30 restaurant indexes with a similar simularity value
    top30_indexes = list(score_series.iloc[0:151].index)
    
    # Names of the top 30 restaurants
    for each in top30_indexes:
        recommend_restaurant.append(list(df_percent.index)[each])
    
    # Creating the new data set to show similar restaurants
    df_new = pd.DataFrame(columns=['cuisines', 'Mean Rating', 'cost'])
    
    # Create the top 30 similar restaurants with some of their columns and filters
    for each in recommend_restaurant:
        df_new = df_new.append(pd.DataFrame(df_percent[['cuisines','Mean Rating', 'cost']][df_percent.index == each].sample()))
        
    # Adding filters
    if filters[0] != '':
        df_new = df_new[df_new['cuisines'].str.contains(filters[0])]
    if filters[1] != '':
        df_new = df_new[df_new['Mean Rating'] >= float(filters[1])]
    if filters[2] != '':
        df_new = df_new[df_new['cost'] >= float(filters[2])]
    if filters[3] != '':
        df_new = df_new[df_new['cost'] <= float(filters[3])]
        
    # Drop the same named restaurants and sort only the top 10 by the highest rating
    df_new = df_new.drop_duplicates(subset=['cuisines','Mean Rating', 'cost'], keep=False)
    df_new = df_new.sort_values(by='Mean Rating', ascending=False).head(10)
    
    print('TOP %s RESTAURANTS LIKE %s WITH SIMILAR REVIEWS: ' % (str(len(df_new)), name))
    
    return df_new

#recommend('Pai Vihar',['Indian','3.5','','400'])

In [12]:
window = tk.Tk()

        
frame = tk.Frame(
            master=window,
            relief=tk.RAISED,
            borderwidth=1
        )

frame.pack()

label = tk.Label(master=frame, text='Find restaurants like:')
label.grid(row=0, column=0)

restaurant = tk.Entry(master=frame)
restaurant.grid(row=0, column=1)

label = tk.Label(master=frame, text='Cuisine:')
label.grid(row=1, column=0)

cuisine = tk.Entry(master=frame)
cuisine.grid(row=1, column=1)

label = tk.Label(master=frame, text='Rate:')
label.grid(row=1, column=2)

Rate= tk.Entry(master=frame)
Rate.grid(row=1, column=3)

label = tk.Label(master=frame, text='Cost from:')
label.grid(row=3, column=0)

cost_f = tk.Entry(master=frame)
cost_f.grid(row=3, column=1)

label = tk.Label(master=frame, text='Cost to:')
label.grid(row=3, column=2)

cost_to = tk.Entry(master=frame)
cost_to.grid(row=3, column=3)

        
def get_recommend():
    x = restaurant.get()
    filters = [cuisine.get(), Rate.get(), cost_f.get(), cost_to.get()]
    
    if x == '':
        df_new = df_percent[['cuisines','Mean Rating', 'cost']].sort_values(by='Mean Rating', ascending=False)
        
        if filters[0] != '':
            df_new = df_new[df_new['cuisines'].str.contains(filters[0])]
        if filters[1] != '':
            df_new = df_new[df_new['Mean Rating'] >= float(filters[1])]
        if filters[2] != '':
            df_new = df_new[df_new['cost'] >= float(filters[2])]
        if filters[3] != '':
            df_new = df_new[df_new['cost'] <= float(filters[3])]
            
        df_new = df_new.drop_duplicates(subset=['cuisines','Mean Rating', 'cost'], keep=False)
        y = df_new.head(10)
    else:
        y = recommend(x,filters)
    
    tk.Label(master=frame, text='You should try...', font='Helvetica 14 bold').grid(row=5, column=0)
        
    n=6
    for index, row in y.iterrows():
        tk.Label(master=frame, text = index).grid(row=n+1, column=0) 
        n+=1 

find_button = tk.Button(master=frame, text='Find', command=get_recommend)
find_button.grid(row=4, column=0)

       
window.mainloop()

TOP 10 RESTAURANTS LIKE Pai Vihar WITH SIMILAR REVIEWS: 
