import numpy as np
import pandas as pd
import seaborn as sb
import re
import matplotlib.pyplot as plt
import plotly.offline as py
import plotly.graph_objs as go
import seaborn as sns
import warnings
warnings.filterwarnings('always')
warnings.filterwarnings('ignore')

from nltk.corpus import stopwords
from sklearn.metrics.pairwise import linear_kernel
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

In [8]:
zomato_real=pd.read_csv('zomato.csv')
zomato_real.head() 

Unnamed: 0,url,address,name,online_order,book_table,rate,votes,phone,location,rest_type,dish_liked,cuisines,approx_cost(for two people),reviews_list,menu_item,listed_in(type),listed_in(city)
0,https://www.zomato.com/bangalore/jalsa-banasha...,"942, 21st Main Road, 2nd Stage, Banashankari, ...",Jalsa,Yes,Yes,4.1/5,775,080 42297555\r\n+91 9743772233,Banashankari,Casual Dining,"Pasta, Lunch Buffet, Masala Papad, Paneer Laja...","North Indian, Mughlai, Chinese",800,"[('Rated 4.0', 'RATED\n A beautiful place to ...",[],Buffet,Banashankari
1,https://www.zomato.com/bangalore/spice-elephan...,"2nd Floor, 80 Feet Road, Near Big Bazaar, 6th ...",Spice Elephant,Yes,No,4.1/5,787,080 41714161,Banashankari,Casual Dining,"Momos, Lunch Buffet, Chocolate Nirvana, Thai G...","Chinese, North Indian, Thai",800,"[('Rated 4.0', 'RATED\n Had been here for din...",[],Buffet,Banashankari
2,https://www.zomato.com/SanchurroBangalore?cont...,"1112, Next to KIMS Medical College, 17th Cross...",San Churro Cafe,Yes,No,3.8/5,918,+91 9663487993,Banashankari,"Cafe, Casual Dining","Churros, Cannelloni, Minestrone Soup, Hot Choc...","Cafe, Mexican, Italian",800,"[('Rated 3.0', ""RATED\n Ambience is not that ...",[],Buffet,Banashankari
3,https://www.zomato.com/bangalore/addhuri-udupi...,"1st Floor, Annakuteera, 3rd Stage, Banashankar...",Addhuri Udupi Bhojana,No,No,3.7/5,88,+91 9620009302,Banashankari,Quick Bites,Masala Dosa,"South Indian, North Indian",300,"[('Rated 4.0', ""RATED\n Great food and proper...",[],Buffet,Banashankari
4,https://www.zomato.com/bangalore/grand-village...,"10, 3rd Floor, Lakshmi Associates, Gandhi Baza...",Grand Village,No,No,3.8/5,166,+91 8026612447\r\n+91 9901210005,Basavanagudi,Casual Dining,"Panipuri, Gol Gappe","North Indian, Rajasthani",600,"[('Rated 4.0', 'RATED\n Very good restaurant ...",[],Buffet,Banashankari


In [9]:
# Dropping the column "dish_liked", "phone", "url"
zomato_real = zomato_real.drop(['rest_type', 'cuisines', 'url'], axis=1)

# Remove the NaN values from the dataset
zomato_real.dropna(how='any', inplace=True)

# Removing the Duplicates
zomato_real.drop_duplicates(inplace=True)

# Changing the column names
zomato_real = zomato_real.rename(columns={'approx_cost(for two people)': 'cost', 'listed_in(type)': 'type', 'listed_in(city)': 'city'})

# Removing '/5' from Rates
zomato_real = zomato_real.loc[zomato_real.rate != 'NEW']
zomato_real = zomato_real.loc[zomato_real.rate != '-'].reset_index(drop=True)

remove_slash = lambda x: x.replace('/5', '') if type(x) == str else x
zomato_real.rate = zomato_real.rate.apply(remove_slash).str.strip().astype('float')

# Changing the cost to string
zomato_real['cost'] = zomato_real['cost'].astype(str)
zomato_real['cost'] = zomato_real['cost'].apply(lambda x: x.replace(',', '.'))
zomato_real['cost'] = zomato_real['cost'].astype(float)


In [10]:
zomato_real .shape

(23104, 14)

In [11]:
zomato_real.isnull().sum()

address         0
name            0
online_order    0
book_table      0
rate            0
votes           0
phone           0
location        0
dish_liked      0
cost            0
reviews_list    0
menu_item       0
type            0
city            0
dtype: int64

In [12]:
## Computing Mean Rating
restaurants = list(zomato_real['name'].unique())
zomato_real['Mean Rating'] = 0
for i in range(len(restaurants)):
    zomato_real['Mean Rating'][zomato_real['name'] == restaurants[i]] = zomato_real['rate'][zomato_real['name'] == restaurants[i]].mean()   
#Scaling the mean rating values
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler(feature_range = (1,5))
zomato_real[['Mean Rating']] = scaler.fit_transform(zomato_real[['Mean Rating']]).round(2)

In [13]:
zomato_real[['name','rate','Mean Rating']].head()

Unnamed: 0,name,rate,Mean Rating
0,Jalsa,4.1,3.99
1,Spice Elephant,4.1,3.97
2,San Churro Cafe,3.8,3.58
3,Addhuri Udupi Bhojana,3.7,3.45
4,Grand Village,3.8,3.58


In [None]:
## Lower Casing
zomato_real["reviews_list"] = zomato_real["reviews_list"].str.lower()

## Removal of Puctuations

import string
PUNCT_TO_REMOVE = string.punctuation
def remove_punctuation(text):
    """custom function to remove the punctuation"""
    return text.translate(str.maketrans('', '', PUNCT_TO_REMOVE))

zomato_real["reviews_list"] = zomato_real["reviews_list"].apply(lambda text: remove_punctuation(text))

# Removal of Stopwords

from nltk.corpus import stopwords
STOPWORDS = set(stopwords.words('english'))
def remove_stopwords(text):
    """custom function to remove the stopwords"""
    return " ".join([word for word in str(text).split() if word not in STOPWORDS])

zomato_real["reviews_list"] = zomato_real["reviews_list"].apply(lambda text: remove_stopwords(text))

#Cleaning URL

def remove_urls(text):
    url_pattern = re.compile(r'https?://\S+|www\.\S+')
    return url_pattern.sub(r'', text)

zomato_real["reviews_list"] = zomato_real["reviews_list"].apply(lambda text: remove_urls(text))

In [None]:
zomato_real[['reviews_list']][:5]

In [None]:
# Most Famous restaurant chains in Bangalore
plt.figure(figsize=(8, 5))
chains = zomato_real['name'].value_counts()[:10]
sns.barplot(x=chains, y=chains.index, palette='deep')
plt.title("Most famous restaurants chains in Bangalore")
plt.xlabel("Number of outlets")

# Types of Restaurant
counts = zomato_real["reviews_list"].value_counts()[:10] 
p = counts.sort_values().plot.barh(figsize=(8, 5), fontsize=18) 
p.set_xlabel("Number of Restaurant", fontsize=18) 
p.set_ylabel("Restaurant Type", fontsize=18)
p.set_title("Types of Restaurant", fontsize=20)

# Distribution of Restaurant Rating 
fig, ax = plt.subplots(nrows=1, ncols=1, figsize=(15, 5))
sns.distplot(zomato_real['rate'], kde=False, color='g', ax=ax, bins=20)
ax.axvline(zomato_real['rate'].mean(), 0, 1, color='r', label='Mean')
ax.legend()
ax.set_ylabel('Count', size=20)
ax.set_xlabel('Rate', size=20)
ax.set_title('Distribution(count) of Restaurant rating', size=20)

# Top 10 Rated Restaurants
df_rating = zomato_real.drop_duplicates(subset='name')
df_rating = df_rating.sort_values(by='rate', ascending=False).head(10)
plt.figure(figsize=(7, 5))
sns.barplot(data=df_rating, x='rate', y='name', palette='RdBu')
plt.title('Top Rated 10 Restaurants');

In [None]:
def get_top_words(column, top_nu_of_words, nu_of_word):
    
    vec = CountVectorizer(ngram_range= nu_of_word, stop_words='english')
    
    bag_of_words = vec.fit_transform(column)
    
    sum_words = bag_of_words.sum(axis=0)
    
    words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
    
    words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
    
    return words_freq[:top_nu_of_words]


In [None]:
# Top 15 two-word frequencies for Cuisines
lst = get_top_words(zomato_real['reviews_list'], 15, (2, 2))
df_words = pd.DataFrame(lst, columns=['Word', 'Count'])

plt.figure(figsize=(7, 6))
sns.barplot(data=df_words, x='Count', y='Word')
plt.title('Word Couple Frequency for Cuisines');

In [None]:
df_percent = pd.DataFrame({'name': ['A', 'B', 'C'], 'value': [10, 20, 30]})
df_percent.set_index('name', inplace=True)
# Creating tf-idf matrix
tfidf = TfidfVectorizer(analyzer='word', ngram_range=(1, 2), min_df=0, stop_words='english')
tfidf_matrix = tfidf.fit_transform(df_percent['reviews_list'])

In [None]:
cosine_similarities = linear_kernel(tfidf_matrix, tfidf_matrix)

In [None]:
def recommend(name, cosine_similarities = cosine_similarities):
   
    recommend_restaurant = []
    
    # Find the index of the hotel entered
    idx = indices[indices == name].index[0]
    
    # Find the restaurants with a similar cosine-sim value and order them from bigges number
    score_series = pd.Series(cosine_similarities[idx]).sort_values(ascending=False)
    
    # Extract top 30 restaurant indexes with a similar cosine-sim value
    top30_indexes = list(score_series.iloc[0:31].index)
    
    # Names of the top 30 restaurants
    for each in top30_indexes:
        recommend_restaurant.append(list(df_percent.index)[each])
    
    # Creating the new data set to show similar restaurants
    df_new = pd.DataFrame(columns=['cuisines', 'Mean Rating', 'cost'])
    
    # Create the top 30 similar restaurants with some of their columns
    for each in recommend_restaurant:
        df_new = df_new.append(pd.DataFrame(df_percent[['cuisines','Mean Rating', 'cost']][df_percent.index == each].sample()))
    
    # Drop the same named restaurants and sort only the top 10 by the highest rating
    df_new = df_new.drop_duplicates(subset=['cuisines','Mean Rating', 'cost'], keep=False)
    df_new = df_new.sort_values(by='Mean Rating', ascending=False).head(10)
    
    print('TOP %s RESTAURANTS LIKE %s WITH SIMILAR REVIEWS: ' % (str(len(df_new)), name))
    
    return df_new

In [19]:
zomato_real.loc[(zomato_real.name == 'Marwa Restaurant')][:1]

Unnamed: 0,address,name,online_order,book_table,rate,votes,phone,location,dish_liked,cost,reviews_list,menu_item,type,city,Mean Rating
494,"21, Ground Floor, B.H.C.S.L Layout, Stage 2, B...",Marwa Restaurant,Yes,Yes,3.5,551,080 49653014,Bannerghatta Road,"Kalmi Kabab, Gulab Jamun, Chicken Grill, Shawa...",600.0,rated 50 ratedn disappointed decision shifting...,[],Delivery,Bannerghatta Road,3.19


In [20]:
zomato_real.loc[(zomato_real.name == 'Jalsa')][:1]

Unnamed: 0,address,name,online_order,book_table,rate,votes,phone,location,dish_liked,cost,reviews_list,menu_item,type,city,Mean Rating
0,"942, 21st Main Road, 2nd Stage, Banashankari, ...",Jalsa,Yes,Yes,4.1,775,080 42297555\r\n+91 9743772233,Banashankari,"Pasta, Lunch Buffet, Masala Papad, Paneer Laja...",800.0,rated 40 ratedn beautiful place dine inthe int...,[],Buffet,Banashankari,3.99
