# Recommendation System

In [2]:
# Importing necessary Libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

In [3]:
# Loading the dataset
dataset = pd.read_csv("zomato.csv", encoding='latin-1')

In [4]:
dataset.head(1)

Unnamed: 0,Restaurant ID,Restaurant Name,Country Code,City,Address,Locality,Locality Verbose,Longitude,Latitude,Cuisines,...,Currency,Has Table booking,Has Online delivery,Is delivering now,Switch to order menu,Price range,Aggregate rating,Rating color,Rating text,Votes
0,6317637,Le Petit Souffle,162,Makati City,"Third Floor, Century City Mall, Kalayaan Avenu...","Century City Mall, Poblacion, Makati City","Century City Mall, Poblacion, Makati City, Mak...",121.027535,14.565443,"French, Japanese, Desserts",...,Botswana Pula(P),Yes,No,No,No,3,4.8,Dark Green,Excellent,314


In [6]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9551 entries, 0 to 9550
Data columns (total 21 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   Restaurant ID         9551 non-null   int64  
 1   Restaurant Name       9551 non-null   object 
 2   Country Code          9551 non-null   int64  
 3   City                  9551 non-null   object 
 4   Address               9551 non-null   object 
 5   Locality              9551 non-null   object 
 6   Locality Verbose      9551 non-null   object 
 7   Longitude             9551 non-null   float64
 8   Latitude              9551 non-null   float64
 9   Cuisines              9542 non-null   object 
 10  Average Cost for two  9551 non-null   int64  
 11  Currency              9551 non-null   object 
 12  Has Table booking     9551 non-null   object 
 13  Has Online delivery   9551 non-null   object 
 14  Is delivering now     9551 non-null   object 
 15  Switch to order menu 

In [8]:
# removing duplicates
dataset.duplicated().sum()

0

In [9]:
dataset.isnull().sum()

Restaurant ID           0
Restaurant Name         0
Country Code            0
City                    0
Address                 0
Locality                0
Locality Verbose        0
Longitude               0
Latitude                0
Cuisines                9
Average Cost for two    0
Currency                0
Has Table booking       0
Has Online delivery     0
Is delivering now       0
Switch to order menu    0
Price range             0
Aggregate rating        0
Rating color            0
Rating text             0
Votes                   0
dtype: int64

In [10]:
# filling the null values:
for features in dataset.columns:
    if(dataset[features].isnull().sum() > 0):
        dataset[features].fillna(dataset[features].median, inplace=True)

In [11]:
# Now, finding the null values again to see if its thier using list comprehension
[features for features in dataset.columns if dataset[features].isnull().sum() > 0]
# Return the null values which represent we have removed all the null values from dataset

[]

In [12]:
dataset = dataset.rename(columns={'Average Cost for two':'cost', 'City':'city'})

In [13]:
dataset['cost'] = dataset['cost'].astype(str) #Changing the cost to string
dataset['cost'] = dataset['cost'].apply(lambda x: x.replace(',','.')) #Using lambda function to replace ',' from cost
dataset['cost'] = dataset['cost'].astype(float)

In [14]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9551 entries, 0 to 9550
Data columns (total 21 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   Restaurant ID         9551 non-null   int64  
 1   Restaurant Name       9551 non-null   object 
 2   Country Code          9551 non-null   int64  
 3   city                  9551 non-null   object 
 4   Address               9551 non-null   object 
 5   Locality              9551 non-null   object 
 6   Locality Verbose      9551 non-null   object 
 7   Longitude             9551 non-null   float64
 8   Latitude              9551 non-null   float64
 9   Cuisines              9551 non-null   object 
 10  cost                  9551 non-null   float64
 11  Currency              9551 non-null   object 
 12  Has Table booking     9551 non-null   object 
 13  Has Online delivery   9551 non-null   object 
 14  Is delivering now     9551 non-null   object 
 15  Switch to order menu 

In [19]:
# Adjust the column names
dataset['Restaurant Name'] = dataset['Restaurant Name'].apply(lambda x:x.title())
dataset['Has Online delivery'].replace(('Yes','No'),(True, False),inplace=True)
dataset['Has Table booking'].replace(('Yes','No'),(True, False),inplace=True)

In [22]:
## Computing Mean Rating
restaurants = list(dataset['Restaurant Name'].unique())
dataset['Mean Rating'] = 0

In [23]:
restaurants

['Le Petit Souffle',
 'Izakaya Kikufuji',
 'Heat - Edsa Shangri-La',
 'Ooma',
 'Sambo Kojin',
 'Din Tai Fung',
 'Buffet 101',
 'Vikings',
 'Spiral - Sofitel Philippine Plaza Manila',
 'Locavore',
 'Silantro Fil-Mex',
 "Mad Mark'S Creamery & Good Eats",
 "Guevarra'S",
 'Sodam Korean Restaurant',
 'Cafe Arabelle',
 "Nonna'S Pasta & Pizzeria",
 'Balay Dako',
 'Hobing Korean Dessert Cafe',
 'Wildflour Cafe + Bakery',
 'Niu By Vikings',
 'The Food Hall By Todd English',
 'Chez Michou',
 'Cafí© Daniel Briand',
 'Casa Do Biscoito Mineiro',
 'Maori',
 'Pizza Íæ Bessa',
 'Sushi Loko',
 'Beirute',
 'New Koto',
 'Sandubas Cafí©',
 'Villa Tevere',
 'Rovereto',
 'Buena Carne',
 'Taco Pep',
 'Coco Bambu',
 'Taypíç',
 'Outback Steakhouse',
 'Manzuíç',
 'Gero',
 'Brazilian American Burgers',
 'Pesqueiro Eco Gourmet',
 'Confeitaria Colombo',
 'Bibi',
 'Cervantes',
 'Amir',
 'Tt Burger',
 'Braseiro Da Gíçvea',
 'Balada Mix',
 'Garota De Ipanema',
 'Zazíç Bistríç Tropical',
 'Filí© De Ouro',
 'D.O.C Rist

In [27]:
for i in range(len(restaurants)):
    dataset['Mean Rating'][dataset['Restaurant Name'] == restaurants[i]] = dataset['Aggregate rating'][dataset['Restaurant Name'] == restaurants[i]].mean()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataset['Mean Rating'][dataset['Restaurant Name'] == restaurants[i]] = dataset['Aggregate rating'][dataset['Restaurant Name'] == restaurants[i]].mean()


In [28]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler(feature_range = (1,5))
dataset[['Mean Rating']] = scaler.fit_transform(dataset[['Mean Rating']]).round(2)

In [34]:
dataset["Rating text"] = dataset["Rating text"].str.lower()

In [35]:
dataset['Rating text']

0       excellent
1       excellent
2       very good
3       excellent
4       excellent
          ...    
9546    very good
9547    very good
9548         good
9549    very good
9550    very good
Name: Rating text, Length: 9551, dtype: object

In [36]:
## Removal of Puctuations
import string
PUNCT_TO_REMOVE = string.punctuation

def remove_punctuation(text):
    return text.translate(str.maketrans('', '', PUNCT_TO_REMOVE))

In [37]:
dataset["Rating text"] = dataset["Rating text"].apply(lambda text: remove_punctuation(text))

In [41]:
dataset[['Rating text', 'Cuisines']].sample(5)

Unnamed: 0,Rating text,Cuisines
7325,average,Chinese
282,good,Mexican
497,excellent,"Latin American, Mexican, Southwestern"
7182,good,Cafe
7197,very good,"Cafe, Continental, Italian, Mexican"


In [42]:
restaurant_names = list(dataset['Restaurant Name'].unique())

In [43]:
def get_top_words(column, top_nu_of_words, nu_of_word):
    vec = CountVectorizer(ngram_range= nu_of_word, stop_words='english')
    bag_of_words = vec.fit_transform(column)
    sum_words = bag_of_words.sum(axis=0)
    words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
    words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
    return words_freq[:top_nu_of_words]

In [45]:
import pandas

# Randomly sample 60% of your dataframe
df_percent = dataset.sample(frac=0.5)

In [47]:
df_percent.set_index('Restaurant Name', inplace=True)
indices = pd.Series(df_percent.index)

In [50]:

from sklearn.metrics.pairwise import linear_kernel
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
# Creating tf-idf matrix
tfidf = TfidfVectorizer(analyzer='word', ngram_range=(1, 2), min_df=0, stop_words='english')
tfidf_matrix = tfidf.fit_transform(df_percent['Rating text'])

cosine_similarities = linear_kernel(tfidf_matrix, tfidf_matrix)

In [57]:
def recommend(name, cosine_similarities = cosine_similarities):
    
    # Create a list to put top restaurants
    recommend_restaurant = []
    
    # Find the index of the hotel entered
    idx = indices[indices == name].index[0]
    
    # Find the restaurants with a similar cosine-sim value and order them from bigges number
    score_series = pd.Series(cosine_similarities[idx]).sort_values(ascending=False)
    
    # Extract top 30 restaurant indexes with a similar cosine-sim value
    top30_indexes = list(score_series.iloc[0:31].index)
    
    # Names of the top 30 restaurants
    for each in top30_indexes:
        recommend_restaurant.append(list(df_percent.index)[each])
    
    # Creating the new data set to show similar restaurants
    df_new = pd.DataFrame(columns=['Cuisines', 'Mean Rating', 'Cost'])
    
    # Create the top 30 similar restaurants with some of their columns
    for each in recommend_restaurant:
        df_new = df_new.append(pd.DataFrame(df_percent[['Cuisines','Mean Rating', 'cost']][df_percent.index == each].sample()))
    
    # Drop the same named restaurants and sort only the top 10 by the highest rating
    df_new = df_new.drop_duplicates(subset=['Cuisines','Mean Rating', 'cost'], keep=False)
    df_new = df_new.sort_values(by='Mean Rating', ascending=False).head(10)
    
    print('TOP %s RESTAURANTS LIKE %s WITH SIMILAR REVIEWS: ' % (str(len(df_new)), name))
    
    return df_new


recommend('Izakaya Kikufuji')

TOP 10 RESTAURANTS LIKE Izakaya Kikufuji WITH SIMILAR REVIEWS: 


  df_new = df_new.append(pd.DataFrame(df_percent[['Cuisines','Mean Rating', 'cost']][df_percent.index == each].sample()))
  df_new = df_new.append(pd.DataFrame(df_percent[['Cuisines','Mean Rating', 'cost']][df_percent.index == each].sample()))
  df_new = df_new.append(pd.DataFrame(df_percent[['Cuisines','Mean Rating', 'cost']][df_percent.index == each].sample()))
  df_new = df_new.append(pd.DataFrame(df_percent[['Cuisines','Mean Rating', 'cost']][df_percent.index == each].sample()))
  df_new = df_new.append(pd.DataFrame(df_percent[['Cuisines','Mean Rating', 'cost']][df_percent.index == each].sample()))
  df_new = df_new.append(pd.DataFrame(df_percent[['Cuisines','Mean Rating', 'cost']][df_percent.index == each].sample()))
  df_new = df_new.append(pd.DataFrame(df_percent[['Cuisines','Mean Rating', 'cost']][df_percent.index == each].sample()))
  df_new = df_new.append(pd.DataFrame(df_percent[['Cuisines','Mean Rating', 'cost']][df_percent.index == each].sample()))
  df_new = df_new.append

Unnamed: 0,Cuisines,Mean Rating,Cost,cost
Draft Gastro Pub,Bar Food,5.0,,130.0
Carnival By Tresind,Indian,5.0,,500.0
Talaga Sampireun,"Sunda, Indonesian",5.0,,200000.0
Ab'S Absolute Barbecues,"Continental, Indian",4.96,,160.0
Kopper Kadai,North Indian,4.92,,1400.0
Spice Kraft,"Continental, Middle Eastern, Asian",4.92,,1200.0
Yauatcha,"Chinese, Dim Sum",4.84,,90.0
The Fatty Bao - Asian Gastro Bar,Asian,4.84,,2400.0
Karakí_Y Gí_Llí_Oûôlu,"Desserts, Bí_rek",4.84,,40.0
Earl Of Sandwich,"American, Sandwich, Salad",4.84,,35.0
