In [1]:
#Importing Libraries
import numpy as np #NumPy is a general-purpose array-processing package.
import pandas as pd #It contains high-level data structures and manipulation tools designed to make data analysis fast and easy.
import matplotlib.pyplot as plt #It is a Plotting Library
import seaborn as sns #Seaborn is a Python data visualization library based on matplotlib.
from sklearn.linear_model import LogisticRegression #Logistic Regression is a Machine Learning classification algorithm
from sklearn.linear_model import LinearRegression #Linear Regression is a Machine Learning classification algorithm
from sklearn.model_selection import train_test_split #Splitting of Dataset
from sklearn.metrics import classification_report 
from sklearn.metrics import confusion_matrix
from sklearn.metrics import r2_score

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [2]:
df = pd.read_csv('../input/zomato-bangalore-restaurants/zomato.csv')
df.head()

In [3]:
df.shape

# **EDA and Cleaning**

In [4]:
df.info()

In [5]:
df.isna().sum()

In [6]:
#dropping irrelevant columns
zomato=df.drop(['url','address','phone','dish_liked', 'menu_item', 'reviews_list'],axis=1) 

In [7]:
#Removing the Duplicates
zomato.duplicated().sum()
zomato.drop_duplicates(inplace=True)

In [8]:
#Remove the NaN values from the dataset
zomato.isnull().sum()
zomato.dropna(how='any',inplace=True)
zomato.info()

In [9]:
#Changing the Columns Names
zomato.columns
zomato = zomato.rename(columns={'approx_cost(for two people)':'cost','listed_in(type)':'type',
                                  'listed_in(city)':'city', 'rate':'rating'})
zomato.columns

In [10]:
#Some Transformations
zomato['cost'] = zomato['cost'].astype(str)
zomato['cost'] = zomato['cost'].apply(lambda x: x.replace(',','.'))
zomato['cost'] = zomato['cost'].astype(float)
zomato.info()

In [11]:
#Removing '/5' from Rates
zomato['rating'].unique()
zomato = zomato.loc[zomato.rating !='NEW']
zomato = zomato.loc[zomato.rating !='-'].reset_index(drop=True)
remove_slash = lambda x: x.replace('/5', '') if type(x) == np.str else x
zomato.rating = zomato.rating.apply(remove_slash).str.strip().astype('float')
zomato['rating'].head()

In [12]:
# Adjust the column names
zomato.name = zomato.name.apply(lambda x:x.title())
zomato.online_order.replace(('Yes','No'),(True, False),inplace=True)
zomato.book_table.replace(('Yes','No'),(True, False),inplace=True)
zomato.head()

# **Visualization**

**Top 10 most popular restaurant types**

In [13]:
from collections import Counter
r_type=zomato['rest_type'].value_counts()[:10]
sns.barplot(x=r_type,y=r_type.index)
plt.title("Most In-Demand Restaurant Types")
plt.xlabel("count")

**Proportion of Restaurants that provide online service**

In [14]:
zomato.online_order.value_counts().plot(kind='pie')
w_circle=plt.Circle((0,0),0.7,color='white')
p=plt.gcf()
p.gca().add_artist(w_circle)

**Proportion of restaurants that allow booking tables**

In [15]:
zomato.book_table.value_counts().plot(kind='pie')
w_circle=plt.Circle((0,0),0.7,color='white')
p=plt.gcf()
p.gca().add_artist(w_circle)

**Most popular restaurant chains**

In [16]:
ax=df.name.value_counts()[:10].plot(kind='bar')
plt.xlabel("Restaurant Name")
plt.ylabel("No. of branches")
plt.title("Top 10 Chain Restaurants")

**Impact of Online Ordering on Ratings**

It can be seen from the chart below that restaurants that allow ordering online tend to have higher ratings

In [17]:
sns.set_style('white')
y=pd.crosstab(zomato.rating,zomato.book_table)
y.plot(kind='bar',stacked=True)

**Impact of Table Booking on Ratings**

It can be seen from the chart below that restaurants that allow table bookings tend to have higher ratings.

In [18]:
sns.set_style('white')
y=pd.crosstab(zomato.rating,zomato.book_table)
y.plot(kind='bar',stacked=True)

**Top 10 retaurant locations**

In [19]:
sns.set_style('darkgrid')
ch=zomato['location'].value_counts()[:10]
sns.barplot(x=ch,y=ch.index,palette='viridis')
plt.xlabel('Number of Outlets')
plt.ylabel('Location')
plt.show()

# **Predictive Models**

We now proceed to build a model that can predict ratings for new restaurants on the market to enable informed decision making by the stakeholders. The first step for this is to encode the columns containing string types into some sort of code.

In [66]:
def Encode(zomato):
    for column in zomato.columns[~zomato.columns.isin(['rating', 'cost', 'votes'])]:
        zomato[column] = zomato[column].factorize()[0]
    return zomato

encodedZomato = Encode(zomato.copy())

Next we try to plot a heatmap to get the correlation between the various available variables

In [101]:
#Get Correlation between different variables
corr = encodedZomato.corr(method='kendall')
plt.figure(figsize=(15,8))
sns.heatmap(corr, annot=True)

## **Splitting the Dataset**

Next up is the very important step of the deciding the train-test split for the different models. In this, after careful consideration, a train-test split of 4:1 is chosen. Also, the dependent and independent variables are declared for the models. The target variable here is rating which is what we aim to predict for future restaurants.

In [68]:
#Defining X and Y for model, Train-test split
x = encodedZomato[["online_order","book_table","votes","location","rest_type","cuisines","cost","type"]]
y = encodedZomato['rating']
#Getting Test and Training Set
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=.2,random_state=353)
x_train.shape, y_train.shape

## **Extra Tree Regressor**

Extra Trees Regressor is an ensemble learning method fundamentally based on decision trees. Extra Trees Regressor, like RandomForest, randomizes certain decisions and subsets of data to minimize over-learning from the data and overfitting

For more details, visit https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.ExtraTreesRegressor

In [69]:
#Extra Tree Regression
from sklearn.ensemble import  ExtraTreesRegressor
ETree=ExtraTreesRegressor(n_estimators = 200)
ETree.fit(x_train,y_train)
y_predict=ETree.predict(x_test)
r2_score(y_test,y_predict)

## **Decision Tree Regressor**

Decision Trees (DTs) are a non-parametric supervised learning method used for classification and regression. The goal is to create a model that predicts the value of a target variable by learning simple decision rules inferred from the data features. A tree can be seen as a piecewise constant approximation.

For more details, visit https://scikit-learn.org/stable/modules/tree.html

also, https://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeRegressor

In [70]:
#DTree Regression
from sklearn.tree import DecisionTreeRegressor
DTree=DecisionTreeRegressor(min_samples_leaf=.00001)
DTree.fit(x_train,y_train)
y_predict=DTree.predict(x_test)
r2_score(y_test,y_predict)

## **Linear Regression Model**

In statistics, linear regression is a linear approach for modelling the relationship between a scalar response and one or more explanatory variables. The case of one explanatory variable is called simple linear regression.

Visit: https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LinearRegression

In [71]:
#Some LinReg Model
reg=LinearRegression()
reg.fit(x_train,y_train)
y_pred=reg.predict(x_test)
r2_score(y_test,y_pred)

## **Gradient Boosting Regressor**

GB builds an additive model in a forward stage-wise fashion; it allows for the optimization of arbitrary differentiable loss functions. In each stage a regression tree is fit on the negative gradient of the given loss function.

For more details, visit https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.GradientBoostingRegressor

In [72]:
from sklearn.ensemble import GradientBoostingRegressor
gbr = GradientBoostingRegressor()
gbr.fit(x_train, y_train)
y_predict=gbr.predict(x_test)
r2_score(y_test,y_predict)

## **XGBoost Regressor**

XGBoost is a decision-tree-based ensemble Machine Learning algorithm that uses a gradient boosting framework. In prediction problems involving unstructured data (images, text, etc.) artificial neural networks tend to outperform all other algorithms or frameworks. However, when it comes to small-to-medium structured/tabular data, decision tree based algorithms are considered best-in-class right now.
Visit https://xgboost.readthedocs.io/en/latest/python/python_api.html?highlight=xgbregressor#xgboost.XGBRegressor to learn more

In [73]:
from xgboost import XGBRegressor

xgb = XGBRegressor(booster = 'gbtree', learning_rate = 0.1, max_depth = 15, n_estimators = 200)
xgb.fit(x_train, y_train)
y_predict=xgb.predict(x_test)

r2_score(y_test,y_predict)

## **Random Forest Regressor**

A random forest is a meta estimator that fits a number of classifying decision trees on various sub-samples of the dataset and uses averaging to improve the predictive accuracy and control over-fitting.

Visit https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestRegressor to learn more

In [74]:
from sklearn.ensemble import RandomForestRegressor
# tuning=dict()
rf=RandomForestRegressor(n_estimators=200,random_state=50,min_samples_leaf=.00001)
rf.fit(x_train,y_train)
y_pred=rf.predict(x_test)
r2_score(y_test,y_pred)

Now let us compare and contrast the performance of each of our models by arranging their r-squared scores in a tabulated format

In [75]:
models = pd.DataFrame({
    'Model' : ['Linear Regression', 'Decision Tree', 'Random Forest','Extra Tree Regressor', 'Gradient Boost', 'XgBoost'],
    'Score' : [reg.score(x_test, y_test), DTree.score(x_test, y_test), rf.score(x_test, y_test), ETree.score(x_test, y_test),
               gbr.score(x_test, y_test), xgb.score(x_test, y_test)]
})


models.sort_values(by = 'Score', ascending = False)

# **Inferences**

It is clearly visible that the XGBoost regressor works the best with our dataset, which means that we'll be using this model to predict the ratings of the our future dataset.

Now we need to use our model, the XGBoost regressor, to predict the ratings for new restaurants

# **Predicting Ratings for New Restaurants**

In [76]:
df_new = pd.read_csv('../input/zomato-prediction/Zomato_predictions.csv')
#Converting cost to float
df_new['cost'] = df_new['cost'].astype(float)
df_new

In [77]:
zomatoNew = Encode(df_new.copy())
zomatoNew

In [96]:

x_pred = zomatoNew[["online_order","book_table","votes","location","rest_type","cuisines","cost","type"]]
y_predict=xgb.predict(x_pred)
y_pred_list = [ round(elem, 1) for elem in y_predict ]
y_pred_list

In [98]:
df_new["rating"] = y_pred_list
df_new

In [100]:
df_new.to_csv("../working/zomato_predicted_ratings.csv", index = False)

# **Conclusions**

We have successfully trained and built multiple models on the dataset. We also found that the XGBoost regressor works incredibly well for this data. This analysis shows that once the model has been trained and tested on the data, we can actually predict the ratings for new restaurants as well with the independent variables being available to us.