# Zomato Dataset Exploratory Data Analysis

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
df = pd.read_csv('Zomatodataset\zomato.csv', encoding='latin-1')

In [None]:
df.head(10)

In [None]:
df.columns # or [feature for feature in df.columns]

In [None]:
df.info()

In [None]:
df.describe()   # only integer features

# DATA ANALYSIS STEPS
1. Missing Values - feature engineering
2. Explore the numerical variables
3. Explore categorical variables
4. Finding Relations between features

## MISSING VALUES

In [None]:
df.shape

In [None]:
df.isnull().sum() # how many null values under each columns

In [None]:
# we will print those features which have null values
[feature for feature in df.columns if df[feature].isnull().sum()> 0]

In [None]:
sns.heatmap(df.isnull(), yticklabels = False, cbar = True, cmap = 'viridis')

In [None]:
plt.figure(figsize = (12,6)) # problem with (8,8)
sns.heatmap(df.isnull(), yticklabels = False, cbar = True, cmap = 'viridis') # press shift + tab for documentation

In [None]:
df_country = pd.read_excel('Zomatodataset\Country-Code.xlsx')
df_country.head()

In [None]:
df.columns

In [None]:
# now we merge df_country with df on 'Country Code' by left outer join
final_df = pd.merge(df, df_country, on = 'Country Code', how = 'left')

In [None]:
final_df.columns

In [None]:
final_df.head(1)

In [None]:
final_df.dtypes

In [None]:
final_df.info()

In [None]:
final_df.columns

In [None]:
final_df.Country.value_counts()

In [None]:
final_df.isnull().sum()

In [None]:
final_df.shape

In [None]:
final_df.Country.value_counts()

In [None]:
final_df.Country.value_counts().index

In [None]:
country_names = final_df.Country.value_counts().index

In [None]:
country_values = final_df.Country.value_counts().values

In [None]:
# pie chart
plt.figure(figsize=(25,15))
plt.pie(country_values[:5], labels=country_names[:5], autopct='%1.2f%%')

Observation: Zomato maximum records or transaction are from 
1. India 
2. USA
3. UK

In [None]:
final_df.columns

In [None]:
ratings = final_df.groupby(['Aggregate rating','Rating color','Rating text']).size().reset_index().rename(columns={0:'Rating Count'})

In [None]:
ratings.head()

## Observation
1. When Rating is from 4.5 to 4.9 -> Excellent
2. ................... 4.0 to 4.4 -> very good
3. ................... 3.5 to 3.9 -> good
4. ................... 3.0 to 3.4 -> average
5. ................... 2.5 to 2.9 -> average
6. ................... 2.0 to 2.4 -> poor

In [None]:
plt.figure(figsize=(15,5))
sns.barplot(x="Aggregate rating", y="Rating Count", data=ratings)

In [None]:
plt.figure(figsize=(15,5))
sns.barplot(x="Aggregate rating", y="Rating Count", hue="Rating color", data=ratings, palette=['Blue','Red','Orange','Yellow','Green','Green'])

## Observations :
1. Not rated count is highest.
2. Maximum number of ratings are between 2.7 and 4.0

In [None]:
# to plot the count plot of all the rating colors frequency
plt.figure(figsize=(10,5))
sns.countplot(x="Rating color", data=ratings, palette =['Blue','Red','Orange','Yellow','Green','Green'])

### Find the country names that gave 0 rating ?
### we can use rating color to filter or Aggregate rating == 0.0 to check

In [None]:
final_df[final_df['Rating color'] == 'White'].groupby('Country').size().reset_index()

In [None]:
# final_df.groupby(['Aggregate rating', 'Country']).size().reset_index().head(5)
final_df[final_df['Aggregate rating'] == 0.0].groupby('Country').size().reset_index()

Observations : Max no of 0 ratings are from Indian customers

### Find out which currency is used by which country ?

In [None]:
final_df.columns

In [None]:
final_df.groupby(['Country','Currency']).size().reset_index()

### Which countries do have online deliveries option?

In [None]:
final_df.columns

In [None]:
final_df[final_df['Has Online delivery'] == "Yes"].Country.value_counts()

In [None]:
final_df[['Has Online delivery','Country']].groupby(['Has Online delivery','Country']).size().reset_index()

### Observations : Online deliveries are available in India and UAE

### Create a pie chart for top 5 cities distribution.

In [None]:
city_values = final_df.City.value_counts().values
city_labels = final_df.City.value_counts().index

In [None]:
plt.figure(figsize=(15,15))
plt.pie(city_values[:5], labels=city_labels[:5], autopct ="%1.2f%%")

### Find the top 10 cuisines