In [None]:
%pip install -r requirements.txt

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib
%matplotlib inline

In [None]:
df = pd.read_csv('zomato.csv', encoding='latin-1')
df.head(5)

In [None]:
df.columns

In [None]:
df.info()

In [None]:
df.describe()

## Things to do in Data Analysis
1. Missing values
2. Explore about the Numerical variables
3. Explore about the Categorical variables
4. Finding relationship among features

In [None]:
df.isnull().sum()

In [None]:
print([features for features in df.columns if df[features].isnull().sum()>0])

In [None]:
sns.heatmap(df.isnull(), yticklabels=False, cbar=False, cmap='viridis')

In [None]:
df_countries = pd.read_excel('Country-Code.xlsx')
df_countries.head(5)

In [None]:
final_df = pd.merge(df, df_countries, on='Country Code', how='left')
final_df.head(5)

In [None]:
final_df.columns

In [None]:
final_df.Country.value_counts()

In [None]:
country_names = final_df.Country.value_counts().index
country_names

In [None]:
country_values = final_df.Country.value_counts().values
country_values

In [None]:
plt.pie(country_values, labels=country_names, autopct='%1.1f%%', startangle=140)

In [None]:
# Pie chart of top 3 countries that uses zomato
plt.pie(country_values[:3], labels=country_names[:3], autopct='%1.2f%%', startangle=140)

### Observation
Zomato's most number of transaction happens in India, then in US, and then in UK

In [None]:
ratings = final_df.groupby(['Aggregate rating', 'Rating color', 'Rating text']).size().reset_index().rename(columns={0: 'Rating Count'})
ratings

### Observation
1. 0 rating -> White | Not rated
2. 1.8 - 2.4 rating -> Red | Poor
3. 2.5 - 3.4 rating -> Orange | Average
4. 3.5 - 3.9 rating -> Yellow | Good
5. 4.0 - 4.4 rating -> Green | Good
6. 4.5 - 4.9 rating -> Dark Green | Excellent


In [None]:
matplotlib.rcParams['figure.figsize'] =(12, 6)
sns.barplot(x='Aggregate rating', y='Rating Count', data=ratings, hue='Rating color', palette=['white', 'red', 'orange', 'yellow', 'green', 'darkgreen'])

### Observation
1. Not rated count is very high.
2. Maximum number of ratings are between 2.5 to 3.4

In [None]:
sns.countplot(x='Rating color', data=ratings, palette=['white', 'red', 'orange', 'yellow', 'green', 'darkgreen'])

In [None]:
# Find the countries that have given 0 rating
zero_rated_countries = final_df[final_df['Rating color'] == 'White'].groupby('Country').size().reset_index().rename(columns={0: 'Zero Rating Count'}).sort_values(by='Zero Rating Count', ascending=False)
zero_rated_countries

### Observation
1. Maximum number of zero ratings is from India

In [None]:
# Find out which currency is used by which country
currency_used = final_df[['Country', 'Currency']].groupby(['Country', 'Currency']).size().reset_index().rename(columns={0: 'Count'})
currency_used

In [None]:
# Find out which countries do have online order option
online_order_countries = final_df[final_df['Has Online delivery'] == 'Yes'].Country.value_counts().reset_index()
online_order_countries

In [None]:
# Find out whether countries do/do not have online order option
online_order = final_df[['Country', 'Has Online delivery']].groupby(['Country', 'Has Online delivery']).size().reset_index().rename(columns={0: 'Count'}).sort_values(by='Country', ascending=True)  
online_order

### Observation
1. Online deliveries are available in India and UAE
2. For both India and UAE, online deliveries are not available in some regions

In [None]:
# Create a pie chart for cities distribution
city_names = final_df.City.value_counts().index
city_names
city_values = final_df.City.value_counts().values
city_values
plt.pie(city_values[:5], labels=city_names[:5], autopct='%1.1f%%', startangle=140)

In [None]:
# Find the top 10 cousines
top_cousines = final_df.Cuisines.value_counts().reset_index().head(10)
top_cousines
