Doing all the necessary imports and launching a new web driver with Chrome. This notebook uses a combination of Selenium and Beautiful Soup

In [1]:
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import Select
from webdriver_manager.chrome import ChromeDriverManager
import time
import pandas as pd
pd.set_option("display.max_colwidth", None)



In [2]:
driver = webdriver.Chrome(ChromeDriverManager().install())



Current google-chrome version is 98.0.4758
Get LATEST chromedriver version for 98.0.4758 google-chrome
Driver [/Users/areena.arora/.wdm/drivers/chromedriver/mac64/98.0.4758.80/chromedriver] found in cache
  driver = webdriver.Chrome(ChromeDriverManager().install())


This is to scroll through the dynamically loading page

In [3]:
def get_biryani_restaurants(city):
    link = "https://www.zomato.com/" + city + "/restaurants/biryani"
    driver.get(link)
    current_height = driver.execute_script('return document.body.scrollHeight')
    while True:
        # Scroll to the bottom, wait for a couple seconds
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight - 3000)")
        time.sleep(10)

        # If it loaded new content, the height should have changed
        prev_height = current_height
        current_height = driver.execute_script('return document.body.scrollHeight')
        if prev_height == current_height:
            print("Height didn't change, end of page, exiting")
            break
        else:
            print("New page height is", current_height, "continuing to try to scroll")
        
    all_restaurants = []
    
    html = driver.page_source
    city_soup = BeautifulSoup(html)

    restaurants_details = city_soup.find_all('h4')

    for each_restaurant_name in restaurants_details:
        each_restaurant_dict = {}
        each_restaurant_dict['name'] = each_restaurant_name.string
        try:
            rating = each_restaurant_name.find_next_sibling().div.div.div.div.div.string
        except:
            rating = "Not available"
        each_restaurant_dict['rating'] = rating
        try:
            price = each_restaurant_name.parent.find_next_sibling().find_all('p')[1].string
        except:
            price = "Range not given"
        each_restaurant_dict['price'] = price
        each_restaurant_dict['city'] = city
        all_restaurants.append(each_restaurant_dict)
    return all_restaurants

In [4]:
master_cities_list = []

cities = ['ncr','mumbai', 'hyderabad', 'lucknow', 'kolkata', 'bangalore']

for each_city in cities:
    restaurants = get_biryani_restaurants(each_city)
    master_cities_list.extend(restaurants)

New page height is 5694 continuing to try to scroll
New page height is 7440 continuing to try to scroll
New page height is 9185 continuing to try to scroll
New page height is 10931 continuing to try to scroll
New page height is 12676 continuing to try to scroll
New page height is 14422 continuing to try to scroll
New page height is 15388 continuing to try to scroll
New page height is 15460 continuing to try to scroll
Height didn't change, end of page, exiting
New page height is 5662 continuing to try to scroll
New page height is 7408 continuing to try to scroll
New page height is 9154 continuing to try to scroll
New page height is 10899 continuing to try to scroll
New page height is 12645 continuing to try to scroll
New page height is 14390 continuing to try to scroll
New page height is 15793 continuing to try to scroll
Height didn't change, end of page, exiting
New page height is 5703 continuing to try to scroll
New page height is 7449 continuing to try to scroll
New page height is 91

In [67]:
df = pd.DataFrame(master_cities_list)
df

Unnamed: 0,name,rating,price,city
0,Moti Mahal Delux,3.8,₹300 for one,ncr
1,Gulati,4.3,₹300 for one,ncr
2,Veg Gulati,4.0,₹300 for one,ncr
3,Bikkgane Biryani,3.9,₹300 for one,ncr
4,Paradise Muradabadi Chicken Biryani,4.1,₹300 for one,ncr
5,Dhaba On Wheels,3.1,₹300 for one,ncr
6,Mr Naan Mrs Curry,3.4,₹300 for one,ncr
7,Irshad Hussain Dhaba,4.0,₹300 for one,ncr
8,Zaika Muradabadi Chicken Biryani,3.5,₹300 for one,ncr
9,Best In Town,-,₹250 for one,ncr


In [68]:
df.price.value_counts()

₹100 for one       142
₹200 for one        98
₹150 for one        84
₹250 for one        81
₹300 for one        56
₹400 for one        36
₹350 for one        21
₹500 for one        12
₹50 for one          9
Range not given      6
Name: price, dtype: int64

In [69]:
df["price"] = df["price"].str.replace(" for one", "")
df["price"] = df["price"].str.replace("Range not given", "0")
df["price"] = df["price"].str.replace("₹", "")

In [70]:
df.dtypes

name      object
rating    object
price     object
city      object
dtype: object

In [71]:
df.astype({'price': 'float'}).dtypes

name       object
rating     object
price     float64
city       object
dtype: object

In [72]:
df.price.value_counts()

100    142
200     98
150     84
250     81
300     56
400     36
350     21
500     12
50       9
0        6
Name: price, dtype: int64

In [73]:
df.rating.value_counts()

4.0              100
-                 96
3.9               53
4.1               50
3.8               45
4.3               29
3.7               28
4.2               28
3.6               27
3.5               22
3.4               14
3.2               13
3.3               10
3.1                8
Not available      6
New                5
4.5                3
4.4                3
3.0                2
2.6                1
2.9                1
2.7                1
Name: rating, dtype: int64

In [74]:
df["rating"] = df["rating"].str.replace("-", "0")
df["rating"] = df["rating"].str.replace("New", "0")
df["rating"] = df["rating"].str.replace("Not available", "0")

In [75]:
pd.set_option("display.max_rows", None)
df

Unnamed: 0,name,rating,price,city
0,Moti Mahal Delux,3.8,300,ncr
1,Gulati,4.3,300,ncr
2,Veg Gulati,4.0,300,ncr
3,Bikkgane Biryani,3.9,300,ncr
4,Paradise Muradabadi Chicken Biryani,4.1,300,ncr
5,Dhaba On Wheels,3.1,300,ncr
6,Mr Naan Mrs Curry,3.4,300,ncr
7,Irshad Hussain Dhaba,4.0,300,ncr
8,Zaika Muradabadi Chicken Biryani,3.5,300,ncr
9,Best In Town,0.0,250,ncr


- Average prices
- Average ratings

- Number of places in each city
- Number of places in each city with rating above 4
- Number of places in each city with rating above 4 and price below 200