In [3]:
!pip install selenium
!pip install bs4
!pip install pandas

In [4]:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
import pandas as pd
from time import sleep

In [5]:
def get_source(url):
    # Set up Chrome options
    chrome_options = Options()
    chrome_options.add_argument('log-level=3')
    chrome_options.add_argument("--start-maximized")
    chrome_options.add_argument('--blink-settings=imagesEnabled=false')
    chrome_options.add_argument('--disable-images')
    chrome_options.add_argument('--no-sandbox')
    chrome_options.add_argument('--disable-dev-shm-usage')

    # Initialize the WebDriver
    driver = webdriver.Chrome(options=chrome_options)

    # URL of the page you want to scrape
    url = url
    driver.get(url)
    html = ""  
    try:
        # Wait until all "read more" buttons are present
        wait = WebDriverWait(driver, 10)
        buttons = wait.until(EC.presence_of_all_elements_located((By.XPATH, "//span[text()=' read more']")))

        print(f"Expanding all the {len(buttons)} descriptions...\n")

        # Click each button
        for i, button in enumerate(buttons):
            try:
                # Scroll into view
                print(f"{i+1} Expended ...")
                driver.execute_script("arguments[0].scrollIntoView();", button)
                sleep(1)

                # Ensure the element is clickable
                wait.until(EC.element_to_be_clickable(button))

                # Click using JavaScript if regular click fails
                driver.execute_script("arguments[0].click();", button)

                sleep(1)
            except Exception as e:
                print(f"Error clicking button: {e}")


        # Get page source after clicking all buttons
        html = driver.page_source if driver.page_source else "" 

    except Exception as e:
        print(f"Error: {e}")


    # Save the page source to a file
    with open('page_source.html', 'w', encoding='utf-8') as file:
        file.write(html)

    # Close the driver
    driver.quit()

In [6]:
url = "https://www.zomato.com/mumbai/tru-falafel-khar/order"

get_source(url)

Expanding all the 40 descriptions...

1 Expended ...
2 Expended ...
3 Expended ...
4 Expended ...
5 Expended ...
6 Expended ...
7 Expended ...
8 Expended ...
9 Expended ...
10 Expended ...
11 Expended ...
12 Expended ...
13 Expended ...
14 Expended ...
15 Expended ...
16 Expended ...
17 Expended ...
18 Expended ...
19 Expended ...
20 Expended ...
21 Expended ...
22 Expended ...
23 Expended ...
24 Expended ...
25 Expended ...
26 Expended ...
27 Expended ...
28 Expended ...
29 Expended ...
30 Expended ...
31 Expended ...
32 Expended ...
33 Expended ...
34 Expended ...
35 Expended ...
36 Expended ...
37 Expended ...
38 Expended ...
39 Expended ...
40 Expended ...


In [7]:
with open(r"page_source.html",encoding='utf') as html:
    content = html.read()

In [8]:
soup = BeautifulSoup(content,'html.parser')

In [9]:
soup

<html data-rh="lang" lang="en-in"><head>
<meta content="width=device-width, initial-scale=1" name="viewport"/>
<link href="https://googleads.g.doubleclick.net" rel="preconnect"/>
<link href="https://jumbo.zomato.com" rel="preconnect"/>
<link href="https://accounts.google.com" rel="preconnect"/>
<link href="https://securepubads.g.doubleclick.net" rel="preconnect"/>
<link href="https://www.google-analytics.com" rel="preconnect"/>
<link href="https://www.googleadservices.com" rel="preconnect"/>
<link href="https://bat.bing.com" rel="preconnect"/>
<link href="https://8391443.fls.doubleclick.net" rel="preconnect"/>
<title>Tru Falafel, Khar order online - Zomato</title>
<meta charset="utf-8" data-rh="true"/><meta content="NOODP,NOYDIR" data-rh="true" name="robots"/><meta content="Order food online from Tru Falafel, Khar, Mumbai. Get great offers and super fast food delivery when you order food online from Tru Falafel on Zomato." data-rh="true" name="description"/><meta content="summary" data

In [10]:
food_items = soup.find_all('div', {'type': ['veg', 'non-veg']})
data = []  
for elm in food_items:
    # Determine the food type
    food_type = elm.get('type')

    # Find the sibling element
    sibling = elm.find_parent().find_next_sibling()

    if sibling:
        # Find the section tag by moving up the hierarchy from the sibling
        section_tag = sibling.find_parent()
        while section_tag and section_tag.name != 'section':
            section_tag = section_tag.find_parent()

        if section_tag:
            # Extract the h4 tag text within the section
            section = section_tag.find('h4')
            if section:
                section_text = section.get_text().strip()
            else:
                print("No section found within the section.")
        else:
            print("No section tag found.")

        # Extract the sub headding
        sub_heading = ""
        if section_tag:
            sub_hedding_find = section_tag.select('section > div > p') 
            
            for p_tag in sub_hedding_find:
                sub_heading = p_tag.get_text().strip()

        # Extract the title
        title_h = sibling.select_one("div > div > div > h4")
        title = title_h.get_text().strip() if title_h else None

        # Extract the votes
        vote_span = sibling.select_one("span:contains('votes')")
        vote = vote_span.get_text().strip() if vote_span else None

        # Extract the price
        price_span = sibling.select_one("div > div > div > div > span:not(:contains('votes'))")
        price = price_span.get_text().replace('₹', '').strip() if price_span else None

        # Extract the description
        description_p = sibling.select_one("p")
        description = description_p.get_text().strip() if description_p else None

        data.append({
            'Title': title,
            'Type': food_type,
            'Price': price,
            'Votes': vote,
            'Description': description,
            'Sub Headding': sub_heading,
            'Section': section_text
        })
        # print(f"Title: {title}\nType: {food_type}\nPrice: {price}\nVotes: {vote}\nDescription: {description}\n")



In [11]:
df = pd.DataFrame(data)

try:
    df.to_csv('output.csv', index=False)
except Exception as e:
    print(e)