In [10]:
import os
import time
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException, TimeoutException
from openpyxl import load_workbook
import warnings
from datetime import datetime

# Suppress openpyxl warnings
warnings.filterwarnings('ignore', category=UserWarning, module='openpyxl')

class YogaScraper:
    def __init__(self):
        self.url = "https://www.sulekha.com/yoga-classes/chennai"
        self.output_file = "yoga_classes_chennai.xlsx"
        self.scraped_urls = set()
        self.driver = None
        self.initialize_scraper()
        
    def initialize_scraper(self):
        # Set up Chrome options
        options = webdriver.ChromeOptions()
        options.add_argument("--start-maximized")
        options.add_argument("--disable-notifications")
        
        # Initialize the WebDriver
        self.driver = webdriver.Chrome(options=options)
        
        # Load existing data if file exists
        if os.path.exists(self.output_file):
            try:
                df = pd.read_excel(self.output_file)
                self.scraped_urls = set(df['URL'].tolist())
                print(f"Resuming scraping. Found {len(self.scraped_urls)} existing records.")
            except Exception as e:
                print(f"Could not read existing file: {e}. Starting fresh scrape.")
    
    def scroll_to_load_content(self):
        """Scroll the page to ensure all content is loaded"""
        scroll_pause = 2.5
        screen_height = self.driver.execute_script("return window.innerHeight")
        scroll_step = int(screen_height * 0.8)
        
        last_height = self.driver.execute_script("return document.body.scrollHeight")
        scroll_attempts = 0
        
        while scroll_attempts < 5:
            # Scroll down by step amount
            self.driver.execute_script(f"window.scrollBy(0, {scroll_step});")
            time.sleep(scroll_pause)
            
            # Check for "View More" button
            try:
                view_more = self.driver.find_element(By.ID, "btnviewmorebiz")
                if view_more.is_displayed():
                    self.driver.execute_script("arguments[0].scrollIntoView({block: 'center'});", view_more)
                    time.sleep(1)
                    self.driver.execute_script("arguments[0].click();", view_more)
                    time.sleep(3)
            except:
                pass
            
            # Check if we've reached the bottom
            new_height = self.driver.execute_script("return document.body.scrollHeight")
            if new_height == last_height:
                scroll_attempts += 1
            else:
                scroll_attempts = 0
                last_height = new_height
            
            current_pos = self.driver.execute_script("return window.pageYOffset + window.innerHeight")
            if current_pos >= new_height:
                break
    
    def extract_listing_data(self, listing):
        """Extract all required data from a single listing"""
        try:
            # Extract name and URL
            name_element = listing.find_element(By.CSS_SELECTOR, "div.name h3.title-xlarge")
            name = name_element.text.strip()
            url = listing.find_element(By.CSS_SELECTOR, "div.name a").get_attribute("href").strip()
            
            if url in self.scraped_urls:
                return None
                
            # Extract locality
            try:
                locality = listing.find_element(By.CSS_SELECTOR, "div.locality span").text.strip()
            except:
                locality = ""
                
            # Extract rating
            try:
                rating = listing.find_element(By.CSS_SELECTOR, "div.ratings-group b").text.strip()
            except:
                rating = ""
                
            # Extract number of reviews
            try:
                reviews = listing.find_element(By.CSS_SELECTOR, "div.ratings-group span").text\
                    .replace("Based on", "").replace("reviews", "").strip()
            except:
                reviews = ""
                
            # Extract phone number
            try:
                phone = listing.find_element(By.CSS_SELECTOR, "div.mobile a[href^='tel:']").text.strip()
            except:
                phone = ""
            
            return {
                "Class Name": name,
                "URL": url,
                "Locality": locality,
                "Rating": rating,
                "Number of Reviews": reviews,
                "Phone Number": phone,
                "Scraped At": datetime.now().strftime("%Y-%m-%d %H:%M:%S")
            }
            
        except Exception as e:
            print(f"Error processing listing: {e}")
            return None
    
    def scrape_visible_listings(self):
        """Scrape all currently visible listings on the page"""
        listings = self.driver.find_elements(By.CSS_SELECTOR, "div.sk-card")
        data = []
        
        for listing in listings:
            listing_data = self.extract_listing_data(listing)
            if listing_data:
                data.append(listing_data)
                self.scraped_urls.add(listing_data['URL'])
        
        return data
    
    def save_to_excel(self, data):
        """Save the scraped data to Excel file"""
        if not data:
            return
            
        try:
            # Create DataFrame from new data
            new_df = pd.DataFrame(data)
            
            # Load existing data if file exists
            if os.path.exists(self.output_file):
                existing_df = pd.read_excel(self.output_file)
                combined_df = pd.concat([existing_df, new_df], ignore_index=True)
                combined_df.drop_duplicates(subset=['URL'], keep='last', inplace=True)
            else:
                combined_df = new_df
            
            # Save to Excel
            combined_df.to_excel(self.output_file, index=False)
            print(f"Saved {len(combined_df)} total records.")
            
        except PermissionError:
            print("Excel file is locked. Waiting to save...")
            time.sleep(10)
            self.save_to_excel(data)
        except Exception as e:
            print(f"Error saving data: {e}")
    
    def run(self):
        try:
            print("Starting scraping...")
            self.driver.get(self.url)
            time.sleep(5)  # Initial page load
            
            print("Scrolling and scraping content...")
            while True:
                # Scroll to load content
                self.scroll_to_load_content()
                
                # Scrape current content
                new_data = self.scrape_visible_listings()
                if new_data:
                    self.save_to_excel(new_data)
                    print(f"Added {len(new_data)} new listings")
                
                # Check if we've reached the end
                try:
                    self.driver.find_element(By.CSS_SELECTOR, "div.no-more-results")
                    print("Reached end of results")
                    break
                except:
                    pass
                
        except Exception as e:
            print(f"Scraping failed: {e}")
        finally:
            self.driver.quit()
            print("Scraping completed successfully!")

if __name__ == "__main__":
    scraper = YogaScraper()
    scraper.run()

Starting scraping...
Scrolling and scraping content...
Error processing listing: Message: no such element: Unable to locate element: {"method":"css selector","selector":"div.name h3.title-xlarge"}
  (Session info: chrome=134.0.6998.179); For documentation on this error, please visit: https://www.selenium.dev/documentation/webdriver/troubleshooting/errors#no-such-element-exception
Stacktrace:
	GetHandleVerifier [0x00007FF70009FE45+26629]
	(No symbol) [0x00007FF700006010]
	(No symbol) [0x00007FF6FFE9931A]
	(No symbol) [0x00007FF6FFEEF8E7]
	(No symbol) [0x00007FF6FFEEFB1C]
	(No symbol) [0x00007FF6FFEE228C]
	(No symbol) [0x00007FF6FFF17AEF]
	(No symbol) [0x00007FF6FFEE2156]
	(No symbol) [0x00007FF6FFF17CC0]
	(No symbol) [0x00007FF6FFF40169]
	(No symbol) [0x00007FF6FFF17883]
	(No symbol) [0x00007FF6FFEE0550]
	(No symbol) [0x00007FF6FFEE1803]
	GetHandleVerifier [0x00007FF7003F72BD+3529853]
	GetHandleVerifier [0x00007FF70040DA22+3621858]
	GetHandleVerifier [0x00007FF7004024D3+3575443]
	GetHan

KeyboardInterrupt: 