In [1]:
%pip install selenium pandas openpyxl

Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


In [2]:
import time
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException, TimeoutException
import pandas as pd
import os
from datetime import datetime

# Configure Chrome options
options = webdriver.ChromeOptions()
options.add_argument("--start-maximized")
options.add_experimental_option("excludeSwitches", ["enable-automation"])
options.add_experimental_option('useAutomationExtension', False)

# Initialize the driver
driver = webdriver.Chrome(options=options)
wait = WebDriverWait(driver, 15)

# Main URL
main_url = "https://turftown.in/chennai/sports-venues"
driver.get(main_url)

# File to store scraped data
output_file = 'turf.xlsx'
temp_file = 'turf_temp.xlsx'
log_file = 'scraping_log.txt'

def log_message(message):
    timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
    log_entry = f"[{timestamp}] {message}\n"
    print(log_entry, end='')
    with open(log_file, 'a') as f:
        f.write(log_entry)

def load_scraped_urls():
    scraped_urls = set()
    for file in [output_file, temp_file]:
        if os.path.exists(file):
            try:
                df = pd.read_excel(file)
                if 'URL' in df.columns:
                    scraped_urls.update(df['URL'].tolist())
            except Exception as e:
                log_message(f"Warning: Could not read {file}: {str(e)}")
    return scraped_urls

def save_data(data, file_path):
    max_retries = 3
    for attempt in range(max_retries):
        try:
            df = pd.DataFrame(data)
            if os.path.exists(file_path):
                existing_df = pd.read_excel(file_path)
                combined_df = pd.concat([existing_df, df], ignore_index=True)
                combined_df = combined_df.drop_duplicates(subset=['URL'], keep='last')
                combined_df.to_excel(file_path, index=False)
            else:
                df.to_excel(file_path, index=False)
            return True
        except PermissionError:
            if attempt == max_retries - 1:
                log_message(f"Error: Could not save to {file_path} after {max_retries} attempts")
                return False
            time.sleep(2)
        except Exception as e:
            log_message(f"Error saving data: {str(e)}")
            return False
    return True

def scrape_phone_number_with_retry(max_retries=2):
    """Improved phone number scraping that clicks the correct call button"""
    for attempt in range(max_retries):
        try:
            # Scroll to bottom where the buttons are located
            driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
            time.sleep(1.5)
            
            # Find the specific call button using precise selectors
            call_button = None
            for selector in [
                "div.address_button_cont2__N6Sn9 button",  # Specific container for call button
                "button img[alt='call_icon']",  # Button containing call icon image
                "button[class*='primaryButton_dark_container'] img[alt*='call']"  # More specific
            ]:
                try:
                    call_button = wait.until(
                        EC.element_to_be_clickable((By.CSS_SELECTOR, selector))
                    )
                    break
                except:
                    continue
            
            if not call_button:
                raise NoSuchElementException("Call button not found")
            
            # Scroll the button into view properly
            driver.execute_script("arguments[0].scrollIntoView({block: 'center', behavior: 'smooth'});", call_button)
            time.sleep(0.5)
            
            # Click using JavaScript to avoid interception
            driver.execute_script("arguments[0].click();", call_button)
            log_message(f"Attempt {attempt + 1}: Clicked call button")
            
            # Wait for modal with phone number
            modal = wait.until(
                EC.visibility_of_element_located(
                    (By.CSS_SELECTOR, "div.address_modal__TMCLW")
                )
            )
            
            # Extract phone number
            phone_element = wait.until(
                EC.visibility_of_element_located(
                    (By.CSS_SELECTOR, "div.address_modal__TMCLW p.address_modalTitle__Bzh08")
                )
            )
            phone_number = phone_element.text.replace("Contact Number: ", "").strip()
            
            # Click OK to close modal
            ok_button = wait.until(
                EC.element_to_be_clickable(
                    (By.CSS_SELECTOR, "div.address_modal__TMCLW button.primaryButton_container__u4V1M")
                )
            )
            driver.execute_script("arguments[0].click();", ok_button)
            
            # Wait for modal to disappear
            wait.until(
                EC.invisibility_of_element_located(
                    (By.CSS_SELECTOR, "div.address_modal__TMCLW")
                )
            )
            
            return phone_number
            
        except TimeoutException:
            log_message(f"Attempt {attempt + 1}: Timeout waiting for modal")
            if attempt == max_retries - 1:
                return "Modal timeout"
            time.sleep(2)
            
        except Exception as e:
            log_message(f"Attempt {attempt + 1}: Error - {str(e)}")
            if attempt == max_retries - 1:
                return "Scraping error"
            time.sleep(2)
    
    return "Not found after retries"

def scrape_venue_data(venue_url):
    """Scrape all data from a single venue page with improved stability"""
    # Open venue page in a new tab
    driver.execute_script("window.open('');")
    driver.switch_to.window(driver.window_handles[1])
    driver.get(venue_url)
    time.sleep(2)  # Let page load completely
    
    venue_data = {
        'URL': venue_url,
        'Scraped At': datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
        'Image URLs': "Not found",
        'Venue Name': "Not found",
        'Sports Type': "Not found",
        'Rating': "Not found",
        'Number of Ratings': "Not found",
        'Address': "Not found",
        'Amenities': "Not found",
        'Phone Number': "Not found"
    }
    
    try:
        # 1. Image URLs
        try:
            images = wait.until(EC.presence_of_all_elements_located(
                (By.CSS_SELECTOR, ".GridImageSection_row__YVgOa img")
            ))
            venue_data['Image URLs'] = ", ".join(
                [img.get_attribute('src') for img in images if img.get_attribute('src')]
            )
        except Exception as e:
            log_message(f"Image URLs error: {str(e)}")
        
        # 2. Venue name and area
        try:
            name_element = wait.until(EC.presence_of_element_located(
                (By.CSS_SELECTOR, ".authenticatedVenue_name__oo_V3")
            ))
            venue_data['Venue Name'] = name_element.text.replace('\n', ' ').strip()
        except Exception as e:
            log_message(f"Venue name error: {str(e)}")
        
        # 3. Sports type
        try:
            sport_type = wait.until(EC.presence_of_element_located(
                (By.CSS_SELECTOR, ".rating_sport_name__xnRpa.rating_white1__13nIV")
            ))
            venue_data['Sports Type'] = sport_type.text.strip()
        except Exception as e:
            log_message(f"Sports type error: {str(e)}")
        
        # 4. Rating - improved with multiple selector options
        try:
            # Try the main selector first
            rating = wait.until(EC.presence_of_element_located(
                (By.CSS_SELECTOR, "div.rating_row__x4_93 p.rating_orange__6Il0F")
            ))
            venue_data['Rating'] = rating.text.strip()
        except TimeoutException:
            try:
                # Fallback to more general selector
                rating = driver.find_element(
                    By.CSS_SELECTOR, "p[class*='rating_orange']"
                )
                venue_data['Rating'] = rating.text.strip()
            except NoSuchElementException:
                try:
                    # Last fallback - look for orange-colored text that's likely the rating
                    rating = driver.find_element(
                        By.XPATH, "//p[contains(@class, 'orange') and string-length(text()) < 5]"
                    )
                    venue_data['Rating'] = rating.text.strip()
                except Exception:
                    venue_data['Rating'] = "Not found"
                    log_message(f"Rating not found for {venue_url}")
        except Exception as e:
            venue_data['Rating'] = "Not found"
            log_message(f"Rating error for {venue_url}: {str(e)}")
        
        # 5. Number of ratings
        try:
            num_ratings = wait.until(EC.presence_of_element_located(
                (By.CSS_SELECTOR, ".rating_sport_name__xnRpa.rating_white1__13nIV.rating_ml-8__CZHjt")
            ))
            venue_data['Number of Ratings'] = num_ratings.text.strip() \
                .replace('(', '').replace(' ratings)', '')
        except Exception as e:
            log_message(f"Number of ratings error: {str(e)}")
        
        # 6. Address
        try:
            address = wait.until(EC.presence_of_element_located(
                (By.CSS_SELECTOR, ".address_caption__tWtid")
            ))
            venue_data['Address'] = address.text.strip()
        except Exception as e:
            log_message(f"Address error: {str(e)}")
        
        # 7. Amenities
        try:
            amenities_section = wait.until(EC.presence_of_element_located(
                (By.ID, "amenties")
            ))
            amenities = amenities_section.find_elements(
                By.CSS_SELECTOR, ".amenities_sub__JlZ_n"
            )
            venue_data['Amenities'] = ", ".join(
                [amenity.text.strip() for amenity in amenities]
            )
        except Exception as e:
            log_message(f"Amenities error: {str(e)}")
        
        # 8. Phone number (scrape last with retries)
        venue_data['Phone Number'] = scrape_phone_number_with_retry()
        
    except Exception as e:
        log_message(f"General error scraping {venue_url}: {str(e)}")
    
    finally:
        # Close the venue tab and switch back to main window
        driver.close()
        driver.switch_to.window(driver.window_handles[0])
        return venue_data

def main_scraping_loop():
    """Main loop to handle the scraping process with improved stability"""
    scraped_urls = load_scraped_urls()
    log_message(f"Loaded {len(scraped_urls)} previously scraped URLs")
    
    if not os.path.exists(temp_file):
        pd.DataFrame(columns=['URL']).to_excel(temp_file, index=False)
    
    try:
        while True:
            try:
                # Find all venue links with multiple attempts
                venue_links = []
                for _ in range(3):  # Try 3 times to find links
                    try:
                        venue_links = wait.until(EC.presence_of_all_elements_located(
                            (By.CSS_SELECTOR, ".venueItemAlt_container__FiIAZ a.venueItemAlt_deco-none__M2A4x")
                        ))
                        break
                    except:
                        time.sleep(2)
                        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
                
                venue_urls = [
                    url if url.startswith('http') else f"https://turftown.in{url}" 
                    for url in [link.get_attribute('href') for link in venue_links]
                ]
                
                new_urls = [url for url in venue_urls if url not in scraped_urls]
                
                if not new_urls:
                    log_message("No new venues found on this page")
                else:
                    log_message(f"Found {len(new_urls)} new venues to scrape on this page")
                
                for url in new_urls:
                    log_message(f"Scraping: {url}")
                    venue_data = scrape_venue_data(url)
                    
                    if not save_data([venue_data], temp_file):
                        log_message("Warning: Failed to save data for this venue")
                    
                    scraped_urls.add(url)
                    time.sleep(1.5)  # Slightly longer delay between venues
                
                # Handle "See more" button
                try:
                    see_more = wait.until(EC.element_to_be_clickable(
                        (By.XPATH, "//button[contains(., 'See more')]")
                    ))
                    driver.execute_script("arguments[0].scrollIntoView();", see_more)
                    time.sleep(0.5)
                    driver.execute_script("arguments[0].click();", see_more)
                    log_message("Clicked 'See more' button")
                    time.sleep(3)  # Wait for new content to load
                except (NoSuchElementException, TimeoutException):
                    log_message("No more 'See more' button found. Finished scraping.")
                    break
                except Exception as e:
                    log_message(f"Error clicking 'See more': {str(e)}")
                    break
                    
            except Exception as e:
                log_message(f"Error in page processing: {str(e)}")
                break
                
    except Exception as e:
        log_message(f"Fatal error in main loop: {str(e)}")
    finally:
        # Final cleanup and data consolidation
        try:
            if os.path.exists(temp_file):
                temp_df = pd.read_excel(temp_file)
                if os.path.exists(output_file):
                    main_df = pd.read_excel(output_file)
                    combined_df = pd.concat([main_df, temp_df]).drop_duplicates(subset=['URL'], keep='last')
                    combined_df.to_excel(output_file, index=False)
                else:
                    temp_df.to_excel(output_file, index=False)
                os.remove(temp_file)
        except Exception as e:
            log_message(f"Error during final cleanup: {str(e)}")
        
        log_message(f"Scraping complete. Final data saved to {output_file}")
        driver.quit()

if __name__ == "__main__":
    log_message("Starting scraping process")
    main_scraping_loop()

[2025-04-02 13:34:02] Starting scraping process
[2025-04-02 13:34:03] Loaded 439 previously scraped URLs
[2025-04-02 13:34:04] No new venues found on this page
[2025-04-02 13:34:05] Clicked 'See more' button
[2025-04-02 13:34:08] No new venues found on this page
[2025-04-02 13:34:09] Clicked 'See more' button
[2025-04-02 13:34:12] No new venues found on this page
[2025-04-02 13:34:13] Clicked 'See more' button
[2025-04-02 13:34:16] No new venues found on this page
[2025-04-02 13:34:16] Clicked 'See more' button
[2025-04-02 13:34:20] No new venues found on this page
[2025-04-02 13:34:20] Clicked 'See more' button
[2025-04-02 13:34:24] No new venues found on this page
[2025-04-02 13:34:24] Clicked 'See more' button
[2025-04-02 13:34:28] No new venues found on this page
[2025-04-02 13:34:29] Clicked 'See more' button
[2025-04-02 13:34:32] No new venues found on this page
[2025-04-02 13:34:33] Clicked 'See more' button
[2025-04-02 13:34:37] No new venues found on this page
[2025-04-02 13:3