In [1]:
import time
import csv
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException, NoSuchElementException, StaleElementReferenceException

# === CONFIG ===
maps_url = "https://www.google.com/maps/place/Pearl+Continental+Hotel+Rawalpindi/data=!4m10!3m9!1s0x38df9363b8ad0a09:0xb00ace25d4922b2c!5m2!4m1!1i2!8m2!3d33.588719!4d73.056722!16s%2Fg%2F12cnws5r7!19sChIJCQqtuGOT3zgRLCuS1CXOCrA?authuser=0&hl=en&rclk=1"
chromedriver_path = r"E:\Downloads\chromedriver-win64\chromedriver-win64\chromedriver.exe"
output_file = "cheesecake_factory_reviews.csv"

# === SETUP SELENIUM WITH ANTI-DETECTION ===
chrome_options = Options()
# Keep browser visible for debugging - comment out headless for now
# chrome_options.add_argument("--headless=new")

# Anti-detection measures
chrome_options.add_argument("--disable-blink-features=AutomationControlled")
chrome_options.add_experimental_option("excludeSwitches", ["enable-automation"])
chrome_options.add_experimental_option('useAutomationExtension', False)
chrome_options.add_argument("--disable-gpu")
chrome_options.add_argument("--window-size=1920,1080")
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--disable-dev-shm-usage")
chrome_options.add_argument("--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36")
chrome_options.add_argument("--disable-web-security")
chrome_options.add_argument("--allow-running-insecure-content")

try:
    service = Service(chromedriver_path)
    driver = webdriver.Chrome(service=service, options=chrome_options)
    
    # Execute CDP commands to hide webdriver
    driver.execute_cdp_cmd('Network.setUserAgentOverride', {
        "userAgent": 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36'
    })
    driver.execute_script("Object.defineProperty(navigator, 'webdriver', {get: () => undefined})")
    
    wait = WebDriverWait(driver, 20)
    
    print("üåê Opening Google Maps...")
    driver.get(maps_url)
    time.sleep(8)  # Longer initial wait
    
    # === CLOSE COOKIE POPUP IF PRESENT ===
    try:
        reject_btn = driver.find_element(By.XPATH, "//button[contains(., 'Reject all')]")
        reject_btn.click()
        print("‚úÖ Closed cookie popup")
        time.sleep(2)
    except:
        print("‚ÑπÔ∏è No cookie popup detected")
    
    # === OPEN ALL REVIEWS ===
    print("üîç Looking for reviews button...")
    try:
        # Wait and try to find reviews button
        possible_xpaths = [
            "//button[contains(@aria-label,'Reviews')]",
            "//button[contains(@aria-label,'reviews')]",
            "//button[contains(@data-tab-index,'1')]",
            "//button[.//span[contains(text(),'Reviews')]]",
            "//button[.//span[contains(text(),'reviews')]]"
        ]
        
        all_reviews_btn = None
        for xpath in possible_xpaths:
            try:
                all_reviews_btn = WebDriverWait(driver, 5).until(
                    EC.element_to_be_clickable((By.XPATH, xpath))
                )
                print(f"   Found button with xpath: {xpath[:50]}...")
                break
            except:
                continue
        
        if all_reviews_btn:
            # Scroll to button first
            driver.execute_script("arguments[0].scrollIntoView({behavior: 'smooth', block: 'center'});", all_reviews_btn)
            time.sleep(1)
            all_reviews_btn.click()
            print("‚úÖ Clicked reviews button")
            time.sleep(5)
        else:
            print("‚ö†Ô∏è Could not find reviews button - reviews might already be visible")
    except Exception as e:
        print(f"‚ö†Ô∏è Error with reviews button: {e}")
        print("   Continuing anyway - reviews might already be visible")
    
    # === FIND SCROLL CONTAINER ===
    print("üìú Looking for scrollable reviews container...")
    scroll_div = None
    scroll_xpaths = [
        '//div[contains(@class,"m6QErb") and contains(@class,"DxyBCb")]',
        '//div[@role="main"]//div[contains(@class,"m6QErb")]',
        '//div[contains(@aria-label,"Reviews")]',
    ]
    
    for xpath in scroll_xpaths:
        try:
            scroll_div = driver.find_element(By.XPATH, xpath)
            print(f"   Found scroll container")
            break
        except:
            continue
    
    if not scroll_div:
        print("‚ùå Could not find scroll container. Exiting.")
        driver.quit()
        exit()
    
    # === SCROLL AND LOAD REVIEWS ===
    print("üîÅ Scrolling through reviews...")
    last_height = 0
    same_count = 0
    max_same = 5
    scroll_attempts = 0
    max_scrolls = 100
    
    while scroll_attempts < max_scrolls:
        try:
            driver.execute_script('arguments[0].scrollTop = arguments[0].scrollHeight', scroll_div)
            time.sleep(2)
            
            new_height = driver.execute_script('return arguments[0].scrollHeight', scroll_div)
            
            # Expand "More" buttons
            try:
                more_buttons = driver.find_elements(By.XPATH, '//button[@aria-label="See more" or contains(@class,"w8nwRe")]')
                for btn in more_buttons[:3]:
                    try:
                        if btn.is_displayed():
                            driver.execute_script("arguments[0].click();", btn)
                            time.sleep(0.5)
                    except:
                        pass
            except:
                pass
            
            if new_height == last_height:
                same_count += 1
            else:
                same_count = 0
            
            if same_count >= max_same:
                print(f"   Reached end after {scroll_attempts} scrolls")
                break
            
            last_height = new_height
            scroll_attempts += 1
            
            if scroll_attempts % 10 == 0:
                current_reviews = len(driver.find_elements(By.XPATH, '//div[@data-review-id]'))
                print(f"   Scrolled {scroll_attempts} times... Found {current_reviews} reviews so far")
                
        except Exception as e:
            print(f"   Error during scroll: {e}")
            break
    
    print("‚úÖ Scrolling complete. Extracting reviews...")
    
    # === EXTRACT REVIEWS ===
    reviews_elements = driver.find_elements(By.XPATH, '//div[@data-review-id]')
    print(f"üßæ Found {len(reviews_elements)} review containers")
    
    if len(reviews_elements) == 0:
        print("‚ö†Ô∏è No reviews found. Saving page source for debugging...")
        with open("debug_page_source.html", "w", encoding="utf-8") as f:
            f.write(driver.page_source)
        print("   Page source saved to debug_page_source.html")
    
    reviews_data = []
    
    for idx, r in enumerate(reviews_elements):
        try:
            # Extract rating
            rating = None
            try:
                rating_elem = r.find_element(By.XPATH, './/*[contains(@aria-label,"star")]')
                rating = rating_elem.get_attribute("aria-label")
            except:
                try:
                    rating_elem = r.find_element(By.XPATH, './/span[@role="img"]')
                    rating = rating_elem.get_attribute("aria-label")
                except:
                    rating = "No rating"
            
            # Extract review text
            text = ""
            try:
                text_elem = r.find_element(By.XPATH, './/span[@class="wiI7pd"]')
                text = text_elem.text.strip()
            except:
                try:
                    text_elem = r.find_element(By.XPATH, './/div[contains(@class,"MyEned")]')
                    text = text_elem.text.strip()
                except:
                    pass
            
            # Debug first few reviews
            if idx < 3:
                print(f"\n--- Review {idx+1} ---")
                print(f"Rating: {rating}")
                print(f"Text: {text[:100] if text else '(empty)'}...")
            
            if rating or text:
                reviews_data.append([rating, text])
                
        except StaleElementReferenceException:
            continue
        except Exception as e:
            if idx < 5:
                print(f"   Error on review {idx}: {e}")
            continue
    
    print(f"\nüìä Successfully extracted {len(reviews_data)} reviews")
    
    driver.quit()
    
    # === SAVE CSV ===
    if len(reviews_data) > 0:
        with open(output_file, "w", newline="", encoding="utf-8-sig") as f:
            writer = csv.writer(f)
            writer.writerow(["Rating", "Review"])
            writer.writerows(reviews_data)
        print(f"‚úÖ Done! {len(reviews_data)} reviews saved to '{output_file}'")
    else:
        print("‚ùå No reviews were extracted. Check debug_page_source.html to see what the page looks like.")

except Exception as e:
    print(f"\n‚ùå Fatal error: {e}")
    import traceback
    traceback.print_exc()
    try:
        driver.quit()
    except:
        pass

ModuleNotFoundError: No module named 'selenium'

In [1]:
import time
import csv
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException, NoSuchElementException, StaleElementReferenceException

# === CONFIG ===
maps_url = "https://www.google.com/maps/place/Semicolon+Cafe/@47.6119559,-122.200889,17z/data=!3m1!4b1!4m6!3m5!1s0x54906d666ec16e35:0x7b19a782bd3e0877!8m2!3d47.6119559!4d-122.200889!16s%2Fg%2F11rwmdyt7w?entry=ttu&g_ep=EgoyMDI1MTEwMi4wIKXMDSoASAFQAw%3D%3D"
chromedriver_path = r"E:\Downloads\chromedriver-win64\chromedriver-win64\chromedriver.exe"
output_file = "cheesecake_factory_reviews.csv"

# === SETUP SELENIUM WITH ANTI-DETECTION ===
chrome_options = Options()
# Keep browser visible for debugging
# chrome_options.add_argument("--headless=new")

# Anti-detection measures
chrome_options.add_argument("--disable-blink-features=AutomationControlled")
chrome_options.add_experimental_option("excludeSwitches", ["enable-automation"])
chrome_options.add_experimental_option('useAutomationExtension', False)
chrome_options.add_argument("--disable-gpu")
chrome_options.add_argument("--window-size=1920,1080")
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--disable-dev-shm-usage")
chrome_options.add_argument("--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36")

try:
    service = Service(chromedriver_path)
    driver = webdriver.Chrome(service=service, options=chrome_options)
    
    # Execute CDP commands to hide webdriver
    driver.execute_cdp_cmd('Network.setUserAgentOverride', {
        "userAgent": 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36'
    })
    driver.execute_script("Object.defineProperty(navigator, 'webdriver', {get: () => undefined})")
    
    wait = WebDriverWait(driver, 20)
    
    print("üåê Opening Google Maps...")
    driver.get(maps_url)
    time.sleep(8)
    
    # === CLOSE COOKIE POPUP IF PRESENT ===
    try:
        reject_btn = driver.find_element(By.XPATH, "//button[contains(., 'Reject all')]")
        reject_btn.click()
        print("‚úÖ Closed cookie popup")
        time.sleep(2)
    except:
        print("‚ÑπÔ∏è No cookie popup detected")
    
    # === OPEN ALL REVIEWS ===
    print("üîç Looking for reviews button...")
    try:
        possible_xpaths = [
            "//button[contains(@aria-label,'Reviews')]",
            "//button[contains(@aria-label,'reviews')]",
            "//button[contains(@data-tab-index,'1')]",
            "//button[.//span[contains(text(),'Reviews')]]",
            "//button[.//span[contains(text(),'reviews')]]"
        ]
        
        all_reviews_btn = None
        for xpath in possible_xpaths:
            try:
                all_reviews_btn = WebDriverWait(driver, 5).until(
                    EC.element_to_be_clickable((By.XPATH, xpath))
                )
                break
            except:
                continue
        
        if all_reviews_btn:
            driver.execute_script("arguments[0].scrollIntoView({behavior: 'smooth', block: 'center'});", all_reviews_btn)
            time.sleep(1)
            all_reviews_btn.click()
            print("‚úÖ Clicked reviews button")
            time.sleep(5)
        else:
            print("‚ö†Ô∏è Could not find reviews button")
    except Exception as e:
        print(f"‚ö†Ô∏è Error with reviews button: {e}")
    
    # === SORT BY NEWEST (to get consistent results) ===
    try:
        print("üîΩ Setting sort order to 'Newest'...")
        sort_button = WebDriverWait(driver, 10).until(
            EC.element_to_be_clickable((By.XPATH, "//button[contains(@aria-label,'Sort') or contains(@data-value,'Sort')]"))
        )
        sort_button.click()
        time.sleep(2)
        
        # Click "Newest" option
        newest_option = WebDriverWait(driver, 5).until(
            EC.element_to_be_clickable((By.XPATH, "//div[@role='menuitemradio' and contains(.,'Newest')]"))
        )
        newest_option.click()
        time.sleep(3)
        print("‚úÖ Sorted by Newest")
    except Exception as e:
        print(f"‚ÑπÔ∏è Could not change sort order: {e}")
    
    # === FIND SCROLL CONTAINER ===
    print("üìú Looking for scrollable reviews container...")
    scroll_div = None
    scroll_xpaths = [
        '//div[contains(@class,"m6QErb") and contains(@class,"DxyBCb")]',
        '//div[@role="main"]//div[contains(@class,"m6QErb")]',
        '//div[contains(@aria-label,"Reviews")]',
    ]
    
    for xpath in scroll_xpaths:
        try:
            scroll_div = driver.find_element(By.XPATH, xpath)
            print(f"   Found scroll container")
            break
        except:
            continue
    
    if not scroll_div:
        print("‚ùå Could not find scroll container. Exiting.")
        driver.quit()
        exit()
    
    # === SCROLL AND LOAD REVIEWS WITH DEDUPLICATION ===
    print("üîÅ Scrolling through reviews (this may take several minutes)...")
    
    seen_review_ids = set()  # Track unique review IDs
    last_count = 0
    no_new_reviews_count = 0
    max_no_new = 10  # Stop if no new reviews after 10 scrolls
    scroll_attempts = 0
    max_scrolls = 300  # Increase max scrolls to get all reviews
    
    while scroll_attempts < max_scrolls:
        try:
            # Scroll down
            driver.execute_script('arguments[0].scrollTop = arguments[0].scrollHeight', scroll_div)
            time.sleep(2.5)  # Wait for content to load
            
            # Expand "More" buttons to get full review text
            try:
                more_buttons = driver.find_elements(By.XPATH, '//button[@aria-label="See more" or contains(@class,"w8nwRe")]')
                for btn in more_buttons[:5]:  # Expand first 5 visible
                    try:
                        if btn.is_displayed():
                            driver.execute_script("arguments[0].click();", btn)
                            time.sleep(0.3)
                    except:
                        pass
            except:
                pass
            
            # Count unique reviews
            current_reviews = driver.find_elements(By.XPATH, '//div[@data-review-id]')
            current_unique_ids = set()
            for r in current_reviews:
                try:
                    review_id = r.get_attribute('data-review-id')
                    if review_id:
                        current_unique_ids.add(review_id)
                except:
                    pass
            
            current_count = len(current_unique_ids)
            
            # Check if we found new reviews
            if current_count == last_count:
                no_new_reviews_count += 1
            else:
                no_new_reviews_count = 0
                seen_review_ids.update(current_unique_ids)
            
            # Stop if no new reviews for a while
            if no_new_reviews_count >= max_no_new:
                print(f"   No new reviews found after {no_new_reviews_count} scrolls. Stopping.")
                break
            
            last_count = current_count
            scroll_attempts += 1
            
            # Progress update every 20 scrolls
            if scroll_attempts % 20 == 0:
                print(f"   Scrolled {scroll_attempts} times... Found {current_count} unique reviews")
                
        except Exception as e:
            print(f"   Error during scroll: {e}")
            break
    
    print(f"‚úÖ Scrolling complete after {scroll_attempts} scrolls")
    print(f"üßæ Found {len(seen_review_ids)} unique review IDs")
    
    # === EXTRACT REVIEWS USING UNIQUE IDs ===
    print("üìù Extracting review data...")
    
    reviews_data = []
    reviews_dict = {}  # Use dict to ensure no duplicates
    
    reviews_elements = driver.find_elements(By.XPATH, '//div[@data-review-id]')
    print(f"   Processing {len(reviews_elements)} review elements...")
    
    for idx, r in enumerate(reviews_elements):
        try:
            # Get unique review ID
            review_id = r.get_attribute('data-review-id')
            if not review_id or review_id in reviews_dict:
                continue  # Skip if no ID or already processed
            
            # Extract rating
            rating = None
            try:
                rating_elem = r.find_element(By.XPATH, './/*[contains(@aria-label,"star")]')
                rating = rating_elem.get_attribute("aria-label")
            except:
                try:
                    rating_elem = r.find_element(By.XPATH, './/span[@role="img"]')
                    rating = rating_elem.get_attribute("aria-label")
                except:
                    rating = "No rating"
            
            # Extract review text
            text = ""
            try:
                text_elem = r.find_element(By.XPATH, './/span[@class="wiI7pd"]')
                text = text_elem.text.strip()
            except:
                try:
                    text_elem = r.find_element(By.XPATH, './/div[contains(@class,"MyEned")]')
                    text = text_elem.text.strip()
                except:
                    pass
            
            # Extract reviewer name (optional, for verification)
            reviewer = ""
            try:
                reviewer_elem = r.find_element(By.XPATH, './/div[contains(@class,"d4r55")]')
                reviewer = reviewer_elem.text.strip()
            except:
                pass
            
            # Extract date (optional)
            date = ""
            try:
                date_elem = r.find_element(By.XPATH, './/span[contains(@class,"rsqaWe")]')
                date = date_elem.text.strip()
            except:
                pass
            
            # Store in dict using review_id as key to prevent duplicates
            if rating or text:
                reviews_dict[review_id] = {
                    'rating': rating,
                    'text': text,
                    'reviewer': reviewer,
                    'date': date
                }
            
            # Show progress
            if (idx + 1) % 100 == 0:
                print(f"   Processed {idx + 1}/{len(reviews_elements)} elements, {len(reviews_dict)} unique reviews")
                
        except StaleElementReferenceException:
            continue
        except Exception as e:
            continue
    
    # Convert dict to list
    for review_id, data in reviews_dict.items():
        reviews_data.append([data['rating'], data['text'], data['reviewer'], data['date']])
    
    print(f"\nüìä Successfully extracted {len(reviews_data)} unique reviews")
    
    # Debug: Show first few reviews
    print("\n--- Sample Reviews ---")
    for i, review in enumerate(reviews_data[:3]):
        print(f"\nReview {i+1}:")
        print(f"  Rating: {review[0]}")
        print(f"  Reviewer: {review[2]}")
        print(f"  Date: {review[3]}")
        print(f"  Text: {review[1][:100]}...")
    
    driver.quit()
    
    # === SAVE CSV ===
    if len(reviews_data) > 0:
        with open(output_file, "w", newline="", encoding="utf-8-sig") as f:
            writer = csv.writer(f)
            writer.writerow(["Rating", "Review", "Reviewer", "Date"])
            writer.writerows(reviews_data)
        print(f"\n‚úÖ Done! {len(reviews_data)} unique reviews saved to '{output_file}'")
        print(f"üìÅ File location: {output_file}")
    else:
        print("‚ùå No reviews were extracted.")

except Exception as e:
    print(f"\n‚ùå Fatal error: {e}")
    import traceback
    traceback.print_exc()
    try:
        driver.quit()
    except:
        pass

üåê Opening Google Maps...
‚ÑπÔ∏è No cookie popup detected
üîç Looking for reviews button...
‚úÖ Clicked reviews button
üîΩ Setting sort order to 'Newest'...
‚ÑπÔ∏è Could not change sort order: Message: invalid session id: session deleted as the browser has closed the connection
from disconnected: not connected to DevTools
  (Session info: chrome=142.0.7444.177); For documentation on this error, please visit: https://www.selenium.dev/documentation/webdriver/troubleshooting/errors#invalidsessionidexception
Stacktrace:
Symbols not available. Dumping unresolved backtrace:
	0x7ff68cda7a35
	0x7ff68cda7a90
	0x7ff68cb216ad
	0x7ff68cb0d1c5
	0x7ff68cb32a5a
	0x7ff68cbaa306
	0x7ff68cbcb222
	0x7ff68cb6b068
	0x7ff68cb6be93
	0x7ff68d0629d0
	0x7ff68d05ce50
	0x7ff68d07cc45
	0x7ff68cdc30ce
	0x7ff68cdcadbf
	0x7ff68cdb0c14
	0x7ff68cdb0dcf
	0x7ff68cd96828
	0x7ffd510153e0
	0x7ffd529a485b

üìú Looking for scrollable reviews container...
‚ùå Could not find scroll container. Exiting.
üîÅ Scrolling throug

Traceback (most recent call last):
  File "e:\Google_map_data_scraping\google map reviews scraping\mapscraper\Lib\site-packages\urllib3\connection.py", line 198, in _new_conn
    sock = connection.create_connection(
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "e:\Google_map_data_scraping\google map reviews scraping\mapscraper\Lib\site-packages\urllib3\util\connection.py", line 85, in create_connection
    raise err
  File "e:\Google_map_data_scraping\google map reviews scraping\mapscraper\Lib\site-packages\urllib3\util\connection.py", line 73, in create_connection
    sock.connect(sa)
ConnectionRefusedError: [WinError 10061] No connection could be made because the target machine actively refused it

The above exception was the direct cause of the following exception:

Traceback (most recent call last):
  File "e:\Google_map_data_scraping\google map reviews scraping\mapscraper\Lib\site-packages\urllib3\connectionpool.py", line 787, in urlopen
    response = self._make_request(
     

: 

In [2]:
import time
import json
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import StaleElementReferenceException

# === CONFIG ===
maps_url = "https://www.google.com/maps/place/The+Cheesecake+Factory,+401+Bellevue+Square,+Bellevue,+WA+98004,+United+States"
chromedriver_path = r"E:\Downloads\chromedriver-win64\chromedriver-win64\chromedriver.exe"
output_file = "cheesecake_factory_reviews.json"

# === SETUP SELENIUM ===
chrome_options = Options()
chrome_options.add_argument("--disable-blink-features=AutomationControlled")
chrome_options.add_experimental_option("excludeSwitches", ["enable-automation"])
chrome_options.add_experimental_option('useAutomationExtension', False)
chrome_options.add_argument("--disable-gpu")
chrome_options.add_argument("--window-size=1920,1080")
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--disable-dev-shm-usage")
chrome_options.add_argument("--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36")

try:
    service = Service(chromedriver_path)
    driver = webdriver.Chrome(service=service, options=chrome_options)
    
    driver.execute_cdp_cmd('Network.setUserAgentOverride', {
        "userAgent": 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36'
    })
    driver.execute_script("Object.defineProperty(navigator, 'webdriver', {get: () => undefined})")
    
    wait = WebDriverWait(driver, 20)
    
    print("üåê Opening Google Maps...")
    driver.get(maps_url)
    time.sleep(6)
    
    # Close cookie popup if present
    try:
        reject_btn = driver.find_element(By.XPATH, "//button[contains(., 'Reject all')]")
        reject_btn.click()
        time.sleep(2)
    except:
        pass
    
    # === SCRAPE COMPANY NAME ===
    print("üè¢ Extracting company details...")
    company_name = ""
    phone_number = ""

    try:
        # Company name is usually in the h1 tag or div with fontHeadlineLarge
        name_el = wait.until(EC.presence_of_element_located((By.XPATH, '//h1[contains(@class,"DUwDvf")]')))
        company_name = name_el.text.strip()
    except:
        print("‚ö†Ô∏è Could not extract company name")

    # Phone number (inside call button or info section)
    try:
        phone_el = driver.find_element(By.XPATH, '//button[contains(@aria-label, "Phone") or contains(@data-item-id, "phone:tel") or contains(@aria-label, "Call")]')
        phone_number = phone_el.text.strip()
    except:
        try:
            phone_el = driver.find_element(By.XPATH, '//div[contains(text(), "+") and contains(text(), " ")]')
            phone_number = phone_el.text.strip()
        except:
            print("‚ö†Ô∏è Could not extract phone number")

    print(f"‚úÖ Company: {company_name}")
    print(f"üìû Phone: {phone_number or 'Not found'}")
    
    # === OPEN REVIEWS TAB ===
    print("üîç Opening reviews tab...")
    try:
        review_tab = wait.until(EC.element_to_be_clickable((By.XPATH, '//button[contains(@aria-label,"reviews")]')))
        driver.execute_script("arguments[0].click();", review_tab)
        time.sleep(4)
    except Exception as e:
        print(f"‚ö†Ô∏è Could not open reviews section: {e}")
    
    # === FIND SCROLL CONTAINER ===
    scroll_div = wait.until(EC.presence_of_element_located(
        (By.XPATH, '//div[contains(@class,"m6QErb") and contains(@class,"DxyBCb")]')
    ))
    
    # === SCROLL TO LOAD REVIEWS (LIMITED TO FIRST 100 FOR TEST) ===
    print("üîÅ Scrolling to load reviews...")
    seen_ids = set()
    scroll_count = 0

    while len(seen_ids) < 100 and scroll_count < 30:
        driver.execute_script('arguments[0].scrollTop = arguments[0].scrollHeight', scroll_div)
        time.sleep(1)
        elements = driver.find_elements(By.XPATH, '//div[@data-review-id]')
        for el in elements:
            rid = el.get_attribute('data-review-id')
            if rid:
                seen_ids.add(rid)
        scroll_count += 1
        print(f"   Scroll {scroll_count}: {len(seen_ids)} reviews loaded...")

    print(f"‚úÖ Loaded {len(seen_ids)} reviews for extraction.\n")
    
    # === EXPAND "SEE MORE" BUTTONS ===
    more_buttons = driver.find_elements(By.XPATH, '//button[@aria-label="See more"]')
    for btn in more_buttons:
        try:
            driver.execute_script("arguments[0].click();", btn)
        except:
            continue

    # === EXTRACT REVIEWS DATA ===
    print("üìù Extracting reviews...")
    reviews_data = []
    elements = driver.find_elements(By.XPATH, '//div[@data-review-id]')

    for idx, r in enumerate(elements):
        if idx >= 100:  # Limit for testing
            break
        try:
            review_id = r.get_attribute('data-review-id')
            if not review_id:
                continue

            # Rating
            rating = "No rating"
            try:
                rating_el = r.find_element(By.XPATH, './/*[contains(@aria-label,"star")]')
                rating = rating_el.get_attribute("aria-label")
            except:
                pass

            # Text
            text = ""
            try:
                text_el = r.find_element(By.XPATH, './/span[@class="wiI7pd"]')
                text = text_el.text.strip()
            except:
                pass

            # Reviewer
            reviewer = ""
            try:
                reviewer_el = r.find_element(By.XPATH, './/div[contains(@class,"d4r55")]')
                reviewer = reviewer_el.text.strip()
            except:
                pass

            # Date
            date = ""
            try:
                date_el = r.find_element(By.XPATH, './/span[contains(@class,"rsqaWe")]')
                date = date_el.text.strip()
            except:
                pass

            reviews_data.append({
                "review_id": review_id,
                "reviewer": reviewer,
                "rating": rating,
                "review_text": text,
                "date": date,
                "company_name": company_name,
                "phone_number": phone_number
            })
        except StaleElementReferenceException:
            continue
        except Exception:
            continue

    driver.quit()
    
    # === SAVE TO JSON ===
    with open(output_file, "w", encoding="utf-8") as f:
        json.dump(reviews_data, f, indent=4, ensure_ascii=False)

    print(f"\n‚úÖ DONE! Extracted {len(reviews_data)} reviews saved to {output_file}")

except Exception as e:
    print(f"\n‚ùå Error: {e}")
    import traceback
    traceback.print_exc()
    try:
        driver.quit()
    except:
        pass


üåê Opening Google Maps...
üîç Opening reviews...
‚úÖ Opened reviews
üîÅ Scrolling through reviews...
   (This will stop automatically when all available reviews are loaded)
   Scroll 10: 50 unique reviews (no new: 1/5)
   Scroll 20: 120 unique reviews (no new: 0/5)

‚úÖ Reached end! Found 120 unique reviews after 25 scrolls
üìñ Expanding truncated reviews...
   Found 109 'See more' buttons
   Expanded 50/109...
   Expanded 100/109...

üìù Extracting review data...

üìä Extracted 120 unique reviews

--- First 3 Reviews ---

1. Hyung Kim - 5 stars
   Date: 15 hours ago
   Text: ...

2. Maria D - 5 stars
   Date: 2 days ago
   Text: Amazing place to have dinner! Has a good amount of gluten-free options, only not...

3. Cole Littrell - 2 stars
   Date: 3 days ago
   Text: Great services. Cheesecake Serives a lot of type of food but they can‚Äôt cook any...

‚úÖ SUCCESS! 120 reviews saved to 'cheesecake_factory_reviews.csv'

üìä Rating Distribution:
   5: 71 reviews
   4: 24 reviews

In [2]:
import time
import json
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import StaleElementReferenceException

# === CONFIG ===
maps_url = "https://www.google.com/maps/place/Semicolon+Cafe/@47.6119559,-122.200889,17z/data=!3m1!4b1!4m6!3m5!1s0x54906d666ec16e35:0x7b19a782bd3e0877!8m2!3d47.6119559!4d-122.200889!16s%2Fg%2F11rwmdyt7w?entry=ttu&g_ep=EgoyMDI1MTEwMi4wIKXMDSoASAFQAw%3D%3D"
chromedriver_path = r"E:\Downloads\chromedriver-win64\chromedriver-win64\chromedriver.exe"
output_file = "cheesecake_factory_reviews.json"

# === SETUP SELENIUM ===
chrome_options = Options()
chrome_options.add_argument("--disable-blink-features=AutomationControlled")
chrome_options.add_experimental_option("excludeSwitches", ["enable-automation"])
chrome_options.add_experimental_option('useAutomationExtension', False)
chrome_options.add_argument("--disable-gpu")
chrome_options.add_argument("--window-size=1920,1080")
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--disable-dev-shm-usage")
chrome_options.add_argument("--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36")

try:
    service = Service(chromedriver_path)
    driver = webdriver.Chrome(service=service, options=chrome_options)
    
    driver.execute_cdp_cmd('Network.setUserAgentOverride', {
        "userAgent": 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36'
    })
    driver.execute_script("Object.defineProperty(navigator, 'webdriver', {get: () => undefined})")
    
    wait = WebDriverWait(driver, 20)
    
    print("üåê Opening Google Maps...")
    driver.get(maps_url)
    time.sleep(6)
    
    # Close cookie popup if present
    try:
        reject_btn = driver.find_element(By.XPATH, "//button[contains(., 'Reject all')]")
        reject_btn.click()
        time.sleep(2)
    except:
        pass
    
    # === SCRAPE COMPANY NAME ===
    print("üè¢ Extracting company details...")
    company_name = ""
    phone_number = ""

    try:
        # Company name is usually in the h1 tag or div with fontHeadlineLarge
        name_el = wait.until(EC.presence_of_element_located((By.XPATH, '//h1[contains(@class,"DUwDvf")]')))
        company_name = name_el.text.strip()
    except:
        print("‚ö†Ô∏è Could not extract company name")

    # Phone number (inside call button or info section)
    try:
        phone_el = driver.find_element(By.XPATH, '//button[contains(@aria-label, "Phone") or contains(@data-item-id, "phone:tel") or contains(@aria-label, "Call")]')
        phone_number = phone_el.text.strip()
    except:
        try:
            phone_el = driver.find_element(By.XPATH, '//div[contains(text(), "+") and contains(text(), " ")]')
            phone_number = phone_el.text.strip()
        except:
            print("‚ö†Ô∏è Could not extract phone number")

    print(f"‚úÖ Company: {company_name}")
    print(f"üìû Phone: {phone_number or 'Not found'}")
    
    # === OPEN REVIEWS TAB ===
    print("üîç Opening reviews tab...")
    try:
        review_tab = wait.until(EC.element_to_be_clickable((By.XPATH, '//button[contains(@aria-label,"reviews")]')))
        driver.execute_script("arguments[0].click();", review_tab)
        time.sleep(4)
    except Exception as e:
        print(f"‚ö†Ô∏è Could not open reviews section: {e}")
    
    # === FIND SCROLL CONTAINER ===
    scroll_div = wait.until(EC.presence_of_element_located(
        (By.XPATH, '//div[contains(@class,"m6QErb") and contains(@class,"DxyBCb")]')
    ))
    
    # === SCROLL TO LOAD REVIEWS (LIMITED TO FIRST 100 FOR TEST) ===
    print("üîÅ Scrolling to load reviews...")
    seen_ids = set()
    scroll_count = 0

    while len(seen_ids) < 100 and scroll_count < 30:
        driver.execute_script('arguments[0].scrollTop = arguments[0].scrollHeight', scroll_div)
        time.sleep(1)
        elements = driver.find_elements(By.XPATH, '//div[@data-review-id]')
        for el in elements:
            rid = el.get_attribute('data-review-id')
            if rid:
                seen_ids.add(rid)
        scroll_count += 1
        print(f"   Scroll {scroll_count}: {len(seen_ids)} reviews loaded...")

    print(f"‚úÖ Loaded {len(seen_ids)} reviews for extraction.\n")
    
    # === EXPAND "SEE MORE" BUTTONS ===
    more_buttons = driver.find_elements(By.XPATH, '//button[@aria-label="See more"]')
    for btn in more_buttons:
        try:
            driver.execute_script("arguments[0].click();", btn)
        except:
            continue

    # === EXTRACT REVIEWS DATA ===
    print("üìù Extracting reviews...")
    reviews_data = []
    elements = driver.find_elements(By.XPATH, '//div[@data-review-id]')

    for idx, r in enumerate(elements):
        if idx >= 100:  # Limit for testing
            break
        try:
            review_id = r.get_attribute('data-review-id')
            if not review_id:
                continue

            # Rating
            rating = "No rating"
            try:
                rating_el = r.find_element(By.XPATH, './/*[contains(@aria-label,"star")]')
                rating = rating_el.get_attribute("aria-label")
            except:
                pass

            # Text
            text = ""
            try:
                text_el = r.find_element(By.XPATH, './/span[@class="wiI7pd"]')
                text = text_el.text.strip()
            except:
                pass

            # Reviewer
            reviewer = ""
            try:
                reviewer_el = r.find_element(By.XPATH, './/div[contains(@class,"d4r55")]')
                reviewer = reviewer_el.text.strip()
            except:
                pass

            # Date
            date = ""
            try:
                date_el = r.find_element(By.XPATH, './/span[contains(@class,"rsqaWe")]')
                date = date_el.text.strip()
            except:
                pass

            reviews_data.append({
                "review_id": review_id,
                "reviewer": reviewer,
                "rating": rating,
                "review_text": text,
                "date": date,
                "company_name": company_name,
                "phone_number": phone_number
            })
        except StaleElementReferenceException:
            continue
        except Exception:
            continue

    driver.quit()
    
    # === SAVE TO JSON ===
    with open(output_file, "w", encoding="utf-8") as f:
        json.dump(reviews_data, f, indent=4, ensure_ascii=False)

    print(f"\n‚úÖ DONE! Extracted {len(reviews_data)} reviews saved to {output_file}")

except Exception as e:
    print(f"\n‚ùå Error: {e}")
    import traceback
    traceback.print_exc()
    try:
        driver.quit()
    except:
        pass


üåê Opening Google Maps...
üè¢ Extracting company details...
‚ö†Ô∏è Could not extract company name
‚ö†Ô∏è Could not extract phone number
‚úÖ Company: 
üìû Phone: Not found
üîç Opening reviews tab...
üîÅ Scrolling to load reviews...
   Scroll 1: 20 reviews loaded...
   Scroll 2: 20 reviews loaded...
   Scroll 3: 30 reviews loaded...
   Scroll 4: 40 reviews loaded...
   Scroll 5: 50 reviews loaded...
   Scroll 6: 60 reviews loaded...
   Scroll 7: 70 reviews loaded...
   Scroll 8: 80 reviews loaded...
   Scroll 9: 90 reviews loaded...
   Scroll 10: 100 reviews loaded...
‚úÖ Loaded 100 reviews for extraction.

üìù Extracting reviews...

‚úÖ DONE! Extracted 100 reviews saved to cheesecake_factory_reviews.json


In [1]:
import time
import json
import re
import os
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC


# ========== CONFIG ==========

maps_url = "https://www.google.com/maps/place/Sintra+Hotel+Islamabad/@33.7155856,73.0848467,17z/data=!3m1!4b1!4m9!3m8!1s0x38dfbf4eb4ff030b:0xc101bff92da0ee59!5m2!4m1!1i2!8m2!3d33.7155856!4d73.0848467!16s%2Fg%2F11t2lwwsdv?authuser=0&hl=en&entry=ttu&g_ep=EgoyMDI1MTEyMy4xIKXMDSoASAFQAw%3D%3D"
chromedriver_path = r"E:\Downloads\chromedriver-win64\chromedriver-win64\chromedriver.exe"

REVIEWS_TO_SCRAPE = 20


# ========== SELENIUM SETUP ==========

chrome_options = Options()
chrome_options.add_argument("--disable-blink-features=AutomationControlled")
chrome_options.add_experimental_option("excludeSwitches", ["enable-automation"])
chrome_options.add_argument("--disable-gpu")
chrome_options.add_argument("--window-size=1920,1080")
chrome_options.add_argument("--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36")

service = Service(chromedriver_path)
driver = webdriver.Chrome(service=service, options=chrome_options)

wait = WebDriverWait(driver, 20)



# ========== RATING EXTRACTOR (NEW GOOGLE MAPS 2025) ==========

def extract_rating_from_wrapper(wrapper):
    """
    Rating is no longer inside the review block.
    It is located inside the wrapper <div class='jftiEf'>.
    This function handles all Google Maps DOM variations.
    """

    # 1) aria-label rating: <span aria-label="5.0">
    try:
        rating_el = wrapper.find_element(By.XPATH, './/span[@aria-label]')
        aria = rating_el.get_attribute("aria-label")
        m = re.search(r"([0-9]+(?:[.,][0-9])?)", aria)
        if m:
            return m.group(1).replace(",", ".")
    except:
        pass

    # 2) alt="5 stars"
    try:
        img = wrapper.find_element(By.XPATH, './/img[contains(@alt,"star")]')
        alt = img.get_attribute("alt")
        m = re.search(r"([0-9]+)", alt)
        if m:
            return m.group(1)
    except:
        pass

    # 3) FINAL FALLBACK ‚Äì count star icons if present
    try:
        stars = wrapper.find_elements(By.XPATH, './/*[contains(@class,"kvMYJc")]')
        if stars:
            return str(len(stars))
    except:
        pass

    return "No rating"



# ========== CLEAN PHONE FUNCTION ==========

def clean_phone(phone):
    if not phone:
        return ""
    phone = phone.replace("ÓÇ∞", "").strip()
    return re.sub(r"[^0-9+]", "", phone)



# ========== MAIN EXECUTION ==========

try:
    print("üåç Opening Google Maps...")
    driver.get(maps_url)
    time.sleep(5)

    # Close Cookies Popup
    try:
        reject_btn = driver.find_element(By.XPATH, "//button[contains(., 'Reject all')]")
        reject_btn.click()
        time.sleep(1)
    except:
        pass

    # ==== Extract Business Name ====
    print("üè¢ Extracting business details...")
    try:
        name_el = wait.until(
            EC.visibility_of_element_located((By.XPATH, '//h1[contains(@class,"DUwDvf")]'))
        )
        company_name = name_el.text.strip()
    except:
        company_name = "Unknown"

    # ==== Extract Phone Number ====
    phone_number = ""
    phone_selectors = [
        '//button[contains(@aria-label,"Phone")]',
        '//button[contains(@data-item-id,"phone:tel")]',
        '//a[contains(@href,"tel:")]'
    ]

    for sel in phone_selectors:
        try:
            el = driver.find_element(By.XPATH, sel)
            phone_number = el.text or el.get_attribute("href")
            break
        except:
            continue

    phone_number = clean_phone(phone_number)

    print(f"‚úî Company: {company_name}")
    print(f"‚úî Phone: {phone_number}")

    # ==== Open Reviews Tab ====
    print("üü¶ Opening reviews section...")
    try:
        review_tab = wait.until(
            EC.element_to_be_clickable((By.XPATH, '//button[contains(@aria-label,"reviews")]'))
        )
        driver.execute_script("arguments[0].click();", review_tab)
        time.sleep(4)
    except Exception as e:
        print("‚ö† Could not open reviews section:", e)

    # ==== Scroll to load reviews ====
    print("üîÅ Scrolling to load reviews...")

    scroll_box = wait.until(
        EC.presence_of_element_located(
            (By.XPATH, '//div[contains(@class,"m6QErb") and contains(@class,"DxyBCb")]')
        )
    )

    wrappers = {}  # store wrapper divs uniquely
    scroll_attempts = 0

    while len(wrappers) < REVIEWS_TO_SCRAPE and scroll_attempts < 80:
        driver.execute_script("arguments[0].scrollTop = arguments[0].scrollHeight", scroll_box)
        time.sleep(1.2)

        # Google Maps review wrapper is "jftiEf"
        found = driver.find_elements(By.XPATH, '//div[contains(@class,"jftiEf")]')

        for w in found:
            try:
                rid = w.find_element(By.XPATH, './/div[@data-review-id]').get_attribute("data-review-id")
                wrappers[rid] = w
            except:
                pass

        scroll_attempts += 1
        print(f"Scroll {scroll_attempts}: {len(wrappers)} unique reviews loaded...")

    print(f"‚úî Loaded {len(wrappers)} wrapper reviews.")

    # ==== Extract Reviews ====
    print("üìù Extracting review data...")
    reviews_data = []

    for rid, wrapper in list(wrappers.items())[:REVIEWS_TO_SCRAPE]:

        # Rating from wrapper
        rating = extract_rating_from_wrapper(wrapper)

        # Reviewer name
        try:
            reviewer = wrapper.find_element(By.XPATH, './/div[contains(@class,"d4r55")]').text
        except:
            reviewer = ""

        # Review text
        try:
            text = wrapper.find_element(By.XPATH, './/span[@class="wiI7pd"]').text
        except:
            text = ""

        # Date
        try:
            date = wrapper.find_element(By.XPATH, './/span[contains(@class,"rsqaWe")]').text
        except:
            date = ""

        reviews_data.append({
            "review_id": rid,
            "reviewer": reviewer,
            "rating": rating,
            "review_text": text,
            "date": date
        })

    # ==== Save Output ====
    os.makedirs("output", exist_ok=True)
    file_name = re.sub(r'[^A-Za-z0-9 ]+', '', company_name).replace(" ", "_") + "_reviews.json"

    path = os.path.join("output", file_name)

    with open(path, "w", encoding="utf-8") as f:
        json.dump({
            "company_name": company_name,
            "phone_number": phone_number,
            "reviews": reviews_data
        }, f, indent=4, ensure_ascii=False)

    print(f"\nüéâ DONE! Extracted {len(reviews_data)} TRUE reviews ‚Üí {path}")

except Exception as e:
    print("‚ùå ERROR:", e)

finally:
    try:
        driver.quit()
    except:
        pass


üåç Opening Google Maps...
üè¢ Extracting business details...
‚úî Company: Unknown
‚úî Phone: 
üü¶ Opening reviews section...
üîÅ Scrolling to load reviews...
Scroll 1: 20 unique reviews loaded...
‚úî Loaded 20 wrapper reviews.
üìù Extracting review data...

üéâ DONE! Extracted 20 TRUE reviews ‚Üí output\Unknown_reviews.json
