Getting insights about businesses and locations from Google Maps

#https://towardsdatascience.com/getting-insights-about-businesses-and-locations-from-google-maps-3f8a5739059a

Install >> conda install selenium
           conda install -c conda-forge folium
           

In [1]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.keys import Keys
from selenium.common.exceptions import NoSuchElementException, TimeoutException, ElementNotInteractableException, ElementClickInterceptedException
from webdriver_manager.chrome import ChromeDriverManager

from tqdm import tqdm.notebook.tqdm as tqdmn

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import folium

import time, re

In [2]:
%%HTML
<style>.dataframe th,td:first-child{background: rgb(63,87,124);background: linear-gradient(180deg, rgba(63,87,124,1) 0%, rgba(101,124,161,1) 100%, rgba(0,212,255,1) 100%);;
padding: 10px;font-family: monospace;font-size: 110%;color: white;border:1px dashed white;text-align:left !important;
-moz-border-radius: 3x;-webkit-border-radius: 3px;}.dataframe thead{border:none; !important;}</style>

In [3]:

# Make sure to supply the path to where you put the chromedriver.exe file. Use \\ instead of \ :
#driver = webdriver.Chrome("/Users/karinaalem/Downloads/chromedriver")
custom_path=r'/Users/karinaalem/Downloads/'

driver = webdriver.Chrome(ChromeDriverManager(path=custom_path).install())
# This is the list where we'll capture the historical landmarks (names and addresses)
landmarks = []

# The first search we'll do for 'rome touriste attraction'
url = 'https://www.google.com/maps/search/shopping en cordoba'

# Opening the search URL. You'll notice a chrome window opening :
driver.get(url)

[WDM] - Downloading: 100%|██████████| 8.08M/8.08M [00:00<00:00, 19.8MB/s]


In [7]:
# We repeat the code below 3 times if we arrive at the end of the navgation list the loop breaks on its own) :
for i in tqdmn(range(2), leave=False, desc='1. Rounding the historical landmarks' ) :
    
    # Waiting for the results to appear :
    WebDriverWait(driver,20).until(EC.visibility_of_element_located((By.XPATH, "section-result-title")))

    # Capturing the names of the competitors and adding it to our historical landmarks' :
    result_names = [i.text for i in driver.find_elements_by_css_selector('h3[class=section-result-title]')]
    result_addresses = [i.text for i in driver.find_elements_by_css_selector('span[class=section-result-location]')] 
    
    for name, address in zip(result_names, result_addresses) :
        landmarks.append(name + ' ' + address)
        
    # Waiting for the 'Next' button to be visible and then click it. If it's not there anymore, break the loop :
    try :
        WebDriverWait(driver,20).until(EC.visibility_of_element_located((By.CSS_SELECTOR, "//*[@id='n7lv7yjyC35__section-pagination-button-next']"))).click()
    except ElementClickInterceptedException :
        break
        
    # Waiting 5 seconds before looping (otherwise we get the error ElementClickInterceptedException). If you get the
    #exception, make it wait for a little longer than 5 seconds :
    time.sleep(45)


# We want to remove any '/' character in the names and addresses in the landmarks list (because they'll break URLs) :
landmarks = [i.replace('/', ' ') for i in landmarks]

# These are the empty lists we will populate with the extracted data :
full_name = []
rating = []
total_ratings = []
landmark_cat = []
description = []
address = []
hours = []
lat = []
long = []
    
# Here's the big loop iterating over the landmarks list :
for landmark in tqdmn(landmarks, leave=False, desc='2. Extracting the data') :
    
    # URL making :
    url = 'https://www.google.com/maps/search/' + landmark
    driver.get(url)

    # Waiting for the name of the landmark to load and be visible. If it fails, skip to next one :
    try :
        WebDriverWait(driver,15).until(EC.visibility_of_element_located((By.CSS_SELECTOR, "section-hero-header-title-title")))
    except (NoSuchElementException, TimeoutException) as e :
        continue
        
    # Extracting the data and putting it into the empty lists we defined earlier :
    try:
        full_name.append(driver.find_element_by_xpath('//*[@id="pane"]/div/div[1]/div/div/div[2]/div[1]/div[1]').text)
    except NoSuchElementException :
        full_name.append(np.nan)

    try:
        rating.append(driver.find_element_by_xpath('//*[@id="pane"]/div/div[1]/div/div/div[2]/div[1]/div[2]/div/div[1]/span[1]/span/span').text)
    except NoSuchElementException :
        rating.append(np.nan)

    try:
        total_ratings.append(driver.find_element_by_xpath('//*[@id="pane"]/div/div[1]/div/div/div[2]/div[1]/div[2]/div/div[1]/span[2]/span/span[1]/span[2]/span[1]/button').text)
    except NoSuchElementException:
        total_ratings.append(np.nan)

    try:
        landmark_cat.append(driver.find_element_by_xpath('//*[@id="pane"]/div/div[1]/div/div/div[2]/div[1]/div[2]/div/div[2]/span[1]/span[1]/button').text)
    except NoSuchElementException:
        landmark_cat.append(np.nan)
    
    try:
        description.append(driver.find_element_by_css_selector('div[class=section-editorial-quote]').text)
    except NoSuchElementException:
        description.append(np.nan
                          )
    try:
        address.append(driver.find_element_by_css_selector('div[data-tooltip="Copy address"]').text)
    except NoSuchElementException :
        address.append(np.nan)

    # Here we capture the popular hours for all 7 days starting with Sunday :
    try:
        hours.append([i.get_attribute('aria-label') for i in driver.find_elements_by_xpath("//*[contains(@aria-label, 'busy at')]")])
    except NoSuchElementException:
        hours.append(np.nan)
        
    try:
        coordinates = driver.find_element_by_css_selector('meta[itemprop=image]').get_attribute('content')
        coordinates = coordinates.split('?center=')[1].split('&zoom=')[0].split('%2C')
        lat.append(coordinates[0])
        long.append(coordinates[1])
    except NoSuchElementException:
        lat.append(np.nan)
        long.append(np.nan)
print(hours)
# Closing the Chrome window
driver.close()


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for i in tqdmn(range(2), leave=False, desc='1. Rounding the historical landmarks' ) :


1. Rounding the historical landmarks:   0%|          | 0/2 [00:00<?, ?it/s]

TimeoutException: Message: timeout: Timed out receiving message from renderer: 600.000
  (Session info: chrome=106.0.5249.61)


In [None]:
HL = pd.DataFrame(data={'full_name':full_name, 'rating':rating, 'total_ratings':total_ratings, 'landmark_category':landmark_cat, 'description':description, 'address':address, 'hours':hours})

In [None]:
HL.head()