In [1]:
from IPython.display import display, HTML

display(HTML(data="""
<style>
    div#notebook-container    { width: 95%; }
    div#menubar-container     { width: 65%; }
    div#maintoolbar-container { width: 99%; }a
</style>
"""))

# 7. Web Scraping

Web  scraping  is  the  practice  of  gathering  data  through  any  means  otherthan a program interacting with an API (or, obviously, through a human using a webbrowser).  This  is  most  commonly  accomplished  by  writing  an  automated  programthat queries a web server, requests data (usually in the form of the HTML and otherfiles  that  comprise  web  pages),  and  then  parses  that  data  to  extract  needed  information.

# 7.1 Selenium
Selenium automates browsers. That's it! <br>
Selenium is a Python library and tool used for automating web browsers to do a number of tasks. One of such is web-scraping to extract useful data and information that may be otherwise unavailable. <br>
**For this course, we use Chrome.**

## 7.1 Installing Libraries
We need to install these two libraries

In [2]:
!pip install selenium
!pip install webdriver-manager



## 7.2 Calling Libraries

In [3]:
# this library is to manipulate browser
from selenium import webdriver

# it allows you to work with differen versions of drivers
# We call ChromeDriver
from webdriver_manager.chrome import ChromeDriverManager
import re # regular expressions
import time # para calcular tiempo de corrida

## 7.3 Launch/Set the Driver
This code opens a Chrome Driver. We are going to use it to go navigate on the web.

In [4]:
pwd # sirve para ubicar dónde nos encontramos

'C:\\Users\\jesus\\Documents\\GitHub\\Diplomado_PUCP\\Lecture_7'

Checar este chat: https://chatgpt.com/share/56ebeb2f-3dbd-4b6f-b984-13f3d35f864c

In [5]:
from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager

In [6]:
# ChromeDriverManager permite manejar Chrome con código
# descargar según versión de SO y de Google Chrome: https://googlechromelabs.github.io/chrome-for-testing/
# para ver la versión: chrome://settings/help
# Una vez descargado, pegar la carpeta de Descargas en la carpeta del GitHub donde se correrá el ipynb

# Case 1 - Download the driver using ChromeDriverManager
driver = webdriver.Chrome(executable_path = r'chromedriver-win64/chromedriver.exe') # busca el ejecutable

# Este bloque de código da este error: TypeError: WebDriver.__init__() got an unexpected keyword argument 'executable_path'

TypeError: WebDriver.__init__() got an unexpected keyword argument 'executable_path'

In [7]:
# Inicializa el controlador de Chrome con `ChromeDriverManager`
driver = webdriver.Chrome(service = webdriver.chrome.service.Service(ChromeDriverManager().install()))

In [8]:
driver # es una clase

<selenium.webdriver.chrome.webdriver.WebDriver (session="fc1a3325a6f7e2707c9c79d4868e41cc")>

In [11]:
from selenium.webdriver.chrome.service import Service
#from selenium.webdriver.chrome.options import Options
driver  = webdriver.Chrome(service = Service(executable_path="chromedriver-win64/chromedriver.exe"), options=webdriver.ChromeOptions())

In [12]:
url = 'https://resultadoshistorico.onpe.gob.pe/EG2021/'
driver.get(url)

In [13]:
url = 'https://larepublica.pe/politica/2024/08/14/el-vaticano-expulsa-al-fundador-del-sodalicio-luis-fernando-figari-833406'
driver.get(url) 

## Chrome is being controlled by automated test software

![Chrome is controlled by automated software](Images/chrome_automated.png)

In [14]:
# Access to the title
print('Title: ', driver.title)

Title:  El Vaticano expulsa al fundador del Sodalicio, Luis Fernando Figari | Política | La República


In [15]:
# Access to the curent url 
print('Current Page URL: ', driver.current_url)

Current Page URL:  https://larepublica.pe/politica/2024/08/14/el-vaticano-expulsa-al-fundador-del-sodalicio-luis-fernando-figari-833406


In [16]:
# Make screenshot of the webpage
driver.save_screenshot('Images/resultados_presidenciales_2024.png')

True

In [17]:
if re.search(r'resultadoshistorico', driver.current_url):
    driver.save_screenshot('Images/resultados_presidenciales_ok.png') #save screenshot with provided name
    print('Resultados Presidenciales saved!')
else:
    print("Page not found")

Page not found


In [18]:
# get cookie information
cookies = driver.get_cookies() 
print('Cookies obtained from resultados_presidenciales')
print(cookies)

Cookies obtained from resultados_presidenciales
[{'domain': '.larepublica.pe', 'expiry': 1758457181, 'httpOnly': False, 'name': '_ga_K5929ZXSSV', 'path': '/', 'sameSite': 'Lax', 'secure': False, 'value': 'GS1.1.1723897166.1.0.1723897181.45.0.0'}, {'domain': '.larepublica.pe', 'expiry': 1739449179, 'httpOnly': False, 'name': '__eoi', 'path': '/', 'sameSite': 'None', 'secure': True, 'value': 'ID=c7fdb00a9313a4a9:T=1723897179:RT=1723897179:S=AA-AfjatFaEIAyTrvgdflwzfUCyg'}, {'domain': 'larepublica.pe', 'httpOnly': False, 'name': 'MgidStorage', 'path': '/', 'sameSite': 'Lax', 'secure': False, 'value': '%7B%220%22%3A%7B%22svspr%22%3A%22%22%2C%22svsds%22%3A1%7D%2C%22C1446817%22%3A%7B%22page%22%3A1%2C%22time%22%3A%221723897167395%22%7D%7D'}, {'domain': '.larepublica.pe', 'expiry': 1739794767, 'httpOnly': False, 'name': 'compass_uid', 'path': '/', 'sameSite': 'Lax', 'secure': True, 'value': '2f7b63a3-a49e-481d-b9eb-c17b718a3acd'}, {'domain': '.larepublica.pe', 'expiry': 1757593166, 'httpOnly': 

In [19]:
# Get page source
type(driver.page_source)
driver.page_source # para ver el código fuente

'<html lang="es"><head><meta charset="utf-8"><meta name="viewport" content="width=device-width"><title>El Vaticano expulsa al fundador del Sodalicio, Luis Fernando Figari | Política | La República</title><link rel="preconnect" href="https://fonts.gstatic.com"><link rel="preconnect" href="https://imgmedia.larepublica.pe"><link rel="preconnect" href="https://www.googletagmanager.com"><link rel="preload" as="font" type="font/woff2" crossorigin="crossorigin" href="https://fonts.gstatic.com/s/ptserif/v17/EJRSQgYoZZY2vCFuvAnt66qSVys.woff2"><link rel="preload" as="font" type="font/woff2" crossorigin="crossorigin" href="https://fonts.gstatic.com/s/roboto/v27/KFOmCnqEu92Fr1Mu4mxK.woff2"><meta name="viewport" content="width=device-width, initial-scale=1"><link rel="alternate" type="application/rss+xml" href="https://larepublica.pe/rss/politica.xml"><meta name="robots" content="max-image-preview:large"><meta name="robots" content="index,follow"><meta name="googlebot" content="index,follow"><meta 

In [20]:
# Refresh the page - 
driver.refresh() #reload or refresh the browser

In [21]:
# Manera 1 de abrir driver
driver = webdriver.Chrome(service = webdriver.chrome.service.Service(ChromeDriverManager().install()))
driver.maximize_window()

In [22]:
# Manera 2 de abrir driver
driver  = webdriver.Chrome(service = Service(executable_path="chromedriver-win64/chromedriver.exe"), options=webdriver.ChromeOptions())
driver.maximize_window()

In [23]:
service = Service(executable_path="chromedriver-win64/chromedriver.exe")
options = webdriver.ChromeOptions()
driver  = webdriver.Chrome(service = service, options = options)
driver.maximize_window()

In [52]:
driver  = webdriver.Chrome(service = Service(executable_path="chromedriver-win64/chromedriver.exe"), options=webdriver.ChromeOptions())
driver.maximize_window() # para maximizar ventana

url_1 = "https://resultadoshistorico.onpe.gob.pe/EG2021/"
driver.get( url_1 )
time.sleep(2) # cuando abras la página, tómate 2 segundos congelado en la carga, aún no la siguiente

url_2 = "https://www.google.com/"
driver.get( url_2 )
time.sleep(1)

driver.back() # regresa a la página anterior

In [53]:
driver2  = webdriver.Chrome(service = Service(executable_path="chromedriver-win64/chromedriver.exe"), options=webdriver.ChromeOptions())
driver2.maximize_window()

url_1 = "https://resultadoshistorico.onpe.gob.pe/EG2021/"
driver2.get(url_1)
time.sleep(3) # cuando abras la página, tómate 3 segundos congelado en la carga, aún no la siguiente

url_2 = "https://www.elcomercio.pe/"
driver2.get(url_2)
time.sleep(1)

In [54]:
driver.close() # cierra el navegador (si ya fue cerrado manualmente, dará MaxRetryError)

In [55]:
driver2.close() # cierra el navegador (si ya fue cerrado manualmente, dará MaxRetryError)

In [50]:
driver.quit()

In [31]:
driver2.quit() # cierra TODOS los browsers abiertos con webdriver. También puede ser driver2.quit--.

![Quite and Close](Images/quite_close.png)

In [40]:
type(driver)

selenium.webdriver.chrome.webdriver.WebDriver

`driver` is an `selenium.webdriver.chrome.webdriver.WebDriver` object. This object has some attributes that will help us to navigate on the web.

Now, you can see in the driver that we are in [this link](https://www.convocatoriascas.com/).

# Extra - Best Practices before working

1. Maximize the browser

In [56]:
service = Service(executable_path="chromedriver-win64/chromedriver.exe")
driver = webdriver.Chrome(service = service)
url = 'https://www.kaspersky.com/resource-center/definitions/cookies'
driver.get( url )
driver.maximize_window()

2. Set the Browser Zoom Level to 100 percent. Es importante considerar que el código puede funcionar distinto según tamaño de pantalla

In [57]:
driver.execute_script("document.body.style.zoom='100%'")

### 7.4.1. HTML
HTML stands for HyperText Markup Language. You can deduce that it’s a language for creating web pages. It’s not a programming language like Python or Java, but it’s a markup language. It describes the elements of a page through tags characterized by angle brackets.

1. The document always begins and ends using `<html>` and `</html>`.
2. `<body></body>` constitutes the visible part of HTML document.
3. `<h1>` to `<h3>` tags are defined for the headings.

#### 7.4.1.1. HTML Headings
HTML headings are defined with the `<h1>` to `<h6>` tags.
`<h1>` defines the most important heading. `<h6>` defines the least important heading.

We can use text cells since markdown reads html tags.

<h1>This is heading 1</h1>
<h2>This is heading 2</h2>
<h3>This is heading 3</h3>

In [None]:
# No correr
<h1>This is heading 1</h1>
<h2>This is heading 2</h2>
<h3>This is heading 3</h3>

#### 7.4.1.2. HTML Paragraphs
HTML paragraphs are defined with the `<p>` tag.
`<br>` tag is similar to `"\n"`.

<html>
<br>
<p>My first paragraph.</p> <br>
<p>This is another paragraph for this text cell.</p>
<html>

#### 7.4.1.3. HTML Links
HTML links are defined with the <a> tag:

<a href="http://bayes.cs.ucla.edu/jp_home.html">This is a link for Judea Pearl Website</a>

#### 7.4.1.3. Unordered HTML List
An unordered list starts with the `<ul>` tag. Each list item starts with the `<li>` tag.

<ul>
  <li>Coffee</li>
  <li>Tea</li>
  <li>Milk</li>
</ul>

#### 7.4.1.4. Ordered HTML List
An ordered list starts with the `<ol>` tag. Each list item starts with the `<li>` tag.

<ol>
  <li>Coffee</li>
  <li>Tea</li>
  <li>Milk</li>
</ol>

#### 7.4.1.4. HTML Tables

A table in HTML consists of table cells inside rows and columns. Each table cell is defined by a `<td>` and a `</td>` tag. Each table row starts with a `<tr>` and end with a `</tr>` tag.

<table>
  <tr>
    <th>Manager</th>
    <th>Club</th>
    <th>Nationality</th>
  </tr>
  <tr>
    <td>Mikel Arteta</td>
    <td>Arsenal</td>
    <td>Spain</td>
  </tr>
  <tr>
    <td>Thomas Tuchel</td>
    <td>Chelsea</td>
    <td>Germany</td>
  </tr>
</table>

#### 7.4.1.5. HTML Iframes

An HTML iframe is used to display a web page within a web page.


<!DOCTYPE html>
<html>
  
<head>
    <title>HTML iframe src Attribute</title>
</head>
  
<body style="text-align: center">
    <h1>Diploma</h1>
    <h2>HTML iframe</h2>
    <iframe>
          
        <!DOCTYPE html>
        <html>

        <head>
            <title>New html</title>
        </head>

        <body style="text-align: center">
            <h1>Diploma2</h1>
            <h2>HTML iframe</h2>
            <iframe>

            </iframe>
        </body>

        </html>
    </iframe>
</body>
  
</html>

#### 7.4.1.6. HTML Tags - Key

|Tag|Description|
|---|---|
|`<h1>` to `<h6>`|	Defines HTML headings|
|`<ul>`|	Defines an unordered list|
|`<ol>`|	Defines an ordered list|
|`<p>`|	Defines a paragraph|
|`<a>`|	It is termed as anchor tag and it creates a hyperlink or link.|
|`<div>`|	It defines a division or section within HTML document.|
|`<strong>`|	It is used to define important text.|
|`<table>`|	It is used to present data in tabular form or to create a table within HTML document.|
|`<td>`|	It is used to define cells of an HTML table which contains table data|
|`<iframe>`|	Defines an inline frame|

### 7.4. Identifying elements in a web page

To identify elements of a webpage, we need to inspect the webpage. Open the driver and press `Ctrl`+ `Shift` + `I`.

#### One Element
|Method|Description|
|---|---|
|find_element_by_id| Use id.|
|find_element_by_name| Use name.|
|find_element_by_xpath| Use Xpath.|
|find_element_by_tag_name| Use HTML tag.|
|find_element_by_class_name| Use class name.|
|find_element_by_css_selector| Use css selector.|

#### Multiple  elements
|Method|Description|
|---|---|
|find_elements_by_id| Use id.|
|find_elements_by_name| Use name.|
|find_elements_by_xpath| Use Xpath.|
|find_elements_by_tag_name| Use HTML tag.|
|find_elements_by_class_name| Use class name.|
|find_elements_by_css_selector| Use css selector.|

### 7.4.1. Xpath
XPath in Selenium is an XML path used for navigation through the HTML structure of the page. It is a syntax or language for finding any element on a web page using XML path expression.

The basic format of XPath in selenium is explained below with screen shot.
<img src="../_images/x_path.png">

**DO NOT COMPLICATE!**
Finding the XPath of a element:
1. Go to the element
2. Right click
3. Inspect - You may have to do it twice.
4. Go to the selected line
5. Right click
7. Copy 
8. Copy Full Xpath

**Example**

We are going to select `Economistas` option and make a click. Use `find_element_by_xpath` and click.

In [61]:
from selenium.webdriver.common.by import By

In [59]:
service = Service(executable_path="chromedriver-win64/chromedriver.exe")
driver  = webdriver.Chrome(service=service, options=options)
driver.maximize_window()
url_1 = "https://resultadoshistorico.onpe.gob.pe/EG2021/"
driver.get( url_1 )

Busco lo que quiero extraer en la página. Click derecho, Inspect (dos veces).
En el código HTML: Copy, Copy Full XPath.

In [63]:
# Resumen General: /html/body/onpe-root/onpe-home-onpe/div[1]/div/div/div/div[2]/div[1]/div/div/a/div[2]/div
# Elecciones presidenciales: /html/body/onpe-root/onpe-home-onpe/div[1]/div/div/div/div[2]/div[2]/div/div/a/div[2]
# Elecciones congresales:    /html/body/onpe-root/onpe-home-onpe/div[1]/div/div/div/div[2]/div[3]/div/div/a/div[2]

In [64]:
resumen_general = driver.find_element( By.XPATH  , '/html/body/onpe-root/onpe-home-onpe/div[1]/div/div/div/div[2]/div[1]/div/div/a/div[2]/div')
resumen_general.click()

In [65]:
service = Service(executable_path="chromedriver-win64/chromedriver.exe")
driver = webdriver.Chrome(service=service, options=options)
driver.maximize_window()
url_1 = "https://resultadoshistorico.onpe.gob.pe/EG2021/"
driver.get( url_1 )
time.sleep(3) # sin time.sleep da error porque la página no carga lo suficientemente rápido
resumen_general = driver.find_element( By.XPATH  , '/html/body/onpe-root/onpe-home-onpe/div[1]/div/div/div/div[2]/div[1]/div/div/a/div[2]/div')
resumen_general.click()

In [66]:
service = Service(executable_path="chromedriver-win64/chromedriver.exe")
driver = webdriver.Chrome(service=service, options=options)
driver.maximize_window()
url_1 = "https://resultadoshistorico.onpe.gob.pe/EG2021/"
driver.get( url_1 )
time.sleep(3) # sin time.sleep da error porque la página no carga lo suficientemente rápido
elec_pres = driver.find_element( By.XPATH  , '/html/body/onpe-root/onpe-home-onpe/div[1]/div/div/div/div[2]/div[2]/div/div/a/div[2]')
elec_pres.click()

In [70]:
service = Service(executable_path="chromedriver-win64/chromedriver.exe")
driver = webdriver.Chrome(service=service, options=options)
driver.maximize_window()
url_1 = "https://resultadoshistorico.onpe.gob.pe/EG2021/"
driver.get( url_1 )
resumen_general = driver.find_element(By.XPATH,'/html/body/onpe-root/onpe-home-onpe/div[1]/div/div/div/div[2]/div[1]/div/div/a/div[1]/img')
resumen_general.click()

NoSuchElementException: Message: no such element: Unable to locate element: {"method":"xpath","selector":"/html/body/onpe-root/onpe-home-onpe/div[1]/div/div/div/div[2]/div[1]/div/div/a/div[1]/img"}
  (Session info: chrome=127.0.6533.120); For documentation on this error, please visit: https://www.selenium.dev/documentation/webdriver/troubleshooting/errors#no-such-element-exception
Stacktrace:
	GetHandleVerifier [0x00007FF6D6C49642+30946]
	(No symbol) [0x00007FF6D6BFE3D9]
	(No symbol) [0x00007FF6D6AF6FDA]
	(No symbol) [0x00007FF6D6B4822C]
	(No symbol) [0x00007FF6D6B4850C]
	(No symbol) [0x00007FF6D6B8DCB7]
	(No symbol) [0x00007FF6D6B6CAAF]
	(No symbol) [0x00007FF6D6B8B041]
	(No symbol) [0x00007FF6D6B6C813]
	(No symbol) [0x00007FF6D6B3A6E5]
	(No symbol) [0x00007FF6D6B3B021]
	GetHandleVerifier [0x00007FF6D6D7F84D+1301229]
	GetHandleVerifier [0x00007FF6D6D8BDC7+1351783]
	GetHandleVerifier [0x00007FF6D6D82A13+1313971]
	GetHandleVerifier [0x00007FF6D6C7DD16+245686]
	(No symbol) [0x00007FF6D6C0759F]
	(No symbol) [0x00007FF6D6C03814]
	(No symbol) [0x00007FF6D6C039A2]
	(No symbol) [0x00007FF6D6BFA3FF]
	BaseThreadInitThunk [0x00007FFACE04257D+29]
	RtlUserThreadStart [0x00007FFACF14AF28+40]


* ID = "id"
* NAME = "name"
* XPATH = "xpath"
* LINK_TEXT = "link text"
* PARTIAL_LINK_TEXT = "partial link text"
* TAG_NAME = "tag name"
* CLASS_NAME = "class name"
* CSS_SELECTOR = "css selector" 

In [63]:
driver.find_element(By.ID, 'select_ambito').click()
driver.find_element(By.NAME, 'cod_ambito').click()

<bound method WebElement.click of <selenium.webdriver.remote.webelement.WebElement (session="8a4e0f1dd2d11a6eb4a6bad617a19a21", element="2EFDE05B32D6EA4F017A654BAF16E7D7_element_2579")>>

In [58]:
# Best practices
driver.find_element_by_id('select_ambito').click

  driver.find_element_by_id('select_ambito').click


<bound method WebElement.click of <selenium.webdriver.remote.webelement.WebElement (session="8a4e0f1dd2d11a6eb4a6bad617a19a21", element="2EFDE05B32D6EA4F017A654BAF16E7D7_element_2579")>>

In [60]:
driver.find_element_by_name('cod_ambito').click()
#driver.find_element_by_xpath('/html/body/onpe-root/onpe-layout-container/onpe-onpe-epres-re/div[1]/div[3]/div[1]/div[1]/div/div/div/select')

  driver.find_element_by_name('cod_ambito').click()


In [44]:
driver.find_element_by_class_name('select_ubigeo')

  driver.find_element_by_class_name('select_ubigeo')


<selenium.webdriver.remote.webelement.WebElement (session="fcb117569a03c401035e92fbc09659db", element="FA2D5037E60912FE13DFE1E09C17F29E_element_11974")>

In [46]:
searchBox = driver.find_element_by_id('select_ambito')
# searchBox = driver.find_element_by_xpath('//*[@id="select_ambito"]')
# searchBox = driver.find_element_by_css_selector('#select_ambito')

  searchBox = driver.find_element_by_id('select_ambito')


![Web Element](Images/Web_Elementpng.png)

In [49]:
searchBox.get_attribute('value')

'T'

In [48]:
searchBox.get_attribute('value')

'T'

In [50]:
driver = webdriver.Chrome(executable_path=r'chromedriver-win64/chromedriver.exe')

url = 'https://resultadoshistorico.onpe.gob.pe/EG2021/'
driver.get( url )

driver.maximize_window()

  driver = webdriver.Chrome(executable_path=r'chromedriver-win64/chromedriver.exe')


In [51]:
driver.find_element_by_xpath("/html/body/onpe-root/onpe-home-onpe/div[1]/div/div/div/div[2]/div[1]/div/div/a/div[1]/img").click()

  driver.find_element_by_xpath("/html/body/onpe-root/onpe-home-onpe/div[1]/div/div/div/div[2]/div[1]/div/div/a/div[1]/img").click()


In [53]:
searchBox = driver.find_element_by_xpath('/html/body/onpe-root/onpe-layout-container/onpe-onpe-rgen-rsgr/div/div[2]/div[1]/div[1]/div/div/div[1]/select/option[2]')
searchBox.click()

  searchBox = driver.find_element_by_xpath('/html/body/onpe-root/onpe-layout-container/onpe-onpe-rgen-rsgr/div/div[2]/div[1]/div[1]/div/div/div[1]/select/option[2]')


In [56]:
searchBox = driver.find_element_by_xpath('/html/body/onpe-root/onpe-layout-container/onpe-onpe-rgen-rsgr/div/div[2]/div[1]/div[1]/div/div/div[1]/select/option[2]')
searchBox.text

  searchBox = driver.find_element_by_xpath('/html/body/onpe-root/onpe-layout-container/onpe-onpe-rgen-rsgr/div/div[2]/div[1]/div[1]/div/div/div[1]/select/option[2]')


'PERÚ'

In [41]:
searchBox = driver.find_element_by_id('select_ambito')
searchBox

  searchBox = driver.find_element_by_id('select_ambito')


<selenium.webdriver.remote.webelement.WebElement (session="b5ca8d1ebed563e3c66a5bd8dcd59bc6", element="F4712B94759471922218DCBC330F7E1A_element_91")>

**Suggestion** <br>
We do not recomend to use `tag` at first time since most web pages use nested tags and it is difficult to define a element using HTML tag. However, it is great to find elements that is inside another located element in the web. Let's see the example.

# EXAMPLE USING ONPE WEBPAGE

## Example Alex

In [64]:
driver = webdriver.Chrome(executable_path=r'chromedriver-win64/chromedriver.exe')
driver.maximize_window()

url_1 = "https://resultadoshistorico.onpe.gob.pe/EG2021/"
driver.get( url_1 )

  driver = webdriver.Chrome(executable_path=r'chromedriver-win64/chromedriver.exe')


In [65]:
driver.find_element(By.XPATH, '/html/body/onpe-root/onpe-home-onpe/div[1]/div/div/div/div[2]/div[1]/div/div/a/div[2]/div').click()

In [66]:
driver.find_element(By.XPATH, '/html/body/onpe-root/onpe-layout-container/onpe-menu/div/nav/div/div/div[2]/div/div[2]/a').click()


In [67]:
driver.find_element(By.XPATH, '/html/body/onpe-root/onpe-layout-container/onpe-onpe-epres-re/div[1]/div[3]/div[1]/div[1]/div/div/div[1]/select/option[2]').click()

In [70]:
table_path = driver.find_element(By.XPATH, '/html/body/onpe-root/onpe-layout-container/onpe-onpe-epres-re/div[1]/div[4]/div[1]/div[3]')

In [77]:
table_html = table_path.get_attribute( 'innerHTML' )
#table_html

In [74]:
import pandas as pd

In [76]:
table = pd.read_html( table_html )
table[0]

Unnamed: 0,TOTAL DE VOTOS,TOTAL DE VOTOS.1,TOTAL DE VOTOS.2,TOTAL DE VOTOS.3,TOTAL DE VOTOS.4,TOTAL DE VOTOS.5
0,ORGANIZACIONES POLÍTICAS,ORGANIZACIONES POLÍTICAS,ORGANIZACIONES POLÍTICAS,TOTAL,%VÁLIDOS,%EMITIDOS
1,,,PARTIDO NACIONALISTA PERUANO,228955,1.608%,1.309%
2,,,"EL FRENTE AMPLIO POR JUSTICIA, VIDA Y LIBERTAD",64217,0.451%,0.367%
3,,,PARTIDO MORADO,319176,2.242%,1.825%
4,,,PERU PATRIA SEGURA,54341,0.382%,0.311%
5,,,VICTORIA NACIONAL,802957,5.639%,4.592%
6,,,ACCION POPULAR,1294681,9.092%,7.404%
7,,,AVANZA PAIS - PARTIDO DE INTEGRACION SOCIAL,1652682,11.607%,9.452%
8,,,PODEMOS PERU,808559,5.678%,4.624%
9,,,JUNTOS POR EL PERU,1111407,7.805%,6.356%


In [79]:
driver = webdriver.Chrome(executable_path=r'chromedriver-win64/chromedriver.exe')
driver.maximize_window()

url_1 = "https://resultadoshistorico.onpe.gob.pe/EG2021/"
driver.get( url_1 )

driver.find_element(By.XPATH, '/html/body/onpe-root/onpe-home-onpe/div[1]/div/div/div/div[2]/div[1]/div/div/a/div[2]/div').click()
time.sleep(2)
driver.find_element(By.XPATH, '/html/body/onpe-root/onpe-layout-container/onpe-menu/div/nav/div/div/div[2]/div/div[2]/a').click()
time.sleep(2)
driver.find_element(By.XPATH, '/html/body/onpe-root/onpe-layout-container/onpe-onpe-epres-re/div[1]/div[3]/div[1]/div[1]/div/div/div[1]/select/option[2]').click()
time.sleep(2)

table_path = driver.find_element(By.XPATH, '/html/body/onpe-root/onpe-layout-container/onpe-onpe-epres-re/div[1]/div[4]/div[1]/div[3]')

table_html = table_path.get_attribute( 'innerHTML' )

table = pd.read_html( table_html )
table[0]

  driver = webdriver.Chrome(executable_path=r'chromedriver-win64/chromedriver.exe')


Unnamed: 0,TOTAL DE VOTOS,TOTAL DE VOTOS.1,TOTAL DE VOTOS.2,TOTAL DE VOTOS.3,TOTAL DE VOTOS.4,TOTAL DE VOTOS.5
0,ORGANIZACIONES POLÍTICAS,ORGANIZACIONES POLÍTICAS,ORGANIZACIONES POLÍTICAS,TOTAL,%VÁLIDOS,%EMITIDOS
1,,,PARTIDO NACIONALISTA PERUANO,228955,1.608%,1.309%
2,,,"EL FRENTE AMPLIO POR JUSTICIA, VIDA Y LIBERTAD",64217,0.451%,0.367%
3,,,PARTIDO MORADO,319176,2.242%,1.825%
4,,,PERU PATRIA SEGURA,54341,0.382%,0.311%
5,,,VICTORIA NACIONAL,802957,5.639%,4.592%
6,,,ACCION POPULAR,1294681,9.092%,7.404%
7,,,AVANZA PAIS - PARTIDO DE INTEGRACION SOCIAL,1652682,11.607%,9.452%
8,,,PODEMOS PERU,808559,5.678%,4.624%
9,,,JUNTOS POR EL PERU,1111407,7.805%,6.356%


In [None]:
row_new_columns = table[ 0 ].iloc[ 0 , 2: ]
clean_columns = row_new_columns \
                      .str.replace( " ", "_") \
                      .str.lower().str.replace( "%", "share_") \
                      .apply( lambda x : unidecode.unidecode( x ) ) \
                      .tolist()

# Selecting specific columns
table_clean = table[0].iloc[ 1:, 2: ].copy()

# rename columns
table_clean.columns = clean_columns

## [First Round](https://resultadoshistorico.onpe.gob.pe/EG2021/ResumenGeneral/10/T)

In [42]:
# pip install lxml
# pip install lxmunidecodel

In [43]:
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver import ActionChains
import pandas as pd
import numpy as np
import os
import time
import re
import unidecode
import time 
from selenium.webdriver.support.ui import Select
from webdriver_manager.chrome import ChromeDriverManager

# Driver Path Address

In [44]:
driver = webdriver.Chrome(executable_path=r'chromedriver-win64/chromedriver.exe')
# Maximize window
driver.maximize_window()

  driver = webdriver.Chrome(executable_path=r'chromedriver-win64/chromedriver.exe')


# Extracting all tables

In [57]:
driver = webdriver.Chrome(executable_path=r'chromedriver-win64/chromedriver.exe')
# Maximize window
driver.maximize_window()

# go to the link
url_1 = "https://resultadoshistorico.onpe.gob.pe/EG2021/"
driver.get( url_1 )

resumen_general = driver.find_element_by_xpath('/html/body/onpe-root/onpe-home-onpe/div[1]/div/div/div/div[2]/div[1]/div/div/a/div[1]/img')
resumen_general.click()

  driver = webdriver.Chrome(executable_path=r'chromedriver-win64/chromedriver.exe')
  resumen_general = driver.find_element_by_xpath('/html/body/onpe-root/onpe-home-onpe/div[1]/div/div/div/div[2]/div[1]/div/div/a/div[1]/img')


In [58]:
presidential = driver.find_element_by_xpath('/html/body/onpe-root/onpe-layout-container/onpe-onpe-rgen-rsgr/div/div[2]/div[2]/ul/li[1]/a')
presidential.click()

  presidential = driver.find_element_by_xpath('/html/body/onpe-root/onpe-layout-container/onpe-onpe-rgen-rsgr/div/div[2]/div[2]/ul/li[1]/a')


In [61]:
opt_peru = driver.find_element_by_xpath('/html/body/onpe-root/onpe-layout-container/onpe-onpe-rgen-rsgr/div/div[2]/div[1]/div[1]/div/div/div[1]/select/option[2]')
opt_peru.click()

  opt_peru = driver.find_element_by_xpath('/html/body/onpe-root/onpe-layout-container/onpe-onpe-rgen-rsgr/div/div[2]/div[1]/div[1]/div/div/div[1]/select/option[2]')


## Pesidential results

In [48]:
# presidential = driver.find_element_by_xpath('/html/body/onpe-root/onpe-layout-container/onpe-menu/div/nav/div/div/div[2]/div/div[2]/a/span')
# presidential.click

In [49]:
# # presidential section
# presidential = driver.find_element_by_xpath( "/html/body/onpe-root/onpe-layout-container/onpe-menu/div/nav/div/div/div[2]/div/div[2]/a" )
# presidential.click()

### Get all elements from all options

In [50]:
# scope = driver.find_element_by_xpath( "/html/body/onpe-root/onpe-layout-container/onpe-onpe-epres-re/div[1]/div[3]/div[1]/div[1]/div/div/div/select" )
# scope.click()

In [70]:
#Actualización de las funciones para usar objetos con selenium
regiones = driver.find_element(By.XPATH, '/html/body/onpe-root/onpe-layout-container/onpe-onpe-rgen-rsgr/div/div[2]/div[1]/div[1]/div/div/div[2]/select')
regiones

<selenium.webdriver.remote.webelement.WebElement (session="150dbc982fccd844d1f13ad43e1035b2", element="41616776DD4C61A8DC4C3BA36E17EA26_element_6028")>

In [69]:
driver.find_element(By.XPATH, "/html/body/onpe-root/onpe-layout-container/onpe-onpe-rgen-rsgr/div/div[2]/div[1]/div[1]/div/div/div[2]/select/option[2]").text

'AMAZONAS'

In [75]:
#Actualización de las funciones para usar objetos con selenium
regiones = driver.find_element(By.XPATH, '/html/body/onpe-root/onpe-layout-container/onpe-onpe-rgen-rsgr/div/div[2]/div[1]/div[1]/div/div/div[2]/select')
regiones.find_elements(By.TAG_NAME, "option")[1].text

'AMAZONAS'

In [52]:
regiones.find_elements(By.TAG_NAME,"option")[1].text

'AMAZONAS'

In [57]:
scope_options = driver.find_element_by_xpath('/html/body/onpe-root/onpe-layout-container/onpe-onpe-rgen-rsgr/div/div[2]/div[1]/div[1]/div/div/div[1]/select')
scope_options.find_elements_by_tag_name("option")[2].text

  scope_options = driver.find_element_by_xpath('/html/body/onpe-root/onpe-layout-container/onpe-onpe-rgen-rsgr/div/div[2]/div[1]/div[1]/div/div/div[1]/select')


'EXTRANJERO'

In [59]:
scope_options = driver.find_element_by_xpath('/html/body/onpe-root/onpe-layout-container/onpe-onpe-rgen-rsgr/div/div[2]/div[1]/div[1]/div/div/div[1]/select')

  scope_options = driver.find_element_by_xpath('/html/body/onpe-root/onpe-layout-container/onpe-onpe-rgen-rsgr/div/div[2]/div[1]/div[1]/div/div/div[1]/select')


In [62]:
scope_options.find_elements_by_tag_name("option")[0].text
scope_options.find_elements_by_tag_name("option")[1].text
scope_options.find_elements_by_tag_name("option")[2].text

'EXTRANJERO'

In [77]:
scope = driver.find_element_by_xpath( "/html/body/onpe-root/onpe-layout-container/onpe-onpe-rgen-rsgr/div/div[2]/div[1]/div[1]/div/div/div[1]/select" )
scope

  scope = driver.find_element_by_xpath( "/html/body/onpe-root/onpe-layout-container/onpe-onpe-rgen-rsgr/div/div[2]/div[1]/div[1]/div/div/div[1]/select" )


<selenium.webdriver.remote.webelement.WebElement (session="150dbc982fccd844d1f13ad43e1035b2", element="41616776DD4C61A8DC4C3BA36E17EA26_element_7852")>

In [81]:
scope.find_elements_by_tag_name("option")[2].text

'EXTRANJERO'

In [64]:
scope_options = scope.find_elements_by_tag_name("option")

In [65]:
scope_options

[<selenium.webdriver.remote.webelement.WebElement (session="7044b58859814a98da64965ce17db995", element="98CC869DA3BC59F8EC849A0A10F8CCE8_element_371")>,
 <selenium.webdriver.remote.webelement.WebElement (session="7044b58859814a98da64965ce17db995", element="98CC869DA3BC59F8EC849A0A10F8CCE8_element_376")>,
 <selenium.webdriver.remote.webelement.WebElement (session="7044b58859814a98da64965ce17db995", element="98CC869DA3BC59F8EC849A0A10F8CCE8_element_380")>]

In [66]:
dict_scope_options = { i.text : i for i in scope_options }
dict_scope_options

{'TODOS': <selenium.webdriver.remote.webelement.WebElement (session="7044b58859814a98da64965ce17db995", element="98CC869DA3BC59F8EC849A0A10F8CCE8_element_371")>,
 'PERÚ': <selenium.webdriver.remote.webelement.WebElement (session="7044b58859814a98da64965ce17db995", element="98CC869DA3BC59F8EC849A0A10F8CCE8_element_376")>,
 'EXTRANJERO': <selenium.webdriver.remote.webelement.WebElement (session="7044b58859814a98da64965ce17db995", element="98CC869DA3BC59F8EC849A0A10F8CCE8_element_380")>}

In [67]:
# There are three options
dict_scope_options.keys()
dict_scope_options

{'TODOS': <selenium.webdriver.remote.webelement.WebElement (session="7044b58859814a98da64965ce17db995", element="98CC869DA3BC59F8EC849A0A10F8CCE8_element_371")>,
 'PERÚ': <selenium.webdriver.remote.webelement.WebElement (session="7044b58859814a98da64965ce17db995", element="98CC869DA3BC59F8EC849A0A10F8CCE8_element_376")>,
 'EXTRANJERO': <selenium.webdriver.remote.webelement.WebElement (session="7044b58859814a98da64965ce17db995", element="98CC869DA3BC59F8EC849A0A10F8CCE8_element_380")>}

In [68]:
# We click on Peru
dict_scope_options['PERÚ'].click()

We have to be careful since everytime we make a click, the url changes.

### Loop over all departments

In [91]:
from selenium.webdriver.support.ui import Select  # Import Select class

In [95]:
# Store all_tables
all_tables = {}

dept_0 = driver.find_element_by_xpath( "/html/body/onpe-root/onpe-layout-container/onpe-onpe-rgen-rsgr/div/div[2]/div[1]/div[1]/div/div/div[2]/select" )
dept_0

  dept_0 = driver.find_element_by_xpath( "/html/body/onpe-root/onpe-layout-container/onpe-onpe-rgen-rsgr/div/div[2]/div[1]/div[1]/div/div/div[2]/select" )


<selenium.webdriver.remote.webelement.WebElement (session="150dbc982fccd844d1f13ad43e1035b2", element="41616776DD4C61A8DC4C3BA36E17EA26_element_6028")>

In [99]:
# All selenium objects in department select
dpt = Select( dept_0 )
#dpt.options[15].text

In [100]:
dpt.options

[<selenium.webdriver.remote.webelement.WebElement (session="150dbc982fccd844d1f13ad43e1035b2", element="41616776DD4C61A8DC4C3BA36E17EA26_element_4358")>,
 <selenium.webdriver.remote.webelement.WebElement (session="150dbc982fccd844d1f13ad43e1035b2", element="41616776DD4C61A8DC4C3BA36E17EA26_element_4359")>,
 <selenium.webdriver.remote.webelement.WebElement (session="150dbc982fccd844d1f13ad43e1035b2", element="41616776DD4C61A8DC4C3BA36E17EA26_element_4360")>,
 <selenium.webdriver.remote.webelement.WebElement (session="150dbc982fccd844d1f13ad43e1035b2", element="41616776DD4C61A8DC4C3BA36E17EA26_element_4361")>,
 <selenium.webdriver.remote.webelement.WebElement (session="150dbc982fccd844d1f13ad43e1035b2", element="41616776DD4C61A8DC4C3BA36E17EA26_element_4362")>,
 <selenium.webdriver.remote.webelement.WebElement (session="150dbc982fccd844d1f13ad43e1035b2", element="41616776DD4C61A8DC4C3BA36E17EA26_element_4363")>,
 <selenium.webdriver.remote.webelement.WebElement (session="150dbc982fccd844

In [101]:
# Get number of total options
num_prov_options = len( dpt.options )
num_prov_options

26

In [102]:
# we can loop over all departments
# for dpt_idx in range( num_prov_options ):
# but it will take too much time
# We are going to do it over two departments
for dpt_idx in range( num_prov_options ):
    
    # Get again all departments since HTML is refreshing
    # all elements
    # Click on one specific department
    dpt = Select( driver.find_element_by_xpath( "/html/body/onpe-root/onpe-layout-container/onpe-onpe-rgen-rsgr/div/div[2]/div[1]/div[1]/div/div/div[2]/select" ) )
    department = dpt.options[ dpt_idx ]
    
    # Get departmant name
    dpt_name = department.text
    print(dpt_name)

  dpt = Select( driver.find_element_by_xpath( "/html/body/onpe-root/onpe-layout-container/onpe-onpe-rgen-rsgr/div/div[2]/div[1]/div[1]/div/div/div[2]/select" ) )


--TODOS--
AMAZONAS
ANCASH
APURIMAC
AREQUIPA
AYACUCHO
CAJAMARCA
CALLAO
CUSCO
HUANCAVELICA
HUANUCO
ICA
JUNIN
LA LIBERTAD
LAMBAYEQUE
LIMA
LORETO
MADRE DE DIOS
MOQUEGUA
PASCO
PIURA
PUNO
SAN MARTIN
TACNA
TUMBES
UCAYALI


# Dynamic Pages

In [83]:
driver = webdriver.Chrome( ChromeDriverManager().install() )
# Maximize window
driver.maximize_window()
driver.get('https://www.legacy.com/obituaries/legacy/obituary-search.aspx?isnew=1&affiliateId=0&stateid=17')

name = driver.find_element(By.XPATH, '/html/body/div[2]/div[2]/div[2]/form/div[3]/div[1]/div[1]/div/div/div[1]/div[2]/div[3]/div/div[1]/input[1]')
name.send_keys("Maria")

lastname = driver.find_element(By.XPATH, '/html/body/div[2]/div[2]/div[2]/form/div[3]/div[1]/div[1]/div/div/div[1]/div[2]/div[3]/div/div[1]/input[2]')
lastname.send_keys("Brown")


search = driver.find_element(By.XPATH, '/html/body/div[2]/div[2]/div[2]/form/div[3]/div[1]/div[1]/div/div/div[1]/div[2]/div[5]/div/div[2]/a')
search.click()


  driver = webdriver.Chrome( ChromeDriverManager().install() )


In [84]:

driver = webdriver.Chrome( ChromeDriverManager().install() )
# Maximize window
driver.maximize_window()
driver.get('https://www.legacy.com/obituaries/legacy/obituary-search.aspx?isnew=1&affiliateId=0&stateid=17')

# range of death
driver.find_element_by_xpath('//*[@id="ctl00_ctl00_ContentPlaceHolder1_ContentPlaceHolder1_uxSearchWideControl_ddlSearchRange"]/option[10]').click()


death_begin = driver.find_element_by_xpath('//*[@id="ctl00_ctl00_ContentPlaceHolder1_ContentPlaceHolder1_uxSearchWideControl_txtStartDate"]')
death_begin.send_keys('10/10/1994')

death_end = driver.find_element_by_xpath('//*[@id="ctl00_ctl00_ContentPlaceHolder1_ContentPlaceHolder1_uxSearchWideControl_txtEndDate"]')    
death_end.send_keys('10/10/2005')

# type the Firstname 
keyword = driver.find_element_by_xpath('//*[@id="ctl00_ctl00_ContentPlaceHolder1_ContentPlaceHolder1_uxSearchWideControl_txtFirstName"]')
keyword.send_keys('robert')

# type the Lastname 
keyword = driver.find_element_by_xpath('//*[@id="ctl00_ctl00_ContentPlaceHolder1_ContentPlaceHolder1_uxSearchWideControl_txtLastName"]')
keyword.send_keys('brown')

# type the Title 
keyword = driver.find_element_by_xpath('//*[@id="ctl00_ctl00_ContentPlaceHolder1_ContentPlaceHolder1_uxSearchWideControl_txtKeyword"]')
keyword.send_keys('professor')

 # Set the state of last residence
driver.find_element_by_xpath('//*[@id="ctl00_ctl00_ContentPlaceHolder1_ContentPlaceHolder1_uxSearchWideControl_ddlCountry"]/option[11]').click()
        
# Send information
driver.find_element_by_xpath('//*[@id="lnkSearch"]').click()




  driver = webdriver.Chrome( ChromeDriverManager().install() )
  driver.find_element_by_xpath('//*[@id="ctl00_ctl00_ContentPlaceHolder1_ContentPlaceHolder1_uxSearchWideControl_ddlSearchRange"]/option[10]').click()
  death_begin = driver.find_element_by_xpath('//*[@id="ctl00_ctl00_ContentPlaceHolder1_ContentPlaceHolder1_uxSearchWideControl_txtStartDate"]')
  death_end = driver.find_element_by_xpath('//*[@id="ctl00_ctl00_ContentPlaceHolder1_ContentPlaceHolder1_uxSearchWideControl_txtEndDate"]')
  keyword = driver.find_element_by_xpath('//*[@id="ctl00_ctl00_ContentPlaceHolder1_ContentPlaceHolder1_uxSearchWideControl_txtFirstName"]')
  keyword = driver.find_element_by_xpath('//*[@id="ctl00_ctl00_ContentPlaceHolder1_ContentPlaceHolder1_uxSearchWideControl_txtLastName"]')
  keyword = driver.find_element_by_xpath('//*[@id="ctl00_ctl00_ContentPlaceHolder1_ContentPlaceHolder1_uxSearchWideControl_txtKeyword"]')
  driver.find_element_by_xpath('//*[@id="ctl00_ctl00_ContentPlaceHolder1_ContentPlace

NoSuchElementException: Message: no such element: Unable to locate element: {"method":"xpath","selector":"//*[@id="ctl00_ctl00_ContentPlaceHolder1_ContentPlaceHolder1_uxSearchWideControl_ddlCountry"]/option[11]"}
  (Session info: chrome=120.0.6099.217)
Stacktrace:
	GetHandleVerifier [0x00D56EE3+174339]
	(No symbol) [0x00C80A51]
	(No symbol) [0x00996FF6]
	(No symbol) [0x009C9876]
	(No symbol) [0x009C9C2C]
	(No symbol) [0x009FBD42]
	(No symbol) [0x009E7054]
	(No symbol) [0x009FA104]
	(No symbol) [0x009E6DA6]
	(No symbol) [0x009C1034]
	(No symbol) [0x009C1F8D]
	GetHandleVerifier [0x00DF4B1C+820540]
	sqlite3_dbdata_init [0x00EB53EE+653550]
	sqlite3_dbdata_init [0x00EB4E09+652041]
	sqlite3_dbdata_init [0x00EA97CC+605388]
	sqlite3_dbdata_init [0x00EB5D9B+656027]
	(No symbol) [0x00C8FE6C]
	(No symbol) [0x00C883B8]
	(No symbol) [0x00C884DD]
	(No symbol) [0x00C75818]
	BaseThreadInitThunk [0x76F3FCC9+25]
	RtlGetAppContainerNamedObjectPath [0x77DB7C6E+286]
	RtlGetAppContainerNamedObjectPath [0x77DB7C3E+238]


In [10]:
# we can loop over all departments
# for dpt_idx in range( num_prov_options ):
# but it will take too much time
# We are going to do it over two departments
for dpt_idx in range( 2 ):
    
    # Get again all departments since HTML is refreshing
    # all elements
    # Click on one specific department
    dpt = Select( driver.find_element_by_xpath( "/html/body/onpe-root/onpe-layout-container/onpe-onpe-epres-re/div[1]/div[3]/div[1]/div[1]/div/div/div[2]/select" ) )
    department = dpt.options[ dpt_idx ]
    
    # Get departmant name
    dpt_name = department.text
    
    # We select a different department name
    if dpt_name != "--TODOS--" :
        
        # click on department
        department.click()
        
        # Get all elements of province
        prov = Select( driver.find_element_by_xpath( "/html/body/onpe-root/onpe-layout-container/onpe-onpe-epres-re/div[1]/div[3]/div[1]/div[1]/div/div/div[3]/select" ) )
        num_prov_options = len( prov.options )
        
        for prov_idx in range( num_prov_options ):
            
            # Get again all districts since HTML is refreshing
            # all elements
            prov = Select( driver.find_element_by_xpath( "/html/body/onpe-root/onpe-layout-container/onpe-onpe-epres-re/div[1]/div[3]/div[1]/div[1]/div/div/div[3]/select" ) )
            province = prov.options[ prov_idx ]
                
            # Get province name
            prov_name = province.text
            
            if prov_name != "--TODOS--" :
                
                # click on province
                province.click()
                
                # Get all elements from district
                dist = Select( driver.find_element_by_xpath( "/html/body/onpe-root/onpe-layout-container/onpe-onpe-epres-re/div[1]/div[3]/div[1]/div[1]/div/div/div[4]/select" ) )
                num_dist_options = len( dist.options )
                
                for dist_idx in range( num_dist_options ):
                    
                    # Get again all districts since HTML is refreshing
                    # all elements
                    dist = Select( driver.find_element_by_xpath( "/html/body/onpe-root/onpe-layout-container/onpe-onpe-epres-re/div[1]/div[3]/div[1]/div[1]/div/div/div[4]/select" ) )
                    district = dist.options[ dist_idx ]
                    
                    # Get district name
                    dist_name = district.text
                    
                    if dist_name != "-- SELECCIONE --" :
                        
                        # click on district
                        district.click()
                        
                        # Get UBIGEO
                        ubigeo = driver.current_url.split("/")[ -1 ]
                        
                        ## Get table of presidential votes
                        # Get html at this point
                        table_path = driver.find_element_by_xpath( "/html/body/onpe-root/onpe-layout-container/onpe-onpe-epres-re/div[1]/div[4]/div[1]/div[3]/div" )
                        table_html = table_path.get_attribute( 'innerHTML' )
                        # Read the table using pandas
                        table = pd.read_html( table_html )
                        
                        # Cleaning tables
                        row_new_columns = table[ 0 ].iloc[ 0 , 2: ]
                        clean_columns = row_new_columns \
                                              .str.replace( " ", "_") \
                                              .str.lower().str.replace( "%", "share_") \
                                              .apply( lambda x : unidecode.unidecode( x ) ) \
                                              .tolist()
                        
                        # Selecting specific columns
                        table_clean = table[0].iloc[ 1:, 2: ].copy()
                        
                        # rename columns
                        table_clean.columns = clean_columns
                        
                        # New values to columns 
                        table_clean[ 'department' ] = dpt_name
                        table_clean[ 'province' ]   = prov_name
                        table_clean[ 'district' ]   = dist_name
                        table_clean[ 'ubigeo' ]     = ubigeo
                        
                        # store tables
                        all_tables[ ubigeo ] = table_clean

In [12]:
final_data = pd.concat( all_tables.values() ).reset_index( drop = True )

In [15]:
final_data.to_excel( r'example_round.xlsx' , index = False )