## Ejercicio Web Scraping bs4/Selenium/Helium

_**url** = https://www.20minutos.es/_

Vamos a hacer Web Scraping de las primeras 3 páginas de una de las siguientes categorias de noticias: **Ciencia**, **Deporte**, **Gente**, **Economía**, **Grastronomía** y **Opinión**.

Y de cada noticia/articulo vamos a obtener:

- **Titulo**
- **Hora**
- **Fecha**
- **Autor**
- **Texto completo**
- **Categoria de la noticia**

Genera un DataFrame con esta información y guardalo en el archivo **`20minutos_1.csv`**.

In [None]:
import numpy as np
import pandas as pd

import requests
from  bs4 import BeautifulSoup

from selenium import webdriver

from time import sleep

In [None]:
# 1 categoría - 3 páginas

# Inicializar Driver
chrome_driver = "../../10_10_2023_dia_18/chromedriver.exe"

url_20minutos = "https://www.20minutos.es/"

browser = webdriver.Chrome(executable_path = chrome_driver)

browser.get(url = url_20minutos)

browser.maximize_window()
sleep(2)

# Aceptar Cookies
browser.find_element_by_css_selector('#didomi-notice-agree-button > span').click()

categorias = ["Ciencia"]
lista_url = list()

for categoria in categorias:
    
    # Menú de opciones
    browser.find_element_by_css_selector('#ui-toggle-menu > a > svg.ui-unfold-menu.icon-bars > use').click()
    sleep(1)
    
    browser.find_element_by_partial_link_text(link_text = categoria).click()
    sleep(2)
    
    page_url = browser.current_url
    
    for i in range(1, 4):
        
        browser.get(url = f"{page_url}{i}")
    
        # Beautiful Soup
        soup = BeautifulSoup(browser.page_source, "html.parser")

        articulos = soup.find_all("article", class_ = "media")

        articulos_url = [[x.find("a")["href"], categoria] for x in articulos[:-4]]

        lista_url.extend(articulos_url)
        
    browser.get(url = url_20minutos)
    sleep(2)
    
browser.close()

In [None]:
for url in lista_url:
    
    print(url[0])

In [None]:
data = list()

for url in lista_url:

    response = requests.get(url[0])
    soup = BeautifulSoup(response.text, "html.parser")
    
    try:
        
        titulo = soup.find("h1", class_ = "article-title").text
    
    except:
        titulo = np.nan
        
    try:
        
        fecha, hora = soup.find("span", class_ = "article-date").text.split(" - ")
    
    except:
        
        fecha, hora = np.nan, np.nan
        
    try:
        
        autor = soup.find("span", class_ = "article-author").text.strip()
    
    except:
        
        autor = np.nan
        
    try:
        
        texto = "\n".join([x.text.strip() for x in soup.find_all("p", class_ = "paragraph")])
    
    except:
        
        texto = np.nan
    
    data.append([titulo, fecha, hora, autor, texto])
    
    sleep(1)
    
df = pd.DataFrame(data = data, columns = ["titulo", "fecha", "hora", "autor", "texto"])

df["categoria"] = [x[1] for x in lista_url]
df["url"] = [x[0] for x in lista_url]

In [None]:
df

In [None]:
df.to_csv("20minutos_1.csv", index = False)

- **Escribe el código para sacar la información de las primeras 5 páginas de las 6 categorias anteriores. Guarda esta información en el archivo `20minutos_2.csv`**

In [None]:
# Todas las categorías - 5 páginas

# Inicializar Driver
chrome_driver = "chromedriver.exe"

url_20minutos = "https://www.20minutos.es/"

browser = webdriver.Chrome(executable_path = chrome_driver)

browser.get(url = url_20minutos)

browser.maximize_window()
sleep(2)

# Aceptar Cookies
browser.find_element_by_css_selector('#didomi-notice-agree-button > span').click()

categorias = ["Ciencia", "Deportes", "Gente", "Economía", "Gastro", "Opinión"]
lista_url = list()

for categoria in categorias:
    
    # Menú de opciones
    browser.find_element_by_css_selector('#ui-toggle-menu > a > svg.ui-unfold-menu.icon-bars > use').click()
    sleep(1)
    
    browser.find_element_by_partial_link_text(link_text = categoria).click()
    sleep(2)
    
    page_url = browser.current_url
    
    for i in range(1, 6):
        
        browser.get(url = f"{page_url}{i}")
    
        # Beautiful Soup
        soup = BeautifulSoup(browser.page_source, "html.parser")

        articulos = soup.find_all("article", class_ = "media")

        articulos_url = [[x.find("a")["href"], categoria] for x in articulos[:-4]]

        lista_url.extend(articulos_url)
        
    browser.get(url = url_20minutos)
    sleep(2)
    
browser.close()

In [None]:
data = list()

for url in lista_url:

    response = requests.get(url[0])
    soup = BeautifulSoup(response.text, "html.parser")
    
    try:
        
        titulo = soup.find("h1", class_ = "article-title").text
    
    except:
        titulo = np.nan
        
    try:
        
        fecha, hora = soup.find("span", class_ = "article-date").text.split(" - ")
    
    except:
        
        fecha, hora = np.nan, np.nan
        
    try:
        
        autor = soup.find("span", class_ = "article-author").text.strip()
    
    except:
        
        autor = np.nan
        
    try:
        
        texto = "\n".join([x.text.strip() for x in soup.find_all("p", class_ = "paragraph")])
    
    except:
        
        texto = np.nan
    
    data.append([titulo, fecha, hora, autor, texto])
    
    sleep(2)
    
df = pd.DataFrame(data = data, columns = ["titulo", "fecha", "hora", "autor", "texto"])

df["categoria"] = [x[1] for x in lista_url]
df["url"] = [x[0] for x in lista_url]

In [None]:
df.to_csv("20minutos_2.csv", index = False)