# Scraper funcional para páginas individuales de La Voz del Sur

In [None]:
import requests
from bs4 import BeautifulSoup
import csv
from datetime import datetime

# URL del artículo
url = "https://www.lavozdelsur.com.mx/laguneros-y-ciudad-guzman-participan-en-la-amdi-cup-en-tapalpa/"

headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}

try:
    response = requests.get(url, headers=headers, timeout=10)
    response.raise_for_status()
    soup = BeautifulSoup(response.content, 'html.parser')

    # 1. Extraer título (ya funcionaba bien)
    title = soup.find('h1').get_text(strip=True) if soup.find('h1') else 'Título no encontrado'

    # 2. Búsqueda MEJORADA de fecha - Estrategias múltiples
    date = None
    date_formats_to_try = [
        '%Y-%m-%d',  # Formato ISO
        '%d/%m/%Y',  # Formato latino
        '%B %d, %Y'  # "Enero 15, 2023"
    ]

    # Estrategia 1: Buscar <time> con datetime
    time_tag = soup.find('time', {'datetime': True})
    if time_tag:
        datetime_attr = time_tag['datetime']
        for fmt in date_formats_to_try:
            try:
                date_obj = datetime.strptime(datetime_attr.split('T')[0], fmt)
                date = date_obj.strftime('%d/%m/%Y')
                break
            except ValueError:
                continue

    # Estrategia 2: Buscar texto que parezca fecha
    if not date:
        possible_date_elements = soup.find_all(['span', 'div', 'p'], class_=True)
        for element in possible_date_elements:
            if 'date' in element.get('class', []):
                text = element.get_text(strip=True)
                for fmt in date_formats_to_try:
                    try:
                        date_obj = datetime.strptime(text, fmt)
                        date = date_obj.strftime('%d/%m/%Y')
                        break
                    except ValueError:
                        continue
                if date:
                    break

    # Estrategia 3: Buscar texto que contenga "Publicado el"
    if not date:
        for element in soup.find_all(string=lambda text: 'publicado' in text.lower()):
            parent = element.parent
            text = parent.get_text(strip=True)
            date_part = text.split('Publicado el')[-1].strip().split()[0]
            for fmt in date_formats_to_try:
                try:
                    date_obj = datetime.strptime(date_part, fmt)
                    date = date_obj.strftime('%d/%m/%Y')
                    break
                except ValueError:
                    continue

    date = date or 'Fecha no encontrada'

    # 3. Extraer contenido (ya funcionaba bien)
    content_div = soup.find('article') or soup.find('div', class_='entry-content')
    if content_div:
        for element in content_div.find_all(['div', 'script', 'style', 'iframe']):
            element.decompose()
        content = '\n'.join(p.get_text(strip=True) for p in content_div.find_all('p') if p.get_text(strip=True))
    else:
        content = 'Contenido no encontrado'

    # Exportar a CSV
    with open('articulo_la_voz_del_sur_mejorado.csv', 'w', newline='', encoding='utf-8') as csvfile:
        writer = csv.writer(csvfile)
        writer.writerow(['Título', 'Fecha', 'Contenido'])
        writer.writerow([title, date, content])

    print("¡Extracción completada con éxito!")
    print(f"Título: {title}")
    print(f"Fecha encontrada: {date}")

except Exception as e:
    print(f"Error: {str(e)}")