In [1]:
from bs4 import BeautifulSoup, NavigableString, Comment
import re

#### Función de carga del HTML

In [2]:
# Abre un archivo HTML y devuelve su contenido como una cadena.
def open_html(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        html_content = f.read()
    return html_content

#### Funciones de limpieza del HTML

In [3]:
# Elimina etiquetas específicas del HTML
def erase_tags(soup):
    try:
        for script in soup(['footer','form','script','style','nav','img','i']):
            if script is not None:
                script.decompose()
        clean_text=soup
        return clean_text
    
    except Exception:
        return soup

def erase_comments(soup):
    for element in soup(text=lambda text: isinstance(text, Comment)):
        element.extract()
    return soup

def remove_isolated_links(soup):
    for a_tag in soup.findAll('a'):
        previous_sibling = a_tag.find_previous_sibling(string=True)
        next_sibling = a_tag.find_next_sibling(string=True)

        if not (previous_sibling and next_sibling):
            a_tag.replace_with(NavigableString(a_tag.text))
        else:
            a_tag.decompose()
            
    return soup

def href_for_text(soup):
    a_tags = soup.find_all('a')
    for tag in a_tags:
        if '#0' in tag.get('href', ''): # Si la href del tag es igual a #0, eliminamos el tag entero.
            tag.decompose()
        else:  # Reemplazamos el enlace 'a' por el nuevo nodo que solo contiene el texto.
            tag.replace_with(tag.text)
    return soup

def remove_specific_classes(soup, classes_to_remove):
    for class_ in classes_to_remove:
        for tag in soup.find_all(True, {'class': class_}):
            tag.decompose()
    return soup

def remove_empty_structures(soup,parametros):
    for param in parametros:
        div_tags = soup.find_all(param)
        for tag in div_tags:
            if not tag.get_text(strip=True):
                tag.decompose()
    return soup

def remove_specific_text(text_to_remove, soup):
    for text_to_remove in text_to_remove:
            for contenido in soup(text=lambda text: text_to_remove in text):
                contenido.replace_with(contenido.replace(text_to_remove, ''))
    return soup

def clear_blankspaces_after_href(soup):
    rgx=r'\r\n\s*\.'
    new_str_soup=re.sub(rgx,'',str(soup))
    soup= BeautifulSoup(new_str_soup, 'html.parser')
    return soup

#### Funciones de conversión de tipos

In [4]:
def erase_empty_lines(clean_text):
    lines = (line.strip() for line in clean_text.splitlines())
    chunks = (phrase.strip() for line in lines for phrase in line.split("  "))
    clean_text = '\n'.join(chunk for chunk in chunks if chunk)
    return clean_text

def html_to_text(soup):
    texto=soup.get_text()
    texto_normalizado=erase_empty_lines(texto)
    return texto_normalizado

def soup_to_html(soup):
    try:
        html=soup.prettify()
        return html
    except Exception:
        return soup

def html_to_soup(filepath):
    html_content = open_html(filepath)
    soup = BeautifulSoup(html_content, 'html.parser') 
    return soup

#### Función de guardado

In [5]:
def save_txt(text, titulo, tipo):
    if tipo != 'texto':
        text=soup_to_html(text)
    archivo = open(f"{titulo}.txt", "w",encoding='utf-8')
    archivo.write(text)
    archivo.close()

#### Sacar texto limpio

In [6]:
def main_html_scrapping(file_path):
    soup = html_to_soup(file_path)
    content_l = erase_tags(soup)
    content_l = remove_isolated_links(content_l)
    content_sin_comments = erase_comments(content_l)
    content_l = href_for_text(content_sin_comments)
    # content_l = remove_specific_text([''], content_l) # para borrar algun texto especifico, como de la cabecera o de la parte de abajo
    # content_l = remove_specific_classes(content_l, [''])
    content_l = remove_empty_structures(content_l, ['span','p','div']) # eliminar todas las span vacias
    content_l = clear_blankspaces_after_href(content_l) # eliminamos el salto de linea que se mete cuando cambiamos las href por texto
    return content_l


In [11]:
def main_scrapping(file_path, file_name):
    soup = main_html_scrapping(file_path)
    texto = html_to_text(soup)
    save_txt(texto, file_name, 'texto')

In [12]:
file_path = 'data/BOE-A-1902-8161.html' 
main_scrapping(file_path, 'BOE-A-1902-8161.txt')