## 2.2 HTML to text conversion

NOTE: it is gzcat on MacOS to view the files…

In [1]:
from resiliparse.parse.encoding import detect_encoding
from resiliparse.extract.html2text import extract_plain_text

def run_extract_text_from_html_bytes(html_bytes: bytes) -> str | None:

    # Detect encoding of the byte string
    enc = detect_encoding(html_bytes)
    print(enc)

    # Decode the byte string into a Unicode string
    html = html_bytes.decode('utf-8')
    print(html)

    # If the encoding is not UTF-8, try to decode it using the detected encoding
    if enc != 'utf-8':
        try:
            html = html_bytes.decode(enc)
        except UnicodeDecodeError:
            return None

    # Extract text from the HTML string
    text = extract_plain_text(html)

    return text

In [2]:
# run_extract_text_from_html_bytes(b'\xff\xfeH\x00e\x00l\x00l\x00o\x00 \x00W\x00o\x00r\x00l\x00d\x00')
run_extract_text_from_html_bytes(b'<html><head><title>Test</title></head><body><h1>Hello World</h1></body></html>')

cp1252
<html><head><title>Test</title></head><body><h1>Hello World</h1></body></html>


'Hello World'

In [7]:
from fastwarc import ArchiveIterator, WarcRecordType
import gzip

from resiliparse.parse.encoding import detect_encoding
from resiliparse.extract.html2text import extract_plain_text

# Function to process a WARC file
def process_warc_file(warc_path):
    with gzip.open(warc_path, 'rb') as warc_gz:
        for record in ArchiveIterator(warc_gz):
            if record.record_type == WarcRecordType.response:
                html_bytes = record.reader.read()
                text = run_extract_text_from_html_bytes(html_bytes)
                if text:
                    print(text[:500])  # Print first 500 characters as a preview

process_warc_file("/data/CC-MAIN-20210722174000-20210722194000-00111.warc.gz")

FileNotFoundError: [Errno 2] No such file or directory: '/data/CC-MAIN-20210722174000-20210722194000-00111.warc.gz'

# 2.3

In [None]:
import fasttext

def run_identify_language(text: str) -> tuple[str, float]:
    
    model = fasttext.load_model('lid.176.bin')

    # Predict the language of the text
    text = text.replace('\n', ' ') # Remove newlines
    predictions = model.predict(text, k=1) # k=1 means we only want the top prediction

    return (predictions[0][0], predictions[1][0]) # Return the language code and the confidence score