In [None]:
%%html
<link rel="stylesheet" href="cc-jupyter.css"/>

# HTML Parsing

## Some Prerequisites

[ipython magic
commands](https://ipython.readthedocs.io/en/stable/interactive/magics.html)

In [None]:
%pip install -q requests lxml cssselect beautifulsoup4

doc = ('<html><head><title>Test</title></head>'
       '<body><h1>Parse me!</h1>'
       '<a href="/admin.html">admin area</a>'
       '<a href="/help.html">help pages</a>'
       '</body></html>')

## html.parser, the standard parser

[homepage](https://docs.python.org/3/library/html.parser.html)

In [None]:
from html.parser import HTMLParser
class MyHTMLParser(HTMLParser):
    def handle_starttag(self, tag, attrs):
        print(f"start tag '{tag}'", end=', ')
    def handle_endtag(self, tag):
        print(f"end tag '{tag}'", end=', ')
    def handle_data(self, data):
        print(f"data '{data}''", end=', ')

parser = MyHTMLParser()
parser.feed(doc)

## lxml.html, the fast parser

[homepage](https://lxml.de)

## lxml, local content

In [None]:
from lxml.html import etree

for a in etree.fromstring(doc).cssselect('a'):
    print('%s: %s' % (a.text, a.get('href')))

## lxml, external content

In [None]:
from lxml.html import parse

google_doc = parse('http://www.google.com').getroot()
for a in google_doc.cssselect('a')[:10]:
    print('%s: %s' % (a.text_content(), a.get('href')))

## BeautifulSoup, the user-friendly parser

[homepage](https://www.crummy.com/software/BeautifulSoup/)

## BeautifulSoup, local content

In [None]:
from bs4 import BeautifulSoup

soup = BeautifulSoup(doc)
for a in soup.find_all('a'):
    print('%s: %s' % (a.get_text(), a.get('href')))

## BeautifulSoup, remote content

In [None]:
from bs4 import BeautifulSoup
import requests

soup = BeautifulSoup(requests.get('http://www.google.com').text)
for a in soup.find_all('a')[:10]:
    print('%s: %s' % (a.get_text(), a.get('href')))

## Requests, HTTP for Humans

[homepage](https://requests.readthedocs.io/en/master/)

In [None]:
import requests

external_doc = requests.get('http://www.google.com')
print(external_doc.text[:200], "...")

## xslt, local content

In [None]:
import lxml.etree as ET

dom = ET.fromstring(doc)
xslt = ET.fromstring('''
<xsl:stylesheet version="1.0" xmlns:xsl="http://www.w3.org/1999/XSL/Transform">
    <xsl:template match="/html">
        <xsl:copy>
            <xsl:for-each select="body/a">
                <p><xsl:value-of select="node()"/>: <xsl:value-of select="@href"/></p>
            </xsl:for-each>
        </xsl:copy>
    </xsl:template>
</xsl:stylesheet>
''')

In [None]:
transform = ET.XSLT(xslt)
newdom = transform(dom)
print(ET.tostring(newdom, pretty_print=True).decode("ascii"))

## xslt, real content

teleschau_21.10.2020.dat, just one item

In [None]:
import lxml.etree as ET

dom = ET.parse("data/teleschau_21.10.2020.dat")
print(ET.tostring(dom, pretty_print=True).decode("ascii")[:1200], "...")

In [None]:
xslt = ET.fromstring('''<?xml version="1.0"?>
<xsl:stylesheet version="1.0" xmlns:xsl="http://www.w3.org/1999/XSL/Transform">
    <xsl:template match="/">
        <artikel-liste>
            <xsl:apply-templates select="NewsML/NewsItem"/>
        </artikel-liste>
    </xsl:template>

    <xsl:template match="NewsItem">
        <artikel>
            <metadaten>
                <artikel-id>
                    <xsl:value-of select="Identification/NewsIdentifier/NewsItemId"/>
                </artikel-id>
                <urheberinformation>
                    <xsl:value-of select="NewsLines/CopyrightLine"/>
                </urheberinformation>
                <autor>
                    <autor-name>
                        <xsl:value-of select="NewsLines/ByLine"/>
                    </autor-name>
                </autor>
            </metadaten>
            <inhalt>
                <titel-liste>
                    <titel>
                        <xsl:value-of select="NewsLines/HeadLine"/>
                    </titel>
                </titel-liste>
            </inhalt>
        </artikel>
    </xsl:template>
</xsl:stylesheet>
''')

In [None]:
transform = ET.XSLT(xslt)
newdom = transform(dom)
print(ET.tostring(newdom, pretty_print=True).decode("ascii"))