# Extract artist information from new articles

The objective is to get an url and fetch a news article about an artist or release. We want to classify the following elements for automatic extraction

* title
* release date
* artist (this is about)
* author

We will focus on elements from the top



In [13]:
from bs4 import BeautifulSoup
import urllib


In [26]:
import ssl
def parse_page(url):
    r = urllib.request.urlopen(url, context=ssl._create_unverified_context()).read()
    soup = BeautifulSoup(r, 'html.parser')
    return fetch_title(soup)

In [286]:
import csv
results = []
with open('data/prod_titles.csv', newline='') as csvfile:
    spamreader = csv.reader(csvfile, delimiter='|', quotechar='"')
    next(spamreader, None)
    for row in spamreader:
        try: 
            results.append({
                'parsed': parse_page(row[0]).strip(),
                'target': row[1],
                'url': row[0],
            })
        except urllib.error.HTTPError:
            pass
            
        

In [287]:
failed = 0
for v in results:
    if v['parsed'] != v['target']:
        print(f"wanted: #{v['target']}# but got #{v['parsed']}#")
        print(f"for: {v['url']}")
        failed += 1
print(f"of {len(results)}  we failed {failed} results")

wanted: #Kendrick Lamar in Berlin: Königliche Audienz # but got #Kendrick Lamar in Berlin: Königliche Audienz#
for: https://juice.de/kendrick-lamar-in-berlin-live/
wanted: #Eher ein Messias als ein Rap-König# but got #rbb|24 (logo)#
for: https://www.rbb24.de/kultur/beitrag/2018/03/konzertkritik-kendrick-lamar-berlin.html
wanted: #Nur einmal den Messias berühren# but got #Rap-Musik: Nur einmal den Messias berühren#
for: http://www.fnp.de/nachrichten/kultur/Nur-einmal-den-Messias-beruehren;art679,2910348
wanted: #Konzertbericht: Kendrick Lamar in Frankfurt am Main # but got #Konzertbericht: Kendrick Lamar in Frankfurt am Main (15.02.2018)#
for: https://www.tonspion.de/news/konzertbericht-kendrick-lamar-frankfurt-am-main-15022018
wanted: #James Brown, Michael Jackson - diese Liga# but got #Kendrick Lamar live in Frankfurt: James Brown, Micheal Jackson - diese Liga#
for: http://www.spiegel.de/kultur/musik/kendrick-lamar-live-in-frankfurt-james-brown-micheal-jackson-diese-liga-a-1193846.htm

In [304]:
url = "http://www.deutschlandfunk.de/us-band-incubus-mit-aller-macht-zu-album-nummer-acht.2588.de.html?dram:article_id=384680"
target = 'Demut, bitte!'
r = urllib.request.urlopen(url, context=ssl._create_unverified_context()).read()
soup = BeautifulSoup(r, 'html.parser')


In [305]:
import bs4
substitutes = [
    ("// Track", ""),
    ("feat.", "ft"),
    ("–", "-"),
    ("inkl. Video",""),
    ("|  WR.de  | Kultur",""),
    (" // Live", ""),
]
translation = "".maketrans("»«",'""')

def fetch_title_string(string_list):
    if len(string_list) == 0:
        return None
    out = ''
    index = -1
    while len(out.strip()) < 3:
        out = string_list[index]
        index -= 1
    return out

def fetch_title(soup):
    h1 = None
    # 1 rule
    try:
        for h in soup.find_all("h1",limit=3):
        
            if 'header' not in [c.name for c in h.parents] and 'site-title' not in h['class'] and "is-hidden" not in h['class']:
                h1 = h
                break
    except KeyError:
        h1 = h

    title = h1 and (h1.string or fetch_title_string(list(h1.strings)))
    
    depth = 0
    if h1:
        for i in h1.previous_elements:
            depth += 1
            if depth > 6:
                break
            if type(i) == bs4.element.Tag and i.name == 'h2':
                title = f'{i.string}: {title}'
                break
    
    if not title or len(title) < 3:
        title = soup.h2 and (soup.h2.string or fetch_title_string(list(soup.h2.strings)))
        for i in soup.h2.parents:
            try:
                if i['id'] == 'header':
                    title = None
                    break
            except KeyError:
                pass
        
    if not title or len(title) < 3:
        title = soup.h3 and (soup.h3.string or fetch_title_string(list(soup.h3.strings)))
        for i in soup.h3.parents:
            try:
                if i['id'] == 'header':
                    title = None
                    break
            except KeyError:
                pass
        
    
    # 2nd rule
    if not title or len(title) < 3:
        title = soup.title.string


    # clean up
    if title:
        for f, t in substitutes:
            title = title.replace(f, t)
        title = title.translate(translation).strip()
        
    return title

In [306]:
result = fetch_title(soup)
if result == target:
    print("##success##")
else:
    print(target)
    print(result)
    print("##FAIL##")

Demut, bitte!
Entdecken Sie den Deutschlandfunk
##FAIL##


In [319]:
h1 = None
# 1 rule
try:
    for h in soup.find_all("h1",limit=3):

        if 'header' not in [c.name for c in h.parents] and 'site-title' not in h['class'] and "is-hidden" not in h['class']:
            h1 = h
            break
except KeyError:
    h1 = h


title = h1 and (h1.string or fetch_title_string(list(h1.strings)))
title

'Mit aller Macht zu Album Nummer Acht'

In [122]:
soup


<!DOCTYPE html>

<html class="no-js" lang="de" xmlns="http://www.w3.org/1999/html">
<head>
<meta charset="utf-8"/>
<meta content="width=device-width" name="viewport"/>
<meta content="Live-Review zum ersten Deutschland-Konzert der US-Band Greta van Fleet am 22. März 2018 im Bürgerhaus Stollwerck in Köln mit Sheafs als Support-Band." name="description">
<link href="https://plus.google.com/u/0/100962721244913334751/posts" rel="author"/>
<meta content="@c-schumacher" name="twitter:creator">
<meta content="https://www.facebook.com/carsten.schumacher" property="article:author"/>
<meta content="Storys,Greta Van Fleet,Sheafs,Konzert-Nachlese,Live-Review,Nachbericht,Köln,Bürgerhaus Stollwerck,Sheafs,Konzertbericht,Livebericht,Nachlesen" name="keywords"/>
<meta content="Storys,Greta Van Fleet,Sheafs,Konzert-Nachlese,Live-Review,Nachbericht,Köln,Bürgerhaus Stollwerck,Sheafs,Konzertbericht,Livebericht,Nachlesen" name="article:tag"/>
<meta content="Greta van Fleet live in Köln" property="og:title"