# Wahldaten-Crawler für alle Bundestags-, Landtags- und Europawahlen
Dieses Skript lädt Wahlergebnisse für alle Wahlen in Deutschland aus dem [Wahlarchiv der Tagesschau](https://wahl.tagesschau.de/wahlen/chronologie/chronologie.shtml) herunter. Diese werden in einem JSON-Format im aktuellen Verzeichnis gespeichert.

Das Ausgabeformat ist eine Liste mit Objekten. Jedes Objekt hat folgende Felder:

| Feld       | Beschreibung                                              |
|------------|-----------------------------------------------------------|
| title      | Titel der Wahl, z.B. Bundestagswahl 1990                  |
| url        | URL der Quelle                                            |
| date       | Datum der Wahl in ISO-8601                                |
| territory  | Bundesland, "Deutschland", "BRD", "DDR", oder "Europa"    |
| kind       | Art der Wahl, z.B. Landtagswahl                           |
| government | Liste der Parteien, die die Regierung stellen, als Kürzel |
| turnout    | Wahlbeteiligung in Prozent (optional)                     |
| results    | Wahlergebnisse der einzelnen Parteien                     |

Das Feld `results` enthält ein Objekt mit einem Schlüssel für jedes Parteikürzel. Der Wert ist jeweils ein Objekt mit folgenden Feldern:

| Feld      | Beschreibung                                        |
|-----------|-----------------------------------------------------|
| pct       | Wahlergebnis mit ein oder zwei Nachkommastellen     |
| votes     | Anzahl Stimmen (optional)                           |
| long_name | Voller Name oder Beschreibung der Partei (optional) |
| color     | Hexcode der Parteifarbe (optional)                  |

## Quellcode

Zunächst wird aus einer Übersichtsseite eine Liste der URLs aller Wahl-Seiten generiert.

In [1]:
import requests
from bs4 import BeautifulSoup

URL_BASE = "https://wahl.tagesschau.de/wahlen/"

overview_raw = requests.get(URL_BASE + "chronologie/chronologie.shtml")
page = BeautifulSoup(overview_raw.content, 'html.parser')

election_urls = []
for listing in page.find_all('ul', {"class": "list"}):
    for entry in listing.find_all('a'):
        election_urls.append({
            "title": entry.text,
            "url": URL_BASE + entry["href"][3:]
        })
        
print(f"Found {len(election_urls)} elections")

Found 270 elections


Als nächstes werden die Wahlseiten jeder Wahl geladen und die Wahlergebnisse und Metadaten extrahiert. Mehrere Hilfs-Funktionen extrahieren Metadaten aus der Seite.

In [None]:
import json
from datetime import datetime
from time import sleep

DATE_FORMAT = "%d.%m.%Y"
TIMESTAMP = datetime.strftime(datetime.now(), "%y%m%d")
OUT_FNAME = f"elections_germany.{TIMESTAMP}.json"

def get_metadata(base_title):
    try:
        date_text, kind, territory = election["title"].split(" ")
        year = date_text.split(".")[-1]
        title = f"{kind} {territory} {year}"
    except ValueError:
        date_text, kind = election["title"].split(" ")
        year = int(date_text.split(".")[-1])
        if kind == "Europawahl":
            territory = "Europawahl"
            title = f"{kind} in Deutschland {year}"
        elif kind == "Bundestagswahl":
            if year >= 1990:
                territory = "Deutschland"
            else:
                territory = "BRD"
            title = f"{kind} {year}"
        elif kind == "Volkskammerwahl":
            territory = "DDR"
            title = f"{kind} {year}"
            
    date = datetime.strptime(date_text, DATE_FORMAT).isoformat()

    return title, date, kind, territory

def select_government(page):
    government = None
    gov_raw = page.find("div", {"class": "regierung"})
    if gov_raw:
        government = [el.get_text() for el in gov_raw.find_all("span")[1:]]
    return government

def select_turnout(page):
    turnout = None
    mod_stoppers = page.find_all("div", {"class": "modStopper"})
    for ms in mod_stoppers:
        if "Wahlbeteiligung" in ms.get_text():
            turnout = float(ms.get_text().strip().split(" ")[1][:-1].replace(",", "."))
    return turnout

def select_party_info(glossary, party):
    if glossary is None:
        return {}
    
    candidates = [a for a in gloss.find_all("div", {"class": "glossarText"}) if party in a.get_text() ]

    rv = {}
    if len(candidates) == 1:
        long_name = candidates[0].get_text().split(":")[1].strip()
        color = candidates[0].previousSibling.previousSibling["style"].split("#")[1][:6]
        rv["long_name"] = long_name
        # #707173 is a placeholder grey color         
        if color != "707173":
            rv["color"] = "#" + color

    return rv
    
def extract_results_from_tables(tables, gloss):
    results = {}
    for t in tables:
        for row in t.find_all("tr", {"class": "row"}):
            party = row.find("td", {"class": "labelshort"}).get_text()
            results[party] = {
                "pct": float(row.find("td", {"class": "perc"}).get_text().replace(",", ".")),
                "votes": int(row.find("td", {"class": "votes"}).get_text().replace(".", ""))
            }
            results[party].update(select_party_info(gloss, party))
    return results
        
# Extract results from image
def extract_results_alt(page, gloss):
    # only small party results are shown in a table
    # find big party results  
    def extract_from_image(page, searchstring):
        els = page.find_all(lambda tag: tag.has_attr("alt") and searchstring in tag["alt"])
        contents = el["alt"]
        contents_1, source = contents.split("Quelle:")
        _, contents_2 = contents_1.split("%:")
        return contents_2.split(";")
        
    try:
        results = extract_from_image(page, "Ergebnis")
    except ValueError:
        results = extract_from_image(page, "Ergebnis")
    except ValueError:
        print(page.find_all(lambda tag: tag.has_attr("alt") and "Ergebnis" in tag["alt"]))
        raise ValueError
    rv = {}
    for result in results:
        try:
            party, pct = result.strip().split(" ")
            rv[party] = {
                "pct": float(pct.replace(",", "."))
            }
            rv[party].update(select_party_info(gloss, party))

        except ValueError:
            pass
    if "Andere" in rv.keys():
        del rv["Andere"]
    return rv

def save_results():
    with open(OUT_FNAME, "w") as f:
        json.dump(elections, f, indent=2, ensure_ascii=False)

elections = []
for i, election in enumerate(election_urls[255:]):
    print(str(i) + "-" + election["title"])
    d = requests.get(election["url"])
    page = BeautifulSoup(d.content, 'html.parser')
    
    title, date, kind, territory = get_metadata(election["title"])
    government = select_government(page)
    turnout = select_turnout(page)

    entry = {}
    entry["title"] = title
    entry["url"] = election["url"]
    entry["date"] = date
    entry["territory"] = territory
    entry["kind"] = kind
    entry["government"] = government
    entry["turnout"] = turnout
    
    try:
        gloss_heading = page.find("h3", {"class": "glossar"})
        gloss = gloss_heading.next_sibling.next_sibling.find("ul")
    except AttributeError:
        print("No glossary here")
        gloss = None
    
    try:
        tableA, tableB = page.find_all("table", {"class": "fivepercentX"})
        results = extract_results_from_tables([tableA, tableB], gloss)
    except ValueError:
        tableB = page.find("table", {"class": "fivepercent"})
        results = extract_results_from_tables([tableB], gloss) if tableB else {}
        resultsB = extract_results_alt(page, gloss)
        results.update(resultsB)
    entry["results"] = results
    elections.append(entry)
    
    save_results()
    
    # out of respect
    sleep(.5)