In [None]:
from collections import Counter
import requests
from nltk.corpus import stopwords
import string # "string" module is already installed with Python
from nltk.tokenize import word_tokenize
from nltk.stem.wordnet import WordNetLemmatizer
from datetime import datetime
import time
from requests.exceptions import ChunkedEncodingError, ConnectionError

In [None]:
#PUBLIC key, so no need to hide
key = 'OSOegLs.PR2lwJ1dwCeje9vTj7FPOt3hvpYKtwKkhw'

### Retrieve parliamentary speech minutes (API)

In [None]:
ep=22 #define latest election period of interest
earliest_ep=20 #define earliest election period of interest
responses = []
end_date = datetime.today().strftime('%Y-%m-%d')

In [None]:
max_attempts = 3
while ep > earliest_ep:
    url = f"https://search.dip.bundestag.de/api/v1/plenarprotokoll-text?f.zuordnung=BT&f.datum.end={end_date}&apikey={key}"

    for attempt in range(1, max_attempts + 1):
        try:
            response = requests.get(url, timeout=30)
            response.raise_for_status()
            payload = response.json()
            break
        except (ChunkedEncodingError, ConnectionError) as exc:
            print(f"Attempt {attempt}/{max_attempts} failed: {exc}")
            if attempt == max_attempts:
                print("Giving up on this request, continuing with next loop iteration.")
                payload = None
            else:
                time.sleep(2 * attempt)  # simple backoff
        except requests.HTTPError as exc:
            print(f"HTTP error {exc.response.status_code}: {exc}")
            payload = None
            break

    if payload is None:
        # Skip updating state for this iteration
        continue

    responses.append(payload)

    docs = payload.get("documents", [])
    if not docs:
        print("No documents returned, stopping.")
        break

    dates = [doc["datum"] for doc in docs if "datum" in doc]
    if not dates:
        print("Documents missing 'datum', stopping.")
        break

    end_date = min(dates)
    ep = docs[0]["wahlperiode"]

    if ep <= earliest_ep:
        break

In [None]:
dates = []
for j in range(len(responses)):
    for i in range(len(responses[j]['documents'])):
        dates.append(responses[j]['documents'][i]['datum'])

In [None]:
f'The dataset ranges from {min(dates)} to {max(dates)}'

In [None]:
n_doc = len(responses[0]['documents'])*len(responses)

In [None]:
f'The dataset contains {n_doc} documents '

In [None]:
type(responses[1]['documents'][0]['text'])

In [None]:
# had to export at least one document to test smth
# with open("test_speech_output.txt", "w") as text_file:
#     text_file.write(responses[3]['documents'][0]['text'])

### Just to get a feeling for the amount of tokes: Splitting and cleaning to approximate length of an example document


In [None]:
#start of discussion
start = responses[1]['documents'][0]['text'].index('Die Sitzung ist hiermit eröffnet.')
#end of discussion
#end = responses[1]['documents'][0]['text'].index('Einen schönen restlichen Tag noch!')

#slice corpus
core_speeches = responses[1]['documents'][0]['text'][start:]

In [None]:
def cleaning(sentence):

    # Basic cleaning
    sentence = sentence.strip() ## remove whitespaces
    sentence = sentence.lower() ## lowercase
    sentence = ''.join(char for char in sentence if not char.isdigit()) ## remove numbers

    # Advanced cleaning
    for punctuation in string.punctuation:
        sentence = sentence.replace(punctuation, '') ## remove punctuation

    tokenized_sentence = word_tokenize(sentence) ## tokenize
    stop_words = set(stopwords.words('german')) ## define stopwords

    tokenized_sentence_cleaned = [ ## remove stopwords
        w for w in tokenized_sentence if not w in stop_words
    ]

    lemmatized = [
        WordNetLemmatizer().lemmatize(word, pos = "v")
        for word in tokenized_sentence_cleaned
    ]

    #cleaned_sentence = ' '.join(word for word in lemmatized)

    return lemmatized

In [None]:
len(core_speeches)