## 0. Imports & Settings
All imports are consolidated here. We also configure pandas display options to aid debugging (optional).

In [None]:
import re
import pandas as pd

# Optional: more helpful DataFrame display for interactive work
pd.set_option('display.max_colwidth', 200)
pd.set_option('display.width', 120)

In [None]:
# File path (note the use of raw string `r` to handle backslashes)
file_path = r"../../data/speeches.csv"  

# Define the base path for outputs
base_path = r"../../data/"  

## 2. Load Data
Reads the CSV into a pandas DataFrame `df`. The variable name is kept unchanged.

In [None]:
# Load the CSV file into a DataFrame
df = pd.read_csv(file_path)

# Quick sanity check (uncomment to preview)
# df.head()

## 3. Party Mapping & Initial Features
Creates a `Party` column based on `factionId` and computes a first pass of `count_words`.

In [None]:
# Mapping factionId to party names
party_map = {
    0: "AFD",
    3: "Greens",
    4: "CDU/CSU",
    6: "DIE LINKE.",
    13: "FDP",
    23: "SPD"
}

# Create a new 'Party' column based on 'factionId'
df['Party'] = df['factionId'].map(party_map)

# Count words in 'speechContent' column (robust to NaNs)
df['count_words'] = df['speechContent'].fillna('').astype(str).str.count(r'\S+')

## 4. Create Analysis Dataset `df_test`
Filters out short speeches and standardizes certain `positionShort` values for specific speakers.

In [None]:
## create dataset used for the analysis
df_test = df[df['count_words'] > 250].copy()

# Update Eva Högl entries
mask_eva = (
    (df_test['firstName'] == 'Eva') &
    (df_test['lastName'] == 'Högl') &
    (df_test['positionShort'] == 'Not found')
)
df_test.loc[mask_eva, 'positionShort'] = 'Member of Parliament'

# Update Anja Karliczek entries
mask_anja = (
    (df_test['firstName'] == 'Anja') &
    (df_test['lastName'] == 'Karliczek') &
    (df_test['positionShort'] == 'Not found')
)
df_test.loc[mask_anja, 'positionShort'] = 'Minister'

# Update Marco Wanderwitz entries
mask_marco = (
    (df_test['firstName'] == 'Marco') &
    (df_test['lastName'] == 'Wanderwitz') &
    (df_test['positionShort'] == 'Not found')
)
df_test.loc[mask_marco, 'positionShort'] = 'Secretary of State'

# Remove entries where positionShort is undesired
df_test = df_test[~df_test['positionShort'].isin(['Presidium of Parliament', 'Not found', 'Guest'])].copy()

## 5. Text Cleaning – Numbers in Brackets & Whitespace
Removes `({0})`, `({1})`, etc., collapses whitespace, and trims.

In [None]:
## delete bracketed numbers like ({0}), ({1}), etc. and replace multiple whitespace/newlines with single spaces
def clean_speech_content(text: str) -> str:
    # Guard for non-str inputs
    if text is None:
        return ''
    text = str(text)

    # Remove bracketed numbers like ({0}), ({1}), etc.
    text = re.sub(r'\(\{[0-9]+\}\)', '', text)

    # Replace multiple whitespace/newlines with single spaces
    text = re.sub(r'\s+', ' ', text)

    # Strip leading/trailing whitespace
    text = text.strip()
    return text

# Apply the cleaning function to the speechContent column
df_test['speechContent'] = df_test['speechContent'].apply(clean_speech_content)

## 6. Remove Preambles/Closings from `speechContent`
Deletes salutations, formal addresses, and common closing thanks using a curated phrase list. Longer forms are matched before shorter ones to avoid partial-match leftovers.

In [None]:
## clean up speechContent column by deleting salutations and other non-speech content
def remove_phrases(text: str) -> str:
    if text is None:
        return ''
    text = str(text)

    # List of phrases to remove - organized by category and sorted logically
    phrases_to_remove = [
        # Presidential addresses
        "Sehr geehrter Herr Präsident",
        "Sehr geehrte Frau Präsidentin",
        "Herr Präsident",
        "Frau Präsidentin",
        "Frau Präsident",

        # General formal addresses (longest first - CRITICAL ORDER)
        "Meine sehr verehrten Damen und Herren",
        "Meine sehr geehrten Damen und Herren",
        "Sehr geehrte Damen und Herren",
        "Sehr verehrte Damen und Herren",
        "Meine Damen und Herren",
        "Meine Herren und Damen",
        "Meine Damen! Meine Herren",
        "Meine sehr verehrten Damen",
        "Meine Damen",
        "Meine Herren",

        # Colleague addresses (longest first - CRITICAL ORDER)
        "Meine sehr geehrten Kolleginnen und Kollegen",
        "Meine sehr verehrten Kolleginnen und Kollegen",
        "Meine lieben Kolleginnen und Kollegen",
        "Meine verehrten Kolleginnen und Kollegen",
        "Liebe Kolleginnen und Kollegen",
        "Verehrte Kolleginnen und Kollegen",
        "Geehrte Kolleginnen und Kollegen",
        "Sehr geehrte Kolleginnen und Kollegen",
        "Sehr verehrte Kolleginnen und Kollegen",
        "Liebe Kolleginnen, liebe Kollegen",
        "Sehr geehrte Kollegen",

        # Individual colleague addresses (with one word after)
        r"Lieber Kollege \w+",
        r"Liebe Kollegin \w+",
        r"Herr Kollege \w+",
        r"Frau Kollegin \w+",

        # These must come AFTER longer phrases that contain them
        "Sehr geehrte",
        "Sehr geehrter",
        "Sehr verehrte",
        "Kollegen",

        # Other addresses
        "Sehr verehrte Gäste",

        # Thanks and closing phrases
        "Ich danke für Ihre Aufmerksamkeit",
        "Ich danke für die Aufmerksamkeit",
        "Herzlichen Dank",
        "Vielen Dank",
        "Danke",
    ]

    # Remove each phrase with flexible punctuation
    for phrase in phrases_to_remove:
        # Check if phrase contains regex pattern (for phrases with \w+)
        if r"\w+" in phrase:
            # For patterns like "Lieber Kollege \w+"
            pattern = phrase + r"\s*[.,!?;:]*"
            text = re.sub(pattern, '', text, flags=re.IGNORECASE)
        else:
            # For regular phrases - use word boundaries to prevent partial matches
            pattern = r"\b" + re.escape(phrase) + r"\b\s*[.,!?;:]*"
            text = re.sub(pattern, '', text, flags=re.IGNORECASE)

    # Clean up any remaining single letters at the beginning of sentences
    text = re.sub(r"\b[a-zA-Z]\s+", ' ', text)

    # Clean up any remaining "sehr" at the beginning of sentences or after punctuation
    text = re.sub(r"(^|\s)sehr\s+", r"\1", text, flags=re.IGNORECASE)

    # Clean up extra whitespace
    text = re.sub(r"\s+", ' ', text)
    text = text.strip()
    return text

# Apply the function
df_test['speechContent'] = df_test['speechContent'].apply(remove_phrases)
print("Updated phrase removal completed!")

## 7. Recompute Word Counts
Rebuilds `count_words` on the cleaned `speechContent`.

In [None]:
# add variable to count words in 'speechContent' column
df_test['count_words'] = df_test['speechContent'].fillna('').astype(str).str.count(r'\S+')

## 8. Split by Electoral Term
Creates a separate dataset for each unique `electoralTerm` and exposes them in the notebook's global scope
following your original naming convention (e.g., `electoralTerm_19`).

In [None]:
# Get all unique electoral terms
electoral_terms = sorted(df_test['electoralTerm'].unique())

# Create individual datasets with globals()
for term in electoral_terms:
    dataset_name = f'electoralTerm_{term}'
    globals()[dataset_name] = df_test[df_test['electoralTerm'] == term]
    print(f"{dataset_name}: {len(globals()[dataset_name])} entries")

## 9. Save Outputs
Writes the cleaned master dataset and each electoral-term-specific dataset to CSV.
Paths are parameterized via `base_path`. 

In [None]:
## Save main dataset
df_test.to_csv(f"{base_path}\\df_test_cleaned.csv", index=False)

## Save electoral term datasets
for term in sorted(df_test['electoralTerm'].unique()):
    dataset_name = f'electoralTerm_{term}'
    if dataset_name in globals():
        globals()[dataset_name].to_csv(f"{base_path}\\{dataset_name}.csv", index=False)
        print(f"Saved {dataset_name}.csv")