## Processing: Lemmatization and Lowercasing

### What the Notebook Does

- **Load Context Data:**  
  Read the `context` table from the DWH database and focus on the text columns we want to normalize.

- **Normalize Text:**  
  Convert `pre_context`, `post_context`, `prefix`, and `suffix` to lowercase for consistent analysis.

- **Manual Suffix Lemmatization:**  
  Apply a curated mapping (created from a manual check) to normalize suffix variants into a common lemma.


In [None]:
import os
import sys
import pandas as pd

# Add custom library path relative to notebook location
notebook_dir = (
    os.path.dirname(os.path.abspath(__file__))
    if "__file__" in globals()
    else os.getcwd()
)
sys.path.append(os.path.join(notebook_dir, "..", "pylib"))

import pandas as pd
from handle_sqlite import read_table_as_dataframe

db_path = os.path.join(notebook_dir, "..", "data_output", "dwh_data.db")

### Load Context Data

We load the `context` table and focus on the text columns we want to normalize.


In [None]:
context = read_table_as_dataframe("context", db_path)

text_columns = ["pre_context", "post_context", "prefix", "suffix"]
context[text_columns] = context[text_columns].astype("string")
context.head(5)

### Lowercase All Text Columns

This ensures consistent casing before any manual or automated normalization.


In [None]:
for column in text_columns:
    context[column] = context[column].str.lower()

context[text_columns].head(5)

### Uniqueness and Class Shares (Before Lemmatization)

We summarize uniqueness and relative shares of suffix classes before lemmatization.


In [None]:
suffix_stats_before = (
    context['suffix']
    .value_counts(dropna=False)
    .rename_axis('suffix')
    .reset_index(name='count')
)
suffix_stats_before['share'] = suffix_stats_before['count'] / suffix_stats_before['count'].sum()

suffix_stats_before.head(20)


In [None]:
suffix_uniqueness_before = pd.DataFrame({
    'total_rows': [len(context)],
    'unique_suffixes': [context['suffix'].nunique(dropna=False)],
})

suffix_uniqueness_before

### Build Lemma Candidates from Prefixes and Suffixes

We generate candidate lemma groups using conservative German suffix rules.
Short bases only allow small plural-style endings (e.g., +s, +n, +en),
while longer bases allow limited extra length (about 25%).
Overlapping groups are merged so a key appearing in another group pulls the sets together.


In [None]:
import math

# Algorithm overview:
# 1) Build candidate groups by matching words that differ only by small German suffixes
#    (very short bases: only +s; 4-letter bases: +s/+e/+n/+en; longer bases: up to ~25% extra length).
# 2) Pick the shortest word as the lemma key for each group.
# 3) Merge overlapping groups so that if a variant is also a key, the groups are unified.
# This keeps pairs like 'fest'/'festen' and avoids over-broad stems like 'etf' -> 'ethik'.

def build_lemma_candidates_from_words(words, min_len=3):
    words = pd.Series(words, dtype='string')
    words = words.dropna().str.strip().str.lower()
    words = words[words.str.len() >= min_len]

    unique_words = sorted(set(words.tolist()), key=lambda w: (len(w), w))
    assigned = set()
    candidates = {}

    def max_extra_len(base_len):
        if base_len <= 3:
            return 1
        if base_len == 4:
            return 2
        return max(1, math.ceil(base_len * 0.25))

    def allowed_small_suffix(extra, base_len):
        if base_len <= 3:
            return extra == 's'
        if base_len == 4:
            return extra in ('s', 'e', 'n', 'en')
        return len(extra) <= max_extra_len(base_len)

    def is_blocked_pair(base, word):
        # Prevent 'wand'/'wande' from pulling in 'wandel*' variants.
        if base in ('wand', 'wande') and word.startswith('wandel'):
            return True
        return False

    for base in unique_words:
        if base in assigned:
            continue
        base_len = len(base)
        group = [base]

        for word in unique_words:
            if word == base:
                continue
            if is_blocked_pair(base, word) or is_blocked_pair(word, base):
                continue
            if word.startswith(base):
                extra = word[base_len:]
                if allowed_small_suffix(extra, base_len):
                    group.append(word)
            elif base.startswith(word):
                extra = base[len(word):]
                if allowed_small_suffix(extra, len(word)):
                    group.append(word)

        if len(group) < 2:
            assigned.add(base)
            continue
        group = sorted(set(group), key=lambda w: (len(w), w))
        key = group[0]
        candidates[key] = group
        assigned.update(group)

    return candidates

def build_lemma_candidates(prefixes, suffixes, min_len=3):
    words = pd.concat([
        pd.Series(prefixes, dtype='string'),
        pd.Series(suffixes, dtype='string'),
    ])
    return build_lemma_candidates_from_words(words, min_len=min_len)

def merge_overlapping_candidates(candidates):
    def choose_key(words):
        return sorted(words, key=lambda w: (len(w), w))[0]

    unvisited = set(candidates.keys())
    merged = {}

    while unvisited:
        start = unvisited.pop()
        stack = [start]
        component = set()

        while stack:
            key = stack.pop()
            if key in component:
                continue
            component.add(key)
            for word in candidates.get(key, []):
                if word in candidates and word not in component:
                    stack.append(word)
                    if word in unvisited:
                        unvisited.remove(word)

        all_words = set()
        for key in component:
            all_words.update(candidates.get(key, []))
            all_words.add(key)
        all_words = sorted(all_words)
        merged_key = choose_key(all_words)
        merged[merged_key] = all_words

    return merged

lemma_candidates = build_lemma_candidates(context['prefix'], context['suffix'])
lemma_candidates = merge_overlapping_candidates(lemma_candidates)
len(lemma_candidates)


In [None]:
# Preview a few candidate groups for manual cleanup
dict(list(lemma_candidates.items()))

### Suffix Lemmatization

The dictionary replaces a variant suffix to a canonical lemma and writes it into suffix_lemma.


In [None]:
# Build a reverse lookup so every variant points to its shortest lemma key
suffix_lemma_map = {
    variant: lemma
    for lemma, variants in lemma_candidates.items()
    for variant in variants
}

context['suffix_lemma'] = (
    context['suffix'].map(suffix_lemma_map)
    .fillna(context['suffix'])
)

context[['suffix', 'suffix_lemma']].head(50)


### Uniqueness and Class Shares (After Lemmatization)

We recompute the same tables after lemmatization for easy charting.


In [None]:
suffix_stats_after = (
    context['suffix_lemma']
    .value_counts(dropna=False)
    .rename_axis('suffix_lemma')
    .reset_index(name='count')
)
suffix_stats_after['share'] = suffix_stats_after['count'] / suffix_stats_after['count'].sum()

suffix_stats_after.head(20)


In [None]:
suffix_uniqueness_after = pd.DataFrame({
    'total_rows': [len(context)],
    'unique_suffix_lemmas': [context['suffix_lemma'].nunique(dropna=False)],
})

suffix_uniqueness_after


If you want to reuse the normalized data later, you can save it as a CSV or store it in a new table.


In [None]:
# Write processed context to a new table in the same DB using pylib utils
from handle_sqlite import save_dataframe_to_db

# Choose a new table name to avoid overwriting the original 'context'
output_table = 'context_processed'

# Use 'replace' so re-running this cell updates the processed table cleanly
save_dataframe_to_db(context, output_table, db_path, if_exists='replace')
print(f"Wrote DataFrame to table '{output_table}' in {db_path}")