# Label Quality Check: Embedding-basierte Analyse
## UMAP Visualisierung + Outlier Detection pro Klasse

Dieses Notebook analysiert die Qualitaet der hand-gelabelten Daten:
1. Artikel mit Sentence-Transformer embedden
2. UMAP 2D-Projektion → Klassen-Overlap sichtbar machen
3. Outlier Detection (Centroid Distance + LOF) → verdaechtige Samples finden
4. Interaktive Inspektion der Outlier

In [None]:
# === SETUP ===
import subprocess, sys

def _install(pkg):
    subprocess.check_call([sys.executable, "-m", "pip", "install", "-q", pkg])

for pkg in ["sentence-transformers", "umap-learn", "scikit-learn", "scipy", "matplotlib"]:
    try:
        __import__(pkg.replace("-", "_").split(">")[0])
    except ImportError:
        print(f"Installing {pkg}...")
        _install(pkg)

import os
from pathlib import Path

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from scipy.spatial.distance import cdist
from sentence_transformers import SentenceTransformer
from sklearn.neighbors import LocalOutlierFactor
from umap import UMAP

print("Imports OK.")

In [None]:
# === KONFIGURATION ===
# Hier Parameter anpassen:

# Embedding-Modell (multilingual, gut fuer Deutsch)
MODEL_NAME = "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"
BATCH_SIZE = 64

# Outlier Detection
CENTROID_PERCENTILE = 95    # Samples ueber diesem Perzentil = Outlier
LOF_NEIGHBORS = 10          # k fuer Local Outlier Factor

# UMAP
UMAP_NEIGHBORS = 15
UMAP_MIN_DIST = 0.1
RANDOM_SEED = 42

# Pfade
BASE_DIR = Path(os.getcwd()).resolve()
DATA_DIR = BASE_DIR / ".." / ".." / "data" / "articles"
LABELED_CSV = DATA_DIR / "cleaned_articles_labeled.csv"
OUTPUT_DIR = BASE_DIR / "label_quality_output"
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

SKIP_LABELS = {"skipped", "not_clean"}

print(f"Labeled CSV:  {LABELED_CSV}")
print(f"Output:       {OUTPUT_DIR}")
print(f"Modell:       {MODEL_NAME}")

In [None]:
# === DATEN LADEN ===
df = pd.read_csv(LABELED_CSV, encoding="utf-8")
df = df[~df["label"].isin(SKIP_LABELS)].reset_index(drop=True)

print(f"Geladen: {len(df)} gelabelte Artikel, {df['label'].nunique()} Klassen\n")
print("Klassenverteilung:")
print(df["label"].value_counts().to_string())

In [None]:
# === EMBEDDINGS BERECHNEN ===
print(f"Lade Modell: {MODEL_NAME}")
model = SentenceTransformer(MODEL_NAME)

# Headline + Text kombinieren (Headline gibt starkes Topic-Signal)
texts = (df["headline"].fillna("") + " \u2014 " + df["text"].fillna("")).tolist()

print(f"Encode {len(texts)} Artikel (batch_size={BATCH_SIZE})...")
embeddings = model.encode(
    texts,
    batch_size=BATCH_SIZE,
    show_progress_bar=True,
    normalize_embeddings=True,
)
print(f"Embedding Shape: {embeddings.shape}")

In [None]:
# === UMAP PROJEKTION ===
print("UMAP laeuft...")
reducer = UMAP(
    n_neighbors=UMAP_NEIGHBORS,
    min_dist=UMAP_MIN_DIST,
    n_components=2,
    metric="cosine",
    random_state=RANDOM_SEED,
)
coords = reducer.fit_transform(embeddings)

df["umap_x"] = coords[:, 0]
df["umap_y"] = coords[:, 1]
print(f"UMAP fertig — Shape: {coords.shape}")

In [None]:
# === OUTLIER DETECTION: CENTROID DISTANCE ===
labels = df["label"].values
is_centroid_outlier = np.zeros(len(labels), dtype=bool)
centroid_dists = np.zeros(len(labels))

for lbl in np.unique(labels):
    mask = labels == lbl
    class_emb = embeddings[mask]

    # Zentroid berechnen und normalisieren
    centroid = class_emb.mean(axis=0, keepdims=True)
    centroid /= np.linalg.norm(centroid) + 1e-10

    # Kosinus-Distanz zum Zentroid
    dists = cdist(class_emb, centroid, metric="cosine").flatten()
    centroid_dists[mask] = dists

    threshold = np.percentile(dists, CENTROID_PERCENTILE)
    idx = np.where(mask)[0]
    is_centroid_outlier[idx] = dists > threshold

df["centroid_dist"] = centroid_dists
df["centroid_outlier"] = is_centroid_outlier

print(f"Centroid Outlier ({CENTROID_PERCENTILE}. Perzentil): {is_centroid_outlier.sum()} Artikel geflaggt")

In [None]:
# === OUTLIER DETECTION: LOCAL OUTLIER FACTOR ===
is_lof_outlier = np.zeros(len(labels), dtype=bool)

for lbl in np.unique(labels):
    mask = labels == lbl
    class_emb = embeddings[mask]

    k = min(LOF_NEIGHBORS, len(class_emb) - 1)
    if k < 2:
        continue

    lof = LocalOutlierFactor(n_neighbors=k, metric="cosine", contamination="auto")
    preds = lof.fit_predict(class_emb)  # -1 = Outlier, 1 = Inlier

    idx = np.where(mask)[0]
    is_lof_outlier[idx] = preds == -1

df["lof_outlier"] = is_lof_outlier

print(f"LOF Outlier (k={LOF_NEIGHBORS}): {is_lof_outlier.sum()} Artikel geflaggt")
print(f"Beide Methoden: {(is_centroid_outlier & is_lof_outlier).sum()} Artikel")

In [None]:
# === PLOT: UMAP UEBERSICHT ===
unique_labels = np.unique(labels)
cmap = plt.cm.get_cmap("tab20", len(unique_labels))
color_map = {lbl: cmap(i) for i, lbl in enumerate(unique_labels)}

fig, ax = plt.subplots(figsize=(16, 12))
for lbl in unique_labels:
    mask = labels == lbl
    ax.scatter(coords[mask, 0], coords[mask, 1], c=[color_map[lbl]], label=lbl, s=20, alpha=0.6)

ax.legend(bbox_to_anchor=(1.02, 1), loc="upper left", fontsize=8, markerscale=2)
ax.set_title("UMAP — Alle gelabelten Artikel nach Klasse", fontsize=14)
ax.set_xlabel("UMAP 1")
ax.set_ylabel("UMAP 2")
plt.tight_layout()
fig.savefig(OUTPUT_DIR / "umap_overview.png", dpi=150, bbox_inches="tight")
plt.show()

In [None]:
# === PLOT: PER-KLASSE OUTLIER (Centroid + LOF) ===

def plot_class_outliers(coords, labels, outlier_mask, method_name):
    unique_labels = sorted(np.unique(labels))
    n_classes = len(unique_labels)
    cols = 4
    rows = (n_classes + cols - 1) // cols

    fig, axes = plt.subplots(rows, cols, figsize=(5 * cols, 4.5 * rows))
    axes = axes.flatten()

    for i, lbl in enumerate(unique_labels):
        ax = axes[i]
        mask = labels == lbl
        class_coords = coords[mask]
        class_outlier = outlier_mask[mask]

        ax.scatter(coords[~mask, 0], coords[~mask, 1], c="lightgray", s=5, alpha=0.15)

        inlier = ~class_outlier
        ax.scatter(class_coords[inlier, 0], class_coords[inlier, 1],
                   c="steelblue", s=18, alpha=0.6, label="Inlier")

        n_out = class_outlier.sum()
        ax.scatter(class_coords[class_outlier, 0], class_coords[class_outlier, 1],
                   c="red", s=40, alpha=0.9, edgecolors="darkred", linewidths=0.5,
                   label=f"Outlier ({n_out})")

        ax.set_title(f"{lbl}", fontsize=10, fontweight="bold")
        ax.legend(fontsize=7, loc="upper right")
        ax.set_xticks([])
        ax.set_yticks([])

    for j in range(i + 1, len(axes)):
        axes[j].set_visible(False)

    fig.suptitle(f"Per-Klasse Outlier ({method_name})", fontsize=14, y=1.01)
    plt.tight_layout()
    fig.savefig(OUTPUT_DIR / f"outliers_per_class_{method_name.lower().replace(' ', '_')}.png",
                dpi=150, bbox_inches="tight")
    plt.show()


print("--- Centroid Distance Outlier ---")
plot_class_outliers(coords, labels, df["centroid_outlier"].values, "Centroid Distance")

print("\n--- LOF Outlier ---")
plot_class_outliers(coords, labels, df["lof_outlier"].values, "LOF")

In [None]:
# === PLOT: DISTANZ-BOXPLOT PRO KLASSE ===
dist_data = []
for lbl in sorted(np.unique(labels)):
    mask = labels == lbl
    for d in centroid_dists[mask]:
        dist_data.append({"label": lbl, "cosine_dist": d})

dist_df = pd.DataFrame(dist_data)

fig, ax = plt.subplots(figsize=(14, 6))
dist_df.boxplot(column="cosine_dist", by="label", ax=ax, rot=45, grid=False)
ax.set_title(f"Kosinus-Distanz zum Klassen-Zentroid (Schwelle: {CENTROID_PERCENTILE}. Perzentil)", fontsize=12)
ax.set_xlabel("")
ax.set_ylabel("Cosine Distance")
fig.suptitle("")
plt.tight_layout()
fig.savefig(OUTPUT_DIR / "centroid_distance_boxplot.png", dpi=150, bbox_inches="tight")
plt.show()

In [None]:
# === OUTLIER SUMMARY ===
print("=" * 75)
print("OUTLIER SUMMARY")
print("=" * 75)
print(f"{'Klasse':30s}  {'Total':>5s}  {'Centroid':>8s}  {'LOF':>5s}  {'Beide':>5s}")
print("-" * 75)

for lbl in sorted(df["label"].unique()):
    mask = df["label"] == lbl
    n_total = mask.sum()
    n_centroid = df.loc[mask, "centroid_outlier"].sum()
    n_lof = df.loc[mask, "lof_outlier"].sum()
    n_both = (df.loc[mask, "centroid_outlier"] & df.loc[mask, "lof_outlier"]).sum()
    print(f"  {lbl:30s}  {n_total:4d}     {n_centroid:3d}       {n_lof:3d}    {n_both:3d}")

n_flagged = (df["centroid_outlier"] | df["lof_outlier"]).sum()
n_both_total = (df["centroid_outlier"] & df["lof_outlier"]).sum()
print(f"\nTotal geflaggt (mind. 1 Methode): {n_flagged}")
print(f"Geflaggt von BEIDEN Methoden:     {n_both_total}  <-- die verdaechtigsten")

In [None]:
# === OUTLIER REPORT SPEICHERN ===
outlier_df = df[df["centroid_outlier"] | df["lof_outlier"]].copy()
outlier_df = outlier_df.sort_values(["label", "centroid_dist"], ascending=[True, False])
outlier_df["text_preview"] = outlier_df["text"].str[:200] + "..."

report_cols = ["id", "label", "headline", "text_preview", "domain",
               "centroid_dist", "centroid_outlier", "lof_outlier"]
report_cols = [c for c in report_cols if c in outlier_df.columns]
report = outlier_df[report_cols]

report_path = OUTPUT_DIR / "outlier_report.csv"
report.to_csv(report_path, index=False, encoding="utf-8")
print(f"Outlier Report gespeichert: {report_path}  ({len(report)} Artikel)")

# Vollstaendiger DataFrame mit allen Scores
full_path = OUTPUT_DIR / "labeled_with_quality_scores.csv"
df.to_csv(full_path, index=False, encoding="utf-8")
print(f"Vollstaendiger DataFrame:   {full_path}")

In [None]:
# === INTERAKTIVE INSPEKTION ===
# Zeigt die verdaechtigsten Outlier (von BEIDEN Methoden geflaggt)

both_outliers = df[df["centroid_outlier"] & df["lof_outlier"]].copy()
both_outliers = both_outliers.sort_values("centroid_dist", ascending=False)

print(f"{len(both_outliers)} Artikel von BEIDEN Methoden geflaggt:\n")

for i, (_, row) in enumerate(both_outliers.iterrows()):
    print(f"{'='*70}")
    print(f"[{i+1}/{len(both_outliers)}]  Label: {row['label']}  |  Centroid-Dist: {row['centroid_dist']:.4f}")
    print(f"ID: {row['id']}  |  Domain: {row.get('domain', 'N/A')}")
    print(f"Headline: {row['headline']}")
    print(f"Text (200 Zeichen): {str(row['text'])[:200]}...")
    print()

In [None]:
# === EINZELNE KLASSE INSPIZIEREN ===
# Klasse hier aendern um Outlier einer bestimmten Klasse zu sehen:

INSPECT_CLASS = "Andere"  # <-- hier aendern

class_outliers = df[
    (df["label"] == INSPECT_CLASS) &
    (df["centroid_outlier"] | df["lof_outlier"])
].sort_values("centroid_dist", ascending=False)

print(f"Outlier in Klasse '{INSPECT_CLASS}': {len(class_outliers)}\n")

for i, (_, row) in enumerate(class_outliers.iterrows()):
    methods = []
    if row["centroid_outlier"]: methods.append("Centroid")
    if row["lof_outlier"]: methods.append("LOF")
    print(f"[{i+1}] Dist={row['centroid_dist']:.4f}  Methode: {'+'.join(methods)}")
    print(f"    Headline: {row['headline']}")
    print(f"    Text: {str(row['text'])[:150]}...")
    print()