--- WiDiD: Incremental Word Sense Discovery for Parliamentary Speeches ---
Term 27, Year 1‚Äì2

In [31]:
# ##¬†Imports
%pip install "elasticsearch==8.6.2" sentence-transformers scikit-learn pandas matplotlib
from elasticsearch import Elasticsearch
from sentence_transformers import SentenceTransformer
from sklearn.cluster import AffinityPropagation
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.manifold import TSNE
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import re, os



In [None]:
# ## Configuration
INDEX_NAME = "parliament_speeches"
ES_URL = "https://assure-hammer-flooring-appreciate.trycloudflare.com"   # adjust if different
TARGET_WORDS = ["iklim","anaokulu","tatil","alƒ±≈üveri≈ü","d√∂viz","emekli","bayram","ayasofya","zam"]
BASELINE_MAX_CLUSTERS = 30
MAX_CLUSTERS = 50
SIMILARITY_THRESHOLD = 0.7
TOP_K_CLUSTERS = 3  # Track top-3 clusters per year
OUTPUT_DIR = "./lorenz_results"
os.makedirs(OUTPUT_DIR, exist_ok=True)

# Create subdirectory for each word (for API compatibility)
# The API expects: src/widid_results/{word}/tsne_{word}.csv
WIDID_RESULTS_DIR = "./widid_results"
os.makedirs(WIDID_RESULTS_DIR, exist_ok=True)

In [33]:
# ## Connect to Elasticsearch
es = Elasticsearch(ES_URL)
print("Connected to Elasticsearch ")
print(es.info().body["version"]["number"])

Connected to Elasticsearch 
8.6.1


## Helper Functions

In [None]:
def fetch_speeches(term, year, size=10000):
    """Fetch speeches for a specific term and year with metadata."""
    query = {
        "size": size,
        "_source": ["content", "term", "year", "file", "session_date"],
        "query": {
            "bool": {
                "must": [
                    {"term": {"term": term}},
                    {"term": {"year": year}}
                ]
            }
        }
    }
    res = es.search(index=INDEX_NAME, body=query)
    return [{
        "content": hit["_source"]["content"],
        "term": hit["_source"].get("term"),
        "year": hit["_source"].get("year"),
        "file": hit["_source"].get("file"),
        "session_date": hit["_source"].get("session_date")
    } for hit in res["hits"]["hits"]]

In [None]:
def discover_term_year_combinations(es, index_name):
    """Discover all available term-year combinations from Elasticsearch."""
    query = {
        "size": 0,
        "aggs": {
            "by_term": {
                "terms": {
                    "field": "term",
                    "size": 100,
                    "order": {"_key": "asc"}
                },
                "aggs": {
                    "by_year": {
                        "terms": {
                            "field": "year",
                            "size": 10,
                            "order": {"_key": "asc"}
                        }
                    }
                }
            }
        }
    }
    
    try:
        res = es.search(index=index_name, body=query)
        term_year_tuples = []
        
        for term_bucket in res["aggregations"]["by_term"]["buckets"]:
            term = term_bucket["key"]
            for year_bucket in term_bucket["by_year"]["buckets"]:
                year = year_bucket["key"]
                term_year_tuples.append((term, year))
        
        # Sort by term, then year
        term_year_tuples.sort(key=lambda x: (x[0], x[1]))
        
        return term_year_tuples
    except Exception as e:
        print(f"Error discovering term-year combinations: {e}")
        return []

TERM_YEAR_TUPLES = discover_term_year_combinations(es, INDEX_NAME)
if TERM_YEAR_TUPLES:
    print(f"Discovered {len(TERM_YEAR_TUPLES)} term-year pairs from Elasticsearch")
    print(f"  Range: {TERM_YEAR_TUPLES[0]} to {TERM_YEAR_TUPLES[-1]}")
    print(f"  Terms: {sorted(set(t for t, y in TERM_YEAR_TUPLES))}")
else:
    print("‚ö†Ô∏è  No term-year combinations found in Elasticsearch!")

Processing 55 term-year pairs from (17, 1) to (27, 5)


In [None]:
def extract_contexts(speeches, target_word, window=10):
    """Extract short context windows around target word with metadata."""
    contexts = []
    # Create regex pattern to match target word and any Turkish suffixes
    pattern = re.compile(rf"\b{re.escape(target_word.lower())}\w*\b")

    for speech in speeches:
        text = speech["content"]
        term = speech.get("term")
        year = speech.get("year")
        file = speech.get("file")
        session_date = speech.get("session_date")

        tokens = re.findall(r"\w+", text.lower()) # simple tokenization, one or more word characters
        for i, tok in enumerate(tokens):
            # Use regex to match the word and its variations
            if pattern.match(tok):
                start = max(0, i - window)
                end = min(len(tokens), i + window + 1)
                snippet = " ".join(tokens[start:end])
                contexts.append({
                    "context": snippet,
                    "term": term,
                    "year": year,
                    "file": file,
                    "session_date": session_date
                })
    return contexts

In [37]:
def compute_embeddings(model, contexts):
    """Compute embeddings for context snippets."""
    if len(contexts) == 0:
        return np.empty((0, model.get_sentence_embedding_dimension()))
    return model.encode(contexts, show_progress_bar=True)

In [38]:
def get_cluster_prototypes(X, labels, return_label_ids=False):
    """Compute centroids for each cluster and optionally return their IDs."""
    clusters = []
    label_ids = []
    for label in np.unique(labels):
        if label == -1:
            continue
        members = X[labels == label]
        if len(members) == 0:
            continue
        centroid = np.mean(members, axis=0)
        clusters.append(centroid)
        label_ids.append(label)
    clusters = np.array(clusters)
    if return_label_ids:
        return clusters, label_ids
    return clusters

In [39]:
def show_top_contexts(contexts, labels, n=3):
    """Print representative contexts for each cluster."""
    df = pd.DataFrame({"cluster": labels, "context": contexts})
    grouped = df.groupby("cluster")["context"].apply(list)
    for cluster, examples in grouped.items():
        cluster_name = "overflow/filtered" if cluster == -1 else cluster
        print(f"\nüåÄ Cluster {cluster_name} ({len(examples)} examples):")
        for ex in examples[:n]:
            print("   ‚Ä¢", ex[:200].replace("\n", " ") + ("..." if len(ex) > 200 else ""))

In [40]:
def limit_clusters(labels, max_clusters):
    """Keep only the largest max_clusters and map the rest to -1."""
    if max_clusters is None:
        return labels, np.unique(labels).tolist()
    labels = np.asarray(labels)
    unique, counts = np.unique(labels, return_counts=True)
    cluster_counts = [
        (label, count) for label, count in zip(unique, counts) if label != -1
    ]
    cluster_counts.sort(key=lambda item: item[1], reverse=True)
    keep = [label for label, _ in cluster_counts[:max_clusters]]
    if not keep:
        return np.full_like(labels, -1), []
    filtered = np.array([label if label in keep else -1 for label in labels], dtype=labels.dtype)
    return filtered, keep


class ClusterAligner:
    """Keeps global cluster IDs and assigns consistent colors over time."""

    def __init__(self, max_clusters=100, similarity_threshold=0.8, cmap_name="gist_ncar"):
        self.max_clusters = max_clusters
        self.similarity_threshold = similarity_threshold
        self.centroids = []
        self.global_ids = []
        self.cmap = plt.cm.get_cmap(cmap_name, max_clusters)
        self.palette = [self.cmap(i) for i in range(self.cmap.N)]
        self.overflow_color = (0.65, 0.65, 0.65, 1.0)

    def _add_centroid(self, centroid):
        if len(self.global_ids) >= self.max_clusters:
            return -1
        new_id = len(self.global_ids)
        self.centroids.append(centroid)
        self.global_ids.append(new_id)
        return new_id

    def _match_or_create(self, centroid):
        centroid = centroid.reshape(1, -1)
        if not self.centroids:
            return self._add_centroid(centroid)
        stacked = np.vstack(self.centroids)
        sims = cosine_similarity(stacked, centroid)[:, 0]
        best_idx = int(np.argmax(sims))
        if sims[best_idx] >= self.similarity_threshold:
            return self.global_ids[best_idx]
        return self._add_centroid(centroid)

    def align(self, raw_labels, centroid_map):
        aligned = np.full_like(raw_labels, -1)
        for local_label, centroid in centroid_map.items():
            global_id = self._match_or_create(centroid)
            if global_id == -1:
                continue
            aligned[raw_labels == local_label] = global_id
        return aligned

    def colors_for(self, labels):
        return [
            self.palette[label]
            if 0 <= label < len(self.palette)
            else self.overflow_color
            for label in labels
        ]

    def get_color(self, label):
        """Get color for a single label."""
        if 0 <= label < len(self.palette):
            return self.palette[label]
        return self.overflow_color


def plot_tsne_with_coords(term, year, word, tsne_coords, aligned_labels, aligner):
    if len(tsne_coords) < 2:
        print("  Skipping t-SNE (insufficient embeddings).")
        return

    plt.figure(figsize=(12, 10))

    # Plot by cluster using numeric markers (memory efficient)
    unique_labels = sorted(set(label for label in aligned_labels if label >= 0))

    for cluster_id in unique_labels:
        mask = aligned_labels == cluster_id
        cluster_coords = tsne_coords[mask]
        color = aligner.get_color(cluster_id)

        # Use cluster number as marker in scatter plot
        plt.scatter(cluster_coords[:, 0], cluster_coords[:, 1],
                   c=[color], marker=f'${cluster_id}$', s=200,
                   alpha=0.8, edgecolors='black', linewidths=0.5)

    # Plot filtered points if any
    if -1 in aligned_labels:
        mask = aligned_labels == -1
        overflow_coords = tsne_coords[mask]
        plt.scatter(overflow_coords[:, 0], overflow_coords[:, 1],
                   c=[aligner.overflow_color], marker='x', s=30,
                   alpha=0.3, linewidths=0.5)

    plt.title(f"'{word}' Term {term} Year {year} (t-SNE - Unified Projection)", fontsize=14, fontweight='bold')
    plt.xlabel("Dim 1", fontsize=12)
    plt.ylabel("Dim 2", fontsize=12)
    plt.grid(alpha=0.3, linestyle='--')
    plt.tight_layout()
    tsne_path = os.path.join(OUTPUT_DIR, f"tsne_term{term}_year{year}_{word}.png")
    plt.savefig(tsne_path, dpi=150, bbox_inches="tight")
    plt.show()
    print(f"  Saved t-SNE plot to {tsne_path}")


In [41]:
def create_cluster_guide(cluster_contexts_map, target_word, output_dir, aligner):
    """
    Create a cluster guide with summary CSV and detailed context file.
    Shows what each global cluster represents semantically.
    """

    if not cluster_contexts_map:
        print("  No clusters to document.")
        return

    # Calculate statistics for each cluster
    guide_rows = []
    for global_id in sorted(cluster_contexts_map.keys()):
        contexts = cluster_contexts_map[global_id]
        term_years = sorted(set(f"T{ctx['term']}Y{ctx['year']}" for ctx in contexts))

        guide_rows.append({
            'global_id': global_id,
            'color_index': global_id,
            'total_contexts': len(contexts),
            'term_year_span': ', '.join(term_years),
            'num_appearances': len(term_years)
        })

    # Create summary CSV
    df_summary = pd.DataFrame(guide_rows).sort_values('total_contexts', ascending=False)
    summary_path = os.path.join(output_dir, f"cluster_guide_{target_word}_summary.csv")
    df_summary.to_csv(summary_path, index=False)
    print(f"  Saved cluster summary to {summary_path}")

    # Create detailed context file
    context_file_path = os.path.join(output_dir, f"cluster_guide_{target_word}_contexts.txt")
    with open(context_file_path, 'w', encoding='utf-8') as f:
        f.write(f"{'='*80}\n")
        f.write(f"CLUSTER GUIDE FOR '{target_word.upper()}'\n")
        f.write(f"Generated: {pd.Timestamp.now()}\n")
        f.write(f"Total clusters: {len(cluster_contexts_map)}\n")
        f.write(f"{'='*80}\n\n")

        # Sort clusters by total contexts (most common first)
        for global_id in sorted(cluster_contexts_map.keys(),
                               key=lambda x: len(cluster_contexts_map[x]),
                               reverse=True):
            contexts = cluster_contexts_map[global_id]
            term_years = sorted(set(f"T{ctx['term']}Y{ctx['year']}" for ctx in contexts))

            # Get color info
            color = aligner.get_color(global_id)
            color_hex = '#{:02x}{:02x}{:02x}'.format(
                int(color[0]*255), int(color[1]*255), int(color[2]*255)
            )

            f.write(f"\n{'='*80}\n")
            f.write(f"CLUSTER {global_id} (Color: {color_hex})\n")
            f.write(f"{'-'*80}\n")
            f.write(f"Total contexts: {len(contexts)}\n")
            f.write(f"Appearances: {len(term_years)} term-years\n")
            f.write(f"Term-year span: {', '.join(term_years)}\n")
            f.write(f"\nREPRESENTATIVE CONTEXTS:\n")
            f.write(f"{'-'*80}\n")

            # Show up to 15 diverse examples
            shown = 0
            for ctx_item in contexts[:15]:
                f.write(f"\n[{ctx_item['term']}-{ctx_item['year']}] ")
                f.write(ctx_item['context'][:250])
                if len(ctx_item['context']) > 250:
                    f.write("...")
                f.write("\n")
                shown += 1

            if len(contexts) > 15:
                f.write(f"\n... and {len(contexts) - 15} more contexts\n")

    print(f"  Saved detailed contexts to {context_file_path}")
    print(f"  Total clusters documented: {len(cluster_contexts_map)}")

    return df_summary


In [42]:
# ## Load Sentence Transformer Model
model = SentenceTransformer("all-MiniLM-L6-v2")
print("Model loaded ")

Model loaded 


In [43]:
# Create Color Reference and Mapping
def create_color_reference(cluster_contexts_map, target_word, output_dir, aligner):
    """Create color reference chart and mapping CSV."""

    if not cluster_contexts_map:
        print("  No clusters to map.")
        return

    # Get all global IDs
    used_cluster_ids = sorted(cluster_contexts_map.keys())

    # Create color mapping CSV
    color_mapping = []
    for global_id in used_cluster_ids:
        color = aligner.get_color(global_id)
        color_hex = '#{:02x}{:02x}{:02x}'.format(
            int(color[0]*255), int(color[1]*255), int(color[2]*255)
        )
        color_rgb = f"({int(color[0]*255)}, {int(color[1]*255)}, {int(color[2]*255)})"

        color_mapping.append({
            'global_id': global_id,
            'hex_color': color_hex,
            'rgb_color': color_rgb
        })

    df_colors = pd.DataFrame(color_mapping)
    color_csv_path = os.path.join(output_dir, f'cluster_colors_{target_word}.csv')
    df_colors.to_csv(color_csv_path, index=False)
    print(f"  Saved color mapping to {color_csv_path}")

    # Print console reference
    print("\n=== Color Reference ===")
    for _, row in df_colors.iterrows():
        print(f"Cluster {row['global_id']}: {row['hex_color']}")

    # Create visual color reference chart
    import matplotlib.patches as mpatches

    fig, ax = plt.subplots(figsize=(12, max(6, len(used_cluster_ids) // 4)))
    patches = []

    for global_id in used_cluster_ids:
        color = aligner.get_color(global_id)
        patches.append(mpatches.Patch(color=color, label=f'Cluster {global_id}'))

    ax.legend(handles=patches, loc='center', ncol=min(4, len(patches)), fontsize=10)
    ax.axis('off')
    plt.title(f"Color Reference for '{target_word}' Clusters", fontsize=14, fontweight='bold')
    plt.tight_layout()

    chart_path = os.path.join(output_dir, f'color_reference_{target_word}.png')
    plt.savefig(chart_path, dpi=150, bbox_inches='tight')
    plt.show()
    print(f"  Saved color reference chart to {chart_path}")

    return df_colors


## 6Ô∏è‚É£ Main Loop Over Words

This loop processes each target word and generates:
- **Individual CSVs**: One per term-year in `lorenz_results/` (includes session_date, file)
- **t-SNE plots**: PNGs saved to `lorenz_results/` (for visualization)
- **Consolidated t-SNE CSV**: Single CSV per word in `widid_results/{word}/tsne_{word}.csv` (for coordinate API endpoint)
- **Cluster guides**: Summary and context files in `lorenz_results/`
- **Color reference**: Color mapping for clusters

The consolidated CSV contains all t-SNE data points with columns:
- `target_word`, `term`, `year`, `tsne_x`, `tsne_y`, `cluster_id`, `context`, `session_date`, `file`

Each row represents a single point in the t-SNE visualization with its context and session date.

In [None]:
for target_word in TARGET_WORDS:
    print(f"\n\n===  Analyzing '{target_word}' across {len(TERM_YEAR_TUPLES)} term-year pairs ===")
    aligner = ClusterAligner(max_clusters=MAX_CLUSTERS, similarity_threshold=SIMILARITY_THRESHOLD)
    baseline_used = False
    cluster_contexts_map = {}  # Store contexts per global_id for cluster guide
    term_year_data = []  # Store embeddings and labels for unified t-SNE

    for term, year in TERM_YEAR_TUPLES:
        print(f"\n--- Term {term}, Year {year} ---")
        speeches = fetch_speeches(term, year)
        
        # Extract actual term/year from Elasticsearch data (use most common if multiple)
        if speeches:
            actual_terms = [s.get("term") for s in speeches if s.get("term") is not None]
            actual_years = [s.get("year") for s in speeches if s.get("year") is not None]
            if actual_terms:
                actual_term = max(set(actual_terms), key=actual_terms.count)
            else:
                actual_term = term
            if actual_years:
                actual_year = max(set(actual_years), key=actual_years.count)
            else:
                actual_year = year
            print(f"  Actual data: Term {actual_term}, Year {actual_year}")
        else:
            actual_term = term
            actual_year = year
        
        contexts = extract_contexts(speeches, target_word)
        print(f"  Contexts: {len(contexts)}")
        if len(contexts) < 10:
            print("  Not enough contexts, skipping this slice.")
            continue

        embeddings = compute_embeddings(model, [c["context"] for c in contexts])
        ap = AffinityPropagation(random_state=42)
        ap.fit(embeddings)
        local_labels = ap.labels_

        cap = BASELINE_MAX_CLUSTERS if not baseline_used else MAX_CLUSTERS
        limited_labels, kept_clusters = limit_clusters(local_labels, cap)
        print(f"  Raw clusters: {len(np.unique(local_labels))}, kept: {len(kept_clusters)} (cap={cap})")

        prototypes, proto_labels = get_cluster_prototypes(embeddings, limited_labels, return_label_ids=True)
        centroid_map = dict(zip(proto_labels, prototypes))
        if not centroid_map:
            print("  No clusters survived filtering, skipping visualization.")
            continue

        baseline_used = True
        aligned_labels = aligner.align(limited_labels, centroid_map)
        global_cluster_count = len(set(label for label in aligned_labels if label >= 0))
        print(f"  Global clusters represented: {global_cluster_count}")

        # Store context examples for cluster guide (up to 10 per global cluster per term-year)
        # Use actual term/year from contexts (which come from Elasticsearch)
        for global_id in set(label for label in aligned_labels if label >= 0):
            cluster_context_examples = [
                contexts[i] for i, label in enumerate(aligned_labels) if label == global_id
            ]
            if global_id not in cluster_contexts_map:
                cluster_contexts_map[global_id] = []
            cluster_contexts_map[global_id].extend([
                {'term': ctx.get("term", actual_term), 'year': ctx.get("year", actual_year), 'context': ctx["context"]}
                for ctx in cluster_context_examples[:10]
            ])

        print("\n=== Representative Contexts ===")
        show_top_contexts([c["context"] for c in contexts], aligned_labels)

        # Use actual term/year from Elasticsearch data
        df = pd.DataFrame(
            {
                "term": actual_term,
                "year": actual_year,
                "context": [c["context"] for c in contexts],
                "session_date": [c.get("session_date") for c in contexts],
                "file": [c.get("file") for c in contexts],
                "local_cluster": limited_labels,
                "global_cluster": aligned_labels,
            }
        )
        csv_path = os.path.join(OUTPUT_DIR, f"widid_term{actual_term}_year{actual_year}_{target_word}.csv")
        df.to_csv(csv_path, index=False)
        print(f"  Saved clusters to {csv_path}")

        # Store data for unified t-SNE (computed later) - use actual term/year from Elasticsearch
        term_year_data.append({
            "term": actual_term,
            "year": actual_year,
            "embeddings": embeddings,
            "aligned_labels": aligned_labels,
            "contexts": contexts
        })


    # Compute unified t-SNE across all term-years
    if term_year_data:
        print("\n=== Computing Unified t-SNE ===")
        all_embeddings = np.vstack([item["embeddings"] for item in term_year_data])
        print(f"  Total embeddings: {len(all_embeddings)}")

        # Compute optimal perplexity for combined data
        desired = max(5, len(all_embeddings) // 3)
        max_valid = max(1, len(all_embeddings) - 1)
        perplexity = min(desired, max_valid, 30)

        tsne_unified = TSNE(n_components=2, random_state=42, perplexity=perplexity).fit_transform(all_embeddings)
        print(f"  t-SNE completed with perplexity={perplexity}")

        # Prepare consolidated data for API
        consolidated_data = []

        # Plot each term-year with its slice of unified coordinates
        start_idx = 0
        for item in term_year_data:
            end_idx = start_idx + len(item["embeddings"])
            tsne_coords = tsne_unified[start_idx:end_idx]
            #plot_tsne_with_coords(item["term"], item["year"], target_word,
                                #tsne_coords, item["aligned_labels"], aligner)

            # Collect data for consolidated CSV
            for i, (coord, label) in enumerate(zip(tsne_coords, item["aligned_labels"])):
                # Get the original context data from the stored data
                context_idx = i
                if context_idx < len(item.get("contexts", [])):
                    context_data = item["contexts"][context_idx]
                    context_text = context_data.get("context", "")
                    # Use term/year from context (Elasticsearch data), fallback to item values
                    context_term = context_data.get("term", item["term"])
                    context_year = context_data.get("year", item["year"])
                    session_date = context_data.get("session_date")
                    file = context_data.get("file")
                else:
                    context_text = ""
                    context_term = item["term"]
                    context_year = item["year"]
                    session_date = None
                    file = None

                consolidated_data.append({
                    "target_word": target_word,
                    "term": context_term,
                    "year": context_year,
                    "tsne_x": float(coord[0]),
                    "tsne_y": float(coord[1]),
                    "cluster_id": int(label),
                    "context": context_text,
                    "session_date": session_date,
                    "file": file
                })

            start_idx = end_idx

        # Save consolidated t-SNE data for API
        if consolidated_data:
            df_consolidated = pd.DataFrame(consolidated_data)

            # Save in word-specific subdirectory for API compatibility
            word_dir = os.path.join(WIDID_RESULTS_DIR, target_word)
            os.makedirs(word_dir, exist_ok=True)
            consolidated_csv_path = os.path.join(word_dir, f"tsne_{target_word}.csv")
            df_consolidated.to_csv(consolidated_csv_path, index=False, encoding='utf-8')

            print(f"\n=== Saved consolidated t-SNE data ===")
            print(f"  File: {consolidated_csv_path}")
            print(f"  Total data points: {len(consolidated_data)}")
            print(f"  Columns: {', '.join(df_consolidated.columns.tolist())}")
            print(f"  This file is ready for the interactive t-SNE API!")
    # Generate cluster guide after processing all term-years for this word
    print("\n=== Generating Cluster Guide ===")
    create_cluster_guide(cluster_contexts_map, target_word, OUTPUT_DIR, aligner)

    # Generate color reference
    print("\n=== Generating Color Reference ===")
    create_color_reference(cluster_contexts_map, target_word, OUTPUT_DIR, aligner)
    print("-----------------------------------------------------")

In [None]:
!zip -r lorenz_results.zip lorenz_results/

In [None]:
!zip -r widid_results.zip widid_results/