# WiDiD with DBSCAN: Incremental Word Sense Discovery
## Using DBSCAN Clustering for Parliamentary Speeches

This notebook implements the same WiDiD approach as `widid.ipynb` but uses **DBSCAN** (Density-Based Spatial Clustering of Applications with Noise) instead of Affinity Propagation.

### Key Differences:
- **DBSCAN** automatically determines the number of clusters based on density
- Better at handling noise and outliers
- Requires tuning `eps` (neighborhood radius) and `min_samples` parameters
- Does not assume clusters are convex or have similar sizes


## 1Ô∏è‚É£ Imports


In [None]:
%pip install "elasticsearch==8.6.2" sentence-transformers scikit-learn pandas matplotlib
from elasticsearch import Elasticsearch
from sentence_transformers import SentenceTransformer
from sklearn.cluster import DBSCAN  # Changed from AffinityPropagation
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.manifold import TSNE
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import re, os


## 2Ô∏è‚É£ Configuration


In [None]:
INDEX_NAME = "parliament_speeches"
ES_URL = "http://localhost:9200"   # adjust if different
TARGET_WORDS = ["vergi"]
START_TERM = 17
END_TERM = 27
YEARS_PER_TERM = 5

# DBSCAN specific parameters
DBSCAN_EPS = 0.3  # Maximum distance between samples (lower = tighter clusters)
DBSCAN_MIN_SAMPLES = 3  # Minimum samples in neighborhood to form core point
DBSCAN_METRIC = 'cosine'  # Use cosine distance for semantic similarity

BASELINE_MAX_CLUSTERS = 30
MAX_CLUSTERS = 50
SIMILARITY_THRESHOLD = 0.7
TOP_K_CLUSTERS = 3
OUTPUT_DIR = "./dbscan_results"
os.makedirs(OUTPUT_DIR, exist_ok=True)

print(f"üìä DBSCAN Configuration:")
print(f"   eps={DBSCAN_EPS}, min_samples={DBSCAN_MIN_SAMPLES}, metric='{DBSCAN_METRIC}'")


## 3Ô∏è‚É£ Connect to Elasticsearch


In [None]:
es = Elasticsearch(ES_URL)
print("Connected to Elasticsearch ‚úÖ")
print(es.info().body["version"]["number"])


## 4Ô∏è‚É£ Helper Functions


In [None]:
def fetch_speeches(term, year, size=10000):
    """Fetch speeches for a specific term and year."""
    query = {
        "size": size,
        "_source": ["content", "term", "year"],
        "query": {
            "bool": {
                "must": [
                    {"term": {"term": term}},
                    {"term": {"year": year}}
                ]
            }
        }
    }
    res = es.search(index=INDEX_NAME, body=query)
    return [hit["_source"]["content"] for hit in res["hits"]["hits"]]


In [None]:
def make_term_year_tuples(start_term, end_term):
    result = []
    for term in range(start_term, end_term + 1):
        for year in range(1, 6):
            result.append((term, year))
    return result

TERM_YEAR_TUPLES = make_term_year_tuples(START_TERM, END_TERM)
print(f"Processing {len(TERM_YEAR_TUPLES)} term-year pairs from {TERM_YEAR_TUPLES[0]} to {TERM_YEAR_TUPLES[-1]}")


In [None]:
def extract_contexts(texts, target_word, window=10):
    """Extract short context windows around target word and its morphological variations."""
    contexts = []
    pattern = re.compile(rf"\b{re.escape(target_word.lower())}\w*\b")
    
    for t in texts:
        tokens = re.findall(r"\w+", t.lower())
        for i, tok in enumerate(tokens):
            if pattern.match(tok):
                start = max(0, i - window)
                end = min(len(tokens), i + window + 1)
                snippet = " ".join(tokens[start:end])
                contexts.append(snippet)
    return contexts


In [None]:
def compute_embeddings(model, contexts):
    """Compute embeddings for context snippets."""
    if len(contexts) == 0:
        return np.empty((0, model.get_sentence_embedding_dimension()))
    return model.encode(contexts, show_progress_bar=True)

def get_cluster_prototypes(X, labels, return_label_ids=False):
    """Compute centroids for each cluster and optionally return their IDs."""
    clusters = []
    label_ids = []
    for label in np.unique(labels):
        if label == -1:
            continue
        members = X[labels == label]
        if len(members) == 0:
            continue
        centroid = np.mean(members, axis=0)
        clusters.append(centroid)
        label_ids.append(label)
    clusters = np.array(clusters)
    if return_label_ids:
        return clusters, label_ids
    return clusters


In [None]:
def show_top_contexts(contexts, labels, n=3):
    """Print representative contexts for each cluster."""
    df = pd.DataFrame({"cluster": labels, "context": contexts})
    grouped = df.groupby("cluster")["context"].apply(list)
    for cluster, examples in grouped.items():
        cluster_name = "noise/outlier" if cluster == -1 else cluster
        print(f"\nüåÄ Cluster {cluster_name} ({len(examples)} examples):")
        for ex in examples[:n]:
            print("   ‚Ä¢", ex[:200].replace("\n", " ") + ("..." if len(ex) > 200 else ""))


In [None]:
def limit_clusters(labels, max_clusters):
    """Keep only the largest max_clusters and map the rest to -1."""
    if max_clusters is None:
        return labels, np.unique(labels).tolist()
    labels = np.asarray(labels)
    unique, counts = np.unique(labels, return_counts=True)
    cluster_counts = [
        (label, count) for label, count in zip(unique, counts) if label != -1
    ]
    cluster_counts.sort(key=lambda item: item[1], reverse=True)
    keep = [label for label, _ in cluster_counts[:max_clusters]]
    if not keep:
        return np.full_like(labels, -1), []
    filtered = np.array([label if label in keep else -1 for label in labels], dtype=labels.dtype)
    return filtered, keep


In [None]:
class ClusterAligner:
    """Keeps global cluster IDs and assigns consistent colors over time."""
    
    def __init__(self, max_clusters=100, similarity_threshold=0.8, cmap_name="gist_ncar"):
        self.max_clusters = max_clusters
        self.similarity_threshold = similarity_threshold
        self.centroids = []
        self.global_ids = []
        self.cmap = plt.cm.get_cmap(cmap_name, max_clusters)
        self.palette = [self.cmap(i) for i in range(self.cmap.N)]
        self.overflow_color = (0.65, 0.65, 0.65, 1.0)
    
    def _add_centroid(self, centroid):
        if len(self.global_ids) >= self.max_clusters:
            return -1
        new_id = len(self.global_ids)
        self.centroids.append(centroid)
        self.global_ids.append(new_id)
        return new_id
    
    def _match_or_create(self, centroid):
        centroid = centroid.reshape(1, -1)
        if not self.centroids:
            return self._add_centroid(centroid)
        stacked = np.vstack(self.centroids)
        sims = cosine_similarity(stacked, centroid)[:, 0]
        best_idx = int(np.argmax(sims))
        if sims[best_idx] >= self.similarity_threshold:
            return self.global_ids[best_idx]
        return self._add_centroid(centroid)
    
    def align(self, raw_labels, centroid_map):
        aligned = np.full_like(raw_labels, -1)
        for local_label, centroid in centroid_map.items():
            global_id = self._match_or_create(centroid)
            if global_id == -1:
                continue
            aligned[raw_labels == local_label] = global_id
        return aligned
    
    def colors_for(self, labels):
        return [
            self.palette[label]
            if 0 <= label < len(self.palette)
            else self.overflow_color
            for label in labels
        ]
    
    def get_color(self, label):
        """Get color for a single label."""
        if 0 <= label < len(self.palette):
            return self.palette[label]
        return self.overflow_color


In [None]:
def plot_tsne_with_coords(term, year, word, tsne_coords, aligned_labels, aligner):
    if len(tsne_coords) < 2:
        print("  Skipping t-SNE (insufficient embeddings).")
        return
    
    plt.figure(figsize=(12, 10))
    
    # Plot by cluster using numeric markers
    unique_labels = sorted(set(label for label in aligned_labels if label >= 0))
    
    for cluster_id in unique_labels:
        mask = aligned_labels == cluster_id
        cluster_coords = tsne_coords[mask]
        color = aligner.get_color(cluster_id)
        
        plt.scatter(cluster_coords[:, 0], cluster_coords[:, 1],
                   c=[color], marker=f'${cluster_id}$', s=200,
                   alpha=0.8, edgecolors='black', linewidths=0.5)
    
    # Plot noise points if any
    if -1 in aligned_labels:
        mask = aligned_labels == -1
        noise_coords = tsne_coords[mask]
        plt.scatter(noise_coords[:, 0], noise_coords[:, 1],
                   c=[aligner.overflow_color], marker='x', s=30,
                   alpha=0.3, linewidths=0.5, label='Noise')
    
    plt.title(f"'{word}' Term {term} Year {year} (t-SNE - DBSCAN)", fontsize=14, fontweight='bold')
    plt.xlabel("Dim 1", fontsize=12)
    plt.ylabel("Dim 2", fontsize=12)
    plt.grid(alpha=0.3, linestyle='--')
    plt.tight_layout()
    tsne_path = os.path.join(OUTPUT_DIR, f"tsne_term{term}_year{year}_{word}.png")
    plt.savefig(tsne_path, dpi=150, bbox_inches="tight")
    plt.show()
    print(f"  Saved t-SNE plot to {tsne_path}")


In [None]:
def create_cluster_guide(cluster_contexts_map, target_word, output_dir, aligner):
    """Create a cluster guide with summary CSV and detailed context file."""
    
    if not cluster_contexts_map:
        print("  No clusters to document.")
        return
    
    guide_rows = []
    for global_id in sorted(cluster_contexts_map.keys()):
        contexts = cluster_contexts_map[global_id]
        term_years = sorted(set(f"T{ctx['term']}Y{ctx['year']}" for ctx in contexts))
        
        guide_rows.append({
            'global_id': global_id,
            'color_index': global_id,
            'total_contexts': len(contexts),
            'term_year_span': ', '.join(term_years),
            'num_appearances': len(term_years)
        })
    
    df_summary = pd.DataFrame(guide_rows).sort_values('total_contexts', ascending=False)
    summary_path = os.path.join(output_dir, f"cluster_guide_{target_word}_summary.csv")
    df_summary.to_csv(summary_path, index=False)
    print(f"  Saved cluster summary to {summary_path}")
    
    context_file_path = os.path.join(output_dir, f"cluster_guide_{target_word}_contexts.txt")
    with open(context_file_path, 'w', encoding='utf-8') as f:
        f.write(f"{'='*80}\\n")
        f.write(f"CLUSTER GUIDE FOR '{target_word.upper()}' (DBSCAN)\\n")
        f.write(f"Generated: {pd.Timestamp.now()}\\n")
        f.write(f"Total clusters: {len(cluster_contexts_map)}\\n")
        f.write(f"{'='*80}\\n\\n")
        
        for global_id in sorted(cluster_contexts_map.keys(),
                               key=lambda x: len(cluster_contexts_map[x]),
                               reverse=True):
            contexts = cluster_contexts_map[global_id]
            term_years = sorted(set(f"T{ctx['term']}Y{ctx['year']}" for ctx in contexts))
            
            color = aligner.get_color(global_id)
            color_hex = '#{:02x}{:02x}{:02x}'.format(
                int(color[0]*255), int(color[1]*255), int(color[2]*255)
            )
            
            f.write(f"\\n{'='*80}\\n")
            f.write(f"CLUSTER {global_id} (Color: {color_hex})\\n")
            f.write(f"{'-'*80}\\n")
            f.write(f"Total contexts: {len(contexts)}\\n")
            f.write(f"Appearances: {len(term_years)} term-years\\n")
            f.write(f"Term-year span: {', '.join(term_years)}\\n")
            f.write(f"\\nREPRESENTATIVE CONTEXTS:\\n")
            f.write(f"{'-'*80}\\n")
            
            for ctx_item in contexts[:15]:
                f.write(f"\\n[{ctx_item['term']}-{ctx_item['year']}] ")
                f.write(ctx_item['context'][:250])
                if len(ctx_item['context']) > 250:
                    f.write("...")
                f.write("\\n")
            
            if len(contexts) > 15:
                f.write(f"\\n... and {len(contexts) - 15} more contexts\\n")
    
    print(f"  Saved detailed contexts to {context_file_path}")
    print(f"  Total clusters documented: {len(cluster_contexts_map)}")
    
    return df_summary


In [None]:
def create_color_reference(cluster_contexts_map, target_word, output_dir, aligner):
    """Create color reference chart and mapping CSV."""
    
    if not cluster_contexts_map:
        print("  No clusters to map.")
        return
    
    used_cluster_ids = sorted(cluster_contexts_map.keys())
    
    color_mapping = []
    for global_id in used_cluster_ids:
        color = aligner.get_color(global_id)
        color_hex = '#{:02x}{:02x}{:02x}'.format(
            int(color[0]*255), int(color[1]*255), int(color[2]*255)
        )
        color_rgb = f"({int(color[0]*255)}, {int(color[1]*255)}, {int(color[2]*255)})"
        
        color_mapping.append({
            'global_id': global_id,
            'hex_color': color_hex,
            'rgb_color': color_rgb
        })
    
    df_colors = pd.DataFrame(color_mapping)
    color_csv_path = os.path.join(output_dir, f'cluster_colors_{target_word}.csv')
    df_colors.to_csv(color_csv_path, index=False)
    print(f"  Saved color mapping to {color_csv_path}")
    
    import matplotlib.patches as mpatches
    
    fig, ax = plt.subplots(figsize=(12, max(6, len(used_cluster_ids) // 4)))
    patches = []
    
    for global_id in used_cluster_ids:
        color = aligner.get_color(global_id)
        patches.append(mpatches.Patch(color=color, label=f'Cluster {global_id}'))
    
    ax.legend(handles=patches, loc='center', ncol=min(4, len(patches)), fontsize=10)
    ax.axis('off')
    plt.title(f"Color Reference for '{target_word}' Clusters (DBSCAN)", fontsize=14, fontweight='bold')
    plt.tight_layout()
    
    chart_path = os.path.join(output_dir, f'color_reference_{target_word}.png')
    plt.savefig(chart_path, dpi=150, bbox_inches='tight')
    plt.show()
    print(f"  Saved color reference chart to {chart_path}")
    
    return df_colors


## 5Ô∏è‚É£ Load Sentence Transformer Model


In [None]:
model = SentenceTransformer("all-MiniLM-L6-v2")
print("Model loaded ‚úÖ")


## 6Ô∏è‚É£ Main Loop Over Words with DBSCAN Clustering


In [None]:
for target_word in TARGET_WORDS:
    print(f"\n\n=== üîç Analyzing '{target_word}' across {len(TERM_YEAR_TUPLES)} term-year pairs (DBSCAN) ===")
    aligner = ClusterAligner(max_clusters=MAX_CLUSTERS, similarity_threshold=SIMILARITY_THRESHOLD)
    baseline_used = False
    cluster_contexts_map = {}
    term_year_data = []
    
    for term, year in TERM_YEAR_TUPLES:
        print(f"\n--- Term {term}, Year {year} ---")
        texts = fetch_speeches(term, year)
        contexts = extract_contexts(texts, target_word)
        print(f"  Contexts: {len(contexts)}")
        
        if len(contexts) < 10:
            print("  Not enough contexts, skipping this slice.")
            continue
        
        embeddings = compute_embeddings(model, contexts)
        
        # DBSCAN Clustering (replaces Affinity Propagation)
        dbscan = DBSCAN(
            eps=DBSCAN_EPS,
            min_samples=DBSCAN_MIN_SAMPLES,
            metric=DBSCAN_METRIC
        )
        dbscan.fit(embeddings)
        local_labels = dbscan.labels_
        
        # Count clusters (excluding noise points labeled as -1)
        n_clusters = len(set(local_labels)) - (1 if -1 in local_labels else 0)
        n_noise = list(local_labels).count(-1)
        print(f"  DBSCAN found: {n_clusters} clusters, {n_noise} noise points")
        
        cap = BASELINE_MAX_CLUSTERS if not baseline_used else MAX_CLUSTERS
        limited_labels, kept_clusters = limit_clusters(local_labels, cap)
        print(f"  Kept: {len(kept_clusters)} clusters (cap={cap})")
        
        prototypes, proto_labels = get_cluster_prototypes(embeddings, limited_labels, return_label_ids=True)
        centroid_map = dict(zip(proto_labels, prototypes))
        
        if not centroid_map:
            print("  No clusters survived filtering, skipping visualization.")
            continue
        
        baseline_used = True
        aligned_labels = aligner.align(limited_labels, centroid_map)
        global_cluster_count = len(set(label for label in aligned_labels if label >= 0))
        print(f"  Global clusters represented: {global_cluster_count}")
        
        # Store context examples for cluster guide
        for global_id in set(label for label in aligned_labels if label >= 0):
            cluster_context_examples = [
                contexts[i] for i, label in enumerate(aligned_labels) if label == global_id
            ]
            if global_id not in cluster_contexts_map:
                cluster_contexts_map[global_id] = []
            cluster_contexts_map[global_id].extend([
                {'term': term, 'year': year, 'context': ctx}
                for ctx in cluster_context_examples[:10]
            ])
        
        print("\n=== Representative Contexts ===")
        show_top_contexts(contexts, aligned_labels)
        
        # Save CSV
        df = pd.DataFrame({
            "term": term,
            "year": year,
            "context": contexts,
            "local_cluster": limited_labels,
            "global_cluster": aligned_labels,
        })
        csv_path = os.path.join(OUTPUT_DIR, f"widid_term{term}_year{year}_{target_word}.csv")
        df.to_csv(csv_path, index=False)
        print(f"  Saved clusters to {csv_path}")
        
        # Store for unified t-SNE
        term_year_data.append({
            "term": term,
            "year": year,
            "embeddings": embeddings,
            "aligned_labels": aligned_labels
        })
    
    # Compute unified t-SNE across all term-years
    if term_year_data:
        print("\n=== Computing Unified t-SNE ===")
        all_embeddings = np.vstack([item["embeddings"] for item in term_year_data])
        print(f"  Total embeddings: {len(all_embeddings)}")
        
        desired = max(5, len(all_embeddings) // 3)
        max_valid = max(1, len(all_embeddings) - 1)
        perplexity = min(desired, max_valid, 30)
        
        tsne_unified = TSNE(n_components=2, random_state=42, perplexity=perplexity).fit_transform(all_embeddings)
        print(f"  t-SNE completed with perplexity={perplexity}")
        
        # Plot each term-year
        start_idx = 0
        for item in term_year_data:
            end_idx = start_idx + len(item["embeddings"])
            tsne_coords = tsne_unified[start_idx:end_idx]
            plot_tsne_with_coords(item["term"], item["year"], target_word,
                                tsne_coords, item["aligned_labels"], aligner)
            start_idx = end_idx
    
    # Generate cluster guide
    print("\n=== Generating Cluster Guide ===")
    create_cluster_guide(cluster_contexts_map, target_word, OUTPUT_DIR, aligner)
    
    # Generate color reference
    print("\n=== Generating Color Reference ===")
    create_color_reference(cluster_contexts_map, target_word, OUTPUT_DIR, aligner)
    print("-----------------------------------------------------")


In [None]:
!zip -r dbscan_results.zip dbscan_results/


## üìä DBSCAN Parameter Tuning Guide

### Key Parameters:

1. **`eps` (epsilon)**: Maximum distance between two samples to be considered neighbors
   - **Lower values** (e.g., 0.2): Tighter, more clusters, more noise
   - **Higher values** (e.g., 0.5): Looser, fewer clusters, less noise
   - Default: 0.3 works well for cosine similarity

2. **`min_samples`**: Minimum points in neighborhood to form a core point
   - **Lower values** (e.g., 2-3): More sensitive, more small clusters
   - **Higher values** (e.g., 5-10): More conservative, larger clusters
   - Default: 3 is a good starting point

3. **`metric`**: Distance metric
   - **'cosine'**: Best for semantic similarity (recommended)
   - **'euclidean'**: Standard distance metric

### Tuning Tips:
- If too many noise points: decrease `min_samples` or increase `eps`
- If too few clusters: decrease `eps` or `min_samples`
- If too many clusters: increase `eps` or `min_samples`

### Comparison with Affinity Propagation:
- **AP**: Automatically determines cluster count, can be slow
- **DBSCAN**: Requires parameter tuning, faster, better with noise
