# Refine Topic 251 - Filter Common Words & Redistribute Speeches

This notebook refines the large topic 251 cluster by:
1. Filtering common parliamentary words (Meclis, TBMM, party names, MP names) from keywords
2. Re-embedding filtered keywords
3. Redistributing speeches to other topics based on similarity
4. Re-clustering remaining speeches with filtered keywords
5. Updating Elasticsearch and embeddings file

**Model**: `trmteb/turkish-embedding-model-fine-tuned` (768-dimensional embeddings)

**UMAP**: Reduces 768 ‚Üí 50 dimensions (same parameters as original clustering)

**HDBSCAN**: Re-clusters remaining speeches (same parameters as original clustering)

In [1]:
#connect to drive

from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
!pip install hdbscan  "elasticsearch==8.6.2" tqdm pandas numpy matplotlib seaborn

Collecting elasticsearch==8.6.2
  Downloading elasticsearch-8.6.2-py3-none-any.whl.metadata (4.9 kB)
Collecting elastic-transport<9,>=8 (from elasticsearch==8.6.2)
  Downloading elastic_transport-8.17.1-py3-none-any.whl.metadata (3.8 kB)
Downloading elasticsearch-8.6.2-py3-none-any.whl (385 kB)
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m385.4/385.4 kB[0m [31m31.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading elastic_transport-8.17.1-py3-none-any.whl (64 kB)
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m65.0/65.0 kB[0m [31m7.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: elastic-transport, elasticsearch
Successfully installed elastic-transport-8.17.1 elasticsearch-8.6.2


## 1. Setup and Imports

In [4]:
import os
import sys
import numpy as np
import pandas as pd
from pathlib import Path
from typing import Dict, List, Set, Tuple
from collections import Counter
from sklearn.metrics.pairwise import cosine_similarity

# Sentence transformers for embedding
from sentence_transformers import SentenceTransformer

# Clustering
import hdbscan

# Dimensionality reduction
import umap

# Elasticsearch
from elasticsearch import Elasticsearch, helpers
from tqdm.auto import tqdm

# Configuration
EMBEDDINGS_FILE = "/content/drive/MyDrive/492-data/keyword_embeddings.npy"
MODEL_NAME = "trmteb/turkish-embedding-model-fine-tuned"
ELASTICSEARCH_HOST = os.getenv("ELASTICSEARCH_HOST", "https://updates-exhibit-advanced-websites.trycloudflare.com")
ELASTICSEARCH_INDEX = os.getenv("ELASTICSEARCH_INDEX", "parliament_speeches")
TARGET_TOPIC_ID = 251

# UMAP parameters (matching original approach)
UMAP_N_COMPONENTS = 50
UMAP_N_NEIGHBORS = 5
UMAP_MIN_DIST = 0.0
UMAP_METRIC = 'cosine'
UMAP_RANDOM_STATE = 42

# HDBSCAN parameters (matching original approach)
HDBSCAN_MIN_CLUSTER_SIZE = 10
HDBSCAN_MIN_SAMPLES = 5
HDBSCAN_METRIC = 'euclidean'
HDBSCAN_EPSILON = 5.0

# Redistribution thresholds
SIMILARITY_THRESHOLD = 0.1  # How much better another topic must be to reassign
CORE_THRESHOLD = 0.7  # Minimum similarity to topic 251 to keep it

print("‚úÖ Imports complete")

‚úÖ Imports complete


## 2. Connect to Elasticsearch

In [5]:
def connect_elasticsearch(host: str) -> Elasticsearch:
    """Connect to Elasticsearch and verify connection."""
    es = Elasticsearch(hosts=[host])

    try:
        info = es.info()
        print(f"‚úÖ Connected to Elasticsearch")
        print(f"   Version: {info['version']['number']}")
        return es
    except Exception as e:
        print(f"‚ùå Connection failed: {e}")
        print(f"   Make sure Elasticsearch is running at {host}")
        raise

es = connect_elasticsearch(ELASTICSEARCH_HOST)

‚úÖ Connected to Elasticsearch
   Version: 8.6.1


## 3. Load Data from Elasticsearch and Embeddings

In [6]:
def load_data_from_elasticsearch(es: Elasticsearch, index: str, embeddings_path: str):
    """
    Load speech data from ES and match with embeddings file.

    Returns:
        speeches_dict: {speech_id: {keywords, hdbscan_topic_id, embedding_index}}
        embeddings: numpy array of embeddings
        speech_id_to_index: mapping from speech_id to embedding index
    """
    print(f"\nüì• Loading data from Elasticsearch...")

    # Load embeddings
    if not os.path.exists(embeddings_path):
        raise FileNotFoundError(f"Embeddings file not found: {embeddings_path}")

    embeddings = np.load(embeddings_path)
    print(f"‚úÖ Loaded embeddings: {embeddings.shape}")

    # Query all speeches with keywords and topic_id
    speeches_dict = {}
    speech_id_to_index = {}

    query = {
        "query": {
            "exists": {"field": "keywords"}
        },
        "_source": ["keywords", "keywords_str", "hdbscan_topic_id"],
        "size": 10000
    }

    index_counter = 0
    for hit in helpers.scan(es, query=query, index=index, scroll='5m'):
        speech_id = hit['_id']
        source = hit.get('_source', {})

        # Get keywords (prefer keywords_str, fallback to keywords array)
        keywords = source.get('keywords_str')
        if not keywords and source.get('keywords'):
            keywords = ', '.join(source['keywords']) if isinstance(source['keywords'], list) else str(source['keywords'])

        hdbscan_topic_id = source.get('hdbscan_topic_id')

        speeches_dict[speech_id] = {
            'keywords': keywords,
            'hdbscan_topic_id': hdbscan_topic_id,
            'embedding_index': index_counter
        }
        speech_id_to_index[speech_id] = index_counter
        index_counter += 1

    print(f"‚úÖ Loaded {len(speeches_dict):,} speeches from Elasticsearch")
    print(f"   Embeddings shape: {embeddings.shape}")
    print(f"   Expected speeches: {embeddings.shape[0]:,}")

    if len(speeches_dict) != embeddings.shape[0]:
        print(f"‚ö†Ô∏è  Warning: Number of speeches ({len(speeches_dict)}) doesn't match embeddings ({embeddings.shape[0]})")
        print(f"   This may cause issues. Ensure embeddings match ES document order.")

    return speeches_dict, embeddings, speech_id_to_index

speeches_dict, embeddings, speech_id_to_index = load_data_from_elasticsearch(
    es, ELASTICSEARCH_INDEX, EMBEDDINGS_FILE
)


üì• Loading data from Elasticsearch...
‚úÖ Loaded embeddings: (27201, 768)
‚úÖ Loaded 27,201 speeches from Elasticsearch
   Embeddings shape: (27201, 768)
   Expected speeches: 27,201


## 4. Extract Topic 251 Speeches

In [7]:
# Extract topic 251 speeches
topic_251_speech_ids = [
    sid for sid, data in speeches_dict.items()
    if data.get('hdbscan_topic_id') == TARGET_TOPIC_ID
]

print(f"üìä Found {len(topic_251_speech_ids):,} speeches in topic {TARGET_TOPIC_ID}")

if not topic_251_speech_ids:
    print(f"‚ö†Ô∏è  No speeches found in topic {TARGET_TOPIC_ID}")
    raise ValueError(f"No speeches found in topic {TARGET_TOPIC_ID}")

üìä Found 10,176 speeches in topic 251


## 5. Extract Common Parliamentary Words

In [8]:
def get_common_parliamentary_words(es: Elasticsearch, index: str) -> Set[str]:
    """Extract common words: party names, institution names, frequent MP names."""
    print(f"\nüîç Extracting common parliamentary words...")

    common_words = set()

    # Institution names
    institution_names = {
        "Meclis", "TBMM", "parlamento", "Meclis Ba≈ükanƒ±", "Ba≈ükan",
        "Milletvekili", "Komisyon", "Bakan", "Bakanlƒ±k", "Genel Kurul"
    }
    common_words.update(institution_names)

    # Extract party names from aggregations
    try:
        query = {
            "size": 0,
            "aggs": {
                "parties": {
                    "terms": {
                        "field": "political_party_at_time.keyword",
                        "size": 50
                    }
                }
            }
        }
        response = es.search(index=index, body=query)
        parties = [bucket['key'] for bucket in response['aggregations']['parties']['buckets']]
        # Clean party names (remove term prefixes if any)
        for party in parties:
            # Remove patterns like "XX.d√∂nem " prefix
            cleaned = party.split('.d√∂nem ')[-1] if '.d√∂nem ' in party else party
            common_words.add(cleaned)
        print(f"   Found {len(parties)} party names")
    except Exception as e:
        print(f"   ‚ö†Ô∏è  Could not extract party names: {e}")

    # Extract frequent MP names
    try:
        query = {
            "size": 0,
            "aggs": {
                "speakers": {
                    "terms": {
                        "field": "speech_giver.keyword",
                        "size": 100
                    }
                }
            }
        }
        response = es.search(index=index, body=query)
        mp_names = [bucket['key'] for bucket in response['aggregations']['speakers']['buckets']]
        common_words.update(mp_names)
        print(f"   Found {len(mp_names)} frequent MP names")
    except Exception as e:
        print(f"   ‚ö†Ô∏è  Could not extract MP names: {e}")

    print(f"‚úÖ Total common words: {len(common_words)}")
    return common_words

common_words = get_common_parliamentary_words(es, ELASTICSEARCH_INDEX)


üîç Extracting common parliamentary words...


  response = es.search(index=index, body=query)


   Found 21 party names


  response = es.search(index=index, body=query)


   Found 100 frequent MP names
‚úÖ Total common words: 131


## 6. Filter Keywords for Topic 251 Speeches

In [9]:
def filter_keywords(keywords_str: str, common_words: Set[str]) -> str:
    """Remove common words from comma-separated keyword string."""
    if not keywords_str:
        return ""

    # Split keywords
    keywords = [k.strip() for k in str(keywords_str).split(',')]

    # Filter out common words (case-insensitive)
    filtered = []
    for kw in keywords:
        kw_lower = kw.lower().strip()
        # Check if keyword contains any common word
        is_common = False
        for common in common_words:
            if common.lower() in kw_lower or kw_lower in common.lower():
                is_common = True
                break

        if not is_common and kw.strip():
            filtered.append(kw.strip())

    return ', '.join(filtered) if filtered else keywords_str  # Return original if all filtered

# Filter keywords for topic 251 speeches
print(f"\nüîç Filtering keywords for topic {TARGET_TOPIC_ID} speeches...")
filtered_keywords_list = []
topic_251_indices = []

for speech_id in tqdm(topic_251_speech_ids, desc="Filtering keywords"):
    speech_data = speeches_dict[speech_id]
    original_keywords = speech_data.get('keywords', '')
    filtered_keywords = filter_keywords(original_keywords, common_words)
    filtered_keywords_list.append(filtered_keywords)
    topic_251_indices.append(speech_data.get('embedding_index'))

print(f"‚úÖ Filtered keywords for {len(filtered_keywords_list):,} speeches")

# Show some examples
print("\nüìù Example filtered keywords:")
for i in range(min(5, len(topic_251_speech_ids))):
    speech_id = topic_251_speech_ids[i]
    original = speeches_dict[speech_id].get('keywords', '')[:100]
    filtered = filtered_keywords_list[i][:100]
    print(f"\n  Original: {original}...")
    print(f"  Filtered: {filtered}...")


üîç Filtering keywords for topic 251 speeches...


Filtering keywords:   0%|          | 0/10176 [00:00<?, ?it/s]

‚úÖ Filtered keywords for 10,176 speeches

üìù Example filtered keywords:

  Original: avukatlar g√ºn√º, hukuk sistemi, savunma hakkƒ±, adalet, kamu hizmeti, ba≈ü√∂rt√ºl√º avukatlar, mahkeme, AK...
  Filtered: avukatlar g√ºn√º, hukuk sistemi, savunma hakkƒ±, kamu hizmeti, ba≈ü√∂rt√ºl√º avukatlar, mahkeme, AK Parti, ...

  Original: yargƒ± baƒüƒ±msƒ±zlƒ±ƒüƒ±, yargƒ± vesayeti, savunma hakkƒ±, avukatlar, 12 Eyl√ºl fa≈üizmi, adalet, mahkeme, kam...
  Filtered: yargƒ± vesayeti, savunma hakkƒ±, avukatlar, 12 Eyl√ºl fa≈üizmi, mahkeme, kamu vicdanƒ±, j√ºristokrasi, sav...

  Original: Avukatlar G√ºn√º, Savunma mesleƒüi, Hukuka uygunluk, √ñzg√ºrl√ºk, Sabotaj, Meclis tutanaklarƒ±, Engin √ñzko√ß...
  Filtered: Avukatlar G√ºn√º, Savunma mesleƒüi, Hukuka uygunluk, √ñzg√ºrl√ºk, Sabotaj, AKP Grubu, Sƒ±ra kapaklarƒ±, D√ºze...

  Original: avukatlar g√ºn√º, savunma, avukatlƒ±k, barolar, meslek √∂rg√ºt√º, toplum, temel haklar, √∂zg√ºrl√ºkler, taraf...
  Filtered: avukatlar g√ºn√º, savunma, avukatlƒ±k, 

## 7. Load Embedding Model

In [10]:
print(f"\nüîÑ Loading embedding model: {MODEL_NAME}...")
print("   This may take a few minutes on first run...\n")

model = SentenceTransformer(MODEL_NAME)
embedding_dim = model.get_sentence_embedding_dimension()
print(f"‚úÖ Model loaded successfully!")
print(f"   Embedding dimension: {embedding_dim}")


üîÑ Loading embedding model: trmteb/turkish-embedding-model-fine-tuned...
   This may take a few minutes on first run...



The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/205 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/583 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/442M [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/695 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/296 [00:00<?, ?B/s]

‚úÖ Model loaded successfully!
   Embedding dimension: 768


## 8. Re-embed Filtered Keywords

In [11]:
print(f"\nüîÑ Re-embedding filtered keywords...")

filtered_embeddings_768 = model.encode(
    filtered_keywords_list,
    batch_size=32,
    show_progress_bar=True,
    convert_to_numpy=True
)

print(f"\n‚úÖ Generated {filtered_embeddings_768.shape[0]} filtered embeddings")
print(f"   Shape: {filtered_embeddings_768.shape}")


üîÑ Re-embedding filtered keywords...


Batches:   0%|          | 0/318 [00:00<?, ?it/s]


‚úÖ Generated 10176 filtered embeddings
   Shape: (10176, 768)


## 9. Apply UMAP Dimensionality Reduction

In [12]:
print(f"\nüîÑ Fitting UMAP on all original embeddings...")
print(f"   Parameters: n_components={UMAP_N_COMPONENTS}, n_neighbors={UMAP_N_NEIGHBORS}, ")
print(f"              min_dist={UMAP_MIN_DIST}, metric='{UMAP_METRIC}', random_state={UMAP_RANDOM_STATE}")

reducer = umap.UMAP(
    n_components=UMAP_N_COMPONENTS,
    n_neighbors=UMAP_N_NEIGHBORS,
    min_dist=UMAP_MIN_DIST,
    metric=UMAP_METRIC,
    random_state=UMAP_RANDOM_STATE
)

original_reduced_embeddings = reducer.fit_transform(embeddings)
print(f"‚úÖ UMAP reduction complete: {original_reduced_embeddings.shape}")


üîÑ Fitting UMAP on all original embeddings...
   Parameters: n_components=50, n_neighbors=5, 
              min_dist=0.0, metric='cosine', random_state=42


  warn(


‚úÖ UMAP reduction complete: (27201, 50)


In [13]:
# Transform filtered embeddings using the same reducer
print(f"\nüîÑ Transforming filtered embeddings...")
filtered_reduced_embeddings_all = reducer.transform(filtered_embeddings_768)

# Create combined reduced embeddings (original + filtered for topic 251)
reduced_embeddings = original_reduced_embeddings.copy()
for idx, emb_idx in enumerate(topic_251_indices):
    if emb_idx < len(reduced_embeddings):
        reduced_embeddings[emb_idx] = filtered_reduced_embeddings_all[idx]

print(f"‚úÖ Combined reduced embeddings: {reduced_embeddings.shape}")


üîÑ Transforming filtered embeddings...
‚úÖ Combined reduced embeddings: (27201, 50)


## 10. Calculate Topic Centroids

In [14]:
def calculate_topic_centroids(
    speeches_dict: Dict,
    speech_id_to_index: Dict,
    reduced_embeddings: np.ndarray,
    exclude_topic_id: int = TARGET_TOPIC_ID
) -> Dict[int, np.ndarray]:
    """Calculate centroid embeddings for each topic using reduced embeddings."""
    print(f"\nüìä Calculating topic centroids...")

    topic_embeddings = {}

    # Group speeches by topic
    for speech_id, speech_data in speeches_dict.items():
        topic_id = speech_data.get('hdbscan_topic_id')
        if topic_id is None or topic_id == exclude_topic_id:
            continue

        embedding_idx = speech_data.get('embedding_index')
        if embedding_idx is None or embedding_idx >= len(reduced_embeddings):
            continue

        if topic_id not in topic_embeddings:
            topic_embeddings[topic_id] = []
        topic_embeddings[topic_id].append(reduced_embeddings[embedding_idx])

    # Calculate centroids
    centroids = {}
    for topic_id, embeddings_list in topic_embeddings.items():
        if embeddings_list:
            centroids[topic_id] = np.mean(embeddings_list, axis=0)

    print(f"‚úÖ Calculated centroids for {len(centroids)} topics")
    return centroids

topic_centroids = calculate_topic_centroids(
    speeches_dict, speech_id_to_index, original_reduced_embeddings, exclude_topic_id=TARGET_TOPIC_ID
)

# Calculate topic 251 centroid (using original embeddings)
topic_251_indices_list = [speeches_dict[sid].get('embedding_index') for sid in topic_251_speech_ids]
topic_251_original_embs = original_reduced_embeddings[topic_251_indices_list]
topic_251_centroid = np.mean(topic_251_original_embs, axis=0)

print(f"‚úÖ Calculated topic {TARGET_TOPIC_ID} centroid")


üìä Calculating topic centroids...
‚úÖ Calculated centroids for 252 topics
‚úÖ Calculated topic 251 centroid


## 11. Redistribute Speeches

In [25]:
def redistribute_speeches(
    topic_251_speech_ids: List[str],
    speeches_dict: Dict,
    speech_id_to_index: Dict,
    original_reduced_embeddings: np.ndarray,
    filtered_reduced_embeddings: np.ndarray,
    topic_centroids: Dict[int, np.ndarray],
    topic_251_centroid: np.ndarray,
    threshold: float = SIMILARITY_THRESHOLD,
    core_threshold: float = CORE_THRESHOLD
) -> Tuple[Dict[str, int], List[str]]:
    """
    Find best matching topic for each speech (VECTORIZED VERSION - much faster!).

    Uses matrix operations to calculate all similarities at once instead of loops.
    This will use more RAM but be orders of magnitude faster.

    IMPORTANT: Recalculates topic_251_centroid from filtered embeddings for fair comparison.

    Returns:
        Tuple of (redistributions_dict, to_recluster_list)
    """
    print(f"\nüîÑ Redistributing {len(topic_251_speech_ids):,} speeches (vectorized)...")

    # Extract all embedding indices for topic 251 speeches
    embedding_indices = []
    valid_speech_ids = []

    for speech_id in topic_251_speech_ids:
        speech_data = speeches_dict[speech_id]
        embedding_idx = speech_data.get('embedding_index')

        if embedding_idx is not None and embedding_idx < len(filtered_reduced_embeddings):
            embedding_indices.append(embedding_idx)
            valid_speech_ids.append(speech_id)

    if not embedding_indices:
        return {}, []

    embedding_indices = np.array(embedding_indices)
    n_speeches = len(embedding_indices)

    print(f"   Processing {n_speeches:,} valid speeches...")

    # Extract all embeddings at once (vectorized)
    filtered_embs = filtered_reduced_embeddings[embedding_indices]  # Shape: (n_speeches, 50)

    # RECALCULATE topic_251_centroid from FILTERED embeddings for fair comparison
    topic_251_filtered_centroid = np.mean(filtered_embs, axis=0)
    print(f"   Recalculated topic 251 centroid from filtered embeddings")

    # Normalize embeddings for cosine similarity (L2 normalization)
    filtered_embs_norm = filtered_embs / (np.linalg.norm(filtered_embs, axis=1, keepdims=True) + 1e-8)
    topic_251_centroid_norm = topic_251_filtered_centroid / (np.linalg.norm(topic_251_filtered_centroid) + 1e-8)

    # Calculate similarity to topic 251 for all speeches at once (using FILTERED embeddings)
    # Shape: (n_speeches,)
    topic_251_sims = np.dot(filtered_embs_norm, topic_251_centroid_norm)

    # Prepare topic centroids matrix
    topic_ids = sorted(topic_centroids.keys())
    centroids_matrix = np.array([topic_centroids[tid] for tid in topic_ids])  # Shape: (n_topics, 50)
    centroids_norm = centroids_matrix / (np.linalg.norm(centroids_matrix, axis=1, keepdims=True) + 1e-8)

    # Calculate similarity to all other topics for all speeches at once (vectorized)
    # Shape: (n_speeches, n_topics)
    all_topic_sims = np.dot(filtered_embs_norm, centroids_norm.T)

    # Find best matching topic for each speech (vectorized)
    best_topic_indices = np.argmax(all_topic_sims, axis=1)  # Shape: (n_speeches,)
    best_sims = all_topic_sims[np.arange(n_speeches), best_topic_indices]  # Shape: (n_speeches,)
    best_topic_ids = np.array([topic_ids[idx] for idx in best_topic_indices])

    # DIAGNOSTICS: Print similarity statistics
    print(f"\nüìä Similarity Statistics (using filtered embeddings):")
    print(f"   Topic 251 similarity - Min: {topic_251_sims.min():.4f}, Max: {topic_251_sims.max():.4f}, Mean: {topic_251_sims.mean():.4f}, Median: {np.median(topic_251_sims):.4f}")
    print(f"   Best other topic similarity - Min: {best_sims.min():.4f}, Max: {best_sims.max():.4f}, Mean: {best_sims.mean():.4f}, Median: {np.median(best_sims):.4f}")
    print(f"   Difference (best_other - topic_251) - Min: {(best_sims - topic_251_sims).min():.4f}, Max: {(best_sims - topic_251_sims).max():.4f}, Mean: {(best_sims - topic_251_sims).mean():.4f}")
    print(f"   Threshold: {threshold}, Core threshold: {core_threshold}")

    # ADAPTIVE THRESHOLD: Use a much smaller threshold based on actual distribution
    # Since mean difference is ~0.0015, use a threshold around 0.01-0.02
    adaptive_threshold = max(0.01, threshold * 0.1)  # Use 10% of original threshold or 0.01, whichever is larger
    print(f"   Using adaptive threshold: {adaptive_threshold:.4f}")

    # Decision logic (vectorized) - MORE AGGRESSIVE
    # Condition 1: Reassign if best_topic similarity > topic_251_sim + adaptive_threshold
    # OR if best_topic similarity > topic_251_sim (even slightly better)
    reassign_mask = (best_sims > topic_251_sims + adaptive_threshold)

    # Condition 2: Keep in 251 ONLY if topic_251 is clearly better than best_other
    # Use percentile-based: keep only top 20% most similar to topic 251
    similarity_percentile_80 = np.percentile(topic_251_sims, 80)
    keep_mask = (topic_251_sims >= similarity_percentile_80) & (best_sims <= topic_251_sims + adaptive_threshold)

    # Condition 3: Re-cluster if neither condition met
    recluster_mask = ~(reassign_mask | keep_mask)

    # Build results (vectorized)
    redistributions = {}
    to_recluster = []

    # Reassign speeches
    reassign_indices = np.where(reassign_mask)[0]
    for idx in reassign_indices:
        redistributions[valid_speech_ids[idx]] = int(best_topic_ids[idx])

    # Keep in topic 251
    keep_indices = np.where(keep_mask & ~reassign_mask)[0]
    for idx in keep_indices:
        redistributions[valid_speech_ids[idx]] = TARGET_TOPIC_ID

    # Mark for re-clustering
    recluster_indices = np.where(recluster_mask)[0]
    for idx in recluster_indices:
        to_recluster.append(valid_speech_ids[idx])

    kept_in_251 = len(keep_indices)
    reassigned = len(reassign_indices)

    print(f"\n‚úÖ Redistribution complete:")
    print(f"   Kept in topic {TARGET_TOPIC_ID}: {kept_in_251:,} (top 20% most similar)")
    print(f"   Reassigned to other topics: {reassigned:,}")
    print(f"   To be re-clustered: {len(to_recluster):,}")

    return redistributions, to_recluster

redistributions, to_recluster = redistribute_speeches(
    topic_251_speech_ids,
    speeches_dict,
    speech_id_to_index,
    original_reduced_embeddings,
    reduced_embeddings,
    topic_centroids,
    topic_251_centroid
)


üîÑ Redistributing 10,176 speeches (vectorized)...
   Processing 10,176 valid speeches...
   Recalculated topic 251 centroid from filtered embeddings

üìä Similarity Statistics (using filtered embeddings):
   Topic 251 similarity - Min: 0.9450, Max: 0.9999, Mean: 0.9975, Median: 0.9981
   Best other topic similarity - Min: 0.9570, Max: 0.9999, Mean: 0.9990, Median: 0.9992
   Difference (best_other - topic_251) - Min: -0.0001, Max: 0.0191, Mean: 0.0015
   Threshold: 0.1, Core threshold: 0.7
   Using adaptive threshold: 0.0100

‚úÖ Redistribution complete:
   Kept in topic 251: 2,036 (top 20% most similar)
   Reassigned to other topics: 161
   To be re-clustered: 7,979


## 12. Re-cluster Remaining Speeches

In [26]:
def re_cluster_remaining(
    to_recluster: List[str],
    speeches_dict: Dict,
    speech_id_to_index: Dict,
    filtered_reduced_embeddings: np.ndarray,
    min_cluster_size: int = HDBSCAN_MIN_CLUSTER_SIZE,
    min_samples: int = HDBSCAN_MIN_SAMPLES,
    metric: str = HDBSCAN_METRIC,
    cluster_selection_epsilon: float = HDBSCAN_EPSILON
) -> Dict[str, int]:
    """Re-cluster speeches that weren't redistributed using HDBSCAN."""
    if not to_recluster:
        return {}

    print(f"\nüîÑ Re-clustering {len(to_recluster):,} speeches...")

    # Get embeddings for speeches to re-cluster
    embeddings_to_cluster = []
    speech_id_mapping = []  # Map cluster index back to speech_id

    for speech_id in to_recluster:
        speech_data = speeches_dict[speech_id]
        embedding_idx = speech_data.get('embedding_index')

        if embedding_idx is not None and embedding_idx < len(filtered_reduced_embeddings):
            embeddings_to_cluster.append(filtered_reduced_embeddings[embedding_idx])
            speech_id_mapping.append(speech_id)

    if not embeddings_to_cluster:
        return {}

    embeddings_array = np.array(embeddings_to_cluster)

    # Run HDBSCAN
    print(f"   Parameters: min_cluster_size={min_cluster_size}, min_samples={min_samples}")
    print(f"              metric='{metric}', cluster_selection_epsilon={cluster_selection_epsilon}")

    clusterer = hdbscan.HDBSCAN(
        min_cluster_size=min_cluster_size,
        min_samples=min_samples,
        metric=metric,
        cluster_selection_epsilon=cluster_selection_epsilon,
        prediction_data=True
    )

    cluster_labels = clusterer.fit_predict(embeddings_array)

    # Find max existing topic ID
    max_topic_id = max([s.get('hdbscan_topic_id', 0) for s in speeches_dict.values() if s.get('hdbscan_topic_id')])
    start_topic_id = max_topic_id + 1

    # Create mapping: cluster_label -> new_topic_id
    unique_clusters = sorted(set(cluster_labels))
    cluster_to_topic = {}
    next_topic_id = start_topic_id

    for cluster_label in unique_clusters:
        if cluster_label == -1:
            # Outliers - assign to a special topic or keep as -1
            cluster_to_topic[cluster_label] = -1
        else:
            cluster_to_topic[cluster_label] = next_topic_id
            next_topic_id += 1

    # Create result mapping
    result = {}
    for idx, cluster_label in enumerate(cluster_labels):
        speech_id = speech_id_mapping[idx]
        new_topic_id = cluster_to_topic[cluster_label]
        result[speech_id] = new_topic_id

    n_clusters = len([c for c in unique_clusters if c != -1])
    n_outliers = sum(1 for c in cluster_labels if c == -1)

    print(f"‚úÖ Re-clustering complete:")
    print(f"   Created {n_clusters} new topics (IDs: {start_topic_id} to {next_topic_id - 1})")
    print(f"   Outliers: {n_outliers}")

    return result

recluster_assignments = {}
if to_recluster:
    recluster_assignments = re_cluster_remaining(
        to_recluster,
        speeches_dict,
        speech_id_to_index,
        reduced_embeddings
    )


üîÑ Re-clustering 7,979 speeches...
   Parameters: min_cluster_size=10, min_samples=5
              metric='euclidean', cluster_selection_epsilon=5.0
‚úÖ Re-clustering complete:
   Created 2 new topics (IDs: 252 to 253)
   Outliers: 57


## 13. Generate Topic Labels

In [27]:
def generate_topic_labels(
    speeches_dict: Dict,
    topic_assignments: Dict[str, int],
    n_keywords: int = 5
) -> Dict[int, str]:
    """Generate topic labels from top keywords."""
    print(f"\nüè∑Ô∏è  Generating topic labels...")

    # Group speeches by topic
    topic_speeches = {}
    for speech_id, topic_id in topic_assignments.items():
        if topic_id not in topic_speeches:
            topic_speeches[topic_id] = []
        topic_speeches[topic_id].append(speeches_dict[speech_id].get('keywords', ''))

    topic_labels = {}

    for topic_id, keywords_list in topic_speeches.items():
        # Extract all keywords
        all_keywords = []
        for keywords_str in keywords_list:
            if keywords_str:
                keywords = [k.strip() for k in str(keywords_str).split(',')]
                all_keywords.extend([k for k in keywords if k.strip()])

        # Count keyword frequencies
        keyword_counts = Counter(all_keywords)

        # Get top N keywords
        top_keywords = [kw for kw, count in keyword_counts.most_common(n_keywords)]

        # Create label
        if topic_id == -1:
            topic_labels[topic_id] = "Outliers"
        elif top_keywords:
            topic_labels[topic_id] = ", ".join(top_keywords)
        else:
            topic_labels[topic_id] = f"Topic {topic_id}"

    print(f"‚úÖ Generated labels for {len(topic_labels)} topics")
    return topic_labels

# Generate labels for all reassigned/reclustered topics
all_assignments = {**redistributions, **recluster_assignments}
topic_labels = generate_topic_labels(speeches_dict, all_assignments)

# Display new topic labels
print("\nüìã New Topic Labels:")
for topic_id in sorted(set(all_assignments.values())):
    if topic_id != TARGET_TOPIC_ID:  # Skip topic 251 (already has label)
        label = topic_labels.get(topic_id, f"Topic {topic_id}")
        count = sum(1 for v in all_assignments.values() if v == topic_id)
        print(f"   Topic {topic_id:3d} ({count:5,} speeches): {label}")


üè∑Ô∏è  Generating topic labels...
‚úÖ Generated labels for 13 topics

üìã New Topic Labels:
   Topic  -1 (   57 speeches): Outliers
   Topic   2 (   95 speeches): Turizm, turizm, Mersin, tarih, Antalya
   Topic   4 (    1 speeches): sosyal demokrasi, birle≈üme, b√ºt√ºnle≈üme, iktidar √ßevreleri, huzursuzluk
   Topic  56 (   19 speeches): TBMM, Halklarƒ±n Demokratik Partisi, Turizm, Mersin, vatanda≈ü
   Topic  71 (    1 speeches): Mara≈ü katliamƒ±, anne, bebek, nine, unutulmazlar
   Topic  85 (   11 speeches): demokrasi, esnaf, millet, te≈üekk√ºr, m√ºzakere
   Topic  86 (   13 speeches): Cumhuriyet Halk Partisi, Tarƒ±m, konu≈üma, Hakk√¢ri, Fabrikalar
   Topic 108 (   16 speeches): otizm, saƒülƒ±k hizmeti, eƒüitim, rehabilitasyon, Milliyet√ßi Hareket Partisi
   Topic 171 (    2 speeches): aile, koruma, g√º√ßlendirme, Aile Haftasƒ±, toplum
   Topic 221 (    3 speeches): Kahramanmara≈ü, Otoyol, Cinnah Caddesi, Antalya, Burak
   Topic 252 (   11 speeches): ≈üehit, millet, gazi, ≈ûehitle

## 14. Update Elasticsearch

In [28]:
def update_elasticsearch(
    es: Elasticsearch,
    index: str,
    redistributions: Dict[str, int],
    recluster_assignments: Dict[str, int],
    topic_labels: Dict[int, str]
):
    """Bulk update Elasticsearch with new topic assignments."""
    print(f"\nüíæ Updating Elasticsearch...")

    # Combine all assignments
    all_assignments = {**redistributions, **recluster_assignments}

    if not all_assignments:
        print("   No assignments to update")
        return

    # Prepare bulk update actions
    actions = []
    for speech_id, new_topic_id in all_assignments.items():
        new_label = topic_labels.get(new_topic_id, f"Topic {new_topic_id}")

        actions.append({
            '_op_type': 'update',
            '_index': index,
            '_id': speech_id,
            'doc': {
                'hdbscan_topic_id': int(new_topic_id),
                'hdbscan_topic_label': new_label
            }
        })

    # Bulk update
    success_count = 0
    error_count = 0

    for i in tqdm(range(0, len(actions), 500), desc="Updating ES"):
        batch = actions[i:i+500]
        try:
            success, errors = helpers.bulk(es, batch, raise_on_error=False)
            success_count += success
            if errors:
                error_count += len(errors)
        except Exception as e:
            print(f"   ‚ö†Ô∏è  Error in batch {i}: {e}")
            error_count += len(batch)

    print(f"‚úÖ Elasticsearch update complete:")
    print(f"   Successfully updated: {success_count:,}")
    if error_count > 0:
        print(f"   Errors: {error_count:,}")

update_elasticsearch(es, ELASTICSEARCH_INDEX, redistributions, recluster_assignments, topic_labels)


üíæ Updating Elasticsearch...


Updating ES:   0%|          | 0/21 [00:00<?, ?it/s]

‚úÖ Elasticsearch update complete:
   Successfully updated: 10,176


## 15. Update Embeddings File

In [29]:
# Backup original embeddings
backup_path = EMBEDDINGS_FILE.replace('.npy', '_backup.npy')
print(f"\nüíæ Creating backup: {backup_path}")
np.save(backup_path, embeddings)
print(f"‚úÖ Backup created")

# Update embeddings file with filtered embeddings for topic 251 speeches
print(f"\nüíæ Updating embeddings file...")
updated_embeddings = embeddings.copy()

for idx, emb_idx in enumerate(topic_251_indices):
    if emb_idx < len(updated_embeddings) and idx < len(filtered_embeddings_768):
        updated_embeddings[emb_idx] = filtered_embeddings_768[idx]

# Save updated file
np.save(EMBEDDINGS_FILE, updated_embeddings)
print(f"‚úÖ Updated embeddings file: {EMBEDDINGS_FILE}")


üíæ Creating backup: /content/drive/MyDrive/492-data/keyword_embeddings_backup.npy
‚úÖ Backup created

üíæ Updating embeddings file...
‚úÖ Updated embeddings file: /content/drive/MyDrive/492-data/keyword_embeddings.npy


## 16. Upload to Google Drive

In [30]:
from google.colab import drive
import shutil

# Mount drive if not already mounted
try:
    drive.mount('/content/drive', force_remount=False)
except:
    pass  # Already mounted

# Determine destination path
dest_path = f"/content/drive/MyDrive/{Path(EMBEDDINGS_FILE).name}"

# Copy file
shutil.copy(EMBEDDINGS_FILE, dest_path)
print(f"‚úÖ Uploaded to Google Drive: {dest_path}")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
‚úÖ Uploaded to Google Drive: /content/drive/MyDrive/keyword_embeddings.npy


## Summary

In [31]:
print("\n" + "="*60)
print("REFINEMENT SUMMARY")
print("="*60)
print(f"\n‚úÖ Topic {TARGET_TOPIC_ID} refinement complete!")
print(f"\nüìä Results:")
print(f"   Original speeches in topic {TARGET_TOPIC_ID}: {len(topic_251_speech_ids):,}")
print(f"   Kept in topic {TARGET_TOPIC_ID}: {sum(1 for v in redistributions.values() if v == TARGET_TOPIC_ID):,}")
print(f"   Reassigned to other topics: {sum(1 for v in redistributions.values() if v != TARGET_TOPIC_ID):,}")
print(f"   Re-clustered into new topics: {len(recluster_assignments):,}")
print(f"\nüìÅ Files:")
print(f"   Updated embeddings: {EMBEDDINGS_FILE}")
print(f"   Backup: {backup_path}")
print(f"\n‚úÖ All done!")


REFINEMENT SUMMARY

‚úÖ Topic 251 refinement complete!

üìä Results:
   Original speeches in topic 251: 10,176
   Kept in topic 251: 2,036
   Reassigned to other topics: 161
   Re-clustered into new topics: 7,979

üìÅ Files:
   Updated embeddings: /content/drive/MyDrive/492-data/keyword_embeddings.npy
   Backup: /content/drive/MyDrive/492-data/keyword_embeddings_backup.npy

‚úÖ All done!
