# Parliament Speech Topic Analysis with BERTopic and Groq LLM

This notebook performs topic modeling on Turkish parliament speeches using:
1. **BERTopic** - For automatic topic discovery
2. **Groq LLM** - For generating human-readable topic names in Turkish
3. **Elasticsearch** - For storing and updating speech documents

## Workflow:
1. Connect to Elasticsearch
2. Fetch all speeches
3. Train BERTopic model
4. Update Elasticsearch with topic assignments
5. Generate readable topic names with Groq LLM
6. Export results to CSV

## 1. Installation & Setup

Install required packages (run this first in Google Colab):

In [None]:
# Install required packages
!pip install -q elasticsearch bertopic groq pandas plotly python-dotenv

## 2. Import Libraries

In [None]:
import os
import sys
import time
import csv
import re
from typing import List, Dict, Tuple, Optional
from elasticsearch import Elasticsearch, helpers
from elasticsearch.exceptions import ConnectionError, NotFoundError
from bertopic import BERTopic
import pandas as pd
import plotly.express as px
from groq import Groq

print("‚úÖ All libraries imported successfully!")

## 3. Configuration

Set your environment variables here:

In [None]:
# Elasticsearch Configuration
ELASTICSEARCH_HOST = "http://localhost:9200"  # Change this to your ES host
ELASTICSEARCH_INDEX = "parliament_speeches"

# File paths
MODEL_SAVE_PATH = "./bertopic_model"
TOPIC_SUMMARY_FILE = "./topic_summary.csv"
TOPIC_DETAILS_FILE = "./topic_details.csv"

# Processing configuration
BATCH_SIZE = 1000

# LLM Configuration
GROQ_API_KEY = ""  # Set your Groq API key here
GROQ_MODEL = "llama-3.1-70b-versatile"  # or your preferred model
USE_LLM_NAMING = True  # Set to False to skip LLM naming

# Set environment variables
os.environ["ELASTICSEARCH_HOST"] = ELASTICSEARCH_HOST
os.environ["ELASTICSEARCH_INDEX"] = ELASTICSEARCH_INDEX
os.environ["GROQ_API_KEY"] = GROQ_API_KEY
os.environ["GROQ_MODEL"] = GROQ_MODEL

print("‚úÖ Configuration set!")
print(f"   Elasticsearch: {ELASTICSEARCH_HOST}")
print(f"   Index: {ELASTICSEARCH_INDEX}")
print(f"   LLM Naming: {'Enabled' if USE_LLM_NAMING and GROQ_API_KEY else 'Disabled'}")

## 4. Helper Functions

### 4.1 Elasticsearch Connection

In [None]:
def connect_to_elasticsearch() -> Elasticsearch:
    """
    Connect to Elasticsearch and verify connection.
    
    Returns:
        Elasticsearch client instance
    """
    print(f"üîå Connecting to Elasticsearch at {ELASTICSEARCH_HOST}...")
    
    try:
        es = Elasticsearch(hosts=[ELASTICSEARCH_HOST])
        
        if es.ping():
            count = es.count(index=ELASTICSEARCH_INDEX)
            total_docs = count.get('count', 0)
            print(f"‚úÖ Connected to Elasticsearch")
            print(f"üìä Index: {ELASTICSEARCH_INDEX}")
            print(f"üìä Total documents: {total_docs:,}")
            return es
        else:
            raise Exception("Ping failed")
            
    except Exception as e:
        print(f"‚ùå Failed to connect to Elasticsearch: {e}")
        print(f"   Make sure Elasticsearch is running on {ELASTICSEARCH_HOST}")
        raise

### 4.2 Fetch Speeches from Elasticsearch

In [None]:
def fetch_all_speeches(es: Elasticsearch) -> List[Dict]:
    """
    Fetch all speeches from Elasticsearch using scroll API for efficient retrieval.
    
    Args:
        es: Elasticsearch client instance
        
    Returns:
        List of speech dictionaries with id, content, and metadata
    """
    print(f"\nüì• Fetching speeches from Elasticsearch...")
    
    query = {
        "query": {
            "bool": {
                "must": [
                    {"exists": {"field": "content"}},
                ],
                "must_not": [
                    {"term": {"content": ""}},
                ]
            }
        },
        "size": BATCH_SIZE,
        "_source": [
            "content", "speech_giver", "term", "year", 
            "session_date", "session_id", "speech_no",
            "province", "political_party", "speech_title"
        ]
    }
    
    speeches = []
    scroll_id = None
    batch_count = 0
    
    try:
        response = es.search(
            index=ELASTICSEARCH_INDEX,
            body=query,
            scroll='5m'
        )
        
        scroll_id = response['_scroll_id']
        hits = response['hits']['hits']
        
        while hits:
            batch_count += 1
            print(f"Batch {batch_count}: Processing {len(hits)} speeches...")
            
            for hit in hits:
                source = hit['_source']
                
                if source.get('content') and source['content'].strip():
                    speeches.append({
                        'id': hit['_id'],
                        'content': source['content'],
                        'speech_giver': source.get('speech_giver', ''),
                        'term': source.get('term'),
                        'year': source.get('year'),
                        'session_date': source.get('session_date'),
                        'session_id': source.get('session_id'),
                        'speech_no': source.get('speech_no'),
                        'province': source.get('province'),
                        'political_party': source.get('political_party'),
                        'speech_title': source.get('speech_title')
                    })
            
            response = es.scroll(scroll_id=scroll_id, scroll='5m')
            scroll_id = response['_scroll_id']
            hits = response['hits']['hits']
        
        print(f"‚úÖ Successfully fetched {len(speeches):,} speeches with valid content")
        return speeches
        
    except Exception as e:
        print(f"‚ùå Error fetching speeches: {e}")
        return []
        
    finally:
        if scroll_id:
            try:
                es.clear_scroll(scroll_id=scroll_id)
            except:
                pass

### 4.3 Run BERTopic Modeling

In [None]:
def run_topic_modeling(speeches: List[Dict]) -> Tuple[List[int], BERTopic]:
    """
    Run BERTopic modeling on speech content.
    
    Args:
        speeches: List of speech dictionaries containing 'content' field
        
    Returns:
        Tuple of (topics, topic_model)
    """
    print(f"\n‚öôÔ∏è  Training BERTopic model on {len(speeches):,} speeches...")
    print("   This may take several minutes depending on your hardware...")
    
    contents = [speech['content'] for speech in speeches]
    
    topic_model = BERTopic(
        language="turkish",
        nr_topics=250,
        verbose=True,
        calculate_probabilities=False,
        min_topic_size=3,
    )
    
    topics, _ = topic_model.fit_transform(contents)
    
    topic_model.save(MODEL_SAVE_PATH)
    print(f"‚úÖ Model trained and saved to {MODEL_SAVE_PATH}")
    
    topic_info = topic_model.get_topic_info()
    num_topics = len(topic_info[topic_info['Topic'] != -1])
    outlier_count = (topics == -1).sum()
    print(f"üìä Discovered {num_topics} topics (excluding outliers)")
    print(f"üìä Outliers: {outlier_count} speeches")
    
    return topics, topic_model

### 4.4 Update Elasticsearch with Topic Assignments

In [None]:
def update_elasticsearch_with_topics(
    es: Elasticsearch, 
    speeches: List[Dict], 
    topics: List[int],
    topic_model: BERTopic
) -> Tuple[int, int]:
    """
    Bulk update Elasticsearch documents with topic assignments.
    
    Returns:
        Tuple of (success_count, failure_count)
    """
    print(f"\nüíæ Updating Elasticsearch with topic assignments...")
    
    topic_info = topic_model.get_topic_info()
    topic_labels = {
        int(row['Topic']): row['Name'] 
        for _, row in topic_info.iterrows()
    }
    
    actions = []
    for speech, topic_id in zip(speeches, topics):
        topic_label = topic_labels.get(topic_id, f"Topic_{topic_id}")
        
        actions.append({
            '_op_type': 'update',
            '_index': ELASTICSEARCH_INDEX,
            '_id': speech['id'],
            'doc': {
                'topic_id': int(topic_id),
                'topic_label': topic_label,
                'topic_analyzed': True
            }
        })
    
    print(f"   Updating {len(actions):,} documents...")
    
    try:
        success, errors = helpers.bulk(
            es, 
            actions, 
            raise_on_error=False,
            chunk_size=500
        )
        
        failed_count = len(errors) if errors else 0
        
        print(f"‚úÖ Successfully updated {success:,} documents")
        if failed_count > 0:
            print(f"‚ö†Ô∏è  Failed to update {failed_count} documents")
            
        return success, failed_count
        
    except Exception as e:
        print(f"‚ùå Error during bulk update: {e}")
        return 0, len(actions)

### 4.5 Save Topic Details for LLM Processing

In [None]:
def save_topic_details(topic_model: BERTopic, output_file: str = TOPIC_DETAILS_FILE):
    """
    Save detailed topic information to CSV including keywords and representative docs.
    """
    print(f"\nüìä Saving topic details for LLM processing...")
    
    try:
        topic_info = topic_model.get_topic_info()
        
        detailed_keywords = []
        representative_docs_list = []
        
        for topic_id in topic_info['Topic']:
            if topic_id == -1:
                detailed_keywords.append("Outliers")
                representative_docs_list.append("[]")
            else:
                words = topic_model.get_topic(topic_id)
                keywords = ', '.join([word for word, _ in words[:10]])
                detailed_keywords.append(keywords)
                
                try:
                    rep_docs = topic_model.get_representative_docs(topic_id)
                    representative_docs_list.append(str(rep_docs))
                except:
                    representative_docs_list.append("[]")
        
        topic_info['Keywords'] = detailed_keywords
        topic_info['Representative_Docs'] = representative_docs_list
        
        os.makedirs(os.path.dirname(output_file) if os.path.dirname(output_file) else '.', exist_ok=True)
        
        topic_info.to_csv(output_file, index=False, encoding='utf-8')
        print(f"‚úÖ Topic details saved to {output_file}")
        print(f"   Total topics: {len(topic_info)} (including outliers)")
        
    except Exception as e:
        print(f"‚ö†Ô∏è  Error saving topic details: {e}")

### 4.6 Groq LLM Topic Namer

In [None]:
class GroqTopicNamer:
    """Service for generating readable topic names using Groq LLM."""
    
    def __init__(self, api_key: str, model: str = "llama-3.1-70b-versatile"):
        self.api_key = api_key
        self.model = model
        self.client = Groq(api_key=self.api_key)
    
    def _build_prompt(self, keywords: str, representative_docs: List[str]) -> str:
        docs_text = ""
        for i, doc in enumerate(representative_docs[:3], 1):
            truncated = doc[:200] if len(doc) > 200 else doc
            docs_text += f"{i}. {truncated}...\n\n"
        
        prompt = f"""Sen T√ºrkiye B√ºy√ºk Millet Meclisi konu≈ümalarƒ±nƒ± analiz eden bir uzmansƒ±n.
A≈üaƒüƒ±daki anahtar kelimeler ve √∂rnek konu≈üma metinlerinden yola √ßƒ±karak,
bu konuyu en iyi tanƒ±mlayan kƒ±sa ve anlamlƒ± bir ba≈ülƒ±k olu≈ütur. ba≈ülƒ±ƒüƒ± doƒürudan cevap olarak ver a√ßƒ±klama yapma. √ñrnek ba≈ülƒ±klar : Ekonomi ve B√ºt√ße Politikalarƒ±, Eƒüitim Sistemi ve √ñƒüretmenlik, Saƒülƒ±k Hizmetleri ve Tedavi, G√ºvenlik ve Ter√∂rle M√ºcadele, vb.

Anahtar Kelimeler: {keywords}

√ñrnek Konu≈ümalar:
{docs_text}

Sadece ba≈ülƒ±k ver, a√ßƒ±klama yapma. Ba≈ülƒ±k T√ºrk√ße olmalƒ± ve en fazla 5 kelime olmalƒ±.
Ba≈ülƒ±k:"""
        
        return prompt
    
    def _clean_topic_name(self, name: str) -> str:
        name = re.sub(r'^(Ba≈ülƒ±k:|Konu:|Topic:)\s*', '', name, flags=re.IGNORECASE)
        name = re.sub(r'["\']', '', name)
        name = name.strip()
        
        words = name.split()
        cleaned_words = []
        for word in words:
            if word and len(word) > 1:
                cleaned_words.append(word[0].upper() + word[1:].lower())
            elif word:
                cleaned_words.append(word.upper())
        
        return ' '.join(cleaned_words)
    
    def generate_topic_name(
        self, 
        topic_id: int, 
        keywords: str, 
        representative_docs: List[str],
        max_retries: int = 3
    ) -> str:
        prompt = self._build_prompt(keywords, representative_docs)
        
        for attempt in range(max_retries):
            try:
                response = self.client.chat.completions.create(
                    model=self.model,
                    messages=[
                        {
                            "role": "system",
                            "content": "Sen T√ºrk√ße konu≈üan bir meclis uzmanƒ±sƒ±n. Kƒ±sa, a√ßƒ±k ve anlamlƒ± ba≈ülƒ±klar olu≈üturursun."
                        },
                        {
                            "role": "user",
                            "content": prompt
                        }
                    ],
                    temperature=0.3,
                    max_tokens=50,
                    top_p=1,
                    stream=False
                )
                
                generated_name = response.choices[0].message.content.strip()
                generated_name = self._clean_topic_name(generated_name)
                
                if generated_name and len(generated_name) > 5:
                    return generated_name
                else:
                    print(f"   ‚ö†Ô∏è  Generated name too short for topic {topic_id}, retrying...")
                    
            except Exception as e:
                print(f"   ‚ö†Ô∏è  Attempt {attempt + 1} failed for topic {topic_id}: {e}")
                if attempt < max_retries - 1:
                    time.sleep(2 ** attempt)
                    continue
        
        words = [w.strip().capitalize() for w in keywords.split(',')[:4]]
        return ' ve '.join(words) if len(words) <= 3 else ' '.join(words[:3])
    
    def process_topic_details_csv(self, csv_path: str) -> Dict[int, str]:
        print(f"\nü§ñ Generating readable topic names with Groq LLM...")
        print(f"   Model: {self.model}")
        
        topic_mapping = {}
        
        try:
            maxInt = sys.maxsize
            while True:
                try:
                    csv.field_size_limit(maxInt)
                    break
                except OverflowError:
                    maxInt = int(maxInt / 10)
            
            with open(csv_path, 'r', encoding='utf-8') as f:
                reader = csv.DictReader(f)
                topics = list(reader)
            
            total = len(topics)
            print(f"   Processing {total} topics...")
            
            for idx, row in enumerate(topics, 1):
                topic_id = int(row['Topic'])
                
                if topic_id == -1:
                    print(f"   [{idx}/{total}] Skipping outlier topic -1")
                    continue
                
                keywords = row.get('Keywords', '')
                rep_docs_str = row.get('Representative_Docs', '')
                
                if len(rep_docs_str) > 3000:
                    rep_docs_str = rep_docs_str[:3000]
                
                try:
                    rep_docs_str = rep_docs_str.strip('[]')
                    if rep_docs_str:
                        rep_docs = [doc.strip(' "\"') for doc in rep_docs_str.split('", "')]
                        rep_docs = [doc for doc in rep_docs if doc]
                    else:
                        rep_docs = []
                except:
                    rep_docs = []
                
                print(f"   [{idx}/{total}] Topic {topic_id}: Generating name...")
                readable_name = self.generate_topic_name(topic_id, keywords, rep_docs)
                topic_mapping[topic_id] = readable_name
                
                print(f"   ‚úÖ Topic {topic_id}: \"{readable_name}\"")
                
                time.sleep(0.1)
            
            print(f"\n‚úÖ Successfully generated {len(topic_mapping)} topic names")
            return topic_mapping
            
        except FileNotFoundError:
            print(f"‚ùå Error: topic_details.csv not found at {csv_path}")
            return {}
        except Exception as e:
            print(f"‚ùå Error processing topics: {e}")
            return {}

### 4.7 Update Elasticsearch with Groq-Generated Names

In [None]:
def update_elasticsearch_topic_labels(
    es: Elasticsearch, 
    topic_mapping: Dict[int, str],
    index: str = ELASTICSEARCH_INDEX
) -> int:
    """
    Bulk update Elasticsearch documents with readable topic names.
    """
    print(f"\nüíæ Updating Elasticsearch with readable topic names...")
    
    total_updated = 0
    
    for topic_id, readable_name in topic_mapping.items():
        try:
            response = es.update_by_query(
                index=index,
                body={
                    "script": {
                        "source": "ctx._source.topic_label = params.new_label",
                        "lang": "painless",
                        "params": {
                            "new_label": readable_name
                        }
                    },
                    "query": {
                        "term": {"topic_id": topic_id}
                    }
                },
                conflicts='proceed',
                refresh=True
            )
            
            updated = response.get('updated', 0)
            total_updated += updated
            
            if updated > 0:
                print(f"   ‚úÖ Topic {topic_id}: Updated {updated:,} documents to \"{readable_name}\"")
            
        except Exception as e:
            print(f"   ‚ùå Error updating topic {topic_id}: {e}")
            continue
    
    print(f"\n‚úÖ Total documents updated: {total_updated:,}")
    return total_updated

### 4.8 Export Topic Summary to CSV

In [None]:
def export_topic_summary(
    speeches: List[Dict],
    topics: List[int],
    topic_model: BERTopic,
    exclude_outliers: bool = True,
    groq_topic_mapping: Optional[Dict[int, str]] = None
) -> pd.DataFrame:
    """
    Create and export topic summary CSV for backup/analysis.
    """
    print(f"\nüìä Creating topic summary...")
    
    df = pd.DataFrame(speeches)
    df['topic_id'] = topics
    
    if exclude_outliers:
        original_count = len(df)
        df = df[df['topic_id'] != -1].copy()
        excluded_count = original_count - len(df)
        if excluded_count > 0:
            print(f"   Excluding {excluded_count:,} outlier speeches (topic_id -1)")
    
    topic_info = topic_model.get_topic_info()
    topic_labels = {
        int(row['Topic']): row['Name'] 
        for _, row in topic_info.iterrows()
    }
    df['topic_label'] = df['topic_id'].map(topic_labels)
    
    # Add Groq-generated topic names if available
    if groq_topic_mapping:
        df['groq_topic_label'] = df['topic_id'].map(groq_topic_mapping)
        print(f"   Added Groq-generated topic names for {df['groq_topic_label'].notna().sum():,} speeches")
    
    # Create summary by MP and topic
    groupby_cols = ['speech_giver', 'topic_id', 'topic_label']
    if groq_topic_mapping:
        groupby_cols.append('groq_topic_label')
    
    summary = df.groupby(groupby_cols).agg({
        'id': 'count',
        'term': lambda x: list(set(x.dropna())),
        'year': lambda x: list(set(x.dropna()))
    }).reset_index()
    
    summary.rename(columns={'id': 'speech_count'}, inplace=True)
    summary = summary.sort_values('speech_count', ascending=False)
    
    summary.to_csv(TOPIC_SUMMARY_FILE, index=False)
    print(f"‚úÖ Topic summary saved to {TOPIC_SUMMARY_FILE}")
    print(f"   Total rows: {len(summary):,}")
    print(f"   Unique topics: {summary['topic_id'].nunique()}")
    print(f"   Unique MPs: {summary['speech_giver'].nunique()}")
    
    return summary

### 4.9 Visualize Top Topics

In [None]:
def visualize_top_topics(topic_model: BERTopic, n_topics: int = 10):
    """
    Visualize the top topics discovered.
    """
    print(f"\nüìà Generating topic visualization...")
    
    try:
        topic_info = topic_model.get_topic_info()
        topic_info = topic_info[topic_info['Topic'] != -1]
        top_topics = topic_info.nlargest(n_topics, 'Count')
        
        print(f"\nüèÜ Top {n_topics} Topics:")
        print("=" * 80)
        for idx, row in top_topics.iterrows():
            print(f"Topic {row['Topic']}: {row['Name']}")
            print(f"   Count: {row['Count']:,} speeches")
            print()
        
    except Exception as e:
        print(f"‚ö†Ô∏è  Could not generate visualization: {e}")

## 5. Main Execution

Run the complete pipeline:

In [None]:
def main():
    """Main execution flow."""
    print("=" * 80)
    print("PARLIAMENT SPEECH TOPIC ANALYSIS")
    print("=" * 80)
    
    # Step 1: Connect to Elasticsearch
    es = connect_to_elasticsearch()
    
    # Step 2: Fetch all speeches
    speeches = fetch_all_speeches(es)
    
    if not speeches:
        print("‚ùå No speeches found. Exiting.")
        return
    
    # Step 3: Run topic modeling
    topics, topic_model = run_topic_modeling(speeches)
    
    # Step 4: Update Elasticsearch with results
    success, failed = update_elasticsearch_with_topics(
        es, speeches, topics, topic_model
    )
    
    # Step 5: Save detailed topic information for LLM
    save_topic_details(topic_model, TOPIC_DETAILS_FILE)
    
    # Step 6: Generate readable topic names with LLM (optional)
    topic_mapping = None
    if USE_LLM_NAMING and GROQ_API_KEY:
        try:
            print("\n" + "=" * 80)
            print("ü§ñ LLM TOPIC NAME GENERATION")
            print("=" * 80)
            
            namer = GroqTopicNamer(api_key=GROQ_API_KEY, model=GROQ_MODEL)
            topic_mapping = namer.process_topic_details_csv(TOPIC_DETAILS_FILE)
            
            if topic_mapping:
                updated_count = update_elasticsearch_topic_labels(es, topic_mapping, ELASTICSEARCH_INDEX)
                
                print("\n‚úÖ LLM naming complete!")
                print(f"üìä Generated names for {len(topic_mapping)} topics")
                print(f"üìä Updated {updated_count:,} documents in Elasticsearch")
            else:
                print("‚ö†Ô∏è  No topic mappings generated, skipping ES update")
                topic_mapping = None
                
        except Exception as e:
            print(f"‚ö†Ô∏è  LLM naming failed: {e}")
            print("   Continuing with keyword-based labels")
            topic_mapping = None
    elif not GROQ_API_KEY:
        print("\n‚ö†Ô∏è  Skipping LLM naming: GROQ_API_KEY not set")
        print("   Set GROQ_API_KEY variable to enable")
    else:
        print("\n‚ö†Ô∏è  Skipping LLM naming: USE_LLM_NAMING=False")
    
    # Step 7: Export summary to CSV (backup) - after LLM naming if enabled
    summary = export_topic_summary(speeches, topics, topic_model, groq_topic_mapping=topic_mapping)
    
    # Step 8: Show top topics
    visualize_top_topics(topic_model, n_topics=10)
    
    print("\n" + "=" * 80)
    print("‚úÖ TOPIC ANALYSIS COMPLETE!")
    print("=" * 80)
    print(f"üìä Total speeches analyzed: {len(speeches):,}")
    print(f"üìä Documents updated in ES: {success:,}")
    print(f"üìä Model saved to: {MODEL_SAVE_PATH}")
    print(f"üìä Topic details saved to: {TOPIC_DETAILS_FILE}")
    print(f"üìä Summary saved to: {TOPIC_SUMMARY_FILE}")
    if USE_LLM_NAMING and GROQ_API_KEY:
        print(f"ü§ñ LLM-generated names: Enabled")
    print("=" * 80)

# Run the main function
main()

## 6. View Results

Display the topic summary:

In [None]:
# Load and display the topic summary
summary_df = pd.read_csv(TOPIC_SUMMARY_FILE)
print(f"\nTotal rows in summary: {len(summary_df):,}")
print(f"Columns: {list(summary_df.columns)}")
print("\nFirst 10 rows:")
summary_df.head(10)

## 7. Download Files (for Google Colab)

Download the generated files to your local machine:

In [None]:
# Uncomment to download files in Google Colab
# from google.colab import files
# files.download(TOPIC_SUMMARY_FILE)
# files.download(TOPIC_DETAILS_FILE)