# Speech Keyword Extraction using Aya Expanse 8B

This notebook extracts 10 keywords from each parliament speech using the Aya Expanse 8B language model.
Keywords prioritize topic-related words and are saved to a CSV file with speech_id and keywords columns.

**Key Features:**
- ‚ö° **Batch processing** for 10-30x speedup (optimized for 45GB GPU with batch_size=32)
- üíæ **Auto-saves to Elasticsearch** every 100 speeches (no data loss on interruption)
- üîÑ **Resume mode**: Automatically skips already processed speeches when re-run
- üéØ **Topic-aware**: Uses topic labels to extract more relevant keywords

**Elasticsearch Fields Created:**
- `keywords`: Array of keyword strings
- `keywords_str`: Comma-separated keyword string

## Requirements:
- transformers library
- torch
- elasticsearch
- pandas
- tqdm (for progress bars)

In [None]:
# Check if keywords field already exists in Elasticsearch
print("üîç Checking for existing keywords in Elasticsearch...\n")

try:
    es_check = Elasticsearch(hosts=[ELASTICSEARCH_HOST])
    
    if es_check.ping():
        # Check total documents
        total_count = es_check.count(index=ELASTICSEARCH_INDEX)
        print(f"Total documents in index: {total_count['count']:,}\n")
        
        # Query for documents with keywords
        query_with_kw = {
            'query': {'exists': {'field': 'keywords'}},
            'size': 3,
            '_source': ['speech_giver', 'keywords', 'keywords_str', 'groq_topic_label', 'year']
        }
        
        result = es_check.search(index=ELASTICSEARCH_INDEX, body=query_with_kw)
        docs_with_kw = result['hits']['total']['value']
        
        print(f"üìä Documents WITH keywords: {docs_with_kw:,}")
        print(f"üìä Documents WITHOUT keywords: {total_count['count'] - docs_with_kw:,}\n")
        
        if docs_with_kw > 0:
            percentage = (docs_with_kw / total_count['count']) * 100
            print(f"‚úÖ Progress: {percentage:.1f}% complete\n")
            print("üìã Example documents with keywords:\n" + "="*80)
            
            for i, hit in enumerate(result['hits']['hits'], 1):
                source = hit['_source']
                print(f"\nExample {i}:")
                print(f"  Speech ID: {hit['_id']}")
                print(f"  Speaker: {source.get('speech_giver', 'N/A')}")
                print(f"  Year: {source.get('year', 'N/A')}")
                print(f"  Topic: {source.get('groq_topic_label', 'N/A')}")
                
                if 'keywords' in source:
                    kw = source['keywords']
                    print(f"  Keywords (array): {kw}")
                    print(f"  Count: {len(kw)} keywords")
                
                if 'keywords_str' in source:
                    print(f"  Keywords (string): {source['keywords_str']}")
                print("-"*80)
        else:
            print("‚ùå No keywords found yet. Run the extraction process below.")
    else:
        print("‚ùå Cannot connect to Elasticsearch")
        
except Exception as e:
    print(f"‚ö†Ô∏è  Error checking Elasticsearch: {e}")
    print("   Will proceed with keyword extraction...")

## 1. Setup and Imports

In [None]:
import os
import sys
import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from elasticsearch import Elasticsearch
from tqdm.auto import tqdm
from typing import List, Dict
import json
import time

# Configuration
ELASTICSEARCH_HOST = os.getenv("ELASTICSEARCH_HOST", "http://localhost:9200")
ELASTICSEARCH_INDEX = os.getenv("ELASTICSEARCH_INDEX", "parliament_speeches")
OUTPUT_CSV = "../data/speech_keywords.csv"
BATCH_SIZE = 1000  # Batch size for fetching speeches
MODEL_ID = "CohereLabs/aya-expanse-8b"

# Check GPU availability
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")
if device == "cuda":
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print(f"GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.2f} GB")

## 2. Load Aya Expanse 8B Model

In [None]:
print(f"Loading model: {MODEL_ID}...")
print("This may take a few minutes on first run...")

tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)

# Fix padding for decoder-only models (required for batch processing)
tokenizer.padding_side = 'left'
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

model = AutoModelForCausalLM.from_pretrained(
    MODEL_ID,
    torch_dtype=torch.float16 if device == "cuda" else torch.float32,
    device_map="auto" if device == "cuda" else None,
    low_cpu_mem_usage=True
)

if device == "cpu":
    model = model.to(device)

print("‚úÖ Model loaded successfully!")

## 3. Connect to Elasticsearch and Fetch Speeches

In [None]:
def connect_to_elasticsearch() -> Elasticsearch:
    """Connect to Elasticsearch and verify connection."""
    print(f"üîå Connecting to Elasticsearch at {ELASTICSEARCH_HOST}...")
    
    try:
        es = Elasticsearch(hosts=[ELASTICSEARCH_HOST])
        
        if es.ping():
            count = es.count(index=ELASTICSEARCH_INDEX)
            total_docs = count.get('count', 0)
            print(f"‚úÖ Connected to Elasticsearch")
            print(f"üìä Index: {ELASTICSEARCH_INDEX}")
            print(f"üìä Total documents: {total_docs:,}")
            return es
        else:
            raise Exception("Ping failed")
            
    except Exception as e:
        print(f"‚ùå Failed to connect to Elasticsearch: {e}")
        print(f"   Make sure Elasticsearch is running on {ELASTICSEARCH_HOST}")
        raise

# Connect
es = connect_to_elasticsearch()

In [None]:
def fetch_all_speeches(es: Elasticsearch, limit: int = None, skip_processed: bool = True) -> List[Dict]:
    """
    Fetch speeches from Elasticsearch using scroll API.
    
    Args:
        es: Elasticsearch client
        limit: Optional limit on number of speeches to fetch (for testing)
        skip_processed: Skip speeches that already have keywords (for resuming)
    
    Returns:
        List of speech dictionaries with id, content, and metadata
    """
    print(f"\nüì• Fetching speeches from Elasticsearch...")
    if skip_processed:
        print("   Skipping speeches that already have keywords (resume mode)...")
    
    # Build query - optionally skip already processed speeches
    must_conditions = [{"exists": {"field": "content"}}]
    must_not_conditions = [{"term": {"content": ""}}]
    
    if skip_processed:
        # Skip speeches that already have keywords field
        must_not_conditions.append({"exists": {"field": "keywords"}})
    
    query = {
        "query": {
            "bool": {
                "must": must_conditions,
                "must_not": must_not_conditions
            }
        },
        "size": BATCH_SIZE,
        "_source": [
            "content", "speech_giver", "term", "year", 
            "session_date", "topic_label", "groq_topic_label"
        ]
    }
    
    speeches = []
    scroll_id = None
    batch_count = 0
    
    try:
        response = es.search(
            index=ELASTICSEARCH_INDEX,
            body=query,
            scroll='5m'
        )
        
        scroll_id = response['_scroll_id']
        hits = response['hits']['hits']
        
        while hits:
            batch_count += 1
            print(f"Batch {batch_count}: Processing {len(hits)} speeches...")
            
            for hit in hits:
                source = hit['_source']
                
                if source.get('content') and source['content'].strip():
                    speeches.append({
                        'speech_id': hit['_id'],
                        'content': source['content'],
                        'speech_giver': source.get('speech_giver', ''),
                        'topic_label': source.get('topic_label', ''),
                        'groq_topic_label': source.get('groq_topic_label', ''),
                        'year': source.get('year'),
                    })
            
            # Check if limit reached
            if limit and len(speeches) >= limit:
                speeches = speeches[:limit]
                break
            
            # Get next batch
            response = es.scroll(scroll_id=scroll_id, scroll='5m')
            scroll_id = response['_scroll_id']
            hits = response['hits']['hits']
        
        print(f"‚úÖ Successfully fetched {len(speeches):,} speeches")
        return speeches
        
    except Exception as e:
        print(f"‚ùå Error fetching speeches: {e}")
        return []
        
    finally:
        if scroll_id:
            try:
                es.clear_scroll(scroll_id=scroll_id)
            except:
                pass

# Fetch speeches (use limit=10 for testing, remove for full run)
# skip_processed=True means it will resume from where it left off
speeches = fetch_all_speeches(es, limit=None, skip_processed=True)  # Change to limit=10 for testing
print(f"\nTotal speeches to process: {len(speeches):,}")

if len(speeches) == 0:
    print("‚úÖ All speeches already have keywords! Nothing to process.")

## 4. Keyword Extraction Function

In [None]:
def extract_keywords_from_text(gen_text: str) -> str:
    """Helper to extract keywords from generated text and clean special tokens."""
    try:
        # List of special tokens to remove
        special_tokens = [
            '<|START_OF_TURN_TOKEN|>',
            '<|END_OF_TURN_TOKEN|>',
            '<|CHATBOT_TOKEN|>',
            '<|USER_TOKEN|>',
            '<|SYSTEM_TOKEN|>',
            '<BOS_TOKEN>',
            '<EOS_TOKEN>',
            '<s>',
            '</s>',
        ]
        
        # Find where keywords start
        keywords_start_phrase = "Anahtar kelimeler:"
        if keywords_start_phrase in gen_text:
            keywords_start = gen_text.find(keywords_start_phrase) + len(keywords_start_phrase)
            keywords = gen_text[keywords_start:].strip()
        else:
            # If phrase not found, try to extract from the end of generation
            keywords = gen_text.strip()
        
        # Take only first line
        keywords = keywords.split('\\n')[0].strip()
        
        # Remove all special tokens
        for token in special_tokens:
            keywords = keywords.replace(token, '')
        
        # Clean up extra whitespace and commas
        keywords = keywords.strip()
        keywords = ', '.join([k.strip() for k in keywords.split(',') if k.strip()])
        
        # Validate that we have actual content (not just empty or single character)
        if not keywords or len(keywords) < 3 or keywords.count(',') == 0:
            return "ERROR: No valid keywords generated"
        
        return keywords
        
    except Exception as e:
        return f"ERROR: Could not extract keywords - {str(e)}"

def extract_keywords_batch(speeches_batch: List[Dict], batch_size: int = 8) -> List[str]:
    """
    Extract keywords from multiple speeches at once (batch processing for speed).
    
    Args:
        speeches_batch: List of speech dictionaries
        batch_size: Number of speeches to process together
    
    Returns:
        List of comma-separated keyword strings
    """
    max_chars = 2000
    prompts = []
    
    for speech in speeches_batch:
        speech_content = speech['content'][:max_chars]
        topic_context = f" Konu: '{speech.get('groq_topic_label', '')}'." if speech.get('groq_topic_label') else ""
        
        prompt = f"""A≈üaƒüƒ±daki TBMM konu≈ümasƒ±ndan 10 anahtar kelime √ßƒ±kar. Sadece anahtar kelimeleri virg√ºlle ayrƒ±lmƒ±≈ü olarak listele.{topic_context}

Konu≈üma:
{speech_content}

Anahtar kelimeler:"""
        prompts.append(prompt)
    
    # Batch tokenization
    messages_batch = [[{"role": "user", "content": p}] for p in prompts]
    
    # Tokenize all messages
    tokenized = []
    for msg in messages_batch:
        ids = tokenizer.apply_chat_template(
            msg, 
            tokenize=True, 
            add_generation_prompt=True, 
            return_tensors="pt"
        )
        tokenized.append(ids.squeeze(0))
    
    # Pad to same length (left padding for decoder models)
    from torch.nn.utils.rnn import pad_sequence
    input_ids_batch = pad_sequence(
        tokenized, 
        batch_first=True, 
        padding_value=tokenizer.pad_token_id
    ).to(device)
    
    attention_mask = (input_ids_batch != tokenizer.pad_token_id).long().to(device)
    
    # Generate for entire batch (much faster!)
    with torch.no_grad():
        gen_tokens = model.generate(
            input_ids_batch,
            attention_mask=attention_mask,
            max_new_tokens=50,  # Reduced from 100
            do_sample=False,    # Greedy decoding is faster
            pad_token_id=tokenizer.pad_token_id,
        )
    
    # Decode all results
    results = []
    for gen_token in gen_tokens:
        gen_text = tokenizer.decode(gen_token, skip_special_tokens=True)
        keywords = extract_keywords_from_text(gen_text)
        results.append(keywords)
    
    return results

def extract_keywords(speech_content: str, topic_label: str = "", speech_giver: str = "") -> str:
    """
    Extract 10 keywords from a single speech (single processing, slower).
    Use extract_keywords_batch() for better performance.
    """
    speech_dict = {
        'content': speech_content,
        'groq_topic_label': topic_label,
        'speech_giver': speech_giver
    }
    return extract_keywords_batch([speech_dict], batch_size=1)[0]

# Test with a sample speech
if len(speeches) > 0:
    print("\nüß™ Testing keyword extraction with first speech...\n")
    sample = speeches[0]
    print(f"Speech ID: {sample['speech_id']}")
    print(f"Speaker: {sample['speech_giver']}")
    print(f"Topic: {sample.get('groq_topic_label', 'N/A')}")
    print(f"Content preview: {sample['content'][:200]}...\n")
    
    keywords = extract_keywords(
        sample['content'], 
        sample.get('groq_topic_label', ''),
        sample['speech_giver']
    )
    print(f"Extracted keywords: {keywords}")

## 5. Process All Speeches

In [None]:
def process_all_speeches(speeches: List[Dict]) -> pd.DataFrame:
    """
    Process all speeches and extract keywords.
    
    Args:
        speeches: List of speech dictionaries
    
    Returns:
        DataFrame with speech_id and keywords columns
    """
    results = []
    
    print(f"\nüîÑ Processing {len(speeches):,} speeches...")
    print("This will take some time...\n")
    
    for speech in tqdm(speeches, desc="Extracting keywords"):
        try:
            keywords = extract_keywords(
                speech['content'],
                speech.get('groq_topic_label', ''),
                speech['speech_giver']
            )
            
            results.append({
                'speech_id': speech['speech_id'],
                'keywords': keywords,
                'speech_giver': speech['speech_giver'],
                'year': speech.get('year', ''),
                'topic_label': speech.get('groq_topic_label', '')
            })
            
        except Exception as e:
            print(f"\n‚ö†Ô∏è  Error processing speech {speech['speech_id']}: {e}")
            results.append({
                'speech_id': speech['speech_id'],
                'keywords': 'ERROR',
                'speech_giver': speech['speech_giver'],
                'year': speech.get('year', ''),
                'topic_label': speech.get('groq_topic_label', '')
            })
    
    df = pd.DataFrame(results)
    print(f"\n‚úÖ Processed {len(df):,} speeches")
    return df

# Process all speeches with batch processing
# Adjust batch_size based on your GPU memory:
# - 8GB GPU: batch_size=4-8
# - 16GB GPU: batch_size=8-16  
# - 24GB GPU: batch_size=16-32
# - 45GB+ GPU: batch_size=32-64
# - CPU: batch_size=1-2
batch_size = 32 if device == 'cuda' else 1  # 45GB GPU can handle 32-64

# Upload to Elasticsearch every 100 speeches for safety
results_df = process_all_speeches(speeches, es, batch_size=batch_size, upload_every=100)

## 6. Save Results

In [None]:
# Save to CSV
results_df.to_csv(OUTPUT_CSV, index=False)
print(f"\nüíæ Results saved to: {OUTPUT_CSV}")
print(f"Total rows: {len(results_df):,}")

# Display sample results
print("\nüìä Sample results:")
print(results_df.head(10))

## 7. Statistics and Quality Check

In [None]:
# Check for errors
error_count = (results_df['keywords'] == 'ERROR').sum()
error_or_missing = results_df['keywords'].str.contains('ERROR', na=True).sum()

print(f"\nüìà Statistics:")
print(f"Total speeches processed: {len(results_df):,}")
print(f"Errors: {error_count}")
print(f"Success rate: {((len(results_df) - error_count) / len(results_df) * 100):.2f}%")

# Sample keywords by topic
if 'topic_label' in results_df.columns and results_df['topic_label'].notna().any():
    print("\nüìã Sample keywords by topic:")
    for topic in results_df['topic_label'].dropna().unique()[:5]:
        topic_df = results_df[results_df['topic_label'] == topic]
        if len(topic_df) > 0:
            print(f"\n{topic}:")
            print(f"  Sample: {topic_df.iloc[0]['keywords']}")

# Keyword count distribution
results_df['keyword_count'] = results_df['keywords'].str.split(',').str.len()
print(f"\nüî¢ Keyword count distribution:")
print(results_df['keyword_count'].describe())

## 8. Verification (Keywords Already Uploaded)

In [None]:
def upload_keywords_to_elasticsearch(es: Elasticsearch, results_df: pd.DataFrame):
    """
    Upload extracted keywords back to Elasticsearch.
    
    Args:
        es: Elasticsearch client
        results_df: DataFrame with speech_id and keywords
    """
    print("\nüíæ Uploading keywords to Elasticsearch...")
    
    from elasticsearch import helpers
    
    actions = []
    for _, row in results_df.iterrows():
        if row['keywords'] != 'ERROR':
            # Convert comma-separated string to list
            keywords_list = [k.strip() for k in row['keywords'].split(',')]
            
            actions.append({
                '_op_type': 'update',
                '_index': ELASTICSEARCH_INDEX,
                '_id': row['speech_id'],
                'doc': {
                    'keywords': keywords_list,
                    'keywords_str': row['keywords']
                }
            })
    
    # Bulk update
    success, failed = helpers.bulk(es, actions, raise_on_error=False)
    
    print(f"‚úÖ Successfully updated {success:,} documents")
    if failed:
        print(f"‚ö†Ô∏è  Failed to update {len(failed)} documents")

# Keywords were already uploaded during processing (every 100 speeches)
# Let's verify by checking a random sample from Elasticsearch

if len(results_df) > 0:
    print("\nüîç Verifying keywords in Elasticsearch...\n")
    
    # Check first 3 speeches
    for i in range(min(3, len(results_df))):
        speech_id = results_df.iloc[i]['speech_id']
        
        try:
            doc = es.get(index=ELASTICSEARCH_INDEX, id=speech_id)
            es_keywords = doc['_source'].get('keywords', [])
            
            print(f"Speech ID: {speech_id}")
            print(f"  Keywords in ES: {es_keywords[:5]}..." if len(es_keywords) > 5 else f"  Keywords in ES: {es_keywords}")
            print(f"  CSV keywords: {results_df.iloc[i]['keywords'][:100]}...\n")
        except Exception as e:
            print(f"‚ö†Ô∏è  Could not verify speech {speech_id}: {e}\n")
    
    print("‚úÖ Keywords have been uploaded to Elasticsearch during processing!")

## Summary

This notebook:
1. ‚úÖ Loaded the Aya Expanse 8B model with optimized settings
2. ‚úÖ Fetched unprocessed speeches from Elasticsearch (resume mode)
3. ‚úÖ Extracted 10 keywords using batch processing (10-30x faster)
4. ‚úÖ Uploaded keywords to Elasticsearch every 100 speeches
5. ‚úÖ Saved results to CSV with speech_id and keywords columns
6. ‚úÖ Provided statistics and quality checks

**Output file:** `data/speech_keywords.csv`

**Columns:**
- `speech_id`: Unique identifier for each speech
- `keywords`: Comma-separated list of 10 keywords
- `speech_giver`: Speaker name (for reference)
- `year`: Speech year (for reference)
- `topic_label`: Topic label (for reference)

**Elasticsearch Fields Created:**
- `keywords`: Array of keyword strings
- `keywords_str`: Comma-separated keyword string

**Performance Notes:**
- Uses batch processing (32 speeches at once on 45GB GPU) for 10-30x speedup
- Greedy decoding (do_sample=False) for faster generation
- Left padding for decoder-only architecture compatibility
- Uploads every 100 speeches to prevent data loss
- Resume mode: Re-running skips already processed speeches

**Model Notes:**
- Topic-aware prompting for better keyword relevance
- Long speeches truncated to 2000 characters
- GPU acceleration (FP16) for speed

## 9. Download Results from Colab (Optional)

In [None]:
# Only run this cell if you're using Google Colab
# This will zip the CSV and download it to your computer

try:
    from google.colab import files
    import zipfile
    import os
    
    # Create zip file
    zip_filename = 'speech_keywords.zip'
    
    print(f"üì¶ Creating zip file: {zip_filename}...")
    
    with zipfile.ZipFile(zip_filename, 'w', zipfile.ZIP_DEFLATED) as zipf:
        if os.path.exists(OUTPUT_CSV):
            zipf.write(OUTPUT_CSV, os.path.basename(OUTPUT_CSV))
            print(f"   Added: {OUTPUT_CSV}")
        else:
            print(f"   ‚ö†Ô∏è  File not found: {OUTPUT_CSV}")
    
    # Get file size
    file_size = os.path.getsize(zip_filename) / (1024 * 1024)  # MB
    print(f"\n‚úÖ Zip file created: {zip_filename} ({file_size:.2f} MB)")
    print(f"üì• Downloading to your computer...")
    
    # Download
    files.download(zip_filename)
    
    print(f"\n‚úÖ Download complete!")
    print(f"   Check your Downloads folder for: {zip_filename}")
    
except ImportError:
    print("‚ÑπÔ∏è  This cell only works in Google Colab.")
    print(f"   If you're running locally, the CSV is already saved at: {OUTPUT_CSV}")
except Exception as e:
    print(f"‚ùå Error: {e}")