# 04 - LDA Topic Modeling

This notebook trains the LDA (Latent Dirichlet Allocation) topic model.

## Steps
- Load processed corpus
- Create dictionary and corpus
- Find optimal number of topics (coherence-based)
- Train final model
- Evaluate and save model

In [None]:
# Import required libraries
import sys
import pickle
from pathlib import Path

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Add project root to path
project_root = Path.cwd().parent
if str(project_root) not in sys.path:
    sys.path.insert(0, str(project_root))

from src.config import get_settings, ensure_directories
from src.lda_model import LDATopicModel, find_optimal_topics

In [None]:
# Load settings
settings = get_settings()
ensure_directories(settings)

# Load processed corpus
corpus_path = settings.processed_data_dir / settings.processed_corpus_file
print(f"Loading processed corpus from: {corpus_path}")

with open(corpus_path, 'rb') as f:
    corpus_data = pickle.load(f)

processed_docs = corpus_data['documents']
df = corpus_data['dataframe']

print(f"Loaded {len(processed_docs):,} documents")

## 1. Explore Corpus Statistics

In [None]:
# Corpus statistics
all_tokens = [t for doc in processed_docs for t in doc]
unique_tokens = set(all_tokens)

print("Corpus Statistics:")
print("-" * 40)
print(f"Documents: {len(processed_docs):,}")
print(f"Total tokens: {len(all_tokens):,}")
print(f"Unique tokens: {len(unique_tokens):,}")
print(f"Avg tokens/doc: {len(all_tokens)/len(processed_docs):.1f}")

## 2. Find Optimal Number of Topics

We'll train models with different numbers of topics and evaluate using coherence score.

In [None]:
# Configuration for topic search
TOPIC_RANGE = range(5, 21, 2)  # 5, 7, 9, 11, 13, 15, 17, 19

# Set to True to run the search (can take a long time)
RUN_TOPIC_SEARCH = True

# If you already know the optimal number of topics, set it here
FIXED_NUM_TOPICS = None  # e.g., 10

print(f"Topic range to test: {list(TOPIC_RANGE)}")

In [None]:
if RUN_TOPIC_SEARCH and FIXED_NUM_TOPICS is None:
    print("Finding optimal number of topics...")
    print("This may take a while...")
    print("-" * 50)
    
    optimal_topics, search_results = find_optimal_topics(
        processed_docs,
        topic_range=TOPIC_RANGE,
        settings=settings,
        show_progress=True,
    )
    
    print(f"\n‚úÖ Optimal number of topics: {optimal_topics}")
else:
    optimal_topics = FIXED_NUM_TOPICS or settings.lda_num_topics
    search_results = None
    print(f"Using fixed number of topics: {optimal_topics}")

In [None]:
# Plot coherence scores
if search_results:
    fig, ax = plt.subplots(figsize=(10, 6))
    
    topics = [r['num_topics'] for r in search_results]
    coherences = [r['coherence'] for r in search_results]
    
    ax.plot(topics, coherences, 'o-', markersize=8, linewidth=2)
    ax.axvline(optimal_topics, color='red', linestyle='--', 
               label=f'Optimal: {optimal_topics}')
    
    ax.set_xlabel('Number of Topics')
    ax.set_ylabel('Coherence Score (c_v)')
    ax.set_title('Topic Count vs Coherence Score', fontweight='bold')
    ax.legend()
    ax.grid(True, alpha=0.3)
    
    plt.tight_layout()
    plt.show()
    
    # Save results
    results_df = pd.DataFrame(search_results)
    results_path = settings.outputs_dir / 'topic_coherence_results.csv'
    results_df.to_csv(results_path, index=False)
    print(f"Saved search results to: {results_path}")

## 3. Train Final Model

In [None]:
# Model configuration
NUM_TOPICS = optimal_topics  # Use optimal from search
PASSES = 15
ITERATIONS = 400
WORKERS = 4  # Number of CPU cores to use

print(f"Training final model with configuration:")
print(f"  Topics: {NUM_TOPICS}")
print(f"  Passes: {PASSES}")
print(f"  Iterations: {ITERATIONS}")
print(f"  Workers: {WORKERS}")

In [None]:
# Train model
print("\nTraining LDA model...")
print("-" * 50)

model = LDATopicModel(settings)
model.train(
    processed_docs,
    num_topics=NUM_TOPICS,
    passes=PASSES,
    iterations=ITERATIONS,
    workers=WORKERS,
    show_progress=True,
)

print("\n‚úÖ Training complete!")

In [None]:
# Display model metadata
if model.metadata:
    meta = model.metadata
    print("\nModel Metadata:")
    print("=" * 50)
    print(f"Number of topics:    {meta.num_topics}")
    print(f"Number of documents: {meta.num_documents:,}")
    print(f"Vocabulary size:     {meta.vocabulary_size:,}")
    print(f"Coherence score:     {meta.coherence_score:.4f}")
    print(f"Training time:       {meta.training_time_seconds:.1f}s")

## 4. Explore Topics

In [None]:
# Display all topics
topics = model.get_topics(num_words=15)

print(f"\n{len(topics)} Topics Discovered:")
print("=" * 70)

for topic in topics:
    words = ', '.join(topic.top_words[:10])
    print(f"\nTopic {topic.topic_id}:")
    print(f"  {words}")

In [None]:
# Visualize topic words
fig, axes = plt.subplots(2, min(5, NUM_TOPICS//2 + 1), figsize=(20, 8))
axes = axes.flatten()

for i, topic in enumerate(topics[:10]):
    if i >= len(axes):
        break
    
    words = [w for w, _ in topic.words[:10]]
    weights = [w for _, w in topic.words[:10]]
    
    ax = axes[i]
    ax.barh(range(len(words)), weights, color=plt.cm.tab10(i))
    ax.set_yticks(range(len(words)))
    ax.set_yticklabels(words)
    ax.invert_yaxis()
    ax.set_title(f'Topic {topic.topic_id}', fontweight='bold')

# Hide unused subplots
for i in range(len(topics), len(axes)):
    axes[i].axis('off')

plt.tight_layout()
plt.show()

## 5. Document-Topic Distribution

In [None]:
# Get topic distribution for all documents
print("Computing document-topic distributions...")
topic_matrix = model.get_document_topic_matrix(processed_docs, show_progress=True)

print(f"\nMatrix shape: {topic_matrix.shape}")

In [None]:
# Topic prevalence
topic_prevalence = topic_matrix.mean(axis=0)

fig, ax = plt.subplots(figsize=(12, 6))
bars = ax.bar(range(NUM_TOPICS), topic_prevalence, color=plt.cm.tab10.colors[:NUM_TOPICS])
ax.set_xlabel('Topic')
ax.set_ylabel('Average Probability')
ax.set_title('Topic Prevalence Across Documents', fontweight='bold')
ax.set_xticks(range(NUM_TOPICS))

plt.tight_layout()
plt.show()

In [None]:
# Sample document topic assignments
print("\nSample Document-Topic Assignments:")
print("-" * 60)

for i in range(min(5, len(df))):
    dominant_topic = np.argmax(topic_matrix[i])
    prob = topic_matrix[i, dominant_topic]
    title = df.iloc[i]['title'][:60]
    print(f"Doc {i}: Topic {dominant_topic} ({prob:.2f}) - {title}...")

## 6. Save Model

In [None]:
# Save model and artifacts
model_dir = model.save()

print(f"\n‚úÖ Model saved to: {model_dir}")

In [None]:
# Also save topic-document matrix for dashboard
matrix_path = settings.processed_data_dir / settings.topic_document_matrix_file

# Create DataFrame with topic columns
df_topics = df.copy()
for i in range(NUM_TOPICS):
    df_topics[f'topic_{i}'] = topic_matrix[:, i]

df_topics['dominant_topic'] = np.argmax(topic_matrix, axis=1)
df_topics['dominant_prob'] = np.max(topic_matrix, axis=1)

# Remove tokens column (too large for CSV)
if 'tokens' in df_topics.columns:
    df_topics = df_topics.drop(columns=['tokens'])

df_topics.to_csv(matrix_path, index=False)
print(f"‚úÖ Topic-document matrix saved to: {matrix_path}")

## Summary

In [None]:
print("\n" + "=" * 60)
print("LDA MODELING COMPLETE")
print("=" * 60)
print(f"\nüìä Topics: {NUM_TOPICS}")
print(f"üìà Coherence: {model.metadata.coherence_score:.4f}")
print(f"üìö Documents: {len(processed_docs):,}")
print(f"üìù Vocabulary: {model.metadata.vocabulary_size:,}")

print(f"\nüìÅ Model artifacts saved to: {model_dir}")

print(f"\nüëâ Next: Run 05_analysis_visualization.ipynb for detailed analysis")
print(f"   Then: Launch the Streamlit dashboard with 'streamlit run dashboard/app.py'")