# 05 - Analysis and Visualization

This notebook provides comprehensive analysis and visualization of the trained LDA model.

## Contents
- Topic exploration (word clouds, top words)
- pyLDAvis interactive visualization
- Topic trends over time
- Document similarity
- Topic comparison

In [None]:
# Import required libraries
import sys
import pickle
from pathlib import Path

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Add project root to path
project_root = Path.cwd().parent
if str(project_root) not in sys.path:
    sys.path.insert(0, str(project_root))

from src.config import get_settings
from src.lda_model import LDATopicModel
from src.analysis import TopicAnalyzer
from src.visualizations import TopicVisualizer

In [None]:
# Load settings
settings = get_settings()

# Load model
print("Loading model...")
model = LDATopicModel(settings)
model.load()

print(f"Loaded model with {model.model.num_topics} topics")

In [None]:
# Load processed data
corpus_path = settings.processed_data_dir / settings.processed_corpus_file
with open(corpus_path, 'rb') as f:
    corpus_data = pickle.load(f)

processed_docs = corpus_data['documents']
df = corpus_data['dataframe']

print(f"Loaded {len(processed_docs):,} documents")

In [None]:
# Initialize analyzer and visualizer
analyzer = TopicAnalyzer(model, settings)
visualizer = TopicVisualizer(model, settings)

# Set document data for analysis
analyzer.set_document_data(df, tokens_column='tokens')

print("Analyzer and Visualizer initialized")

## 1. Topic Overview

In [None]:
# Display all topics
topics = model.get_topics(num_words=10)

print(f"\n{len(topics)} Topics:")
print("=" * 70)

for topic in topics:
    print(f"\nTopic {topic.topic_id}: {', '.join(topic.top_words[:8])}")

In [None]:
# Topic prevalence
prevalence = analyzer.compute_topic_prevalence()
prevalence

In [None]:
# Visualize prevalence
fig = visualizer.plot_topic_prevalence(prevalence)
plt.show()

## 2. Word Clouds

In [None]:
# Word clouds for all topics
fig = visualizer.plot_all_wordclouds(num_words=40, cols=3)

# Save
wc_path = settings.outputs_dir / 'wordclouds_all.png'
fig.savefig(wc_path, dpi=150, bbox_inches='tight')
plt.show()

print(f"Saved to: {wc_path}")

In [None]:
# Individual topic word cloud
TOPIC_TO_VIEW = 0  # Change this to view different topic

fig = visualizer.plot_wordcloud(TOPIC_TO_VIEW, num_words=50)
plt.show()

## 3. pyLDAvis Interactive Visualization

In [None]:
# Create and display pyLDAvis
import pyLDAvis
import pyLDAvis.gensim_models

# Enable notebook mode
pyLDAvis.enable_notebook()

In [None]:
# Prepare visualization
vis_data = visualizer.create_pyldavis(
    processed_docs,
    save_path=settings.outputs_dir / 'pyldavis.html'
)

print(f"\nSaved pyLDAvis to: {settings.outputs_dir / 'pyldavis.html'}")

In [None]:
# Display interactive visualization
pyLDAvis.display(vis_data)

## 4. Topic Trends Over Time

In [None]:
# Compute topic trends by year
if 'year' in df.columns:
    trends = analyzer.compute_topic_trends(date_column='year', freq='Y')
    
    print("Topic trends computed")
    trends.head()

In [None]:
# Visualize trends
if 'year' in df.columns:
    fig = visualizer.plot_topic_trends(trends, date_column='_date')
    
    # Save
    trends_path = settings.outputs_dir / 'topic_trends.png'
    fig.savefig(trends_path, dpi=150, bbox_inches='tight')
    plt.show()
    
    print(f"Saved to: {trends_path}")

## 5. Document Similarity Search

In [None]:
# Find similar documents to a specific document
DOCUMENT_INDEX = 0  # Change to explore different documents

# Display the query document
query_doc = df.iloc[DOCUMENT_INDEX]
print("Query Document:")
print("=" * 60)
print(f"Title: {query_doc['title']}")
print(f"\nAbstract: {query_doc['abstract'][:300]}...")

In [None]:
# Find similar documents
similar_docs = analyzer.find_similar_documents(DOCUMENT_INDEX, top_n=5)

print("\nMost Similar Documents:")
print("-" * 60)
for i, (_, row) in enumerate(similar_docs.iterrows()):
    print(f"\n{i+1}. (Similarity: {row['similarity']:.3f})")
    print(f"   Title: {row['title'][:70]}..." if len(row['title']) > 70 else f"   Title: {row['title']}")

## 6. Topic Comparison

In [None]:
# Compare two topics
TOPIC_A = 0
TOPIC_B = 1

fig = visualizer.plot_topic_comparison(TOPIC_A, TOPIC_B, num_words=12)
plt.show()

In [None]:
# Analyze topic overlap
overlap = analyzer.get_topic_overlap(TOPIC_A, TOPIC_B, num_words=20)

print(f"Topic {TOPIC_A} vs Topic {TOPIC_B} Overlap Analysis:")
print("=" * 50)
print(f"Jaccard Similarity: {overlap['jaccard_similarity']:.3f}")
print(f"\nShared words: {', '.join(overlap['shared_words'][:10])}")
print(f"\nUnique to Topic {TOPIC_A}: {', '.join(overlap['unique_to_a'][:5])}")
print(f"Unique to Topic {TOPIC_B}: {', '.join(overlap['unique_to_b'][:5])}")

In [None]:
# Topic distance matrix
distance_matrix = analyzer.compute_topic_distance_matrix()

fig = visualizer.plot_topic_distance_heatmap(distance_matrix)

# Save
dist_path = settings.outputs_dir / 'topic_distances.png'
fig.savefig(dist_path, dpi=150, bbox_inches='tight')
plt.show()

print(f"Saved to: {dist_path}")

## 7. Documents by Topic

In [None]:
# Get documents dominated by a specific topic
TOPIC_ID = 0

topic_docs = analyzer.get_documents_by_topic(TOPIC_ID, min_probability=0.3, top_n=10)

print(f"\nTop 10 Documents for Topic {TOPIC_ID}:")
print("=" * 60)

prob_col = f'topic_{TOPIC_ID}_prob'
for i, (_, row) in enumerate(topic_docs.iterrows()):
    print(f"\n{i+1}. (Prob: {row[prob_col]:.3f})")
    title = row['title']
    print(f"   {title[:75]}..." if len(title) > 75 else f"   {title}")

## 8. Document-Topic Heatmap

In [None]:
# Get topic matrix
topic_matrix = model.get_document_topic_matrix(processed_docs, show_progress=False)

# Plot heatmap for first 50 documents
fig = visualizer.plot_document_topic_heatmap(topic_matrix, num_docs=50)

# Save
heatmap_path = settings.outputs_dir / 'doc_topic_heatmap.png'
fig.savefig(heatmap_path, dpi=150, bbox_inches='tight')
plt.show()

print(f"Saved to: {heatmap_path}")

## 9. Save All Visualizations

In [None]:
# Save all visualizations to outputs directory
print("Saving all visualizations...")

viz_paths = visualizer.save_all_visualizations(
    processed_docs,
    topic_matrix,
    prevalence,
)

print("\n‚úÖ All visualizations saved:")
for name, path in viz_paths.items():
    print(f"   - {name}: {path}")

## Summary

In [None]:
print("\n" + "=" * 60)
print("ANALYSIS AND VISUALIZATION COMPLETE")
print("=" * 60)

print(f"\nüìä Model: {model.model.num_topics} topics")
print(f"üìà Coherence: {model.metadata.coherence_score:.4f}")
print(f"üìö Documents: {len(processed_docs):,}")

print(f"\nüìÅ Output files saved to: {settings.outputs_dir}")

print(f"\nüöÄ Launch the dashboard with:")
print(f"   streamlit run dashboard/app.py")