# 07_topic_modeling.ipynb

**Topic Modeling with Latent Dirichlet Allocation (LDA)**

Pre-define number of topics, fit LDA, and save outputs.

In [None]:
from pathlib import Path

def setup_project_paths():
    current_dir = Path().cwd()
    base_path = current_dir.parent if current_dir.name == 'codigo' else current_dir
    input_path = base_path / 'corpus' / 'tei'
    output_path = base_path / 'resultados' / 'computational-analysis'
    return base_path, input_path, output_path

BASE_PATH, INPUT_PATH, OUTPUT_PATH = setup_project_paths()

In [None]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
import matplotlib.pyplot as plt

# Parameters
n_topics = 5
max_features = 1000
n_top_words = 10

# Load raw texts
texts_df = pd.read_csv(OUTPUT_PATH / 'corpus_summary' / 'csv' / 'raw_texts.csv')
documents = texts_df['text'].fillna('').tolist()

# Vectorize text
vectorizer = CountVectorizer(max_features=max_features, stop_words='spanish')
X = vectorizer.fit_transform(documents)

# Fit LDA
lda = LatentDirichletAllocation(n_components=n_topics, random_state=42)
lda.fit(X)

# Topic-term matrix
terms = vectorizer.get_feature_names_out()
topic_term = pd.DataFrame(lda.components_, columns=terms)
topic_term.to_csv(OUTPUT_PATH / 'extensions' / 'topic_term_matrix.csv', index=False)

# Document-topic matrix
doc_topic = pd.DataFrame(lda.transform(X), columns=[f'Topic{i}' for i in range(1, n_topics+1)])
doc_topic.insert(0, 'filename', texts_df['filename'])
doc_topic.to_csv(OUTPUT_PATH / 'extensions' / 'doc_topic_matrix.csv', index=False)

# Display top words per topic
for idx, topic in enumerate(lda.components_):
    top_indices = topic.argsort()[-n_top_words:][::-1]
    top_words = [terms[i] for i in top_indices]
    print(f"Topic {idx+1}: {', '.join(top_words)}")

# Plot bar chart for each topic's top words
for idx, topic in enumerate(lda.components_):
    top_indices = topic.argsort()[-n_top_words:][::-1]
    values = topic[top_indices]
    words = [terms[i] for i in top_indices]
    plt.figure()
    plt.barh(words[::-1], values[::-1])
    plt.title(f"Top {n_top_words} Words for Topic {idx+1}")
    plt.xlabel("Word Weight")
    plt.tight_layout()
    plt.savefig(OUTPUT_PATH / 'extensions' / f'topic_{idx+1}_top_words.png')
    plt.show()